Chimeras / Scripts / Compare_Method1_Method2 / compare_filter_fastqs_vs_filter_R_methods_ddh2oA.Rmd
compare_filter_fastqs_vs_filter_R_methods_ddh2oA.Rmd
Raw
---
title: "Compare method 1 (awk, now with uniq) and method 2 (subset in R)"
author: "Nadine Bestard"
date: "17/05/2022"
output:
  html_document:
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

Method 1 old: the awk

Method 1: the awk filtering duplicates 

Method 2: subset in R


```{r libs, message=FALSE, warning=FALSE}
library(dplyr)
library(scuttle)
library(here)
library(ggplot2)
```


```{r load}
# load meth2
source(here("Scripts/debug_separation/import_and_filter_ddH20_A.R"))
ddH2O_meth2 <- ddH2O
rm(ddH2O)
#load meth1 old
ddH2O_old_meth1 <- readRDS(here("outs/filter_70/ddH2O_A_UMI_test/ddH2O_A_initial_srt.rds"))
#load updated meth1
counts <- Read10X(here("outs", "filter_70", "CellRanger-onlyCB-uniq-reads", "human", "19880WApool01__ddH2O_A_S3", "outs", "raw_feature_bc_matrix"))
ddH2O_meth1 <- CreateSeuratObject(counts = counts, project = "ddH2O_old", min.cells = 3, min.features = 200)
ddH2O_list <- list(meth1=ddH2O_meth1,old_meth1=ddH2O_old_meth1,meth2=ddH2O_meth2)
```

## General dimensions the 3 methods

```{r explore}
print("dimensions")
# Compare some metrics
lapply(ddH2O_list,function(X)dim(X))
#$meth1
#[1] 15768   717
#
#$old_meth1
#[1] 15769   717
#
#$meth2
#[1] 17995   717

print("sum umis")
lapply(ddH2O_list,function(X)sum(X@meta.data$nCount_RNA))



print("sum genes")
lapply(ddH2O_list,function(X)sum(X@meta.data$nFeature_RNA))
```


## Compare the old method 1 with method 2 (subset in R)

```{r compare-old}
# compare cells
df_compare <- (ddH2O_meth2@meta.data)
df_compare$nFeature_RNA_old_meth1 <- ddH2O_old_meth1@meta.data$nFeature_RNA
df_compare$nCount_RNA_old_meth1 <- (ddH2O_old_meth1@meta.data$nCount_RNA)


# plot cells
plot(df_compare$nCount_RNA_old_meth1, df_compare$nCount_RNA)
plot(df_compare$nFeature_RNA_old_meth1, df_compare$nFeature_RNA)

# compare genes
df_compare_genes <- as.data.frame(rowSums(ddH2O_old_meth1))
df_compare_genes<- merge(as.data.frame(rowSums(ddH2O_meth2)), df_compare_genes, by="row.names")
colnames(df_compare_genes) <- c("genes", "meth2", "old_meth1")
# add ratio between old and new method
df_compare_genes$ratio <- df_compare_genes$old_meth1/df_compare_genes$meth2
# add number cells afected
number_cells <- data.frame(num_cells_old = rowSums(GetAssayData(ddH2O_old_meth1, "counts")>0) )
number_cells$genes <- row.names(number_cells)

df_compare_genes <- merge(number_cells, df_compare_genes, by="genes")


# plot genes
plot(df_compare_genes$meth2, df_compare_genes$old_meth1, main = "Cells") 

# plot the ratio
hist(df_compare_genes$ratio, breaks=seq(0,10,0.2))
plot(df_compare_genes$ratio)
# with ggplot (to set the base if I need to modify)
df_compare_genes %>% arrange(ratio) %>% 
  mutate(genes=factor(genes, levels=genes)) %>%   # This trick update the factor levels
  ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,10,0.5)) + ggtitle( "gene counts old_meth1 / meth2")


is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio))
genes_outlier <- df_compare_genes[is_outlier_gene_ratio,]
# see the threshold
threshold <- attr(is_outlier_gene_ratio, "thresholds")

# how many would be more than 1.1?
sum(df_compare_genes$ratio>1.1)
#[1] 1693 # this is between old meth1 and 2 
# for the difference new 1 and 2 it's only 24

#exclude most expressed gene
df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),]
# plot genes again
plot(df_compare_genes$old_meth1, df_compare_genes$meth2, main="delete the most expressed gene to see better the correlation")
```

any gene where the countoldmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier.
This is a total of `r dim(genes_outlier)[1]` genes.

## Compare method 1 ( not duplicates) with method 2 (subset in R)

```{r compare}
# compare cells
df_compare <- (ddH2O_meth2@meta.data)
df_compare$nFeature_RNA_meth1 <- ddH2O_meth1@meta.data$nFeature_RNA
df_compare$nCount_RNA_meth1 <- (ddH2O_meth1@meta.data$nCount_RNA)


# plot cells
plot(df_compare$nCount_RNA_meth1, df_compare$nCount_RNA)
plot(df_compare$nFeature_RNA_meth1, df_compare$nFeature_RNA)

# compare genes
df_compare_genes <- as.data.frame(rowSums(ddH2O_meth1))
df_compare_genes<- merge(as.data.frame(rowSums(ddH2O_meth2)), df_compare_genes, by="row.names")
colnames(df_compare_genes) <- c("genes", "meth2", "meth1")
# add ratio between old and new method
df_compare_genes$ratio <- df_compare_genes$meth2/df_compare_genes$meth1
# add number cells afected
number_cells <- data.frame(num_cells_old = rowSums(GetAssayData(ddH2O_meth1, "counts")>0) )
number_cells$genes <- row.names(number_cells)

df_compare_genes <- merge(number_cells, df_compare_genes, by="genes")


# plot genes
plot(df_compare_genes$meth2, df_compare_genes$meth1, main = "Cells") 

# plot the ratio
hist(df_compare_genes$ratio, breaks=seq(0,10,0.2))
plot(df_compare_genes$ratio)
# with ggplot (to set the base if I need to modify)
df_compare_genes %>% arrange(ratio) %>% 
  mutate(genes=factor(genes, levels=genes)) %>%   # This trick update the factor levels
  ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,10,0.5)) + ggtitle( "gene counts meth2 / meth1")


is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio))
genes_outlier <- df_compare_genes[is_outlier_gene_ratio,]
# see the threshold
threshold <- attr(is_outlier_gene_ratio, "thresholds")

# how many would be more than 1.1?
sum(df_compare_genes$ratio>1.1)
#[1] 1693 # this is between old meth1 and 2 
# for the difference new 1 and 2 it's only 24



# top genes overeepresented in old
df_compare_genes <- arrange(df_compare_genes, ratio )
genes_top <- tail(df_compare_genes,100)

# save)
write.csv(genes_outlier, here("outs/filter_70/ddH2O_A_UMI_test/genes_outlier_meth1vsmeth2.csv"))
write.csv(genes_top, here("outs/filter_70/ddH2O_A_UMI_test/genes_top_meth1vsmeth2.csv"))

#exclude it
df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),]
# plot genes again
plot(df_compare_genes$meth1, df_compare_genes$meth2, main="delete the most expressed gene to see better the correlation")
```

any gene where the countmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier.
This is a total of `r dim(genes_outlier)[1]` genes.

#### Genes with more than 1.1. ratio

```{r plot-genes}
# plot all genes with more than 1.1 ratio
genes_1.1 <- df_compare_genes$genes[df_compare_genes$ratio>1.1]
for(gene in genes_1.1){
meth1 <- colSums(GetAssayData(ddH2O_meth1[row.names(ddH2O_meth1)==gene,], slot = "counts"))

meth2 <- colSums(GetAssayData(ddH2O_meth2[row.names(ddH2O_meth2)==gene,], slot = "counts"))

ratio <- df_compare_genes$ratio[df_compare_genes$genes == gene]
num_cells_old <- df_compare_genes$num_cells_old[df_compare_genes$genes == gene]
print(plot(meth1, meth2 , main = paste(gene, ", ratio meth2/meth1 =", round(ratio,2), ", num cells =", num_cells_old)))

}

```




## Compare method 1 ( not duplicates) with old method 1 

```{r compare-2}
# compare cells
df_compare <- (ddH2O_old_meth1@meta.data)
df_compare$nFeature_RNA_meth1 <- ddH2O_meth1@meta.data$nFeature_RNA
df_compare$nCount_RNA_meth1 <- (ddH2O_meth1@meta.data$nCount_RNA)


# plot cells
plot(df_compare$nCount_RNA_meth1, df_compare$nCount_RNA)
plot(df_compare$nFeature_RNA_meth1, df_compare$nFeature_RNA)

# compare genes
df_compare_genes <- as.data.frame(rowSums(ddH2O_meth1))
df_compare_genes<- merge(as.data.frame(rowSums(ddH2O_old_meth1)), df_compare_genes, by="row.names")
colnames(df_compare_genes) <- c("genes", "old_meth1", "meth1")
# add ratio between old and new method
df_compare_genes$ratio <- df_compare_genes$old_meth1/df_compare_genes$meth1
# add number cells afected
number_cells <- data.frame(num_cells_old = rowSums(GetAssayData(ddH2O_meth1, "counts")>0) )
number_cells$genes <- row.names(number_cells)

df_compare_genes <- merge(number_cells, df_compare_genes, by="genes")


# plot genes
plot(df_compare_genes$old_meth1, df_compare_genes$meth1) 

# plot the ratio
hist(df_compare_genes$ratio, breaks=seq(0,10,0.2))
#plot(df_compare_genes$ratio)
# with ggplot (to set the base if I need to modify)
df_compare_genes %>% arrange(ratio) %>% 
  mutate(genes=factor(genes, levels=genes)) %>%   # This trick update the factor levels
  ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,10,0.5)) + ggtitle( "gene counts old_meth1 / meth1")


is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio))
genes_outlier <- df_compare_genes[is_outlier_gene_ratio,]
# see the threshold
threshold <- attr(is_outlier_gene_ratio, "thresholds")
(threshold)
# how many would be more than 1.1?
sum(df_compare_genes$ratio>1.1)
#[1] 1693 # this is between old meth1 and meth2 
# for the difference newmeth1 1 and meth22 it's only 24
# meth1 vs old meth2 it's 1699



# top genes overeepresented in old
df_compare_genes <- arrange(df_compare_genes, ratio )
genes_top <- tail(df_compare_genes,100)

# save
write.csv(genes_outlier, here("outs/filter_70/ddH2O_A_UMI_test/genes_outlier_meth1vsmeth1_old.csv"))
write.csv(genes_top, here("outs/filter_70/ddH2O_A_UMI_test/genes_top_meth1vsmeth1_old.csv"))

# most expressed gene
tail(arrange(df_compare_genes, meth1),1)
#exclude it
df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),]
# plot genes again
plot(df_compare_genes$meth1, df_compare_genes$old_meth1, main="delete the most expressed gene to see better the correlation")



```

any gene where the countmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier.
This is a total of `r dim(genes_outlier)[1]` genes.