Chimeras / Scripts / Compare_Method1_Method2 / compare_filter_fastqs_vs_filter_R_methods.Rmd
compare_filter_fastqs_vs_filter_R_methods.Rmd
Raw
---
title: "compare filtering methods"
author: "Nadine Bestard"
date: "17/05/2022"
output:
  html_document:
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

Method 1 old: the awk

Method 1: the awk filtering duplicates 

Method 2: subset in R


```{r libs, message=FALSE, warning=FALSE}
library(dplyr)
library(scuttle)
library(here)
library(ggplot2)
library(Seurat)
```

### Load objects 

These are preprocessed objects. Results looked weird, 
As we do not delete the same number of cells for all the objects, 
non of the ratios is 1 anymore. 

TODO: redo from scrath, only filter genes, not cells.  
TODDO: make sure the cells with 0 expression are out of meth2

```{r load, warning=FALSE}
meth1_old_sce <- readRDS(here("data", "Human_R_Analysis_with_duplicates", "Metformin_ddH2O", "met_human_sce_norm.rds"))

meth1_sce <- readRDS(here("data", "Human_R_Analysis", "Metformin_ddH2O",  "met_human_sce_norm.rds"))
#meth1_sce <- as.Seurat(meth1_sce)

meth2_sce <- readRDS(here("data", "Human_R_Analysis", "Metformin_ddH2O", "CellRanger-combined",  "met_human_sce_QC_chimeras.rds"))
#meth2_sce <- as.Seurat(meth2_sce)


```


```{r modifysrt2}
barcodes <- colnames(meth2_sce)

library(stringr)
barcodes <- str_sub(barcodes,1,nchar(barcodes)-2)
meth2_sce$Barcode <- barcodes

```


## Dimensions and umis
```{r}
print("dimensions meth1, meth1_old and meth2")
dim(meth1_sce)
dim(meth1_old_sce)
dim(meth2_sce)

print("sum umis meth1, meth1_old and meth2")
sum(meth1_sce$total)
sum(meth1_old_sce$total)
sum(meth2_sce$total)

print("sum genes meth1, meth1_old and meth2")
sum(meth1_sce$detected)
sum(meth1_old_sce$detected)
sum(meth2_sce$detected)
```




## Compare the old method 1 with method 2 (subset in R)

```{r compare-old}
# compare cells

orig_comp_df  <- colData(meth1_old_sce)
second_comp_df <- colData(meth2_sce)
df_compare <- merge(orig_comp_df, second_comp_df, by = "Barcode", suffixes = c("_meth1_old","_meth2"))


#df_compare <- (meth2_sce)
#df_compare$detected_meth1 <- meth1_sce$detected
#df_compare$total_meth1 <- (meth1_sce$total)


# plot cells
plot(df_compare$total_meth1_old, df_compare$total_meth2, main = "Cells")
plot(df_compare$detected_meth1_old, df_compare$detected_meth2, main = "Cells")

# compare genes
df_compare_genes <- as.data.frame(rowSums(counts(meth1_old_sce)))
df_compare_genes<- merge(as.data.frame(rowSums(counts(meth2_sce))), df_compare_genes, by="row.names")
colnames(df_compare_genes) <- c("genes", "meth2", "meth1_old")
# add ratio between old and new method
df_compare_genes$ratio <- df_compare_genes$meth1_old/df_compare_genes$meth2
# add number cells afected
number_cells <- data.frame(num_cells_old = rowSums(counts(meth1_old_sce)>0) )
number_cells$genes <- row.names(number_cells)

df_compare_genes <- merge(number_cells, df_compare_genes, by="genes")


# plot genes
plot(df_compare_genes$meth2, df_compare_genes$meth1_old, main = "Genes") 
#exclude malat
df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),]
# plot genes again
plot(df_compare_genes$meth2, df_compare_genes$meth1, main="delete the most expressed gene to see better the correlation", ylim=c(0,80000), xlim=c(0,80000))


# plot the ratio
hist(df_compare_genes$ratio, breaks=seq(0,45,0.2))
plot(df_compare_genes$ratio)
# with ggplot (to set the base if I need to modify)
df_compare_genes %>% arrange(ratio) %>% 
  mutate(genes=factor(genes, levels=genes)) %>%   # This trick update the factor levels
  ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,45,0.5)) + ggtitle( "gene counts meth1_old / meth2")


is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio))
genes_outlier <- df_compare_genes[is_outlier_gene_ratio,]
# see the threshold
threshold <- attr(is_outlier_gene_ratio, "thresholds")

# how many would be more than 1.1?
sum(na.omit(df_compare_genes$ratio)>1.1)
#[1] [1] 3928




# top genes overeepresented in old
df_compare_genes <- arrange(df_compare_genes, ratio )
genes_top <- tail(df_compare_genes,100)

# save)
write.csv(genes_outlier, here("outs/debug/genes_outlier_meth1_oldvsmeth2.csv"))
write.csv(genes_top, here("outs/debug/genes_top_meth1_oldvsmeth2.csv"))


```


any gene where the countoldmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier.
This is a total of `r dim(genes_outlier)[1]` genes.

## Compare method 1 ( not duplicates) with method 2 (subset in R)

```{r compare}

# compare cells
meth1_comp_df  <- colData(meth1_sce)
second_comp_df <- colData(meth2_sce)
df_compare <- merge(meth1_comp_df, second_comp_df, by = "Barcode", suffixes = c("_meth1","_meth2"))

# plot cells
plot(df_compare$total_meth1, df_compare$total_meth2, main = "Cells")
plot(df_compare$detected_meth1, df_compare$detected_meth2, main = "Cells")

# compare genes
df_compare_genes <- as.data.frame(rowSums(counts(meth1_sce)))
df_compare_genes<- merge(as.data.frame(rowSums(counts(meth2_sce))), df_compare_genes, by="row.names")
colnames(df_compare_genes) <- c("genes", "meth2", "meth1")
# add ratio between old and new method
df_compare_genes$ratio <- df_compare_genes$meth1/df_compare_genes$meth2
# add number cells afected
number_cells <- data.frame(num_cells_old = rowSums(counts(meth1_sce)>0) )
number_cells$genes <- row.names(number_cells)

df_compare_genes <- merge(number_cells, df_compare_genes, by="genes")


# plot genes
plot(df_compare_genes$meth2, df_compare_genes$meth1, main = "Genes") 
#exclude malat
df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),]
# plot genes again
plot(df_compare_genes$meth2, df_compare_genes$meth1, main="delete the most expressed gene to see better the correlation", ylim=c(0,80000), xlim=c(0,80000))


# plot the ratio
hist(df_compare_genes$ratio, breaks=seq(0,45,0.2))
plot(df_compare_genes$ratio)
# with ggplot (to set the base if I need to modify)
df_compare_genes %>% arrange(ratio) %>% 
  mutate(genes=factor(genes, levels=genes)) %>%   # This trick update the factor levels
  ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,45,0.5)) + ggtitle( "gene counts meth1 / meth2")


is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio))
genes_outlier <- df_compare_genes[is_outlier_gene_ratio,]
# see the threshold
threshold <- attr(is_outlier_gene_ratio, "thresholds")

# how many would be more than 1.1?
sum(na.omit(df_compare_genes$ratio)>1.1)
#[1] [1] 3928




# top genes overeepresented in old
df_compare_genes <- arrange(df_compare_genes, ratio )
genes_top <- tail(df_compare_genes,100)

# save)
write.csv(genes_outlier, here("outs/debug/genes_outlier_meth1_oldvsmeth2.csv"))
write.csv(genes_top, here("outs/debug/genes_top_meth1_oldvsmeth2.csv"))
```

any gene where the countmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier.
This is a total of `r dim(genes_outlier)[1]` genes.

#### Genes with more than 1.1. ratio

<details>

<summary>

Click to expand

</summary>

```{r plot-genes, eval=FALSE}
# plot all genes with more than 1.1 ratio
genes_1.1 <- df_compare_genes$genes[df_compare_genes$ratio>1.1]
for(gene in genes_1.1){
meth1 <- colSums(GetAssayData(meth1_sce[row.names(meth1_sce)==gene,], slot = "counts"))

meth2 <- colSums(GetAssayData(meth2_sce[row.names(meth2_sce)==gene,], slot = "counts"))

ratio <- df_compare_genes$ratio[df_compare_genes$genes == gene]
num_cells_old <- df_compare_genes$num_cells_old[df_compare_genes$genes == gene]
print(plot(meth1, meth2 , main = paste(gene, ", ratio meth2/meth1 =", round(ratio,2), ", num cells =", num_cells_old)))

}

```



</details>

## Compare method 1 ( not duplicates) with old method 1 

```{r compare-2, eval=FALSE}

# compare cells
meth1_old_comp_df  <- colData(meth1_old_sce)
meth1_comp_df <- colData(meth1_sce)
df_compare <- merge(meth1_old_comp_df, meth1_comp_df, by = "Barcode", suffixes = c("_meth1","_meth2"))


# plot cells
plot(df_compare$total_meth1, df_compare$total)
plot(df_compare$detected_meth1, df_compare$detected)

# compare genes
df_compare_genes <- as.data.frame(rowSums(counts(meth1_sce)))
df_compare_genes<- merge(as.data.frame(rowSums(counts(meth1_old_sce))), df_compare_genes, by="row.names")
colnames(df_compare_genes) <- c("genes", "old_meth1", "meth1")
# add ratio between old and new method
df_compare_genes$ratio <- df_compare_genes$old_meth1/df_compare_genes$meth1
# add number cells afected
number_cells <- data.frame(num_cells_old = rowSums(GetAssayData(meth1_sce, "counts")>0) )
number_cells$genes <- row.names(number_cells)

df_compare_genes <- merge(number_cells, df_compare_genes, by="genes")


# plot genes
plot(df_compare_genes$old_meth1, df_compare_genes$meth1) 

# plot the ratio
hist(df_compare_genes$ratio, breaks=seq(0,45,0.2))
#plot(df_compare_genes$ratio)
# with ggplot (to set the base if I need to modify)
df_compare_genes %>% arrange(ratio) %>% 
  mutate(genes=factor(genes, levels=genes)) %>%   # This trick update the factor levels
  ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,10,0.5)) + ggtitle( "gene counts old_meth1 / meth1")


is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio))
genes_outlier <- df_compare_genes[is_outlier_gene_ratio,]
# see the threshold
threshold <- attr(is_outlier_gene_ratio, "thresholds")
(threshold)
# how many would be more than 1.1?
sum(df_compare_genes$ratio>1.1)
#[1] 1693 # this is between old meth1 and meth2 
# for the difference newmeth1 1 and meth22 it's only 24
# meth1 vs old meth2 it's 1699



# top genes overeepresented in old
df_compare_genes <- arrange(df_compare_genes, ratio )
genes_top <- tail(df_compare_genes,100)

# save
write.csv(genes_outlier, here("outs/debug/genes_outlier_meth1vsmeth1_old.csv"))
write.csv(genes_top, here("outs/debug/genes_top_meth1vsmeth1_old.csv"))

# most expressed gene
tail(arrange(df_compare_genes, meth1),1)
#exclude it
df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),]
# plot genes again
plot(df_compare_genes$meth1, df_compare_genes$old_meth1, main="delete the most expressed gene to see better the correlation")
```


<!-- any gene where the countmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier. -->
<!-- This is a total of `r dim(genes_outlier)[1]` genes. -->