--- title: "compare filtering methods" author: "Nadine Bestard" date: "17/05/2022" output: html_document: code_folding: hide --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` Method 1 old: the awk Method 1: the awk filtering duplicates Method 2: subset in R ```{r libs, message=FALSE, warning=FALSE} library(dplyr) library(scuttle) library(here) library(ggplot2) library(Seurat) ``` ### Load objects These are preprocessed objects. Results looked weird, As we do not delete the same number of cells for all the objects, non of the ratios is 1 anymore. TODO: redo from scrath, only filter genes, not cells. TODDO: make sure the cells with 0 expression are out of meth2 ```{r load, warning=FALSE} meth1_old_sce <- readRDS(here("data", "Human_R_Analysis_with_duplicates", "Metformin_ddH2O", "met_human_sce_norm.rds")) meth1_sce <- readRDS(here("data", "Human_R_Analysis", "Metformin_ddH2O", "met_human_sce_norm.rds")) #meth1_sce <- as.Seurat(meth1_sce) meth2_sce <- readRDS(here("data", "Human_R_Analysis", "Metformin_ddH2O", "CellRanger-combined", "met_human_sce_QC_chimeras.rds")) #meth2_sce <- as.Seurat(meth2_sce) ``` ```{r modifysrt2} barcodes <- colnames(meth2_sce) library(stringr) barcodes <- str_sub(barcodes,1,nchar(barcodes)-2) meth2_sce$Barcode <- barcodes ``` ## Dimensions and umis ```{r} print("dimensions meth1, meth1_old and meth2") dim(meth1_sce) dim(meth1_old_sce) dim(meth2_sce) print("sum umis meth1, meth1_old and meth2") sum(meth1_sce$total) sum(meth1_old_sce$total) sum(meth2_sce$total) print("sum genes meth1, meth1_old and meth2") sum(meth1_sce$detected) sum(meth1_old_sce$detected) sum(meth2_sce$detected) ``` ## Compare the old method 1 with method 2 (subset in R) ```{r compare-old} # compare cells orig_comp_df <- colData(meth1_old_sce) second_comp_df <- colData(meth2_sce) df_compare <- merge(orig_comp_df, second_comp_df, by = "Barcode", suffixes = c("_meth1_old","_meth2")) #df_compare <- (meth2_sce) #df_compare$detected_meth1 <- meth1_sce$detected #df_compare$total_meth1 <- (meth1_sce$total) # plot cells plot(df_compare$total_meth1_old, df_compare$total_meth2, main = "Cells") plot(df_compare$detected_meth1_old, df_compare$detected_meth2, main = "Cells") # compare genes df_compare_genes <- as.data.frame(rowSums(counts(meth1_old_sce))) df_compare_genes<- merge(as.data.frame(rowSums(counts(meth2_sce))), df_compare_genes, by="row.names") colnames(df_compare_genes) <- c("genes", "meth2", "meth1_old") # add ratio between old and new method df_compare_genes$ratio <- df_compare_genes$meth1_old/df_compare_genes$meth2 # add number cells afected number_cells <- data.frame(num_cells_old = rowSums(counts(meth1_old_sce)>0) ) number_cells$genes <- row.names(number_cells) df_compare_genes <- merge(number_cells, df_compare_genes, by="genes") # plot genes plot(df_compare_genes$meth2, df_compare_genes$meth1_old, main = "Genes") #exclude malat df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),] # plot genes again plot(df_compare_genes$meth2, df_compare_genes$meth1, main="delete the most expressed gene to see better the correlation", ylim=c(0,80000), xlim=c(0,80000)) # plot the ratio hist(df_compare_genes$ratio, breaks=seq(0,45,0.2)) plot(df_compare_genes$ratio) # with ggplot (to set the base if I need to modify) df_compare_genes %>% arrange(ratio) %>% mutate(genes=factor(genes, levels=genes)) %>% # This trick update the factor levels ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,45,0.5)) + ggtitle( "gene counts meth1_old / meth2") is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio)) genes_outlier <- df_compare_genes[is_outlier_gene_ratio,] # see the threshold threshold <- attr(is_outlier_gene_ratio, "thresholds") # how many would be more than 1.1? sum(na.omit(df_compare_genes$ratio)>1.1) #[1] [1] 3928 # top genes overeepresented in old df_compare_genes <- arrange(df_compare_genes, ratio ) genes_top <- tail(df_compare_genes,100) # save) write.csv(genes_outlier, here("outs/debug/genes_outlier_meth1_oldvsmeth2.csv")) write.csv(genes_top, here("outs/debug/genes_top_meth1_oldvsmeth2.csv")) ``` any gene where the countoldmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier. This is a total of `r dim(genes_outlier)[1]` genes. ## Compare method 1 ( not duplicates) with method 2 (subset in R) ```{r compare} # compare cells meth1_comp_df <- colData(meth1_sce) second_comp_df <- colData(meth2_sce) df_compare <- merge(meth1_comp_df, second_comp_df, by = "Barcode", suffixes = c("_meth1","_meth2")) # plot cells plot(df_compare$total_meth1, df_compare$total_meth2, main = "Cells") plot(df_compare$detected_meth1, df_compare$detected_meth2, main = "Cells") # compare genes df_compare_genes <- as.data.frame(rowSums(counts(meth1_sce))) df_compare_genes<- merge(as.data.frame(rowSums(counts(meth2_sce))), df_compare_genes, by="row.names") colnames(df_compare_genes) <- c("genes", "meth2", "meth1") # add ratio between old and new method df_compare_genes$ratio <- df_compare_genes$meth1/df_compare_genes$meth2 # add number cells afected number_cells <- data.frame(num_cells_old = rowSums(counts(meth1_sce)>0) ) number_cells$genes <- row.names(number_cells) df_compare_genes <- merge(number_cells, df_compare_genes, by="genes") # plot genes plot(df_compare_genes$meth2, df_compare_genes$meth1, main = "Genes") #exclude malat df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),] # plot genes again plot(df_compare_genes$meth2, df_compare_genes$meth1, main="delete the most expressed gene to see better the correlation", ylim=c(0,80000), xlim=c(0,80000)) # plot the ratio hist(df_compare_genes$ratio, breaks=seq(0,45,0.2)) plot(df_compare_genes$ratio) # with ggplot (to set the base if I need to modify) df_compare_genes %>% arrange(ratio) %>% mutate(genes=factor(genes, levels=genes)) %>% # This trick update the factor levels ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,45,0.5)) + ggtitle( "gene counts meth1 / meth2") is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio)) genes_outlier <- df_compare_genes[is_outlier_gene_ratio,] # see the threshold threshold <- attr(is_outlier_gene_ratio, "thresholds") # how many would be more than 1.1? sum(na.omit(df_compare_genes$ratio)>1.1) #[1] [1] 3928 # top genes overeepresented in old df_compare_genes <- arrange(df_compare_genes, ratio ) genes_top <- tail(df_compare_genes,100) # save) write.csv(genes_outlier, here("outs/debug/genes_outlier_meth1_oldvsmeth2.csv")) write.csv(genes_top, here("outs/debug/genes_top_meth1_oldvsmeth2.csv")) ``` any gene where the countmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier. This is a total of `r dim(genes_outlier)[1]` genes. #### Genes with more than 1.1. ratio <details> <summary> Click to expand </summary> ```{r plot-genes, eval=FALSE} # plot all genes with more than 1.1 ratio genes_1.1 <- df_compare_genes$genes[df_compare_genes$ratio>1.1] for(gene in genes_1.1){ meth1 <- colSums(GetAssayData(meth1_sce[row.names(meth1_sce)==gene,], slot = "counts")) meth2 <- colSums(GetAssayData(meth2_sce[row.names(meth2_sce)==gene,], slot = "counts")) ratio <- df_compare_genes$ratio[df_compare_genes$genes == gene] num_cells_old <- df_compare_genes$num_cells_old[df_compare_genes$genes == gene] print(plot(meth1, meth2 , main = paste(gene, ", ratio meth2/meth1 =", round(ratio,2), ", num cells =", num_cells_old))) } ``` </details> ## Compare method 1 ( not duplicates) with old method 1 ```{r compare-2, eval=FALSE} # compare cells meth1_old_comp_df <- colData(meth1_old_sce) meth1_comp_df <- colData(meth1_sce) df_compare <- merge(meth1_old_comp_df, meth1_comp_df, by = "Barcode", suffixes = c("_meth1","_meth2")) # plot cells plot(df_compare$total_meth1, df_compare$total) plot(df_compare$detected_meth1, df_compare$detected) # compare genes df_compare_genes <- as.data.frame(rowSums(counts(meth1_sce))) df_compare_genes<- merge(as.data.frame(rowSums(counts(meth1_old_sce))), df_compare_genes, by="row.names") colnames(df_compare_genes) <- c("genes", "old_meth1", "meth1") # add ratio between old and new method df_compare_genes$ratio <- df_compare_genes$old_meth1/df_compare_genes$meth1 # add number cells afected number_cells <- data.frame(num_cells_old = rowSums(GetAssayData(meth1_sce, "counts")>0) ) number_cells$genes <- row.names(number_cells) df_compare_genes <- merge(number_cells, df_compare_genes, by="genes") # plot genes plot(df_compare_genes$old_meth1, df_compare_genes$meth1) # plot the ratio hist(df_compare_genes$ratio, breaks=seq(0,45,0.2)) #plot(df_compare_genes$ratio) # with ggplot (to set the base if I need to modify) df_compare_genes %>% arrange(ratio) %>% mutate(genes=factor(genes, levels=genes)) %>% # This trick update the factor levels ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,10,0.5)) + ggtitle( "gene counts old_meth1 / meth1") is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio)) genes_outlier <- df_compare_genes[is_outlier_gene_ratio,] # see the threshold threshold <- attr(is_outlier_gene_ratio, "thresholds") (threshold) # how many would be more than 1.1? sum(df_compare_genes$ratio>1.1) #[1] 1693 # this is between old meth1 and meth2 # for the difference newmeth1 1 and meth22 it's only 24 # meth1 vs old meth2 it's 1699 # top genes overeepresented in old df_compare_genes <- arrange(df_compare_genes, ratio ) genes_top <- tail(df_compare_genes,100) # save write.csv(genes_outlier, here("outs/debug/genes_outlier_meth1vsmeth1_old.csv")) write.csv(genes_top, here("outs/debug/genes_top_meth1vsmeth1_old.csv")) # most expressed gene tail(arrange(df_compare_genes, meth1),1) #exclude it df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),] # plot genes again plot(df_compare_genes$meth1, df_compare_genes$old_meth1, main="delete the most expressed gene to see better the correlation") ``` <!-- any gene where the countmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier. --> <!-- This is a total of `r dim(genes_outlier)[1]` genes. -->