--- title: "Compare method 1 (awk, now with uniq) and method 2 (subset in R)" author: "Nadine Bestard" date: "17/05/2022" output: html_document: code_folding: hide --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` Method 1 old: the awk Method 1: the awk filtering duplicates Method 2: subset in R ```{r libs, message=FALSE, warning=FALSE} library(dplyr) library(scuttle) library(here) library(ggplot2) ``` ```{r load} # load meth2 source(here("Scripts/debug_separation/import_and_filter_ddH20_A.R")) ddH2O_meth2 <- ddH2O rm(ddH2O) #load meth1 old ddH2O_old_meth1 <- readRDS(here("outs/filter_70/ddH2O_A_UMI_test/ddH2O_A_initial_srt.rds")) #load updated meth1 counts <- Read10X(here("outs", "filter_70", "CellRanger-onlyCB-uniq-reads", "human", "19880WApool01__ddH2O_A_S3", "outs", "raw_feature_bc_matrix")) ddH2O_meth1 <- CreateSeuratObject(counts = counts, project = "ddH2O_old", min.cells = 3, min.features = 200) ddH2O_list <- list(meth1=ddH2O_meth1,old_meth1=ddH2O_old_meth1,meth2=ddH2O_meth2) ``` ## General dimensions the 3 methods ```{r explore} print("dimensions") # Compare some metrics lapply(ddH2O_list,function(X)dim(X)) #$meth1 #[1] 15768 717 # #$old_meth1 #[1] 15769 717 # #$meth2 #[1] 17995 717 print("sum umis") lapply(ddH2O_list,function(X)sum(X@meta.data$nCount_RNA)) print("sum genes") lapply(ddH2O_list,function(X)sum(X@meta.data$nFeature_RNA)) ``` ## Compare the old method 1 with method 2 (subset in R) ```{r compare-old} # compare cells df_compare <- (ddH2O_meth2@meta.data) df_compare$nFeature_RNA_old_meth1 <- ddH2O_old_meth1@meta.data$nFeature_RNA df_compare$nCount_RNA_old_meth1 <- (ddH2O_old_meth1@meta.data$nCount_RNA) # plot cells plot(df_compare$nCount_RNA_old_meth1, df_compare$nCount_RNA) plot(df_compare$nFeature_RNA_old_meth1, df_compare$nFeature_RNA) # compare genes df_compare_genes <- as.data.frame(rowSums(ddH2O_old_meth1)) df_compare_genes<- merge(as.data.frame(rowSums(ddH2O_meth2)), df_compare_genes, by="row.names") colnames(df_compare_genes) <- c("genes", "meth2", "old_meth1") # add ratio between old and new method df_compare_genes$ratio <- df_compare_genes$old_meth1/df_compare_genes$meth2 # add number cells afected number_cells <- data.frame(num_cells_old = rowSums(GetAssayData(ddH2O_old_meth1, "counts")>0) ) number_cells$genes <- row.names(number_cells) df_compare_genes <- merge(number_cells, df_compare_genes, by="genes") # plot genes plot(df_compare_genes$meth2, df_compare_genes$old_meth1, main = "Cells") # plot the ratio hist(df_compare_genes$ratio, breaks=seq(0,10,0.2)) plot(df_compare_genes$ratio) # with ggplot (to set the base if I need to modify) df_compare_genes %>% arrange(ratio) %>% mutate(genes=factor(genes, levels=genes)) %>% # This trick update the factor levels ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,10,0.5)) + ggtitle( "gene counts old_meth1 / meth2") is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio)) genes_outlier <- df_compare_genes[is_outlier_gene_ratio,] # see the threshold threshold <- attr(is_outlier_gene_ratio, "thresholds") # how many would be more than 1.1? sum(df_compare_genes$ratio>1.1) #[1] 1693 # this is between old meth1 and 2 # for the difference new 1 and 2 it's only 24 #exclude most expressed gene df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),] # plot genes again plot(df_compare_genes$old_meth1, df_compare_genes$meth2, main="delete the most expressed gene to see better the correlation") ``` any gene where the countoldmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier. This is a total of `r dim(genes_outlier)[1]` genes. ## Compare method 1 ( not duplicates) with method 2 (subset in R) ```{r compare} # compare cells df_compare <- (ddH2O_meth2@meta.data) df_compare$nFeature_RNA_meth1 <- ddH2O_meth1@meta.data$nFeature_RNA df_compare$nCount_RNA_meth1 <- (ddH2O_meth1@meta.data$nCount_RNA) # plot cells plot(df_compare$nCount_RNA_meth1, df_compare$nCount_RNA) plot(df_compare$nFeature_RNA_meth1, df_compare$nFeature_RNA) # compare genes df_compare_genes <- as.data.frame(rowSums(ddH2O_meth1)) df_compare_genes<- merge(as.data.frame(rowSums(ddH2O_meth2)), df_compare_genes, by="row.names") colnames(df_compare_genes) <- c("genes", "meth2", "meth1") # add ratio between old and new method df_compare_genes$ratio <- df_compare_genes$meth2/df_compare_genes$meth1 # add number cells afected number_cells <- data.frame(num_cells_old = rowSums(GetAssayData(ddH2O_meth1, "counts")>0) ) number_cells$genes <- row.names(number_cells) df_compare_genes <- merge(number_cells, df_compare_genes, by="genes") # plot genes plot(df_compare_genes$meth2, df_compare_genes$meth1, main = "Cells") # plot the ratio hist(df_compare_genes$ratio, breaks=seq(0,10,0.2)) plot(df_compare_genes$ratio) # with ggplot (to set the base if I need to modify) df_compare_genes %>% arrange(ratio) %>% mutate(genes=factor(genes, levels=genes)) %>% # This trick update the factor levels ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,10,0.5)) + ggtitle( "gene counts meth2 / meth1") is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio)) genes_outlier <- df_compare_genes[is_outlier_gene_ratio,] # see the threshold threshold <- attr(is_outlier_gene_ratio, "thresholds") # how many would be more than 1.1? sum(df_compare_genes$ratio>1.1) #[1] 1693 # this is between old meth1 and 2 # for the difference new 1 and 2 it's only 24 # top genes overeepresented in old df_compare_genes <- arrange(df_compare_genes, ratio ) genes_top <- tail(df_compare_genes,100) # save) write.csv(genes_outlier, here("outs/filter_70/ddH2O_A_UMI_test/genes_outlier_meth1vsmeth2.csv")) write.csv(genes_top, here("outs/filter_70/ddH2O_A_UMI_test/genes_top_meth1vsmeth2.csv")) #exclude it df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),] # plot genes again plot(df_compare_genes$meth1, df_compare_genes$meth2, main="delete the most expressed gene to see better the correlation") ``` any gene where the countmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier. This is a total of `r dim(genes_outlier)[1]` genes. #### Genes with more than 1.1. ratio ```{r plot-genes} # plot all genes with more than 1.1 ratio genes_1.1 <- df_compare_genes$genes[df_compare_genes$ratio>1.1] for(gene in genes_1.1){ meth1 <- colSums(GetAssayData(ddH2O_meth1[row.names(ddH2O_meth1)==gene,], slot = "counts")) meth2 <- colSums(GetAssayData(ddH2O_meth2[row.names(ddH2O_meth2)==gene,], slot = "counts")) ratio <- df_compare_genes$ratio[df_compare_genes$genes == gene] num_cells_old <- df_compare_genes$num_cells_old[df_compare_genes$genes == gene] print(plot(meth1, meth2 , main = paste(gene, ", ratio meth2/meth1 =", round(ratio,2), ", num cells =", num_cells_old))) } ``` ## Compare method 1 ( not duplicates) with old method 1 ```{r compare-2} # compare cells df_compare <- (ddH2O_old_meth1@meta.data) df_compare$nFeature_RNA_meth1 <- ddH2O_meth1@meta.data$nFeature_RNA df_compare$nCount_RNA_meth1 <- (ddH2O_meth1@meta.data$nCount_RNA) # plot cells plot(df_compare$nCount_RNA_meth1, df_compare$nCount_RNA) plot(df_compare$nFeature_RNA_meth1, df_compare$nFeature_RNA) # compare genes df_compare_genes <- as.data.frame(rowSums(ddH2O_meth1)) df_compare_genes<- merge(as.data.frame(rowSums(ddH2O_old_meth1)), df_compare_genes, by="row.names") colnames(df_compare_genes) <- c("genes", "old_meth1", "meth1") # add ratio between old and new method df_compare_genes$ratio <- df_compare_genes$old_meth1/df_compare_genes$meth1 # add number cells afected number_cells <- data.frame(num_cells_old = rowSums(GetAssayData(ddH2O_meth1, "counts")>0) ) number_cells$genes <- row.names(number_cells) df_compare_genes <- merge(number_cells, df_compare_genes, by="genes") # plot genes plot(df_compare_genes$old_meth1, df_compare_genes$meth1) # plot the ratio hist(df_compare_genes$ratio, breaks=seq(0,10,0.2)) #plot(df_compare_genes$ratio) # with ggplot (to set the base if I need to modify) df_compare_genes %>% arrange(ratio) %>% mutate(genes=factor(genes, levels=genes)) %>% # This trick update the factor levels ggplot(aes(y=ratio, x=genes)) + geom_point() + scale_y_continuous(breaks=seq(0,10,0.5)) + ggtitle( "gene counts old_meth1 / meth1") is_outlier_gene_ratio <- isOutlier( as.numeric(df_compare_genes$ratio)) genes_outlier <- df_compare_genes[is_outlier_gene_ratio,] # see the threshold threshold <- attr(is_outlier_gene_ratio, "thresholds") (threshold) # how many would be more than 1.1? sum(df_compare_genes$ratio>1.1) #[1] 1693 # this is between old meth1 and meth2 # for the difference newmeth1 1 and meth22 it's only 24 # meth1 vs old meth2 it's 1699 # top genes overeepresented in old df_compare_genes <- arrange(df_compare_genes, ratio ) genes_top <- tail(df_compare_genes,100) # save write.csv(genes_outlier, here("outs/filter_70/ddH2O_A_UMI_test/genes_outlier_meth1vsmeth1_old.csv")) write.csv(genes_top, here("outs/filter_70/ddH2O_A_UMI_test/genes_top_meth1vsmeth1_old.csv")) # most expressed gene tail(arrange(df_compare_genes, meth1),1) #exclude it df_compare_genes <- df_compare_genes[!(df_compare_genes$genes == "MALAT1"),] # plot genes again plot(df_compare_genes$meth1, df_compare_genes$old_meth1, main="delete the most expressed gene to see better the correlation") ``` any gene where the countmeth1/countmeth2 is less than `r threshold[1]` or more than `r threshold[1]` is considered outlier. This is a total of `r dim(genes_outlier)[1]` genes.