Chimeras / Scripts / Human_Mouse_Separation / separate_barcodes.R
separate_barcodes.R
Raw
library(Seurat)
library(here)
library(stringr) # to extract pattern
library(dplyr) # manipulate df
library(ggplot2) # plotting


# Loop through all samples
samples <- dir(here("Data", "STARsolo"))
# Test different percentages
pcts <- c(70, 80, 90)

# empty df to hold summaries
df_all <- data.frame()

for(pct in pcts){
  for(sample in samples){
    print(paste("pct:",pct,";Sample:", sample))
    # path to matrix
    path_matrix <- here("Data", "STARsolo", sample, "Solo.out", "Gene", "filtered")
    # read10x expects the genes to be saved in genes.tsv instead of feature.tsv
    file.rename(here(path_matrix, "features.tsv"), here(path_matrix,"genes.tsv"))
    # load reads
    matrix <- Read10X(path_matrix)
    
    
    # inspired from https://bustools.github.io/BUS_notebooks_R/10xv2.html
    # Divide the umi by the species they mapped to
    gene_species <- ifelse(str_detect(rownames(matrix), "^mm10_"), "mouse", "human")
    mouse_inds <- gene_species == "mouse"
    human_inds <- gene_species == "human"
    # mark cells as mouse or human
    cell_species <- tibble(
      barcode = colnames(matrix),
      n_mouse_umi = Matrix::colSums(matrix[mouse_inds, ]),
      n_human_umi = Matrix::colSums(matrix[human_inds, ]),
      tot_umi = Matrix::colSums(matrix),
      prop_mouse = n_mouse_umi / tot_umi,
      prop_human = n_human_umi / tot_umi
    )
    
    # Classify species based on proportion of UMI, with cutoff of 80%
    cell_species <- cell_species %>%
      mutate(species = case_when(
        prop_mouse > pct/100 ~ "mouse",
        prop_human > pct/100 ~ "human",
        TRUE ~ "mixed"
      ))
    
    # create plot and summary
    p <- ggplot(cell_species, aes(n_human_umi, n_mouse_umi, color = species)) +
      geom_point(size = 1) + ggtitle(sample)
    
    summary <- table(cell_species$species)
    df_summary <- data.frame(mouse= summary["mouse"], 
               human = summary["human"], 
               mixed = summary["mixed"], 
               row.names = sample)
    
    # filter the human and mouse barcodes
    cell_human <- cell_species %>%
      filter(cell_species$species == "human")
    cell_mouse <- cell_species %>%
      filter(cell_species$species == "mouse")
    
    # save
    # create directories
    plot_path <- here("outs", paste0("filter_",pct),"Human_Mouse_Separation", "plots")
    barcodes_path <- here("outs", paste0("filter_",pct), "Human_Mouse_Separation","barcodes")
    robj_path <- here("Data", "Human_Mouse_Separation", paste0("filter_",pct))
    
    dir.create(here("outs", paste0("filter_",pct)))
    dir.create(plot_path)
    dir.create(barcodes_path)
    dir.create(robj_path)
    
    
    ggsave(filename = here(plot_path, paste0(sample,"_species.pdf")),
           plot = p)

    write.table(cell_human$barcode,
                here(barcodes_path, paste0(sample, "_human_barcodes.txt")),
                quote = FALSE,
                row.names = FALSE,
                col.names = FALSE
    )
    
    write.table(cell_mouse$barcode,
                here(barcodes_path, paste0(sample, "_mouse_barcodes.txt")),
                quote = FALSE,
                row.names = FALSE,
                col.names = FALSE
    )
    
    saveRDS(cell_species, here(robj_path, paste0(sample, "cell_species.rds")))
    
  }
  
  # save summaries
  df_all <- rbind(df_all, df_summary) 
  write.table(df_all,
              here("outs", paste0("summary_", pct, ".txt")),
              quote = FALSE)
}