src/01_saveSCE_splicedUnspliced.R · Luise-Seeker-Human-WM-Glia

# Script for converting loom files to single cell experiments
# for all spliced and all unspliced matrices respectively
# LASeeker
# 20200418

# LOAD LIBRARIES
library(lattice)
library(matrix)
library(KernSmooth)
library(MASS)
library(nlme)
library(cluster)
library(survival)
library(devtools)
library(Seurat)
library(sctransform)
library(hdf5r)
library(pcaMethods)
library(loomR)
library(velocyto.R)
library(ggplot2)
library(cowplot)
library(dplyr)
library(future)



e <- 40000 * 1024^2
options(future.globals.maxSize = e)


####### MANUAL INPUT REQUIRED HERE #####################
####### Set working directory############################

setwd("/exports/eddie/scratch/lseeker")


########### Find input file locations####################
# loom files should be saved in a directory called "loom" and
# the metadata in a directory called "metadata" within
# the working directory

loom_names <- list.files(path = "loom", pattern = ".loom")
met_data <- list.files(path = "metadata", pattern = ".csv")

############ Set variables for creating Seurat object########
min_cells <- 0 # you can set QC thresholds here if you like
# (for example enter a 3 here and remove all genes that were
# expressed in less cells)

min_features <- 0 # if you want to filter some more, you can
# remove here genes that are not frequently expressed
# (for example all genes below 200 copies detected across all
# cells)

# I decided to so all filtering later.


############# create output folders#########################

dir.create(file.path(getwd(),
  "splice_control_out",
  "datasets",
  "raw_matrices",
  "unspliced",
  sep = ""
), recursive = TRUE)

dir.create(file.path(getwd(),
  "splice_control_out",
  "datasets",
  "raw_matrices",
  "spliced",
  sep = ""
))

dir.create(file.path(
  "splice_control_out",
  "dupl_genes"
), recursive = TRUE)



##############################################################
#################### START ###################################


## read in raw metadata
raw_met_data <- read.csv(paste(getwd(), "/metadata/",
  met_data,
  sep = ""
))


for (i in 1:length(loom_names)) {
  process_number <- sapply(strsplit(loom_names[i], "__"), `[`, 2)
  process_number <- sapply(strsplit(process_number, "_"), `[`, 2)

  # generate a sample name that will be used for saving data
  # and naming Seurat/SCE objects
  sample_name <- sapply(strsplit(loom_names[i], "__"), `[`, 2)
  sample_name <- sub(
    pattern = "(.*)\\..*$",
    replacement = "\\1", basename(sample_name)
  )


  # read in .loom file
  loom_matrices <- read.loom.matrices(file.path(
    getwd(),
    "loom",
    loom_names[i]
  ))

  # extract a matrix with counts for spliced mRNAs
  loom_spliced <- loom_matrices$spliced
  # extract a matrix with counts for unspliced mRNAs
  loom_unspliced <- loom_matrices$unspliced



  ############## Save duplicated gene names######################

  # depending on the used reference genome, it is common that some
  # gene names are duplicated. It is not completely clear to us why
  # this happens. We tested several ways of dealing with this
  # issue and decided to remember duplicated genes so that they
  # can be checked further downstream. If they happen to be genes
  # of interest, differen solutions need to be explored. The combined
  # matrix is only created here for the sake of saving gene counts
  # for the duplicated genes. make.unique maked names unique by
  # appending an ".1" to the name of the second item.

  comb_matrix <- loom_spliced + loom_unspliced

  rownames(comb_matrix) <- make.unique(rownames(comb_matrix))

  d <- rownames(loom_spliced)[duplicated(rownames(loom_spliced))] 
  # gene names in all matrices within a .loom file are the same


  d <- c(d, paste(d, ".1", sep = ""))
  s <- as.data.frame(rowSums(comb_matrix)[which(rownames(comb_matrix)
  %in% d)])
  
  s$genes <- rownames(s)
  rownames(s) <- NULL
  names(s)[1] <- paste("count",
                     sample_name,
                     sep = "_")

  
  if(i == 1){
    t <- s
  } else {
    t <- merge(t, s, by = "genes", all = TRUE)
  }




  ###################################################################
  #### here we continue with the spliced and unspliced matrices #######

  rownames(loom_unspliced) <- make.unique(rownames(loom_unspliced))
  rownames(loom_spliced) <- make.unique(rownames(loom_spliced))


  # prepare to link count matrix to metadata:

  df <- data.frame(
    Barcode = colnames(loom_spliced),
    process_number = as.integer(paste(process_number))
  )

  # Link barcodes to metadata
  metadata <- merge(df, raw_met_data, by = "process_number", all.x = TRUE)

  # rownames must be the barcodes to allow merging

  rownames(metadata) <- metadata$Barcode


  # Convert the combined matrix to a Seurat object
  y_unspliced <- CreateSeuratObject(
    counts = loom_unspliced,
    min.cells = 0,
    min.features = 0,
    project = sample_name,
    assay = "RNA",
    names.field = 1,
    names.delim = ".",
    meta.data = metadata
  )

  y_unspliced[["percent.mt"]] <-
    PercentageFeatureSet(y_unspliced, pattern = "^MT-")


  # same for spliced
  y_spliced <- CreateSeuratObject(
    counts = loom_spliced,
    min.cells = 0,
    min.features = 0,
    project = sample_name,
    assay = "RNA", 
    names.field = 1,
    names.delim = ".",
    meta.data = metadata
  )

  y_spliced[["percent.mt"]] <-
    PercentageFeatureSet(y_spliced, pattern = "^MT-")

  # Save the first one and then apend the others to it
  if (i == 1) {
    x_unspliced <- y_unspliced
    x_spliced <- y_spliced
  } else {
    x_unspliced <- merge(
      x = x_unspliced,
      y = y_unspliced, project = "HCA_raw_unspliced"
    )
    x_spliced <- merge(
      x = x_spliced,
      y = y_spliced, project = "HCA_raw_spliced"
    )
  }
  
  write.csv(t, file.path(
    getwd(),
    "splice_control_out",
    "dupl_genes",
    paste("dupl_gene_counts_",
          ".csv",
          sep = ""
    )
  ))
}



sample_count <- i




sing_cell_unspliced <- as.SingleCellExperiment(x_unspliced)
sing_cell_spliced <- as.SingleCellExperiment(x_spliced)

saveRDS(sing_cell_unspliced, file.path(
  getwd(),
  "splice_control_out",
  "datasets",
  "raw_matrices",
  "unspliced",
  paste("Combined_sce_",
    sample_count,
    "_unspliced_samples.RDS",
    sep = ""
  )
))

saveRDS(sing_cell_spliced, file.path(
  getwd(),
  "splice_control_out",
  "datasets",
  "raw_matrices",
  "spliced",
  paste("Combined_sce_",
    sample_count,
    "_spliced_samples.RDS",
    sep = ""
  )
))




# save session info to file
sink(file.path(
  getwd(),
  "splice_control_out",
  paste(Sys.Date(),
    "_session_info.txt",
    sep = ""
  )
))
sessionInfo()
sink()


# End of Script