#' iSoMAs: iSoform expression and somatic Mutation Association #' #' \code{iSoMAs} is an efficient computational pipeline based on principal component analysis (PCA) techniques #' for exploring the role of somatic mutations in shaping the landscape of gene isoform expression at the #' transcriptome level. #' #' iSoMAs integrates sample-matched RNA-seq and DNA-seq data to study the association between gene somatic #' mutation and gene isoform expression via both cis- and trans-acting mechanisms. #' The iSoMAs workflow consists of two steps: #' In the first step, the high-dimensional gene isoform expression matrix (d=59,866 derived from the 15,448 #' multi-isoform genes) for each cancer type is trimmed by mean.var.plot method (built in the Seurat toolkit) #' into a more informative expression matrix, which keeps only the most variable isoforms but is still #' high-dimensional (d~3,500). After that, the PCA is performed to further reduce the dimension of the #' informative expression matrix into a much lower-dimensional PC score matrix (d<=50) by calculating a PC #' loading matrix. Each column of the PC loading matrix performs a particular linear combination of the top #' variable isoforms into a meta-isoform, with the combination coefficients stored in the corresponding column #' of the PC loading matrix. All the meta-isoforms comprise the coordinates of the new low-dimensional space. #' In the second step, a differential PC score analysis is conducted along each of the 50 PC-coordinates based #' on the mutation status of the studied gene by Wilcoxon rank-sum test. Following the differential PC score #' analysis, the significant genes (termed iSoMAs genes) are determined if the minimum of the 50 p-values is #' smaller than a predefined threshold (Pmin<1e-3). #' #' @param data.iso Gene expression matrix at the isoform-level. Each row represents a transcript (isoform) and #' each column represents a sample. #' @param data.maf Gene somatic mutation tab in mutation annotation format (maf) #' @param gene_to_iso A list of list that maps each gene to its isoforms (transcripts) #' @param iso_to_gene A list mapping each isoform to its gene #' @param genes_test A list of genes to test. If NULL, iSoMAs will automatically determines a qualified list of genes #' for test based on the input data.maf, mut.num.thres and mut.percent.thres. #' @param stat.method Statistical method used for the differential PC score analysis, choose one of: #' \itemize{ #' \item wilcox for Wilcoxon's rank-sum test. #' \item t.test for Student's t-test. #' } #' @param mut.num.thres Threshold number of samples for determining genes_test. #' @param mut.percent.thres Threshold percent of samples for determing genes_test. iSoMAs will take the maximum of #' mut.num.thres and mut.percent.thres to determine the final mut.samp.thres. #' @param varType Designate the specific Variant_Type of somatic mutation in data.maf. #' @param varClassification Designate the specific Variant_Classification of somatic mutation in data.maf. #' @param res.cluster Argument used in \code{Seurat::FindClusters}. #' @param scale.factor Argument used in \code{Seurat::NormalizeData}. #' @param normalization.method Argument used in \code{Seurat::NormalizeData}. #' @param selection.method Argument used in \code{Seurat::FindVariableFeatures}. #' @param nPCs Number of PCs in PCA. #' @param return.Seurat Return the Seurat object. #' @minP.sorted Sort genes based on minP across the nPCs coordinates. #' @param project Name of the project. #' @param filename Name of file for saving the iSoMAs results. #' @ntop.gene Number of top genes from genes_test for test. #' #' @return This function returns a list with the following 20 elements: #' pca, pvals_all, pvals_all_sorted, genes_test, min.cells, min.features, dim_data.iso, dim_iso, project, #' mut.samp.thres, varType, varClassification, group1, group2, nPCs, res.cluster, scale.factor, #' normalization.method, selection.method, iso, time_stamp #' #' @references Hua Tan, ..., Laura Elnitski (2022): iSoMAs: Finding isoform expression and somatic mutation associations in human cancers #' @import Seurat PCA process #' @export iSoMAs <- function(data.iso,data.maf,gene_to_iso=NULL, iso_to_gene=NULL, genes_test=NULL, stat.method="wilcox", mut.num.thres = 5,mut.percent.thres = 0.02, varType = "SNP",varClassification = NULL, res.cluster = 0.5, scale.factor = 10000, normalization.method = "LogNormalize", selection.method = "mvp", nPCs = 50,return.Seurat=F,minP.sorted=F, project = "TCGA-LUAD", filename=NULL, ntop.gene=100){ # iSoMAs.R: main function for iSoMAs pipeline # copyright (c) Hua Tan, warm.tan@gmail.com # 11/2/2022 @4C08 library(Matrix) library(Seurat) library(pbapply) ###-----global variables jPC1=4 #the PC-coordinates starts with column 4, the first 3 columns are sample number of {WildType,Other,Mutant} group1 = "WildType" #test differential PC score between WildType and Mutant groups group2 = "Mutant" data.iso = trim_features_and_samples_TCGA(data.iso,gene_to_iso,iso_to_gene) ###-----numbers in comments are based on LUAD cancer-----### min.features = max(1,floor(nrow(data.iso)*0.1)) #5986 min.cells = max(10,floor(ncol(data.iso)*0.05)) #25 cat(paste0('min.cells=',min.cells,"; min.features=",min.features,"\n")) iso = run_PCA_Seurat(data.iso,res.cluster = res.cluster, scale.factor = scale.factor, normalization.method = normalization.method, selection.method = selection.method, min.cells=min.cells, min.features=min.features, nPCs=nPCs,project = project) pca = iso@reductions$pca myCoord = data.frame(pca@cell.embeddings) PCI = ncol(myCoord) if(any(rownames(myCoord) != colnames(data.iso))){ stop("sample names after PCA are not consistent with data.iso") }else{ mySamples = colnames(data.iso) } cat("Conducting MAF_PCA...\n") ##get genes_test from maf data if(is.null(genes_test)){ cat("genes_test is NULL, generating genes_test from data.maf") mut.samp.thres = max(mut.num.thres,ceiling(ncol(data.iso)*mut.percent.thres)) #11 cat(paste0('mut.samp.thres = ',mut.samp.thres,'\n')) data.maf.tmp = get_maf_of_interest_TCGA(data.maf,varType = varType, varClassification = varClassification)#199580, 120 genes_maf = sort(table(data.maf.tmp$Hugo_Symbol),decreasing = T) #18888 genes_maf = genes_maf[genes_maf>=mut.samp.thres] #5725 cat('top 10 mutated genes:\n') print(head(genes_maf,10)) cat('bottom 10 mutated genes:\n') print(tail(genes_maf,10)) genes_test = names(genes_maf) rm(data.maf.tmp) } if(!is.null(ntop.gene)){ cat(paste0('truncating to top ',ntop.gene,' genes\n')) genes_test = genes_test[1:min(length(genes_test),ntop.gene)] } cat(paste0("genes_test: ",length(genes_test),"---",paste(head(genes_test,20),collapse = ", "),"...\n")) if(length(genes_test)>0){ pvals_all = pblapply(genes_test,function(myGene.mut){ myCoord$Mutation = get_maf_of_interest_TCGA(data.maf,myGene=myGene.mut, mySamples=mySamples, varType=varType, varClassification=varClassification, verbose = F)$Mutation tabMut = table(myCoord$Mutation) # Mutant Other WildType # 213 37 267 if(any(!c("WildType","Mutant") %in% names(tabMut)) || tabMut[group1]<2 || tabMut[group2]<2){ #by HT on 6/24/2022 @13547 cat(paste0(Sys.time(),"---no sufficient samples for differential analysis for ",myGene.mut,"***\n")) print(tabMut) return(NULL) } ###-----in case there is no 'Other' samples tabMut0 = c(Mutant=0,Other=0,WildType=0) tabMut0[names(tabMut)] = tabMut tabMut = tabMut0 # print(tabMut) if(tolower(stat.method)=="wilcox"){ pvals = unlist(lapply(seq(PCI),function(j){ wilcox.test(myCoord[myCoord$Mutation==group1,j], myCoord[myCoord$Mutation==group2,j], alternative="two.sided")$p.value })) }else if(tolower(stat.method)=="t.test"){ pvals = unlist(lapply(seq(PCI),function(j){ t.test(myCoord[myCoord$Mutation==group1,j], myCoord[myCoord$Mutation==group2,j], alternative="two.sided")$p.value })) }else{ cat("stat.method must be {wilcox, t.test}\n") } return(c(tabMut,signif(pvals,2))) }) names(pvals_all) = genes_test pvals_all = pvals_all[!unlist(lapply(pvals_all,is.null))] pvals_all = do.call("rbind",pvals_all) pvals_all = as.data.frame(pvals_all) colnames(pvals_all)[jPC1:ncol(pvals_all)] = paste0("PC_",seq(ncol(pvals_all)-jPC1+1)) # cat("head of pvals_all:\n") # print(head(pvals_all[,])) # cat("tail of pvals_all:\n") # print(tail(pvals_all)) }else{ cat("since genes_test is empty, skipping MAF_PCA + Wilcoxon\n") pvals_all = NULL } if(return.Seurat){ iso.rt = iso }else{ iso.rt = NULL } if(minP.sorted){ a = pvals_all[,jPC1:ncol(pvals_all)] a.min = apply(a,1,min) a.sorted = sort(a.min) pvals_all_sorted = pvals_all[names(a.sorted),,drop=F] pvals_all_sorted$minP = a.sorted }else{ pvals_all_sorted = NULL } res_isomas = list(pca=pca, pvals_all=pvals_all, pvals_all_sorted=pvals_all_sorted, genes_test=genes_test, min.cells=min.cells,min.features=min.features, dim_data.iso=dim(data.iso), dim_iso=dim(iso), project=project, mut.samp.thres=mut.samp.thres, varType=varType,varClassification=varClassification, group1=group1,group2=group2, nPCs=nPCs,res.cluster=res.cluster,scale.factor=scale.factor, normalization.method = normalization.method, selection.method=selection.method, iso=iso.rt, time_stamp=Sys.time()) if(is.null(filename)){ filename = paste0("res_SoMAS_",project,".RData") } save(res_isomas, file = filename) cat("OK\n") return(res_isomas) }