From 58d73fb58ef1757ec6a8217f0a78fce5f020757c Mon Sep 17 00:00:00 2001 From: Jiri Novotny Date: Mon, 28 Jun 2021 07:35:17 +0200 Subject: [PATCH] rewritten parallelism to BiocParallel package - Parallel processing is now controlled by user through a BiocParallel::BiocParallelParam object. This gives more freedom in choosing of parallel backend, as well as it brings a better performance in case a computational cluster is started in the beginning and reused multiple times. This is also case of the sc3() function, which now reuses the same BPPARAM for each of the pipeline's subfunctions. Also, BiocParallel is itself a robust wrapper around the parallel and snow packages, providing an unified interface to backend parallel methods, and comes bundled with powerful logging capabilities (error tracing) and some handy features, such as native progress bar. - Vignette, DESCRIPTION, and NAMESPACE have been updated accordingly. - Some other files were changed due to formatting. This is caused by the usage of devtools package, which tries to unify the code and documentation. --- DESCRIPTION | 84 +++-- NAMESPACE | 7 - R/AllGenerics.R | 41 ++- R/CoreMethods.R | 519 +++++++++++++----------------- man/ann.Rd | 4 +- man/sc3.Rd | 75 +++-- man/sc3_calc_biology.Rd | 48 +-- man/sc3_calc_consens.Rd | 16 +- man/sc3_calc_dists.Rd | 10 +- man/sc3_calc_transfs.Rd | 20 +- man/sc3_estimate_k.Rd | 3 - man/sc3_export_results_xls.Rd | 5 +- man/sc3_interactive.Rd | 3 - man/sc3_kmeans.Rd | 12 +- man/sc3_plot_cluster_stability.Rd | 3 - man/sc3_plot_consensus.Rd | 6 +- man/sc3_plot_de_genes.Rd | 6 +- man/sc3_plot_expression.Rd | 6 +- man/sc3_plot_markers.Rd | 6 +- man/sc3_plot_silhouette.Rd | 3 - man/sc3_prepare.Rd | 65 ++-- man/sc3_run_svm.Rd | 9 +- man/yan.Rd | 4 +- vignettes/SC3.Rmd | 57 ++-- 24 files changed, 480 insertions(+), 532 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5659be3..509025f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,34 +1,66 @@ -Package: SC3 Type: Package +Package: SC3 Title: Single-Cell Consensus Clustering Version: 1.15.1 +Authors@R: + c(person(given = "Vladimir", + family = "Kiselev", + role = c("cre", "aut"), + email = "vladimir.yu.kiselev@gmail.com"), + person(given = "Andrew", + family = "Yiu", + role = "ctb"), + person(given = "Tallulah", + family = "Andrews", + role = "ctb"), + person(given = "Martin", + family = "Hemberg", + role = "aut")) Author: Vladimir Kiselev Maintainer: Vladimir Kiselev -Authors@R: c(person("Vladimir", "Kiselev", - email = "vladimir.yu.kiselev@gmail.com", - role=c("cre", "aut")), - person("Andrew", "Yiu", - role=c("ctb")), - person("Tallulah", "Andrews", - role=c("ctb")), - person("Martin", "Hemberg", - role=c("aut"))) -Description: A tool for unsupervised clustering and analysis of single cell RNA-Seq data. +Description: A tool for unsupervised clustering and analysis of single + cell RNA-Seq data. License: GPL-3 -Imports: graphics, stats, utils, methods, e1071, parallel, foreach, - doParallel, doRNG, shiny, ggplot2, pheatmap (>= 1.0.8), - ROCR, robustbase, rrcov, cluster, WriteXLS, - Rcpp (>= 0.11.1), SummarizedExperiment, SingleCellExperiment, - BiocGenerics, S4Vectors -Depends: R(>= 3.3) -LinkingTo: Rcpp, RcppArmadillo -LazyData: TRUE -RoxygenNote: 6.0.1 -Suggests: knitr, rmarkdown, mclust, scater -VignetteBuilder: knitr -biocViews: ImmunoOncology, SingleCell, Software, Classification, Clustering, DimensionReduction, - SupportVectorMachine, RNASeq, Visualization, Transcriptomics, - DataRepresentation, GUI, DifferentialExpression, Transcription -NeedsCompilation: no URL: https://github.com/hemberg-lab/SC3 BugReports: https://support.bioconductor.org/t/sc3/ +Depends: + R (>= 3.3) +Imports: + BiocGenerics, + BiocParallel, + cluster, + e1071, + ggplot2, + graphics, + methods, + pheatmap (>= 1.0.8), + Rcpp (>= 0.11.1), + robustbase, + ROCR, + rrcov, + S4Vectors, + shiny, + SingleCellExperiment, + stats, + SummarizedExperiment, + utils, + WriteXLS +Suggests: + BiocStyle, + knitr, + mclust, + rmarkdown, + scater +LinkingTo: + Rcpp, + RcppArmadillo +VignetteBuilder: + knitr +biocViews: ImmunoOncology, SingleCell, Software, Classification, + Clustering, DimensionReduction, SupportVectorMachine, RNASeq, + Visualization, Transcriptomics, DataRepresentation, GUI, + DifferentialExpression, Transcription +Encoding: UTF-8 +LazyData: TRUE +NeedsCompilation: no +RoxygenNote: 7.1.1 diff --git a/NAMESPACE b/NAMESPACE index 167add9..7df85a5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -39,11 +39,7 @@ importFrom(SummarizedExperiment,assayNames) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,rowData) importFrom(WriteXLS,WriteXLS) -importFrom(doParallel,registerDoParallel) -importFrom(doRNG,"%dorng%") importFrom(e1071,svm) -importFrom(foreach,"%dopar%") -importFrom(foreach,foreach) importFrom(ggplot2,aes) importFrom(ggplot2,geom_bar) importFrom(ggplot2,ggplot) @@ -53,9 +49,6 @@ importFrom(ggplot2,ylim) importFrom(graphics,plot) importFrom(methods,as) importFrom(methods,new) -importFrom(parallel,detectCores) -importFrom(parallel,makeCluster) -importFrom(parallel,stopCluster) importFrom(pheatmap,pheatmap) importFrom(robustbase,covMcd) importFrom(rrcov,PcaHubert) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 60cb602..b3ee6ea 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -1,49 +1,48 @@ #' @export -setGeneric("sc3", signature = "object", function(object, ks = NULL, - gene_filter = TRUE, pct_dropout_min = 10, pct_dropout_max = 90, - d_region_min = 0.04, d_region_max = 0.07, svm_num_cells = NULL, - svm_train_inds = NULL, svm_max = 5000, n_cores = NULL, kmeans_nstart = NULL, - kmeans_iter_max = 1e+09, k_estimator = FALSE, biology = FALSE, rand_seed = 1) { +setGeneric("sc3", signature = "object", function(object, ks = NULL, + gene_filter = TRUE, pct_dropout_min = 10, pct_dropout_max = 90, + d_region_min = 0.04, d_region_max = 0.07, svm_num_cells = NULL, + svm_train_inds = NULL, svm_max = 5000, kmeans_nstart = NULL, + kmeans_iter_max = 1e+09, k_estimator = FALSE, biology = FALSE, + BPPARAM = BiocParallel::bpparam()) { standardGeneric("sc3") }) #' @export -setGeneric("sc3_estimate_k", signature = "object", function(object) { +setGeneric("sc3_estimate_k", signature = "object", function(object, BPPARAM = BiocParallel::bpparam()) { standardGeneric("sc3_estimate_k") }) #' @export -setGeneric("sc3_prepare", function(object, gene_filter = TRUE, - pct_dropout_min = 10, pct_dropout_max = 90, d_region_min = 0.04, - d_region_max = 0.07, svm_num_cells = NULL, svm_train_inds = NULL, - svm_max = 5000, n_cores = NULL, kmeans_nstart = NULL, - kmeans_iter_max = 1e+09, rand_seed = 1) { +setGeneric("sc3_prepare", function(object, gene_filter = TRUE, + pct_dropout_min = 10, pct_dropout_max = 90, d_region_min = 0.04, + d_region_max = 0.07, svm_num_cells = NULL, svm_train_inds = NULL, + svm_max = 5000, kmeans_nstart = NULL, kmeans_iter_max = 1e+09) { standardGeneric("sc3_prepare") }) #' @export -setGeneric("sc3_calc_dists", signature = "object", function(object) { +setGeneric("sc3_calc_dists", signature = "object", function(object, BPPARAM = BiocParallel::bpparam()) { standardGeneric("sc3_calc_dists") }) #' @export -setGeneric("sc3_calc_transfs", signature = "object", function(object) { +setGeneric("sc3_calc_transfs", signature = "object", function(object, BPPARAM = BiocParallel::bpparam()) { standardGeneric("sc3_calc_transfs") }) #' @export -setGeneric("sc3_kmeans", signature = "object", function(object, ks = NULL) { +setGeneric("sc3_kmeans", signature = "object", function(object, ks = NULL, BPPARAM = BiocParallel::bpparam()) { standardGeneric("sc3_kmeans") }) #' @export -setGeneric("sc3_calc_consens", signature = "object", function(object) { +setGeneric("sc3_calc_consens", signature = "object", function(object, BPPARAM = BiocParallel::bpparam()) { standardGeneric("sc3_calc_consens") }) #' @export -setGeneric("sc3_calc_biology", signature = "object", function(object, ks = NULL, - regime = NULL) { +setGeneric("sc3_calc_biology", signature = "object", function(object, ks = NULL, regime = NULL, BPPARAM = BiocParallel::bpparam()) { standardGeneric("sc3_calc_biology") }) @@ -58,7 +57,7 @@ setGeneric("sc3_run_svm", signature = "object", function(object, ks = NULL) { }) #' @export -setGeneric("sc3_plot_consensus", signature = "object", function(object, k, +setGeneric("sc3_plot_consensus", signature = "object", function(object, k, show_pdata = NULL) { standardGeneric("sc3_plot_consensus") }) @@ -74,13 +73,13 @@ setGeneric("sc3_plot_expression", signature = "object", function(object, k, show }) #' @export -setGeneric("sc3_plot_de_genes", signature = "object", function(object, +setGeneric("sc3_plot_de_genes", signature = "object", function(object, k, p.val = 0.01, show_pdata = NULL) { standardGeneric("sc3_plot_de_genes") }) #' @export -setGeneric("sc3_plot_markers", signature = "object", function(object, k, auroc = 0.85, +setGeneric("sc3_plot_markers", signature = "object", function(object, k, auroc = 0.85, p.val = 0.01, show_pdata = NULL) { standardGeneric("sc3_plot_markers") }) @@ -91,7 +90,7 @@ setGeneric("sc3_plot_cluster_stability", signature = "object", function(object, }) #' @export -setGeneric("sc3_export_results_xls", signature = "object", function(object, +setGeneric("sc3_export_results_xls", signature = "object", function(object, filename = "sc3_results.xls") { standardGeneric("sc3_export_results_xls") }) diff --git a/R/CoreMethods.R b/R/CoreMethods.R index 46b12f4..1243654 100644 --- a/R/CoreMethods.R +++ b/R/CoreMethods.R @@ -1,49 +1,46 @@ #' Run all steps of \code{SC3} in one go -#' +#' #' This function is a wrapper that executes all steps of \code{SC3} analysis in one go. -#' +#' #' @param object an object of \code{SingleCellExperiment} class. #' @param ks a range of the number of clusters \code{k} used for \code{SC3} clustering. #' Can also be a single integer. -#' @param gene_filter a boolen variable which defines whether to perform gene +#' @param gene_filter a boolen variable which defines whether to perform gene #' filtering before SC3 clustering. -#' @param pct_dropout_min if \code{gene_filter = TRUE}, then genes with percent of dropouts smaller than +#' @param pct_dropout_min if \code{gene_filter = TRUE}, then genes with percent of dropouts smaller than #' \code{pct_dropout_min} are filtered out before clustering. -#' @param pct_dropout_max if \code{gene_filter = TRUE}, then genes with percent of dropouts larger than +#' @param pct_dropout_max if \code{gene_filter = TRUE}, then genes with percent of dropouts larger than #' \code{pct_dropout_max} are filtered out before clustering. -#' @param d_region_min defines the minimum number of eigenvectors used for +#' @param d_region_min defines the minimum number of eigenvectors used for #' kmeans clustering as a fraction of the total number of cells. Default is \code{0.04}. #' See \code{SC3} paper for more details. -#' @param d_region_max defines the maximum number of eigenvectors used for +#' @param d_region_max defines the maximum number of eigenvectors used for #' kmeans clustering as a fraction of the total number of cells. Default is \code{0.07}. #' See \code{SC3} paper for more details. -#' @param svm_num_cells number of randomly selected training cells to be used +#' @param svm_num_cells number of randomly selected training cells to be used #' for SVM prediction. The default is \code{NULL}. -#' @param svm_train_inds a numeric vector defining indeces of training cells +#' @param svm_train_inds a numeric vector defining indeces of training cells #' that should be used for SVM training. The default is \code{NULL}. #' @param svm_max define the maximum number of cells below which SVM is not run. -#' @param n_cores defines the number of cores to be used on the user's machine. If not set, `SC3` will use all but one cores of your machine. -#' @param kmeans_nstart nstart parameter passed to \code{\link[stats]{kmeans}} function. Can be set manually. By default it is +#' @param kmeans_nstart nstart parameter passed to \code{\link[stats]{kmeans}} function. Can be set manually. By default it is #' \code{1000} for up to \code{2000} cells and \code{50} for more than \code{2000} cells. -#' @param kmeans_iter_max iter.max parameter passed to \code{\link[stats]{kmeans}} +#' @param kmeans_iter_max iter.max parameter passed to \code{\link[stats]{kmeans}} #' function. #' @param k_estimator boolean parameter, defines whether to estimate an optimal number of clusters \code{k}. If user has already defined the ks parameter the estimation does not affect the user's paramater. -#' @param biology boolean parameter, defines whether to compute differentially expressed genes, marker +#' @param biology boolean parameter, defines whether to compute differentially expressed genes, marker #' genes and cell outliers. -#' @param rand_seed sets the seed of the random number generator. \code{SC3} is a stochastic -#' method, so setting the \code{rand_seed} to a fixed values can be used for reproducibility -#' purposes. -#' +#' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +#' including a seed for random number generator. +#' #' @name sc3 #' @aliases sc3 -#' +#' #' @return an object of \code{SingleCellExperiment} class -sc3.SingleCellExperiment <- function(object, ks, gene_filter, pct_dropout_min, pct_dropout_max, d_region_min, - d_region_max, svm_num_cells, svm_train_inds, svm_max, n_cores, kmeans_nstart, kmeans_iter_max, - k_estimator, biology, rand_seed) { - object <- sc3_prepare(object, gene_filter, pct_dropout_min, pct_dropout_max, - d_region_min, d_region_max, svm_num_cells, svm_train_inds, svm_max, n_cores, kmeans_nstart, - kmeans_iter_max, rand_seed) +sc3.SingleCellExperiment <- function(object, ks, gene_filter, pct_dropout_min, pct_dropout_max, d_region_min, + d_region_max, svm_num_cells, svm_train_inds, svm_max, kmeans_nstart, kmeans_iter_max, + k_estimator, biology, BPPARAM) { + object <- sc3_prepare(object, gene_filter, pct_dropout_min, pct_dropout_max, + d_region_min, d_region_max, svm_num_cells, svm_train_inds, svm_max, kmeans_nstart, kmeans_iter_max) if (k_estimator) { object <- sc3_estimate_k(object) # Do not override cluster if user has set a k @@ -52,12 +49,17 @@ sc3.SingleCellExperiment <- function(object, ks, gene_filter, pct_dropout_min, p ks <- metadata(object)$sc3$k_estimation } } - object <- sc3_calc_dists(object) - object <- sc3_calc_transfs(object) - object <- sc3_kmeans(object, ks) - object <- sc3_calc_consens(object) + + if (!BiocParallel::bpisup(BPPARAM)) { + BiocParallel::bpstart(BPPARAM) + } + + object <- sc3_calc_dists(object, BPPARAM = BPPARAM) + object <- sc3_calc_transfs(object, BPPARAM = BPPARAM) + object <- sc3_kmeans(object, ks = ks, BPPARAM = BPPARAM) + object <- sc3_calc_consens(object, BPPARAM = BPPARAM) if (biology) { - object <- sc3_calc_biology(object, ks) + object <- sc3_calc_biology(object, ks, BPPARAM = BPPARAM) } return(object) } @@ -67,7 +69,7 @@ sc3.SingleCellExperiment <- function(object, ks, gene_filter, pct_dropout_min, p setMethod("sc3", signature(object = "SingleCellExperiment"), sc3.SingleCellExperiment) #' Prepare the \code{SingleCellExperiment} object for \code{SC3} clustering. -#' +#' #' This function prepares an object of \code{SingleCellExperiment} class for \code{SC3} clustering. It #' creates and populates the following items of the \code{sc3} slot of the \code{metadata(object)}: #' \itemize{ @@ -75,55 +77,48 @@ setMethod("sc3", signature(object = "SingleCellExperiment"), sc3.SingleCellExper #' \item \code{kmeans_nstart} - the same as the \code{kmeans_nstart} argument. #' \item \code{n_dim} - contains numbers of the number of eigenvectors to be used #' in \code{\link[stats]{kmeans}} clustering. -#' \item \code{rand_seed} - the same as the \code{rand_seed} argument. -#' \item \code{svm_train_inds} - if SVM is used this item contains indexes of the +#' \item \code{svm_train_inds} - if SVM is used this item contains indexes of the #' training cells to be used for SC3 clustering and further SVM prediction. #' \item \code{svm_study_inds} - if SVM is used this item contains indexes of the #' cells to be predicted by SVM. -#' \item \code{n_cores} - the same as the \code{n_cores} argument. #' } -#' +#' #' @param object an object of \code{SingleCellExperiment} class. -#' @param gene_filter a boolen variable which defines whether to perform gene +#' @param gene_filter a boolen variable which defines whether to perform gene #' filtering before SC3 clustering. -#' @param pct_dropout_min if \code{gene_filter = TRUE}, then genes with percent of dropouts smaller than +#' @param pct_dropout_min if \code{gene_filter = TRUE}, then genes with percent of dropouts smaller than #' \code{pct_dropout_min} are filtered out before clustering. -#' @param pct_dropout_max if \code{gene_filter = TRUE}, then genes with percent of dropouts larger than +#' @param pct_dropout_max if \code{gene_filter = TRUE}, then genes with percent of dropouts larger than #' \code{pct_dropout_max} are filtered out before clustering. -#' @param d_region_min defines the minimum number of eigenvectors used for +#' @param d_region_min defines the minimum number of eigenvectors used for #' kmeans clustering as a fraction of the total number of cells. Default is \code{0.04}. #' See \code{SC3} paper for more details. -#' @param d_region_max defines the maximum number of eigenvectors used for +#' @param d_region_max defines the maximum number of eigenvectors used for #' kmeans clustering as a fraction of the total number of cells. Default is \code{0.07}. #' See \code{SC3} paper for more details. -#' @param svm_num_cells number of randomly selected training cells to be used +#' @param svm_num_cells number of randomly selected training cells to be used #' for SVM prediction. The default is \code{NULL}. -#' @param svm_train_inds a numeric vector defining indeces of training cells +#' @param svm_train_inds a numeric vector defining indeces of training cells #' that should be used for SVM training. The default is \code{NULL}. #' @param svm_max define the maximum number of cells below which SVM is not run. -#' @param n_cores defines the number of cores to be used on the user's machine. If not set, `SC3` will use all but one cores of your machine. -#' @param kmeans_nstart nstart parameter passed to \code{\link[stats]{kmeans}} function. Default is +#' @param kmeans_nstart nstart parameter passed to \code{\link[stats]{kmeans}} function. Default is #' \code{1000} for up to \code{2000} cells and \code{50} for more than \code{2000} cells. -#' @param kmeans_iter_max iter.max parameter passed to \code{\link[stats]{kmeans}} +#' @param kmeans_iter_max iter.max parameter passed to \code{\link[stats]{kmeans}} #' function. Default is \code{1e+09}. -#' @param rand_seed sets the seed of the random number generator. \code{SC3} is a stochastic -#' method, so setting the \code{rand_seed} to a fixed values can be used for reproducibility -#' purposes. -#' +#' #' @name sc3_prepare #' @aliases sc3_prepare sc3_prepare,SingleCellExperiment-method -#' +#' #' @return an object of \code{SingleCellExperiment} class -#' -#' @importFrom parallel detectCores +#' #' @importFrom SummarizedExperiment colData colData<- rowData rowData<- assayNames #' @importFrom S4Vectors metadata metadata<- #' @importFrom utils capture.output #' @importFrom methods new #' @importFrom BiocGenerics counts -sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_min, pct_dropout_max, - d_region_min, d_region_max, svm_num_cells, svm_train_inds, svm_max, n_cores, kmeans_nstart, - kmeans_iter_max, rand_seed) { +sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_min, pct_dropout_max, + d_region_min, d_region_max, svm_num_cells, svm_train_inds, svm_max, kmeans_nstart, kmeans_iter_max) { + if (is.null(rowData(object)$feature_symbol)) { stop("There is no `feature_symbol` column in the `rowData` slot of your dataset! Please write your gene/transcript names to `rowData(object)$feature_symbol`!") return(object) @@ -136,14 +131,14 @@ sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_mi stop("There is no `logcounts` slot in your input SingleCellExperiment object! SC3 operates on `logcounts` slot, which is supposed to contain both normalised and log-transformed expression values! Please write these values the slot by setting `logcounts(object) <- log_norm_counts`!") return(object) } - + message("Setting SC3 parameters...") - + # clean up after the previous SC3 run sc3 slot metadata(object)$sc3 <- list() colData(object) <- colData(object)[, !grepl("sc3_", colnames(colData(object))), drop = FALSE] rowData(object) <- rowData(object)[, !grepl("sc3_", colnames(rowData(object))), drop = FALSE] - + # gene filter f_data <- rowData(object) f_data$sc3_gene_filter <- TRUE @@ -156,7 +151,7 @@ sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_mi } } rowData(object) <- as(f_data, "DataFrame") - + metadata(object)$sc3$kmeans_iter_max <- kmeans_iter_max if (is.null(kmeans_nstart)) { if (ncol(object) > 2000) { @@ -168,14 +163,14 @@ sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_mi } else { metadata(object)$sc3$kmeans_nstart <- kmeans_nstart } - + # define number of cells and region of dimensions n_dim <- floor(d_region_min * ncol(object)):ceiling(d_region_max * ncol(object)) # for large datasets restrict the region of dimensions to 15 if (length(n_dim) > 15) { n_dim <- sample(n_dim, 15) } - + # prepare for SVM if (!is.null(svm_num_cells) | !is.null(svm_train_inds) | ncol(object) > svm_max) { # handle all possible errors @@ -183,7 +178,7 @@ sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_mi if (!is.null(svm_train_inds)) { return(message("You have set both svm_num_cells and svm_train_inds parameters for SVM training. Please set only one of them and rerun sc3_prepare().")) } - if (svm_num_cells >= ncol(object) - 1) + if (svm_num_cells >= ncol(object) - 1) return(message("Number of cells used for SVM training is larger (or equal) than the total number of cells in your dataset. Please make svm_num_cells parameter smaller and rerun sc3_prepare().")) if (svm_num_cells < 10) { return(message("Number of cells used for SVM training is less than 10. Please make sure the number of clusters k is smaller than 10 or increase the number of training cells.")) @@ -199,10 +194,10 @@ sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_mi } # run SVM tmp <- prepare_for_svm(ncol(object), svm_num_cells, svm_train_inds, svm_max) - + metadata(object)$sc3$svm_train_inds <- tmp$svm_train_inds metadata(object)$sc3$svm_study_inds <- tmp$svm_study_inds - + # update kmeans_nstart after defining SVM training indeces if (is.null(kmeans_nstart)) { if (length(tmp$svm_train_inds) <= 2000) { @@ -211,7 +206,7 @@ sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_mi } else { metadata(object)$sc3$kmeans_nstart <- kmeans_nstart } - + # update the region of dimensions n_dim <- floor(d_region_min * length(tmp$svm_train_inds)):ceiling(d_region_max * length(tmp$svm_train_inds)) # for large datasets restrict the region of dimensions to 15 @@ -219,25 +214,9 @@ sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_mi n_dim <- sample(n_dim, 15) } } - + metadata(object)$sc3$n_dim <- n_dim - - metadata(object)$sc3$rand_seed <- rand_seed - - # register computing cluster (N-1 CPUs) on a local machine - if (is.null(n_cores)) { - n_cores <- parallel::detectCores() - if (is.null(n_cores)) { - return("Cannot define a number of available CPU cores that can be used by SC3. Try to set the n_cores parameter in the sc3() function call.") - } - # leave one core for the user - if (n_cores > 1) { - n_cores <- n_cores - 1 - } - } - - metadata(object)$sc3$n_cores <- n_cores - + return(object) } @@ -246,14 +225,14 @@ sc3_prepare.SingleCellExperiment <- function(object, gene_filter, pct_dropout_mi setMethod("sc3_prepare", signature(object = "SingleCellExperiment"), sc3_prepare.SingleCellExperiment) #' Estimate the optimal number of cluster \code{k} for a scRNA-Seq expression matrix -#' +#' #' Uses Tracy-Widom theory on random matrices to estimate the optimal number of #' clusters \code{k}. It creates and populates the \code{k_estimation} item of the #' \code{sc3} slot of the \code{metadata(object)}. -#' +#' #' @name sc3_estimate_k #' @aliases sc3_estimate_k sc3_estimate_k,SingleCellExperiment-method -#' +#' #' @param object an object of \code{SingleCellExperiment} class #' @return an estimated value of k sc3_estimate_k.SingleCellExperiment <- function(object) { @@ -269,61 +248,46 @@ sc3_estimate_k.SingleCellExperiment <- function(object) { setMethod("sc3_estimate_k", signature(object = "SingleCellExperiment"), sc3_estimate_k.SingleCellExperiment) #' Calculate distances between the cells. -#' +#' #' This function calculates distances between the cells. It #' creates and populates the following items of the \code{sc3} slot of the \code{metadata(object)}: #' \itemize{ #' \item \code{distances} - contains a list of distance matrices corresponding to #' Euclidean, Pearson and Spearman distances. #' } -#' +#' #' @name sc3_calc_dists #' @aliases sc3_calc_dists, sc3_calc_dists,SingleCellExperiment-method -#' +#' #' @param object an object of \code{SingleCellExperiment} class -#' +#' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +#' including a seed for random number generator. +#' #' @return an object of \code{SingleCellExperiment} class -#' -#' @importFrom doRNG %dorng% -#' @importFrom foreach foreach %dopar% -#' @importFrom parallel makeCluster stopCluster -#' @importFrom doParallel registerDoParallel -sc3_calc_dists.SingleCellExperiment <- function(object) { +sc3_calc_dists.SingleCellExperiment <- function(object, BPPARAM) { dataset <- get_processed_dataset(object) - + # check whether in the SVM regime if (!is.null(metadata(object)$sc3$svm_train_inds)) { dataset <- dataset[, metadata(object)$sc3$svm_train_inds] } - + # NULLing the variables to avoid notes in R CMD CHECK i <- NULL - + distances <- c("euclidean", "pearson", "spearman") - + message("Calculating distances between the cells...") - - if (metadata(object)$sc3$n_cores > length(distances)) { - n_cores <- length(distances) - } else { - n_cores <- metadata(object)$sc3$n_cores - } - - cl <- parallel::makeCluster(n_cores, outfile = "") - doParallel::registerDoParallel(cl, cores = n_cores) - + # calculate distances in parallel - dists <- foreach::foreach(i = distances) %dorng% { + dists <- BiocParallel::bplapply(distances, BPPARAM = BPPARAM, FUN = function(i, dataset) { try({ calculate_distance(dataset, i) }) - } - - # stop local cluster - parallel::stopCluster(cl) - + }, dataset = dataset) + names(dists) <- distances - + metadata(object)$sc3$distances <- dists return(object) } @@ -333,69 +297,54 @@ sc3_calc_dists.SingleCellExperiment <- function(object) { setMethod("sc3_calc_dists", signature(object = "SingleCellExperiment"), sc3_calc_dists.SingleCellExperiment) #' Calculate transformations of the distance matrices. -#' -#' This function transforms all \code{distances} items of the \code{sc3} slot of -#' the \code{metadata(object)} using either principal component analysis (PCA) +#' +#' This function transforms all \code{distances} items of the \code{sc3} slot of +#' the \code{metadata(object)} using either principal component analysis (PCA) #' or by calculating the eigenvectors of the associated graph Laplacian. -#' The columns of the resulting matrices are then sorted in descending order -#' by their corresponding eigenvalues. The first \code{d} columns -#' (where \code{d = max(metadata(object)$sc3$n_dim)}) of each transformation are then +#' The columns of the resulting matrices are then sorted in descending order +#' by their corresponding eigenvalues. The first \code{d} columns +#' (where \code{d = max(metadata(object)$sc3$n_dim)}) of each transformation are then #' written to the \code{transformations} item of the \code{sc3} slot. #' Additionally, this function also removes the previously calculated \code{distances} from #' the \code{sc3} slot, as they are not needed for further analysis. -#' +#' #' @name sc3_calc_transfs #' @aliases sc3_calc_transfs, sc3_calc_transfs,SingleCellExperiment-method -#' +#' #' @param object an object of \code{SingleCellExperiment} class -#' +#' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +#' including a seed for random number generator. +#' #' @return an object of \code{SingleCellExperiment} class -#' -#' @importFrom doRNG %dorng% -#' @importFrom foreach foreach -#' @importFrom parallel makeCluster stopCluster -#' @importFrom doParallel registerDoParallel -sc3_calc_transfs.SingleCellExperiment <- function(object) { +sc3_calc_transfs.SingleCellExperiment <- function(object, BPPARAM) { dists <- metadata(object)$sc3$distances if (is.null(dists)) { stop(paste0("Please run sc3_calc_dists() first!")) return(object) } - + # NULLing the variables to avoid notes in R CMD CHECK i <- NULL - + distances <- names(dists) transformations <- c("pca", "laplacian") - + n_dim <- metadata(object)$sc3$n_dim - + hash.table <- expand.grid(dists = distances, transfs = transformations, stringsAsFactors = FALSE) - + message("Performing transformations and calculating eigenvectors...") - - if (metadata(object)$sc3$n_cores > nrow(hash.table)) { - n_cores <- nrow(hash.table) - } else { - n_cores <- metadata(object)$sc3$n_cores - } - - cl <- parallel::makeCluster(n_cores, outfile = "") - doParallel::registerDoParallel(cl, cores = n_cores) - + # calculate the 6 distinct transformations in parallel - transfs <- foreach::foreach(i = 1:nrow(hash.table)) %dorng% { + transfs <- BiocParallel::bplapply(1:nrow(hash.table), BPPARAM = BPPARAM, FUN = function(i, hash.table, dists, n_dim) { try({ tmp <- transformation(get(hash.table[i, 1], dists), hash.table[i, 2]) tmp[, 1:max(n_dim)] }) - } - - # stop local cluster - parallel::stopCluster(cl) - + }, hash.table = hash.table, dists = dists, n_dim = n_dim) + names(transfs) <- paste(hash.table[, 1], hash.table[, 2], sep = "_") - + metadata(object)$sc3$transformations <- transfs # remove distances after calculating transformations metadata(object)$sc3$distances <- NULL @@ -412,77 +361,67 @@ sc3_calc_transfs.SingleCellExperiment <- function(object) { setMethod("sc3_calc_transfs", signature(object = "SingleCellExperiment"), sc3_calc_transfs.SingleCellExperiment) #' \code{kmeans} clustering of cells. -#' -#' This function performs \code{\link[stats]{kmeans}} clustering of the matrices +#' +#' This function performs \code{\link[stats]{kmeans}} clustering of the matrices #' contained in the \code{transformations} item of the \code{sc3} slot of the \code{metadata(object)}. It then #' creates and populates the following items of the \code{sc3} slot: #' \itemize{ #' \item \code{kmeans} - contains a list of kmeans clusterings. #' } -#' +#' #' @name sc3_kmeans #' @aliases sc3_kmeans, sc3_kmeans,SingleCellExperiment-method -#' +#' #' @param object an object of \code{SingleCellExperiment} class #' @param ks a continuous range of integers - the number of clusters \code{k} to be used for SC3 clustering. #' Can also be a single integer. -#' +#' #' @return an object of \code{SingleCellExperiment} class -#' -#' @importFrom doRNG %dorng% -#' @importFrom foreach foreach -#' @importFrom parallel makeCluster stopCluster -#' @importFrom doParallel registerDoParallel +#' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +#' including a seed for random number generator. +#' #' @importFrom utils setTxtProgressBar txtProgressBar #' @importFrom stats kmeans -sc3_kmeans.SingleCellExperiment <- function(object, ks) { +sc3_kmeans.SingleCellExperiment <- function(object, ks, BPPARAM) { if (is.null(ks)) { stop(paste0("Please provide a range of the number of clusters `ks` to be used by SC3!")) return(object) } - + transfs <- metadata(object)$sc3$transformations if (is.null(transfs)) { stop(paste0("Please run sc3_calc_transfs() first!")) return(object) } - + # NULLing the variables to avoid notes in R CMD CHECK i <- NULL - + n_dim <- metadata(object)$sc3$n_dim - + hash.table <- expand.grid(transf = names(transfs), ks = ks, n_dim = n_dim, stringsAsFactors = FALSE) - + message("Performing k-means clustering...") - - n_cores <- metadata(object)$sc3$n_cores - + kmeans_iter_max <- metadata(object)$sc3$kmeans_iter_max kmeans_nstart <- metadata(object)$sc3$kmeans_nstart - - cl <- parallel::makeCluster(n_cores, outfile = "") - doParallel::registerDoParallel(cl, cores = n_cores) - + pb <- utils::txtProgressBar(min = 1, max = nrow(hash.table), style = 3) - + # calculate the 6 distinct transformations in parallel - labs <- foreach::foreach(i = 1:nrow(hash.table)) %dorng% { + labs <- BiocParallel::bplapply(1:nrow(hash.table), BPPARAM = BPPARAM, FUN = function(i, hash.table, transfs, n_dim, kmeans_iter_max, kmeans_nstart, pb) { try({ utils::setTxtProgressBar(pb, i) transf <- get(hash.table$transf[i], transfs) - stats::kmeans(transf[, 1:hash.table$n_dim[i]], hash.table$ks[i], iter.max = kmeans_iter_max, - nstart = kmeans_nstart)$cluster + stats::kmeans(transf[, 1:hash.table$n_dim[i]], hash.table$ks[i], iter.max = kmeans_iter_max, + nstart = kmeans_nstart)$cluster }) - } - + }, hash.table = hash.table, transfs = transfs, n_dim = n_dim, kmeans_iter_max = kmeans_iter_max, kmeans_nstart = kmeans_nstart, pb = pb) + close(pb) - - # stop local cluster - parallel::stopCluster(cl) - + names(labs) <- paste(hash.table$transf, hash.table$ks, hash.table$n_dim, sep = "_") - + metadata(object)$sc3$kmeans <- labs return(object) } @@ -492,60 +431,49 @@ sc3_kmeans.SingleCellExperiment <- function(object, ks) { setMethod("sc3_kmeans", signature(object = "SingleCellExperiment"), sc3_kmeans.SingleCellExperiment) #' Calculate consensus matrix. -#' +#' #' This function calculates consensus matrices based on the clustering solutions #' contained in the \code{kmeans} item of the \code{sc3} slot of the \code{metadata(object)}. It then -#' creates and populates the \code{consensus} item of the \code{sc3} slot with +#' creates and populates the \code{consensus} item of the \code{sc3} slot with #' consensus matrices, their hierarchical clusterings in \code{hclust} objects, -#' and Silhouette indeces of the clusters. It also removes the previously +#' and Silhouette indeces of the clusters. It also removes the previously #' calculated \code{kmeans} clusterings from #' the \code{sc3} slot, as they are not needed for further analysis. -#' +#' #' Additionally, it also adds new columns to the \code{colData} slot of the #' input \code{object}. The column names correspond to the consensus cell labels -#' and have the following format: \code{sc3_k_clusters}, where \code{k} is the +#' and have the following format: \code{sc3_k_clusters}, where \code{k} is the #' number of clusters. -#' +#' #' @name sc3_calc_consens #' @aliases sc3_calc_consens, sc3_calc_consens,SingleCellExperiment-method -#' +#' #' @param object an object of \code{SingleCellExperiment} class -#' +#' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +#' including a seed for random number generator. +#' #' @return an object of \code{SingleCellExperiment} class -#' -#' @importFrom doRNG %dorng% -#' @importFrom foreach foreach -#' @importFrom parallel makeCluster stopCluster -#' @importFrom doParallel registerDoParallel +#' #' @import cluster #' @importFrom stats hclust dist as.dist -#' +#' #' @useDynLib SC3 #' @import Rcpp -sc3_calc_consens.SingleCellExperiment <- function(object) { +sc3_calc_consens.SingleCellExperiment <- function(object, BPPARAM) { k.means <- metadata(object)$sc3$kmeans if (is.null(k.means)) { stop(paste0("Please run sc3_kmeans() first!")) return(object) } - + # NULLing the variables to avoid notes in R CMD CHECK i <- NULL - + ks <- as.numeric(unique(unlist(lapply(strsplit(names(k.means), "_"), "[[", 3)))) - - if (metadata(object)$sc3$n_cores > length(ks)) { - n_cores <- length(ks) - } else { - n_cores <- metadata(object)$sc3$n_cores - } - + message("Calculating consensus matrix...") - - cl <- parallel::makeCluster(n_cores, outfile = "") - doParallel::registerDoParallel(cl, cores = n_cores) - - cons <- foreach::foreach(i = ks) %dorng% { + + cons <- BiocParallel::bplapply(ks, BPPARAM = BPPARAM, FUN = function(i, k.means) { try({ d <- k.means[grep(paste0("_", i, "_"), names(k.means))] d <- matrix(unlist(d), nrow = length(d[[1]])) @@ -556,16 +484,13 @@ sc3_calc_consens.SingleCellExperiment <- function(object) { diss <- stats::as.dist(as.matrix(stats::as.dist(tmp))) hc <- stats::hclust(diss) clusts <- reindex_clusters(hc, i) - + silh <- cluster::silhouette(clusts, diss) - + list(consensus = dat, hc = hc, silhouette = silh) }) - } - - # stop local cluster - parallel::stopCluster(cl) - + }, k.means = k.means) + names(cons) <- ks if(is.null(metadata(object)$sc3$consensus)) { metadata(object)$sc3$consensus <- list() @@ -573,10 +498,10 @@ sc3_calc_consens.SingleCellExperiment <- function(object) { for (n in names(cons)) { metadata(object)$sc3$consensus[[n]] <- cons[[n]] } - + # remove kmeans results after calculating consensus metadata(object)$sc3$kmeans <- NULL - + p_data <- colData(object) for (k in ks) { hc <- metadata(object)$sc3$consensus[[as.character(k)]]$hc @@ -590,7 +515,7 @@ sc3_calc_consens.SingleCellExperiment <- function(object) { p_data[, paste0("sc3_", k, "_clusters")] <- factor(clusts, levels = sort(unique(clusts))) } colData(object) <- as(p_data, "DataFrame") - + return(object) } @@ -600,54 +525,52 @@ setMethod("sc3_calc_consens", signature(object = "SingleCellExperiment"), sc3_ca #' Calculate DE genes, marker genes and cell outliers. -#' -#' This function calculates differentially expressed (DE) genes, marker genes +#' +#' This function calculates differentially expressed (DE) genes, marker genes #' and cell outliers based on the consensus \code{SC3} clusterings. -#' -#' DE genes are calculated using \code{\link{get_de_genes}}. Results of the DE -#' analysis are saved as new columns in the -#' \code{featureData} slot of the input \code{object}. The column names correspond -#' to the adjusted \code{p-value}s of the genes and have the following format: +#' +#' DE genes are calculated using \code{\link{get_de_genes}}. Results of the DE +#' analysis are saved as new columns in the +#' \code{featureData} slot of the input \code{object}. The column names correspond +#' to the adjusted \code{p-value}s of the genes and have the following format: #' \code{sc3_k_de_padj}, where \code{k} is the number of clusters. -#' -#' Marker genes are calculated using \code{\link{get_marker_genes}}. -#' Results of the marker gene analysis are saved as three new -#' columns (for each \code{k}) to the -#' \code{featureData} slot of the input \code{object}. The column names correspond -#' to the \code{SC3} cluster labels, to the adjusted \code{p-value}s of the genes +#' +#' Marker genes are calculated using \code{\link{get_marker_genes}}. +#' Results of the marker gene analysis are saved as three new +#' columns (for each \code{k}) to the +#' \code{featureData} slot of the input \code{object}. The column names correspond +#' to the \code{SC3} cluster labels, to the adjusted \code{p-value}s of the genes #' and to the area under the ROC curve -#' and have the following format: \code{sc3_k_markers_clusts}, -#' \code{sc3_k_markers_padj} and \code{sc3_k_markers_auroc}, where \code{k} is +#' and have the following format: \code{sc3_k_markers_clusts}, +#' \code{sc3_k_markers_padj} and \code{sc3_k_markers_auroc}, where \code{k} is #' the number of clusters. -#' -#' Outlier cells are calculated using \code{\link{get_outl_cells}}. Results of the -#' cell outlier analysis are saved as new columns in the -#' \code{phenoData} slot of the input \code{object}. The column names correspond -#' to the \code{log2(outlier_score)} and have the following format: +#' +#' Outlier cells are calculated using \code{\link{get_outl_cells}}. Results of the +#' cell outlier analysis are saved as new columns in the +#' \code{phenoData} slot of the input \code{object}. The column names correspond +#' to the \code{log2(outlier_score)} and have the following format: #' \code{sc3_k_log2_outlier_score}, where \code{k} is the number of clusters. -#' +#' #' Additionally, \code{biology} item is added to the \code{sc3} slot and is set to #' \code{TRUE} indicating that the biological analysis of the dataset has been #' performed. -#' +#' #' @name sc3_calc_biology #' @aliases sc3_calc_biology, sc3_calc_biology,SingleCellExperiment-method -#' +#' #' @param object an object of \code{SingleCellExperiment} class #' @param ks a continuous range of integers - the number of clusters \code{k} to be used for SC3 clustering. #' Can also be a single integer. #' @param regime defines what biological analysis to perform. "marker" for #' marker genes, "de" for differentiall expressed genes and "outl" for outlier #' cells -#' +#' @param BPPARAM a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +#' including a seed for random number generator. +#' #' @return an object of \code{SingleCellExperiment} class -#' -#' @importFrom doRNG %dorng% -#' @importFrom foreach foreach -#' @importFrom parallel makeCluster stopCluster -#' @importFrom doParallel registerDoParallel +#' #' @importFrom methods as -sc3_calc_biology.SingleCellExperiment <- function(object, ks, regime) { +sc3_calc_biology.SingleCellExperiment <- function(object, ks, regime, BPPARAM) { if (is.null(metadata(object)$sc3$consensus)) { stop(paste0("Please run sc3_consensus() first!")) return(object) @@ -667,11 +590,11 @@ sc3_calc_biology.SingleCellExperiment <- function(object, ks, regime) { stop(paste0("Regime value must be either 'marker', 'de' or 'outl', or any combination of these three!")) return(object) } - + message("Calculating biology...") - + hash.table <- expand.grid(ks = ks, regime = regime, stringsAsFactors = FALSE) - + dataset <- get_processed_dataset(object) p_data <- colData(object) clusts <- as.data.frame(p_data[, grep("sc3_.*_clusters", colnames(p_data))]) @@ -682,30 +605,18 @@ sc3_calc_biology.SingleCellExperiment <- function(object, ks, regime) { dataset <- dataset[, metadata(object)$sc3$svm_train_inds] clusts <- clusts[metadata(object)$sc3$svm_train_inds, ] } - + # NULLing the variables to avoid notes in R CMD CHECK i <- NULL - - if (metadata(object)$sc3$n_cores > nrow(hash.table)) { - n_cores <- nrow(hash.table) - } else { - n_cores <- metadata(object)$sc3$n_cores - } - - cl <- parallel::makeCluster(n_cores, outfile = "") - doParallel::registerDoParallel(cl, cores = n_cores) - - biol <- foreach::foreach(i = 1:nrow(hash.table)) %dorng% { + + biol <- BiocParallel::bplapply(1:nrow(hash.table), BPPARAM = BPPARAM, FUN = function(i, dataset, hash.table, clusts) { try({ get_biolgy(dataset, clusts[, paste0("sc3_", hash.table[i, 1], "_clusters")], hash.table[i, 2]) }) - } - - # stop local cluster - parallel::stopCluster(cl) - + }, dataset = dataset, hash.table = hash.table, clusts = clusts) + names(biol) <- paste(hash.table$ks, hash.table$regime, sep = "_") - + f_data <- as.data.frame(rowData(object)) p_data <- as.data.frame(colData(object)) for (b in names(biol)) { @@ -721,11 +632,11 @@ sc3_calc_biology.SingleCellExperiment <- function(object, ks, regime) { f_data[, paste0("sc3_", k, "_markers_clusts")] <- NA f_data[, paste0("sc3_", k, "_markers_padj")] <- NA f_data[, paste0("sc3_", k, "_markers_auroc")] <- NA - f_data[, paste0("sc3_", k, "_markers_clusts")][which(f_data$sc3_gene_filter)] <- biol[[b]][, + f_data[, paste0("sc3_", k, "_markers_clusts")][which(f_data$sc3_gene_filter)] <- biol[[b]][, 2] - f_data[, paste0("sc3_", k, "_markers_padj")][which(f_data$sc3_gene_filter)] <- biol[[b]][, + f_data[, paste0("sc3_", k, "_markers_padj")][which(f_data$sc3_gene_filter)] <- biol[[b]][, 3] - f_data[, paste0("sc3_", k, "_markers_auroc")][which(f_data$sc3_gene_filter)] <- biol[[b]][, + f_data[, paste0("sc3_", k, "_markers_auroc")][which(f_data$sc3_gene_filter)] <- biol[[b]][, 1] } # save cell outliers @@ -742,9 +653,9 @@ sc3_calc_biology.SingleCellExperiment <- function(object, ks, regime) { } rowData(object) <- as(f_data, "DataFrame") colData(object) <- as(p_data, "DataFrame") - + metadata(object)$sc3$biology <- TRUE - + return(object) } @@ -753,24 +664,24 @@ sc3_calc_biology.SingleCellExperiment <- function(object, ks, regime) { setMethod("sc3_calc_biology", signature(object = "SingleCellExperiment"), sc3_calc_biology.SingleCellExperiment) #' Run the hybrid \code{SVM} approach. -#' +#' #' This method parallelize \code{SVM} prediction for each \code{k} (the number -#' of clusters). Namely, for each \code{k}, \code{\link{support_vector_machines}} +#' of clusters). Namely, for each \code{k}, \code{\link{support_vector_machines}} #' function is utilized to predict the labels of study cells. Training cells are #' selected using \code{svm_train_inds} item of the \code{sc3} slot of the #' \code{metadata(object)}. -#' -#' Results are written to the \code{sc3_k_clusters} columns to the -#' \code{colData} slot of the input \code{object}, where \code{k} is the +#' +#' Results are written to the \code{sc3_k_clusters} columns to the +#' \code{colData} slot of the input \code{object}, where \code{k} is the #' number of clusters. -#' +#' #' @name sc3_run_svm #' @aliases sc3_run_svm, sc3_run_svm,SingleCellExperiment-method -#' +#' #' @param object an object of \code{SingleCellExperiment} class #' @param ks a continuous range of integers - the number of clusters \code{k} to be used for SC3 clustering. #' Can also be a single integer. -#' +#' #' @return an object of \code{SingleCellExperiment} class sc3_run_svm.SingleCellExperiment <- function(object, ks) { if (is.null(metadata(object)$sc3$svm_train_inds)) { @@ -781,23 +692,23 @@ sc3_run_svm.SingleCellExperiment <- function(object, ks) { stop(paste0("Please provide a range of the number of clusters `ks` to be used by SC3!")) return(object) } - + dataset <- get_processed_dataset(object) p_data <- colData(object) svm_train_inds <- metadata(object)$sc3$svm_train_inds svm_study_inds <- metadata(object)$sc3$svm_study_inds - + for (k in ks) { clusts <- p_data[, paste0("sc3_", k, "_clusters")] clusts <- clusts[svm_train_inds] - + train.dataset <- dataset[, svm_train_inds] colnames(train.dataset) <- clusts - + study.labs <- support_vector_machines(train.dataset, dataset[, svm_study_inds], "linear") svm.labs <- c(clusts, study.labs) ord <- order(c(svm_train_inds, svm_study_inds)) - + p_data[, paste0("sc3_", k, "_clusters")] <- svm.labs[ord] } colData(object) <- as(p_data, "DataFrame") @@ -809,26 +720,26 @@ sc3_run_svm.SingleCellExperiment <- function(object, ks) { setMethod("sc3_run_svm", signature(object = "SingleCellExperiment"), sc3_run_svm.SingleCellExperiment) #' Write \code{SC3} results to Excel file -#' +#' #' This function writes all \code{SC3} results to an excel file. -#' +#' #' @param object an object of \code{SingleCellExperiment} class #' @param filename name of the excel file, to which the results will be written -#' +#' #' @name sc3_export_results_xls #' @aliases sc3_export_results_xls -#' +#' #' @importFrom WriteXLS WriteXLS sc3_export_results_xls.SingleCellExperiment <- function(object, filename) { if (is.null(metadata(object)$sc3$consensus)) { stop(paste0("Please run sc3_consensus() first!")) } - + p_data <- colData(object) f_data <- rowData(object) - + res <- list() - + if(length(grep("sc3_", colnames(p_data))) != 0) { cells <- as.data.frame(p_data[, grep("sc3_", colnames(p_data))]) colnames(cells) <- colnames(p_data)[grep("sc3_", colnames(p_data))] @@ -845,9 +756,9 @@ sc3_export_results_xls.SingleCellExperiment <- function(object, filename) { } else { warning("There is no gene data provided by SC3!") } - + if(length(res) != 0) { - WriteXLS(res, ExcelFileName = filename, SheetNames = names(res), + WriteXLS(res, ExcelFileName = filename, SheetNames = names(res), row.names = TRUE, AdjWidth = TRUE) } else { warning("There are no SC3 results in your data object, the Excel file will not be produced. Please run SC3 first!") diff --git a/man/ann.Rd b/man/ann.Rd index 74a7ba0..3cf9f87 100644 --- a/man/ann.Rd +++ b/man/ann.Rd @@ -4,7 +4,9 @@ \name{ann} \alias{ann} \title{Cell type annotations for data extracted from a publication by Yan et al.} -\format{An object of class \code{data.frame} with 90 rows and 1 columns.} +\format{ +An object of class \code{data.frame} with 90 rows and 1 columns. +} \source{ \url{http://dx.doi.org/10.1038/nsmb.2660} diff --git a/man/sc3.Rd b/man/sc3.Rd index ca4c06f..3700de5 100644 --- a/man/sc3.Rd +++ b/man/sc3.Rd @@ -1,24 +1,46 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3} \alias{sc3} \alias{sc3.SingleCellExperiment} \alias{sc3,SingleCellExperiment-method} -\alias{sc3} \title{Run all steps of \code{SC3} in one go} \usage{ -sc3.SingleCellExperiment(object, ks, gene_filter, pct_dropout_min, - pct_dropout_max, d_region_min, d_region_max, svm_num_cells, svm_train_inds, - svm_max, n_cores, kmeans_nstart, kmeans_iter_max, k_estimator, biology, - rand_seed) - -\S4method{sc3}{SingleCellExperiment}(object, ks = NULL, gene_filter = TRUE, - pct_dropout_min = 10, pct_dropout_max = 90, d_region_min = 0.04, - d_region_max = 0.07, svm_num_cells = NULL, svm_train_inds = NULL, - svm_max = 5000, n_cores = NULL, kmeans_nstart = NULL, - kmeans_iter_max = 1e+09, k_estimator = FALSE, biology = FALSE, - rand_seed = 1) +sc3.SingleCellExperiment( + object, + ks, + gene_filter, + pct_dropout_min, + pct_dropout_max, + d_region_min, + d_region_max, + svm_num_cells, + svm_train_inds, + svm_max, + kmeans_nstart, + kmeans_iter_max, + k_estimator, + biology, + BPPARAM +) + +\S4method{sc3}{SingleCellExperiment}( + object, + ks = NULL, + gene_filter = TRUE, + pct_dropout_min = 10, + pct_dropout_max = 90, + d_region_min = 0.04, + d_region_max = 0.07, + svm_num_cells = NULL, + svm_train_inds = NULL, + svm_max = 5000, + kmeans_nstart = NULL, + kmeans_iter_max = 1e+09, + k_estimator = FALSE, + biology = FALSE, + BPPARAM = BiocParallel::bpparam() +) } \arguments{ \item{object}{an object of \code{SingleCellExperiment} class.} @@ -26,47 +48,44 @@ sc3.SingleCellExperiment(object, ks, gene_filter, pct_dropout_min, \item{ks}{a range of the number of clusters \code{k} used for \code{SC3} clustering. Can also be a single integer.} -\item{gene_filter}{a boolen variable which defines whether to perform gene +\item{gene_filter}{a boolen variable which defines whether to perform gene filtering before SC3 clustering.} -\item{pct_dropout_min}{if \code{gene_filter = TRUE}, then genes with percent of dropouts smaller than +\item{pct_dropout_min}{if \code{gene_filter = TRUE}, then genes with percent of dropouts smaller than \code{pct_dropout_min} are filtered out before clustering.} -\item{pct_dropout_max}{if \code{gene_filter = TRUE}, then genes with percent of dropouts larger than +\item{pct_dropout_max}{if \code{gene_filter = TRUE}, then genes with percent of dropouts larger than \code{pct_dropout_max} are filtered out before clustering.} -\item{d_region_min}{defines the minimum number of eigenvectors used for +\item{d_region_min}{defines the minimum number of eigenvectors used for kmeans clustering as a fraction of the total number of cells. Default is \code{0.04}. See \code{SC3} paper for more details.} -\item{d_region_max}{defines the maximum number of eigenvectors used for +\item{d_region_max}{defines the maximum number of eigenvectors used for kmeans clustering as a fraction of the total number of cells. Default is \code{0.07}. See \code{SC3} paper for more details.} -\item{svm_num_cells}{number of randomly selected training cells to be used +\item{svm_num_cells}{number of randomly selected training cells to be used for SVM prediction. The default is \code{NULL}.} -\item{svm_train_inds}{a numeric vector defining indeces of training cells +\item{svm_train_inds}{a numeric vector defining indeces of training cells that should be used for SVM training. The default is \code{NULL}.} \item{svm_max}{define the maximum number of cells below which SVM is not run.} -\item{n_cores}{defines the number of cores to be used on the user's machine. If not set, `SC3` will use all but one cores of your machine.} - -\item{kmeans_nstart}{nstart parameter passed to \code{\link[stats]{kmeans}} function. Can be set manually. By default it is +\item{kmeans_nstart}{nstart parameter passed to \code{\link[stats]{kmeans}} function. Can be set manually. By default it is \code{1000} for up to \code{2000} cells and \code{50} for more than \code{2000} cells.} -\item{kmeans_iter_max}{iter.max parameter passed to \code{\link[stats]{kmeans}} +\item{kmeans_iter_max}{iter.max parameter passed to \code{\link[stats]{kmeans}} function.} \item{k_estimator}{boolean parameter, defines whether to estimate an optimal number of clusters \code{k}. If user has already defined the ks parameter the estimation does not affect the user's paramater.} -\item{biology}{boolean parameter, defines whether to compute differentially expressed genes, marker +\item{biology}{boolean parameter, defines whether to compute differentially expressed genes, marker genes and cell outliers.} -\item{rand_seed}{sets the seed of the random number generator. \code{SC3} is a stochastic -method, so setting the \code{rand_seed} to a fixed values can be used for reproducibility -purposes.} +\item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +including a seed for random number generator.} } \value{ an object of \code{SingleCellExperiment} class diff --git a/man/sc3_calc_biology.Rd b/man/sc3_calc_biology.Rd index ee02197..7a887f8 100644 --- a/man/sc3_calc_biology.Rd +++ b/man/sc3_calc_biology.Rd @@ -1,19 +1,20 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_calc_biology} \alias{sc3_calc_biology} \alias{sc3_calc_biology.SingleCellExperiment} \alias{sc3_calc_biology,} \alias{sc3_calc_biology,SingleCellExperiment-method} -\alias{sc3_calc_biology,SingleCellExperiment-method} -\alias{sc3_calc_biology} \title{Calculate DE genes, marker genes and cell outliers.} \usage{ -sc3_calc_biology.SingleCellExperiment(object, ks, regime) +sc3_calc_biology.SingleCellExperiment(object, ks, regime, BPPARAM) -\S4method{sc3_calc_biology}{SingleCellExperiment}(object, ks = NULL, - regime = NULL) +\S4method{sc3_calc_biology}{SingleCellExperiment}( + object, + ks = NULL, + regime = NULL, + BPPARAM = BiocParallel::bpparam() +) } \arguments{ \item{object}{an object of \code{SingleCellExperiment} class} @@ -24,35 +25,38 @@ Can also be a single integer.} \item{regime}{defines what biological analysis to perform. "marker" for marker genes, "de" for differentiall expressed genes and "outl" for outlier cells} + +\item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +including a seed for random number generator.} } \value{ an object of \code{SingleCellExperiment} class } \description{ -This function calculates differentially expressed (DE) genes, marker genes +This function calculates differentially expressed (DE) genes, marker genes and cell outliers based on the consensus \code{SC3} clusterings. } \details{ -DE genes are calculated using \code{\link{get_de_genes}}. Results of the DE -analysis are saved as new columns in the -\code{featureData} slot of the input \code{object}. The column names correspond -to the adjusted \code{p-value}s of the genes and have the following format: +DE genes are calculated using \code{\link{get_de_genes}}. Results of the DE +analysis are saved as new columns in the +\code{featureData} slot of the input \code{object}. The column names correspond +to the adjusted \code{p-value}s of the genes and have the following format: \code{sc3_k_de_padj}, where \code{k} is the number of clusters. -Marker genes are calculated using \code{\link{get_marker_genes}}. -Results of the marker gene analysis are saved as three new -columns (for each \code{k}) to the -\code{featureData} slot of the input \code{object}. The column names correspond -to the \code{SC3} cluster labels, to the adjusted \code{p-value}s of the genes +Marker genes are calculated using \code{\link{get_marker_genes}}. +Results of the marker gene analysis are saved as three new +columns (for each \code{k}) to the +\code{featureData} slot of the input \code{object}. The column names correspond +to the \code{SC3} cluster labels, to the adjusted \code{p-value}s of the genes and to the area under the ROC curve -and have the following format: \code{sc3_k_markers_clusts}, -\code{sc3_k_markers_padj} and \code{sc3_k_markers_auroc}, where \code{k} is +and have the following format: \code{sc3_k_markers_clusts}, +\code{sc3_k_markers_padj} and \code{sc3_k_markers_auroc}, where \code{k} is the number of clusters. -Outlier cells are calculated using \code{\link{get_outl_cells}}. Results of the -cell outlier analysis are saved as new columns in the -\code{phenoData} slot of the input \code{object}. The column names correspond -to the \code{log2(outlier_score)} and have the following format: +Outlier cells are calculated using \code{\link{get_outl_cells}}. Results of the +cell outlier analysis are saved as new columns in the +\code{phenoData} slot of the input \code{object}. The column names correspond +to the \code{log2(outlier_score)} and have the following format: \code{sc3_k_log2_outlier_score}, where \code{k} is the number of clusters. Additionally, \code{biology} item is added to the \code{sc3} slot and is set to diff --git a/man/sc3_calc_consens.Rd b/man/sc3_calc_consens.Rd index 7a7c4b1..443b8ec 100644 --- a/man/sc3_calc_consens.Rd +++ b/man/sc3_calc_consens.Rd @@ -1,21 +1,21 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_calc_consens} \alias{sc3_calc_consens} \alias{sc3_calc_consens.SingleCellExperiment} \alias{sc3_calc_consens,} \alias{sc3_calc_consens,SingleCellExperiment-method} -\alias{sc3_calc_consens,SingleCellExperiment-method} -\alias{sc3_calc_consens} \title{Calculate consensus matrix.} \usage{ -sc3_calc_consens.SingleCellExperiment(object) +sc3_calc_consens.SingleCellExperiment(object, BPPARAM) -\S4method{sc3_calc_consens}{SingleCellExperiment}(object) +\S4method{sc3_calc_consens}{SingleCellExperiment}(object, BPPARAM = BiocParallel::bpparam()) } \arguments{ \item{object}{an object of \code{SingleCellExperiment} class} + +\item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +including a seed for random number generator.} } \value{ an object of \code{SingleCellExperiment} class @@ -23,15 +23,15 @@ an object of \code{SingleCellExperiment} class \description{ This function calculates consensus matrices based on the clustering solutions contained in the \code{kmeans} item of the \code{sc3} slot of the \code{metadata(object)}. It then -creates and populates the \code{consensus} item of the \code{sc3} slot with +creates and populates the \code{consensus} item of the \code{sc3} slot with consensus matrices, their hierarchical clusterings in \code{hclust} objects, -and Silhouette indeces of the clusters. It also removes the previously +and Silhouette indeces of the clusters. It also removes the previously calculated \code{kmeans} clusterings from the \code{sc3} slot, as they are not needed for further analysis. } \details{ Additionally, it also adds new columns to the \code{colData} slot of the input \code{object}. The column names correspond to the consensus cell labels -and have the following format: \code{sc3_k_clusters}, where \code{k} is the +and have the following format: \code{sc3_k_clusters}, where \code{k} is the number of clusters. } diff --git a/man/sc3_calc_dists.Rd b/man/sc3_calc_dists.Rd index 53337a0..6c439a6 100644 --- a/man/sc3_calc_dists.Rd +++ b/man/sc3_calc_dists.Rd @@ -1,21 +1,21 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_calc_dists} \alias{sc3_calc_dists} \alias{sc3_calc_dists.SingleCellExperiment} \alias{sc3_calc_dists,} \alias{sc3_calc_dists,SingleCellExperiment-method} -\alias{sc3_calc_dists,SingleCellExperiment-method} -\alias{sc3_calc_dists} \title{Calculate distances between the cells.} \usage{ -sc3_calc_dists.SingleCellExperiment(object) +sc3_calc_dists.SingleCellExperiment(object, BPPARAM) -\S4method{sc3_calc_dists}{SingleCellExperiment}(object) +\S4method{sc3_calc_dists}{SingleCellExperiment}(object, BPPARAM = BiocParallel::bpparam()) } \arguments{ \item{object}{an object of \code{SingleCellExperiment} class} + +\item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +including a seed for random number generator.} } \value{ an object of \code{SingleCellExperiment} class diff --git a/man/sc3_calc_transfs.Rd b/man/sc3_calc_transfs.Rd index 04b8a37..5da93eb 100644 --- a/man/sc3_calc_transfs.Rd +++ b/man/sc3_calc_transfs.Rd @@ -1,32 +1,32 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_calc_transfs} \alias{sc3_calc_transfs} \alias{sc3_calc_transfs.SingleCellExperiment} \alias{sc3_calc_transfs,} \alias{sc3_calc_transfs,SingleCellExperiment-method} -\alias{sc3_calc_transfs,SingleCellExperiment-method} -\alias{sc3_calc_transfs} \title{Calculate transformations of the distance matrices.} \usage{ -sc3_calc_transfs.SingleCellExperiment(object) +sc3_calc_transfs.SingleCellExperiment(object, BPPARAM) -\S4method{sc3_calc_transfs}{SingleCellExperiment}(object) +\S4method{sc3_calc_transfs}{SingleCellExperiment}(object, BPPARAM = BiocParallel::bpparam()) } \arguments{ \item{object}{an object of \code{SingleCellExperiment} class} + +\item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +including a seed for random number generator.} } \value{ an object of \code{SingleCellExperiment} class } \description{ -This function transforms all \code{distances} items of the \code{sc3} slot of -the \code{metadata(object)} using either principal component analysis (PCA) +This function transforms all \code{distances} items of the \code{sc3} slot of +the \code{metadata(object)} using either principal component analysis (PCA) or by calculating the eigenvectors of the associated graph Laplacian. -The columns of the resulting matrices are then sorted in descending order -by their corresponding eigenvalues. The first \code{d} columns -(where \code{d = max(metadata(object)$sc3$n_dim)}) of each transformation are then +The columns of the resulting matrices are then sorted in descending order +by their corresponding eigenvalues. The first \code{d} columns +(where \code{d = max(metadata(object)$sc3$n_dim)}) of each transformation are then written to the \code{transformations} item of the \code{sc3} slot. Additionally, this function also removes the previously calculated \code{distances} from the \code{sc3} slot, as they are not needed for further analysis. diff --git a/man/sc3_estimate_k.Rd b/man/sc3_estimate_k.Rd index 46dc3e6..9b4a7ec 100644 --- a/man/sc3_estimate_k.Rd +++ b/man/sc3_estimate_k.Rd @@ -1,12 +1,9 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_estimate_k} \alias{sc3_estimate_k} \alias{sc3_estimate_k.SingleCellExperiment} \alias{sc3_estimate_k,SingleCellExperiment-method} -\alias{sc3_estimate_k,SingleCellExperiment-method} -\alias{sc3_estimate_k} \title{Estimate the optimal number of cluster \code{k} for a scRNA-Seq expression matrix} \usage{ sc3_estimate_k.SingleCellExperiment(object) diff --git a/man/sc3_export_results_xls.Rd b/man/sc3_export_results_xls.Rd index 6de31ef..b6dca7b 100644 --- a/man/sc3_export_results_xls.Rd +++ b/man/sc3_export_results_xls.Rd @@ -1,17 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_export_results_xls} \alias{sc3_export_results_xls} \alias{sc3_export_results_xls.SingleCellExperiment} \alias{sc3_export_results_xls,SingleCellExperiment-method} -\alias{sc3_export_results_xls} \title{Write \code{SC3} results to Excel file} \usage{ sc3_export_results_xls.SingleCellExperiment(object, filename) -\S4method{sc3_export_results_xls}{SingleCellExperiment}(object, - filename = "sc3_results.xls") +\S4method{sc3_export_results_xls}{SingleCellExperiment}(object, filename = "sc3_results.xls") } \arguments{ \item{object}{an object of \code{SingleCellExperiment} class} diff --git a/man/sc3_interactive.Rd b/man/sc3_interactive.Rd index f2434b1..ad39254 100644 --- a/man/sc3_interactive.Rd +++ b/man/sc3_interactive.Rd @@ -1,13 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/ShinyMethods.R -\docType{methods} \name{sc3_interactive} \alias{sc3_interactive} \alias{sc3_interactive.SingleCellExperiment} \alias{sc3_interactive,} \alias{sc3_interactive,SingleCellExperiment-method} -\alias{sc3_interactive,SingleCellExperiment-method} -\alias{sc3_interactive} \title{Opens \code{SC3} results in an interactive session in a web browser.} \usage{ sc3_interactive.SingleCellExperiment(object) diff --git a/man/sc3_kmeans.Rd b/man/sc3_kmeans.Rd index aa48727..92776ee 100644 --- a/man/sc3_kmeans.Rd +++ b/man/sc3_kmeans.Rd @@ -1,30 +1,30 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_kmeans} \alias{sc3_kmeans} \alias{sc3_kmeans.SingleCellExperiment} \alias{sc3_kmeans,} \alias{sc3_kmeans,SingleCellExperiment-method} -\alias{sc3_kmeans,SingleCellExperiment-method} -\alias{sc3_kmeans} \title{\code{kmeans} clustering of cells.} \usage{ -sc3_kmeans.SingleCellExperiment(object, ks) +sc3_kmeans.SingleCellExperiment(object, ks, BPPARAM) -\S4method{sc3_kmeans}{SingleCellExperiment}(object, ks = NULL) +\S4method{sc3_kmeans}{SingleCellExperiment}(object, ks = NULL, BPPARAM = BiocParallel::bpparam()) } \arguments{ \item{object}{an object of \code{SingleCellExperiment} class} \item{ks}{a continuous range of integers - the number of clusters \code{k} to be used for SC3 clustering. Can also be a single integer.} + +\item{BPPARAM}{a \code{\link[BiocParallel]{BiocParallelParam}} object specifying a type of parallelism and allocated resources, +including a seed for random number generator.} } \value{ an object of \code{SingleCellExperiment} class } \description{ -This function performs \code{\link[stats]{kmeans}} clustering of the matrices +This function performs \code{\link[stats]{kmeans}} clustering of the matrices contained in the \code{transformations} item of the \code{sc3} slot of the \code{metadata(object)}. It then creates and populates the following items of the \code{sc3} slot: \itemize{ diff --git a/man/sc3_plot_cluster_stability.Rd b/man/sc3_plot_cluster_stability.Rd index 3bb2b41..e963375 100644 --- a/man/sc3_plot_cluster_stability.Rd +++ b/man/sc3_plot_cluster_stability.Rd @@ -1,13 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/PlotMethods.R -\docType{methods} \name{sc3_plot_cluster_stability} \alias{sc3_plot_cluster_stability} \alias{sc3_plot_cluster_stability.SingleCellExperiment} \alias{sc3_plot_cluster_stability,} \alias{sc3_plot_cluster_stability,SingleCellExperiment-method} -\alias{sc3_plot_cluster_stability,SingleCellExperiment-method} -\alias{sc3_plot_cluster_stability} \title{Plot stability of the clusters} \usage{ sc3_plot_cluster_stability.SingleCellExperiment(object, k) diff --git a/man/sc3_plot_consensus.Rd b/man/sc3_plot_consensus.Rd index b359342..8a9c3c3 100644 --- a/man/sc3_plot_consensus.Rd +++ b/man/sc3_plot_consensus.Rd @@ -1,19 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/PlotMethods.R -\docType{methods} \name{sc3_plot_consensus} \alias{sc3_plot_consensus} \alias{sc3_plot_consensus.SingleCellExperiment} \alias{sc3_plot_consensus,} \alias{sc3_plot_consensus,SingleCellExperiment-method} -\alias{sc3_plot_consensus,SingleCellExperiment-method} -\alias{sc3_plot_consensus} \title{Plot consensus matrix as a heatmap} \usage{ sc3_plot_consensus.SingleCellExperiment(object, k, show_pdata) -\S4method{sc3_plot_consensus}{SingleCellExperiment}(object, k, - show_pdata = NULL) +\S4method{sc3_plot_consensus}{SingleCellExperiment}(object, k, show_pdata = NULL) } \arguments{ \item{object}{an object of 'SingleCellExperiment' class} diff --git a/man/sc3_plot_de_genes.Rd b/man/sc3_plot_de_genes.Rd index 8e7d97a..bff4d5f 100644 --- a/man/sc3_plot_de_genes.Rd +++ b/man/sc3_plot_de_genes.Rd @@ -1,19 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/PlotMethods.R -\docType{methods} \name{sc3_plot_de_genes} \alias{sc3_plot_de_genes} \alias{sc3_plot_de_genes.SingleCellExperiment} \alias{sc3_plot_de_genes,} \alias{sc3_plot_de_genes,SingleCellExperiment-method} -\alias{sc3_plot_de_genes,SingleCellExperiment-method} -\alias{sc3_plot_de_genes} \title{Plot expression of DE genes of the clusters identified by \code{SC3} as a heatmap} \usage{ sc3_plot_de_genes.SingleCellExperiment(object, k, p.val, show_pdata) -\S4method{sc3_plot_de_genes}{SingleCellExperiment}(object, k, p.val = 0.01, - show_pdata = NULL) +\S4method{sc3_plot_de_genes}{SingleCellExperiment}(object, k, p.val = 0.01, show_pdata = NULL) } \arguments{ \item{object}{an object of 'SingleCellExperiment' class} diff --git a/man/sc3_plot_expression.Rd b/man/sc3_plot_expression.Rd index b728c28..6b91783 100644 --- a/man/sc3_plot_expression.Rd +++ b/man/sc3_plot_expression.Rd @@ -1,19 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/PlotMethods.R -\docType{methods} \name{sc3_plot_expression} \alias{sc3_plot_expression} \alias{sc3_plot_expression.SingleCellExperiment} \alias{sc3_plot_expression,} \alias{sc3_plot_expression,SingleCellExperiment-method} -\alias{sc3_plot_expression,SingleCellExperiment-method} -\alias{sc3_plot_expression} \title{Plot expression matrix used for SC3 clustering as a heatmap} \usage{ sc3_plot_expression.SingleCellExperiment(object, k, show_pdata) -\S4method{sc3_plot_expression}{SingleCellExperiment}(object, k, - show_pdata = NULL) +\S4method{sc3_plot_expression}{SingleCellExperiment}(object, k, show_pdata = NULL) } \arguments{ \item{object}{an object of 'SingleCellExperiment' class} diff --git a/man/sc3_plot_markers.Rd b/man/sc3_plot_markers.Rd index 50a0028..29c203e 100644 --- a/man/sc3_plot_markers.Rd +++ b/man/sc3_plot_markers.Rd @@ -1,19 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/PlotMethods.R -\docType{methods} \name{sc3_plot_markers} \alias{sc3_plot_markers} \alias{sc3_plot_markers.SingleCellExperiment} \alias{sc3_plot_markers,} \alias{sc3_plot_markers,SingleCellExperiment-method} -\alias{sc3_plot_markers,SingleCellExperiment-method} -\alias{sc3_plot_markers} \title{Plot expression of marker genes identified by \code{SC3} as a heatmap.} \usage{ sc3_plot_markers.SingleCellExperiment(object, k, auroc, p.val, show_pdata) -\S4method{sc3_plot_markers}{SingleCellExperiment}(object, k, auroc = 0.85, - p.val = 0.01, show_pdata = NULL) +\S4method{sc3_plot_markers}{SingleCellExperiment}(object, k, auroc = 0.85, p.val = 0.01, show_pdata = NULL) } \arguments{ \item{object}{an object of 'SingleCellExperiment' class} diff --git a/man/sc3_plot_silhouette.Rd b/man/sc3_plot_silhouette.Rd index 0f0f0c9..76467f2 100644 --- a/man/sc3_plot_silhouette.Rd +++ b/man/sc3_plot_silhouette.Rd @@ -1,13 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/PlotMethods.R -\docType{methods} \name{sc3_plot_silhouette} \alias{sc3_plot_silhouette} \alias{sc3_plot_silhouette.SingleCellExperiment} \alias{sc3_plot_silhouette,} \alias{sc3_plot_silhouette,SingleCellExperiment-method} -\alias{sc3_plot_silhouette,SingleCellExperiment-method} -\alias{sc3_plot_silhouette} \title{Plot silhouette indexes of the cells} \usage{ sc3_plot_silhouette.SingleCellExperiment(object, k) diff --git a/man/sc3_prepare.Rd b/man/sc3_prepare.Rd index c24bf56..0b15fee 100644 --- a/man/sc3_prepare.Rd +++ b/man/sc3_prepare.Rd @@ -1,63 +1,72 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_prepare} \alias{sc3_prepare} \alias{sc3_prepare.SingleCellExperiment} \alias{sc3_prepare,SingleCellExperiment-method} -\alias{sc3_prepare,SingleCellExperiment-method} -\alias{sc3_prepare} \title{Prepare the \code{SingleCellExperiment} object for \code{SC3} clustering.} \usage{ -sc3_prepare.SingleCellExperiment(object, gene_filter, pct_dropout_min, - pct_dropout_max, d_region_min, d_region_max, svm_num_cells, svm_train_inds, - svm_max, n_cores, kmeans_nstart, kmeans_iter_max, rand_seed) +sc3_prepare.SingleCellExperiment( + object, + gene_filter, + pct_dropout_min, + pct_dropout_max, + d_region_min, + d_region_max, + svm_num_cells, + svm_train_inds, + svm_max, + kmeans_nstart, + kmeans_iter_max +) -\S4method{sc3_prepare}{SingleCellExperiment}(object, gene_filter = TRUE, - pct_dropout_min = 10, pct_dropout_max = 90, d_region_min = 0.04, - d_region_max = 0.07, svm_num_cells = NULL, svm_train_inds = NULL, - svm_max = 5000, n_cores = NULL, kmeans_nstart = NULL, - kmeans_iter_max = 1e+09, rand_seed = 1) +\S4method{sc3_prepare}{SingleCellExperiment}( + object, + gene_filter = TRUE, + pct_dropout_min = 10, + pct_dropout_max = 90, + d_region_min = 0.04, + d_region_max = 0.07, + svm_num_cells = NULL, + svm_train_inds = NULL, + svm_max = 5000, + kmeans_nstart = NULL, + kmeans_iter_max = 1e+09 +) } \arguments{ \item{object}{an object of \code{SingleCellExperiment} class.} -\item{gene_filter}{a boolen variable which defines whether to perform gene +\item{gene_filter}{a boolen variable which defines whether to perform gene filtering before SC3 clustering.} -\item{pct_dropout_min}{if \code{gene_filter = TRUE}, then genes with percent of dropouts smaller than +\item{pct_dropout_min}{if \code{gene_filter = TRUE}, then genes with percent of dropouts smaller than \code{pct_dropout_min} are filtered out before clustering.} -\item{pct_dropout_max}{if \code{gene_filter = TRUE}, then genes with percent of dropouts larger than +\item{pct_dropout_max}{if \code{gene_filter = TRUE}, then genes with percent of dropouts larger than \code{pct_dropout_max} are filtered out before clustering.} -\item{d_region_min}{defines the minimum number of eigenvectors used for +\item{d_region_min}{defines the minimum number of eigenvectors used for kmeans clustering as a fraction of the total number of cells. Default is \code{0.04}. See \code{SC3} paper for more details.} -\item{d_region_max}{defines the maximum number of eigenvectors used for +\item{d_region_max}{defines the maximum number of eigenvectors used for kmeans clustering as a fraction of the total number of cells. Default is \code{0.07}. See \code{SC3} paper for more details.} -\item{svm_num_cells}{number of randomly selected training cells to be used +\item{svm_num_cells}{number of randomly selected training cells to be used for SVM prediction. The default is \code{NULL}.} -\item{svm_train_inds}{a numeric vector defining indeces of training cells +\item{svm_train_inds}{a numeric vector defining indeces of training cells that should be used for SVM training. The default is \code{NULL}.} \item{svm_max}{define the maximum number of cells below which SVM is not run.} -\item{n_cores}{defines the number of cores to be used on the user's machine. If not set, `SC3` will use all but one cores of your machine.} - -\item{kmeans_nstart}{nstart parameter passed to \code{\link[stats]{kmeans}} function. Default is +\item{kmeans_nstart}{nstart parameter passed to \code{\link[stats]{kmeans}} function. Default is \code{1000} for up to \code{2000} cells and \code{50} for more than \code{2000} cells.} -\item{kmeans_iter_max}{iter.max parameter passed to \code{\link[stats]{kmeans}} +\item{kmeans_iter_max}{iter.max parameter passed to \code{\link[stats]{kmeans}} function. Default is \code{1e+09}.} - -\item{rand_seed}{sets the seed of the random number generator. \code{SC3} is a stochastic -method, so setting the \code{rand_seed} to a fixed values can be used for reproducibility -purposes.} } \value{ an object of \code{SingleCellExperiment} class @@ -70,11 +79,9 @@ creates and populates the following items of the \code{sc3} slot of the \code{me \item \code{kmeans_nstart} - the same as the \code{kmeans_nstart} argument. \item \code{n_dim} - contains numbers of the number of eigenvectors to be used in \code{\link[stats]{kmeans}} clustering. - \item \code{rand_seed} - the same as the \code{rand_seed} argument. - \item \code{svm_train_inds} - if SVM is used this item contains indexes of the + \item \code{svm_train_inds} - if SVM is used this item contains indexes of the training cells to be used for SC3 clustering and further SVM prediction. \item \code{svm_study_inds} - if SVM is used this item contains indexes of the cells to be predicted by SVM. - \item \code{n_cores} - the same as the \code{n_cores} argument. } } diff --git a/man/sc3_run_svm.Rd b/man/sc3_run_svm.Rd index 11bd707..dc8a961 100644 --- a/man/sc3_run_svm.Rd +++ b/man/sc3_run_svm.Rd @@ -1,13 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/CoreMethods.R -\docType{methods} \name{sc3_run_svm} \alias{sc3_run_svm} \alias{sc3_run_svm.SingleCellExperiment} \alias{sc3_run_svm,} \alias{sc3_run_svm,SingleCellExperiment-method} -\alias{sc3_run_svm,SingleCellExperiment-method} -\alias{sc3_run_svm} \title{Run the hybrid \code{SVM} approach.} \usage{ sc3_run_svm.SingleCellExperiment(object, ks) @@ -25,13 +22,13 @@ an object of \code{SingleCellExperiment} class } \description{ This method parallelize \code{SVM} prediction for each \code{k} (the number -of clusters). Namely, for each \code{k}, \code{\link{support_vector_machines}} +of clusters). Namely, for each \code{k}, \code{\link{support_vector_machines}} function is utilized to predict the labels of study cells. Training cells are selected using \code{svm_train_inds} item of the \code{sc3} slot of the \code{metadata(object)}. } \details{ -Results are written to the \code{sc3_k_clusters} columns to the -\code{colData} slot of the input \code{object}, where \code{k} is the +Results are written to the \code{sc3_k_clusters} columns to the +\code{colData} slot of the input \code{object}, where \code{k} is the number of clusters. } diff --git a/man/yan.Rd b/man/yan.Rd index e02a0ee..a0db34d 100644 --- a/man/yan.Rd +++ b/man/yan.Rd @@ -4,7 +4,9 @@ \name{yan} \alias{yan} \title{Single cell RNA-Seq data extracted from a publication by Yan et al.} -\format{An object of class \code{data.frame} with 20214 rows and 90 columns.} +\format{ +An object of class \code{data.frame} with 20214 rows and 90 columns. +} \source{ \url{http://dx.doi.org/10.1038/nsmb.2660} diff --git a/vignettes/SC3.Rmd b/vignettes/SC3.Rmd index 40eec2d..aa356bf 100644 --- a/vignettes/SC3.Rmd +++ b/vignettes/SC3.Rmd @@ -54,7 +54,7 @@ sce <- SingleCellExperiment( assays = list( counts = as.matrix(yan), logcounts = log2(as.matrix(yan) + 1) - ), + ), colData = ann ) @@ -70,6 +70,23 @@ sce <- runPCA(sce) plotPCA(sce, colour_by = "cell_type1") ``` +## Speeding up the computations + +`SC3` is using the [BiocParallel](https://bioconductor.org/packages/release/bioc/html/BiocParallel.html) package for parallel computations. +To utilize its capabilities, you have to create an object of class `BiocParallelParam`, which defines the parallel backend, its allocated +resources, and also a seed for random number generator. The most common is `SnowParam()`: +```{r} +library(BiocParallel) +BPPARAM <- SnowParam(workers = 4, type = "SOCK", RNGseed = 1, progressbar = TRUE) +bpstart(BPPARAM) +BiocParallel::register(BPPARAM) +``` + +Instead of calling the `register()` function you can also directly pass `BPPARAM` to `SC3` methods which support it. + +> `type = "SOCK"` works on all platforms, because it uses external R processes to distribute the parallel computations. +> On the other hand, `type = "FORK"` works only on Unix platforms, and could be faster in exchange for increased memory consumption. + ## Run SC3 If you would like to explore clustering of your data in the range of `k`s (the number of clusters) from 2 to 4, you just need to run the main `sc3` method and define the range of `k`s using the `ks` parameter (here we also ask `SC3` to calculate biological features based on the identified cell clusters): @@ -77,8 +94,6 @@ If you would like to explore clustering of your data in the range of `k`s (the n sce <- sc3(sce, ks = 2:4, biology = TRUE) ``` -> By default `SC3` will use all but one cores of your machine. You can manually set the number of cores to be used by setting the `n_cores` parameter in the `sc3` call. - To quickly and easily explore the `SC3` solutions using an interactive Shiny application use the following method: ```{r, eval=FALSE} sc3_interactive(sce) @@ -104,8 +119,8 @@ Additionally, having `SC3` results stored in the same slot makes it possible to ```{r} sce <- runPCA(sce) plotPCA( - sce, - colour_by = "sc3_3_clusters", + sce, + colour_by = "sc3_3_clusters", size_by = "sc3_3_log2_outlier_score" ) ``` @@ -123,7 +138,7 @@ Because the biological features were also calculated for each `k`, one can find # Number of Сells -The default settings of `SC3` allow to cluster (using a single `k`) a dataset of 2,000 cells in about 20-30 minutes. +The default settings of `SC3` allow to cluster (using a single `k`) a dataset of 2,000 cells in about 20-30 minutes. For datasets with more than 2,000 cells `SC3` automatically adjusts some of its parameters (see below). This allows to cluster a dataset of 5,000 cells in about 20-30 minutes. The parameters can also be manually adjusted for datasets with any number of cells. @@ -143,11 +158,11 @@ sc3_plot_consensus(sce, k = 3) It is also possible to annotate cells (columns of the consensus matrix) with any column of the `colData` slot of the `sce` object. ```{r, fig.height=6, fig.width=8} sc3_plot_consensus( - sce, k = 3, + sce, k = 3, show_pdata = c( - "cell_type1", + "cell_type1", "log10_total_features", - "sc3_3_clusters", + "sc3_3_clusters", "sc3_3_log2_outlier_score" ) ) @@ -170,11 +185,11 @@ sc3_plot_expression(sce, k = 3) It is also possible to annotate cells (columns of the expression matrix) with any column of the `colData` slot of the `sce` object. ```{r, fig.height=6, fig.width=8} sc3_plot_expression( - sce, k = 3, + sce, k = 3, show_pdata = c( - "cell_type1", + "cell_type1", "log10_total_features", - "sc3_3_clusters", + "sc3_3_clusters", "sc3_3_log2_outlier_score" ) ) @@ -197,11 +212,11 @@ sc3_plot_de_genes(sce, k = 3) It is also possible to annotate cells (columns of the matrix containing DE genes) with any column of the `colData` slot of the `sce` object. ```{r, fig.height=9, fig.width=8} sc3_plot_de_genes( - sce, k = 3, + sce, k = 3, show_pdata = c( - "cell_type1", + "cell_type1", "log10_total_features", - "sc3_3_clusters", + "sc3_3_clusters", "sc3_3_log2_outlier_score" ) ) @@ -217,11 +232,11 @@ sc3_plot_markers(sce, k = 3) It is also possible to annotate cells (columns of the matrix containing marker genes) with any column of the `colData` slot of the `sce` object. ```{r, fig.height=6, fig.width=8} sc3_plot_markers( - sce, k = 3, + sce, k = 3, show_pdata = c( - "cell_type1", + "cell_type1", "log10_total_features", - "sc3_3_clusters", + "sc3_3_clusters", "sc3_3_log2_outlier_score" ) ) @@ -243,15 +258,13 @@ Let us go through each of them independently. ## `sc3_prepare` -We start with `sc3_prepare`. This method prepares an object of `sce` class for `SC3` clustering. This method also defines all parameters needed for clustering and stores them in the `sc3` slot. The parameters have their own defaults but can be manually changed. For more information on the parameters please use `?sc3_prepare`. +We start with `sc3_prepare`. This method prepares an object of `sce` class for `SC3` clustering. This method also defines all parameters needed for clustering and stores them in the `sc3` slot. The parameters have their own defaults but can be manually changed. For more information on the parameters please use `?sc3_prepare`. ```{r} sce <- sc3_prepare(sce) str(metadata(sce)$sc3) ``` -> By default `SC3` will use all but one cores of your machine. You can manually set the number of cores to be used by setting the `n_cores` parameter in the `sc3_prepare` call. - ## _(optional)_ `sc3_estimate_k` When the `sce` object is prepared for clustering, `SC3` can also estimate the optimal number of clusters `k` in the dataset. `SC3` utilizes the Tracy-Widom theory on random matrices to estimate `k`. `sc3_estimate_k` method creates and populates the following items of the `sc3` slot: @@ -274,7 +287,7 @@ names(metadata(sce)$sc3$distances) ## `sc3_calc_transfs` -Next the distance matrices are transformed using PCA and graph Laplacian. Method `sc3_calc_transfs` calculates transforamtions of the distance matrices contained in +Next the distance matrices are transformed using PCA and graph Laplacian. Method `sc3_calc_transfs` calculates transforamtions of the distance matrices contained in the `distances` item of the `sc3` slot. It then creates and populates the following items of the `sc3` slot: * `transformations` - contains a list of transformations of the distance matrices corresponding to PCA and graph Laplacian transformations.