From 13c37606f1952f14ea86b22a0399981118eab459 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 30 Oct 2024 19:00:33 -0700 Subject: [PATCH 01/22] [r] add lsi, var feature selection --- r/R/transforms.R | 72 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/r/R/transforms.R b/r/R/transforms.R index 8a2bd25b..7a110677 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -983,4 +983,76 @@ normalize_tfidf <- function( idf <- 1 / feature_means tf_idf_mat <- tf %>% multiply_rows(idf) return(log1p(tf_idf_mat * scale_factor)) +} +#' Compute LSI For a peak matrix +#' @param mat PeakMatrix +#' @param n_dimensions Number of dimensions to keep during PCA +#' @param scale_factor Scale factor for the tf-idf log transform +#' @param verbose Whether to print out progress +#' @param threads Number of threads to use +#' @return dgCMatrix of shape (n_dimensions, ncol(mat)) +#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA. +#' Tf-idf implementation is from Stuart & Butler et al. 2019. +#' @export +compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) { + assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this? + assert_is(n_dimensions, "integer") + assert_len(n_dimensions, 1) + assert_greater_than_zero(n_dimensions) + assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + + # Signac implementation + npeaks <- colSums(mat) + tf <- mat %>% multiply_cols(1/npeaks) + idf_ <- ncol(mat) / rowSums(mat) + mat_tfidf <- tf %>% multiply_rows(idf_) + mat_log_tfidf <- log1p(scale_factor * mat_tfidf) + + # run z-score norm + cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats + cell_means <- cell_peak_stats["mean",] + cell_vars <- cell_peak_stats["variance",] + mat_lsi_norm <- mat_log_tfidf %>% + add_cols(-cell_means) %>% + multiply_cols(1 / cell_vars) + + # Run pca + svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions) + pca_res <- t(svd_attr$u) %*% mat_lsi_norm + return(pca_res) +} + +#' Get most variable features, given a non-log normalized matrix +highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) { + assert_is(mat, "IterableMatrix") + assert_is(num_feats, "integer") + assert_greater_than_zero(num_feats) + assert_len(num_feats, 1) + assert_is(n_bins, "integer") + assert_len(n_bins, 1) + assert_greater_than_zero(n_bins) + if (nrow(mat) <= num_feats) { + if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), + returning all features", nrow(mat), num_feats)) + return(mat) + } + # Calculate the mean and variance of each feature + # should we set entries that are 0 to NA? + feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ] + feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ] + feature_dispersion <- feature_vars / feature_means + feature_dispersion <- log(feature_dispersion) + feature_means <- log1p(feature_means) + mean_bins <- cut(feature_means, n_bins, labels = FALSE) + + # Calculate the mean and variance of dispersion of each bin + bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x)) + bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x)) + # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1 + bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)] + # map mean_bins indices to bin_stats + feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins] + names(feature_dispersion_norm) <- names(feature_dispersion) + variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)] + return(mat[names(variable_features_), ]) } \ No newline at end of file From 36a8983a53de0bbe558743d9ca862413882b6d91 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 4 Nov 2024 15:14:26 -0800 Subject: [PATCH 02/22] [r] add lsi, variable feature selection --- r/NAMESPACE | 2 + r/NEWS.md | 3 + r/R/singlecell_utils.R | 106 +++++++++++++++++++++++ r/man/highly_variable_features.Rd | 36 ++++++++ r/man/lsi.Rd | 46 ++++++++++ r/pkgdown/_pkgdown.yml | 2 + r/tests/testthat/test-singlecell_utils.R | 49 +++++++++++ 7 files changed, 244 insertions(+) create mode 100644 r/man/highly_variable_features.Rd create mode 100644 r/man/lsi.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index c90e1a15..c7e4fd1c 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -48,6 +48,7 @@ export(gene_score_archr) export(gene_score_tiles_archr) export(gene_score_weights_archr) export(get_trackplot_height) +export(highly_variable_features) export(import_matrix_market) export(import_matrix_market_10x) export(knn_annoy) @@ -55,6 +56,7 @@ export(knn_hnsw) export(knn_to_geodesic_graph) export(knn_to_snn_graph) export(log1p_slow) +export(lsi) export(marker_features) export(match_gene_symbol) export(matrix_stats) diff --git a/r/NEWS.md b/r/NEWS.md index 73ca847d..24b6b0d9 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,6 +24,9 @@ Contributions welcome :) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) - Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169) +- Add MACS2/3 input creation and peak calling through `call_macs_peaks()` (pull request #118) +- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156). +- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 32e45934..c4d7fbf3 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -233,4 +233,110 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { } } return(res) +} + +#' Perform latent semantic indexing (LSI) on a matrix. +#' @param mat (IterableMatrix) dimensions features x cells +#' @param n_dimensions (integer) Number of dimensions to keep during PCA. +#' @param scale_factor (integer) Scale factor for the tf-idf log transform. +#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory. +#' If FALSE, save to a temporary location in disk. Saving in memory will result in faster downstream operations, +#' but will require in higher memory usage. Comparison of memory usage and speed is in the details section. +#' @param threads (integer) Number of threads to use. +#' @return dgCMatrix of shape (n_dimensions, ncol(mat)). +#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. +#' +#' ** Saving in memory vs disk: ** +#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix. +#' This is done to prevent re-calculation of queued operations during PCA optimization. +#' +#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime +#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime +#' +#' @export +lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(n_dimensions) + assert_len(n_dimensions, 1) + assert_greater_than_zero(n_dimensions) + assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + assert_is_wholenumber(threads) + + # log(tf-idf) transform + npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` + tf <- mat %>% multiply_cols(1 / npeaks) + idf_ <- ncol(mat) / rowSums(mat) + mat_tfidf <- tf %>% multiply_rows(idf_) + mat_log_tfidf <- log1p(scale_factor * mat_tfidf) + # Save to prevent re-calculation of queued operations + if (save_in_memory) { + mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE) + } else { + mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) + } + # Z-score normalization + cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats + cell_means <- cell_peak_stats["mean",] + cell_vars <- cell_peak_stats["variance",] + mat_lsi_norm <- mat_log_tfidf %>% + add_cols(-cell_means) %>% + multiply_cols(1 / cell_vars) + # Run pca + svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads) + pca_res <- t(svd_attr_$u) %*% mat_lsi_norm + return(pca_res) +} + +#' Get the most variable features within a matrix +#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, +#' ll features will be returned. +#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +#' and if the number of features +#' within a bin is less than 2, the dispersion is set to 1. +#' @returns IterableMatrix subset of the most variable features. +#' @inheritParams lsi +#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). +#' +#' Calculate using the following process: +#' 1. Calculate the dispersion of each feature (variance / mean) +#' 2. Log normalize dispersion and mean +#' 3. Bin the features by their means, and normalize dispersion within each bin +#' @export +highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is_wholenumber(n_bins) + assert_len(n_bins, 1) + assert_greater_than_zero(n_bins) + if (nrow(mat) <= num_feats) { + log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) + return(mat) + } + + feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ] + feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ] + feature_means[feature_means == 0] <- 1e-12 + feature_dispersion <- feature_vars / feature_means + feature_dispersion[feature_dispersion == 0] <- NA + feature_dispersion <- log(feature_dispersion) + feature_means <- log1p(feature_means) + mean_bins <- cut(feature_means, n_bins, labels = FALSE) + + bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE)) + bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE)) + # Set feats that are in bins with only one feat to have a norm dispersion of 1 + one_gene_bin <- is.na(bin_sd) + bin_sd[one_gene_bin] <- bin_mean[one_gene_bin] + bin_mean[one_gene_bin] <- 0 + # map mean_bins indices to bin_stats + # Do a character search as bins without features mess up numeric indexing + feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)] + names(feature_dispersion_norm) <- names(feature_dispersion) + feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values + if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats)) + variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)] + return(mat[names(variable_features_), ]) } \ No newline at end of file diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd new file mode 100644 index 00000000..54e67599 --- /dev/null +++ b/r/man/highly_variable_features.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{highly_variable_features} +\alias{highly_variable_features} +\title{Get the most variable features within a matrix} +\usage{ +highly_variable_features(mat, num_feats, n_bins, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, +ll features will be returned.} + +\item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +and if the number of features +within a bin is less than 2, the dispersion is set to 1.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +IterableMatrix subset of the most variable features. +} +\description{ +Get the most variable features within a matrix +} +\details{ +The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). + +Calculate using the following process: +\enumerate{ +\item Calculate the dispersion of each feature (variance / mean) +\item Log normalize dispersion and mean +\item Bin the features by their means, and normalize dispersion within each bin +} +} diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd new file mode 100644 index 00000000..38c0d73b --- /dev/null +++ b/r/man/lsi.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{lsi} +\alias{lsi} +\title{Perform latent semantic indexing (LSI) on a matrix.} +\usage{ +lsi( + mat, + n_dimensions = 50L, + scale_factor = 10000, + save_in_memory = FALSE, + threads = 1L +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells} + +\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} + +\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.} + +\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory. +If FALSE, save to a temporary location in disk. Saving in memory will result in faster downstream operations, +but will require in higher memory usage. Comparison of memory usage and speed is in the details section.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +dgCMatrix of shape (n_dimensions, ncol(mat)). +} +\description{ +Perform latent semantic indexing (LSI) on a matrix. +} +\details{ +Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. + +** Saving in memory vs disk: ** +Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix. +This is done to prevent re-calculation of queued operations during PCA optimization. + +Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +\itemize{ +\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime +\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime +} +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 526018e4..3cf6509a 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -136,6 +136,8 @@ reference: - select_features_by_variance - select_features_by_dispersion - select_features_by_mean + - lsi + - highly_variable_features - IterableMatrix-methods - pseudobulk_matrix diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 111324b7..c699d67b 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -13,6 +13,10 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = as(m, "dgCMatrix") } +generate_dense_matrix <- function(nrow, ncol) { + m <- matrix(runif(nrow * ncol), nrow = nrow) +} + test_that("select_features works general case", { m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) { @@ -31,6 +35,26 @@ test_that("select_features works general case", { } }) + +test_that("select_features works general case", { + m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") + for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) { + res <- do.call(fn, list(m1, num_feats = 10)) + expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting + expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable + expect_setequal(res$names, rownames(m1)) + res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows + res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) + expect_identical(res_more_feats_than_rows, res_feats_equal_rows) + if (fn != "select_features_by_mean") { + # Check that normalization actually does something + res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) + expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score)) + } + } +}) + + test_that("Wilcoxon rank sum works (small)", { x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46) y <- c(1.15, 0.88, 0.90, 0.74, 1.21) @@ -180,3 +204,28 @@ test_that("Pseudobulk aggregation works with multiple return types", { } }) + + +test_that("Highly variable feature selection works", { + mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat + res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1) + res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1) + expect_equal(nrow(res), 10) + expect_equal(ncol(res), 26) + expect_equal(nrow(res_t), 10) + expect_equal(ncol(res_t), 500) +}) + +test_that("LSI works", { + mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to ArchR + lsi_res <- lsi(mat, n_dimensions = 5) + lsi_res_t <- lsi(t(mat), n_dimensions = 5) + expect_equal(nrow(lsi_res), 5) + expect_equal(ncol(lsi_res), ncol(mat)) + expect_equal(nrow(lsi_res_t), 5) + expect_equal(ncol(lsi_res_t), nrow(mat)) +}) \ No newline at end of file From 2be2efebe4f53f03f9d895bdecc33f762babef13 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 16:53:53 -0800 Subject: [PATCH 03/22] [r] parametrize z_score_norm, create temp option to return more info `save_lsi`, remove `save_in_memory` in `lsi()` --- r/R/singlecell_utils.R | 157 +++++++++++++++++++++++++++++------------ r/man/lsi.Rd | 24 ++++--- 2 files changed, 126 insertions(+), 55 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index c4d7fbf3..969acf35 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -237,25 +237,28 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' Perform latent semantic indexing (LSI) on a matrix. #' @param mat (IterableMatrix) dimensions features x cells +#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA. #' @param n_dimensions (integer) Number of dimensions to keep during PCA. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. -#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory. -#' If FALSE, save to a temporary location in disk. Saving in memory will result in faster downstream operations, -#' but will require in higher memory usage. Comparison of memory usage and speed is in the details section. #' @param threads (integer) Number of threads to use. -#' @return dgCMatrix of shape (n_dimensions, ncol(mat)). +#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector. +#' @return +#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)). +#' - If save_lsi is TRUE, return a list with the following elements: +#' - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat)) +#' - **svd_attr**: List of SVD attributes +#' - **idf**: Inverse document frequency vector #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. #' -#' ** Saving in memory vs disk: ** -#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix. -#' This is done to prevent re-calculation of queued operations during PCA optimization. -#' #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: -#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime -#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime -#' +#' - 17.1 MB memory usage, 25.1 seconds runtime #' @export -lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) { +lsi <- function( + mat, + z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4, + save_lsi = FALSE, + threads = 1L +) { assert_is(mat, "IterableMatrix") assert_is_wholenumber(n_dimensions) assert_len(n_dimensions, 1) @@ -264,27 +267,34 @@ lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FA assert_is_wholenumber(threads) # log(tf-idf) transform + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` tf <- mat %>% multiply_cols(1 / npeaks) idf_ <- ncol(mat) / rowSums(mat) mat_tfidf <- tf %>% multiply_rows(idf_) mat_log_tfidf <- log1p(scale_factor * mat_tfidf) # Save to prevent re-calculation of queued operations - if (save_in_memory) { - mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE) - } else { - mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) - } + mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) # Z-score normalization - cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats - cell_means <- cell_peak_stats["mean",] - cell_vars <- cell_peak_stats["variance",] - mat_lsi_norm <- mat_log_tfidf %>% - add_cols(-cell_means) %>% - multiply_cols(1 / cell_vars) + if (z_score_norm) { + cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats + cell_means <- cell_peak_stats["mean",] + cell_vars <- cell_peak_stats["variance",] + mat_log_tfidf <- mat_log_tfidf %>% + add_cols(-cell_means) %>% + multiply_cols(1 / cell_vars) + } # Run pca - svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads) - pca_res <- t(svd_attr_$u) %*% mat_lsi_norm + svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads) + pca_res <- t(svd_attr_$u) %*% mat_log_tfidf + if(save_lsi) { + return(list( + pca_res = pca_res, + svd_attr = svd_attr_, + idf = idf_ + )) + } return(pca_res) } @@ -315,28 +325,87 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) return(mat) } - - feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ] - feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ] + mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) + feature_means <- mat_stats$row_stats["mean", ] + feature_vars <- mat_stats$row_stats["variance", ] feature_means[feature_means == 0] <- 1e-12 feature_dispersion <- feature_vars / feature_means feature_dispersion[feature_dispersion == 0] <- NA feature_dispersion <- log(feature_dispersion) feature_means <- log1p(feature_means) - mean_bins <- cut(feature_means, n_bins, labels = FALSE) - - bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE)) - bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE)) - # Set feats that are in bins with only one feat to have a norm dispersion of 1 - one_gene_bin <- is.na(bin_sd) - bin_sd[one_gene_bin] <- bin_mean[one_gene_bin] - bin_mean[one_gene_bin] <- 0 - # map mean_bins indices to bin_stats - # Do a character search as bins without features mess up numeric indexing - feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)] - names(feature_dispersion_norm) <- names(feature_dispersion) - feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values - if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats)) - variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)] - return(mat[names(variable_features_), ]) + features_df <- data.frame( + name = names(feature_means), + vars = feature_vars, + means = feature_means, + dispersion = feature_dispersion + ) + features_df <- features_df %>% + dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% + dplyr::group_by(bin) %>% + dplyr::mutate( + bin_mean = mean(dispersion, na.rm = TRUE), + bin_sd = sd(dispersion, na.rm = TRUE), + bin_sd_is_na = is.na(bin_sd), + bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1 + bin_mean = ifelse(bin_sd_is_na, 0, bin_mean), + feature_dispersion_norm = (dispersion - bin_mean) / bin_sd + ) %>% + dplyr::ungroup() %>% + dplyr::select(name, feature_dispersion_norm) %>% + dplyr::arrange(desc(feature_dispersion_norm)) %>% + dplyr::slice(1:min(num_feats, nrow(.))) + return(mat[features_df$name,]) +} + + +#' Aggregate counts matrices by cell group or feature. +#' +#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each +#' feature. +#' @param cell_groups (Character/factor) Vector of group/cluster assignments for each cell. Length must be `ncol(mat)`. +#' @param method (Character vector) Method(s) to aggregate counts. If one method is provided, the output will be a matrix. If multiple methods are provided, the output will be a named list of matrices. +#' +#' Current options are: `nonzeros`, `sum`, `mean`, `variance`. +#' @param threads (integer) Number of threads to use. +#' @return +#' - If `method` is length `1`, returns a matrix of shape `(features x groups)`. +#' - If `method` is greater than length `1`, returns a list of matrices with each matrix representing a pseudobulk matrix with a different aggregation method. +#' Each matrix is of shape `(features x groups)`, and names are one of `nonzeros`, `sum`, `mean`, `variance`. +#' @details Some simpler stats are calculated in the process of calculating more complex +#' statistics. So when calculating `variance`, `nonzeros` and `mean` can be included with no +#' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time. +#' @inheritParams marker_features +#' @export +pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_is(cell_groups, c("factor", "character", "numeric")) + assert_true(length(cell_groups) == ncol(mat)) + cell_groups <- as.factor(cell_groups) + assert_is(method, "character") + methods <- c("variance", "mean", "sum", "nonzeros") + for (m in method) { + if (!(m %in% methods)) { + rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", "))) + } + } + assert_is(threads, "integer") + # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats + iter <- iterate_matrix(parallel_split(mat, threads, threads*4)) + res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) + # if res is a single matrix, return with colnames and rownames + if (length(method) == 1) { + colnames(res[[method]]) <- levels(cell_groups) + rownames(res[[method]]) <- rownames(mat) + return(res[[method]]) + } + # give colnames and rownames for each matrix in res, which is a named list + for (res_slot in names(res)) { + if ((length(res[[res_slot]]) == 0) || !(res_slot %in% method)) { + res[[res_slot]] <- NULL + } else { + colnames(res[[res_slot]]) <- levels(cell_groups) + rownames(res[[res_slot]]) <- rownames(mat) + } + } + return(res) } \ No newline at end of file diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd index 38c0d73b..137e4437 100644 --- a/r/man/lsi.Rd +++ b/r/man/lsi.Rd @@ -6,27 +6,34 @@ \usage{ lsi( mat, + z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 10000, - save_in_memory = FALSE, + save_lsi = FALSE, threads = 1L ) } \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} +\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.} + \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} \item{scale_factor}{(integer) Scale factor for the tf-idf log transform.} -\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory. -If FALSE, save to a temporary location in disk. Saving in memory will result in faster downstream operations, -but will require in higher memory usage. Comparison of memory usage and speed is in the details section.} +\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.} \item{threads}{(integer) Number of threads to use.} } \value{ -dgCMatrix of shape (n_dimensions, ncol(mat)). +\itemize{ +\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)). +\item If save_lsi is TRUE, return a list with the following elements: +\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat)) +\item \strong{svd_attr}: List of SVD attributes +\item \strong{idf}: Inverse document frequency vector +} } \description{ Perform latent semantic indexing (LSI) on a matrix. @@ -34,13 +41,8 @@ Perform latent semantic indexing (LSI) on a matrix. \details{ Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. -** Saving in memory vs disk: ** -Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix. -This is done to prevent re-calculation of queued operations during PCA optimization. - Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: \itemize{ -\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime -\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime +\item 17.1 MB memory usage, 25.1 seconds runtime } } From dccc3a56e67f104aa4e3c9e46e65fa0ba7c8c9d8 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 16:54:18 -0800 Subject: [PATCH 04/22] [r] add test case for LSI comparing to archr --- r/tests/real_data/ArchR_LSI.R | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 r/tests/real_data/ArchR_LSI.R diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R new file mode 100644 index 00000000..b78d700f --- /dev/null +++ b/r/tests/real_data/ArchR_LSI.R @@ -0,0 +1,50 @@ +devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") +devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") + +#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. +#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. +#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix +#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation +#' we can compare between the two packages. +#' @param proj An archr project. +test_lsi_similarity_to_archr <- function() { + # Set up temp dir + int_dir <- file.path(tempdir(), "insertion_test") + dir.create(int_dir) + setwd(int_dir) + # add iterative lsi for dim reduction + proj <- getTestProject() + proj <- addPeakMatrix(proj) + # Get the peak matrix + test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix")) + # Calculate LSI on ArchR + # running LSI without binarizing, as we don't do this in the BPCells implementation + # we also don't filter quantile outliers. + lsi_archr <- .computeLSI( + mat = test_mat, + LSIMethod = 2, + nDimensions = 2, + binarize = FALSE, + outlierQuantiles = NULL, + test_mat = test_mat + ) + svd_archr <- lsi_archr$svd + lsi_mat_archr <- t(lsi_archr$matSVD) + # set rownames to NA, as we don't have rownames in the BPCells implementation + rownames(lsi_mat_archr) <- NULL + # PCA Matrix = T(u) * Pre-SVD Matrix + # u * PCA Matrix = u * T(u) * Pre-SVD Matrix + # u * PCA Matrix = Pre-SVD Matrix + pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr + # Calculate LSI on BPCells + # Do not use z-score normalization, as this isn't done with ArchR + lsi_bpcells <- lsi( + test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), + z_score_norm = FALSE, + n_dimensions = 2, + save_lsi = TRUE + ) + pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res + testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6)) +} +test_lsi_similarity_to_archr() \ No newline at end of file From 183dd40a18cddae2d353a1a439d73cfb26af6e5b Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 19:29:32 -0800 Subject: [PATCH 05/22] [r] clean up var gene selection, lsi docstring --- r/R/singlecell_utils.R | 38 ++++++++++++++++++++++++------- r/man/highly_variable_features.Rd | 17 ++++++++++++-- r/man/lsi.Rd | 6 ++--- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 969acf35..5a00462b 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -236,6 +236,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { } #' Perform latent semantic indexing (LSI) on a matrix. +#' +#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. #' @param mat (IterableMatrix) dimensions features x cells #' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA. #' @param n_dimensions (integer) Number of dimensions to keep during PCA. @@ -243,9 +245,9 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' @param threads (integer) Number of threads to use. #' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @return -#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)). +#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. #' - If save_lsi is TRUE, return a list with the following elements: -#' - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat)) +#' - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))`` #' - **svd_attr**: List of SVD attributes #' - **idf**: Inverse document frequency vector #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. @@ -304,7 +306,12 @@ lsi <- function( #' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, #' and if the number of features #' within a bin is less than 2, the dispersion is set to 1. -#' @returns IterableMatrix subset of the most variable features. +#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected. +#' @returns +#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - If `save_feat_selection` is True, return a list with the following elements: +#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - **feature_selection**: Dataframe with the following columns: #' @inheritParams lsi #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). #' @@ -313,7 +320,11 @@ lsi <- function( #' 2. Log normalize dispersion and mean #' 3. Bin the features by their means, and normalize dispersion within each bin #' @export -highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { +highly_variable_features <- function( + mat, num_feats, n_bins = 20, + save_feat_selection = FALSE, + threads = 1L +) { assert_is(mat, "IterableMatrix") assert_greater_than_zero(num_feats) assert_is_wholenumber(num_feats) @@ -325,22 +336,27 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) return(mat) } + # Calculate row information for dispersion mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) feature_means <- mat_stats$row_stats["mean", ] feature_vars <- mat_stats$row_stats["variance", ] + # Give a small value to features with 0 mean, helps with 0 division feature_means[feature_means == 0] <- 1e-12 + # Calculate dispersion, and log normalize feature_dispersion <- feature_vars / feature_means feature_dispersion[feature_dispersion == 0] <- NA feature_dispersion <- log(feature_dispersion) feature_means <- log1p(feature_means) features_df <- data.frame( name = names(feature_means), - vars = feature_vars, - means = feature_means, + var = feature_vars, + mean = feature_means, dispersion = feature_dispersion ) + + # Bin by mean, and normalize dispersion with each bin features_df <- features_df %>% - dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% + dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% dplyr::group_by(bin) %>% dplyr::mutate( bin_mean = mean(dispersion, na.rm = TRUE), @@ -351,9 +367,15 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) { feature_dispersion_norm = (dispersion - bin_mean) / bin_sd ) %>% dplyr::ungroup() %>% - dplyr::select(name, feature_dispersion_norm) %>% + dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>% dplyr::arrange(desc(feature_dispersion_norm)) %>% dplyr::slice(1:min(num_feats, nrow(.))) + if (save_feat_selection) { + return(list( + mat = mat[features_df$name,], + feature_selection = features_df + )) + } return(mat[features_df$name,]) } diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd index 54e67599..943d986b 100644 --- a/r/man/highly_variable_features.Rd +++ b/r/man/highly_variable_features.Rd @@ -4,7 +4,13 @@ \alias{highly_variable_features} \title{Get the most variable features within a matrix} \usage{ -highly_variable_features(mat, num_feats, n_bins, threads = 1L) +highly_variable_features( + mat, + num_feats, + n_bins = 20, + save_feat_selection = FALSE, + threads = 1L +) } \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} @@ -16,10 +22,17 @@ ll features will be returned.} and if the number of features within a bin is less than 2, the dispersion is set to 1.} +\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.} + \item{threads}{(integer) Number of threads to use.} } \value{ -IterableMatrix subset of the most variable features. +\itemize{ +\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item If \code{save_feat_selection} is True, return a list with the following elements: +\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item \strong{feature_selection}: Dataframe with the following columns: +} } \description{ Get the most variable features within a matrix diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd index 137e4437..299513df 100644 --- a/r/man/lsi.Rd +++ b/r/man/lsi.Rd @@ -28,15 +28,15 @@ lsi( } \value{ \itemize{ -\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)). +\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}. \item If save_lsi is TRUE, return a list with the following elements: -\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat)) +\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))`` \item \strong{svd_attr}: List of SVD attributes \item \strong{idf}: Inverse document frequency vector } } \description{ -Perform latent semantic indexing (LSI) on a matrix. +Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}. } \details{ Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. From 4972f3430ab9da12bbde231699a9c52b92ad0cec Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 19:30:00 -0800 Subject: [PATCH 06/22] [r] add variable gene selection test --- r/tests/real_data/ArchR_LSI.R | 9 +++- .../Scanpy_variable_feat_selection.py | 46 ++++++++++++++++++ .../scanpy_variable_feat_selection.R | 47 +++++++++++++++++++ 3 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 r/tests/real_data/Scanpy_variable_feat_selection.py create mode 100644 r/tests/real_data/scanpy_variable_feat_selection.R diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index b78d700f..2ad74610 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -1,3 +1,11 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") @@ -47,4 +55,3 @@ test_lsi_similarity_to_archr <- function() { pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6)) } -test_lsi_similarity_to_archr() \ No newline at end of file diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py new file mode 100644 index 00000000..6ae28c1b --- /dev/null +++ b/r/tests/real_data/Scanpy_variable_feat_selection.py @@ -0,0 +1,46 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +import sys, tempfile, os +import numpy as np +import pandas as pd +import scanpy as sc + + +def call_highly_var_genes_single_batch(temp_dir: str) -> None: + """ + Call highly_variable genes on a single batch of PBMC68k data using their interpreation of + the Seurat implementation. + Write the input anndata object csv at `/highly_var_genes_scanpy_input.csv` + Write the output as a csv, at `/highly_var_genes_scanpy_output.csv` + + Args: + temp_dir (str): Path to the temporary directory to write the input and output files. + """ + # Dataset is only (765, 700) + adata = sc.datasets.pbmc68k_reduced() + adata.var_names_make_unique() + res = sc.pp._highly_variable_genes.highly_variable_genes(adata, + n_top_genes = 50, + n_bins = 20, + flavor = "seurat", + inplace = False, + check_values = False).drop(columns = 'mean_bin') + # remove mean_bin + adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv")) + res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv")) + + +if __name__ == "__main__": + # We use the first argument as the temporary directory + if len(sys.argv) < 2: + # If no argument is provided, use the current directory + call_highly_var_genes_single_batch(".") + else: + call_highly_var_genes_single_batch(sys.argv[1]) + \ No newline at end of file diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R new file mode 100644 index 00000000..da873a7b --- /dev/null +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -0,0 +1,47 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") + +compare_feat_selection_to_scanpy <- function(dir = NULL) { + # Set up temp dir + if (is.null(dir)) { + dir <- file.path(tempdir(), "feat_selection_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + + # Call python script + system2("python", c("Scanpy_variable_feat_selection.py", dir)) + + # read in input csv + input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1)) + output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1) + # filter output mat to only where "highly_variable" is true + output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable) + output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% + dplyr::arrange(desc(dispersions_norm)) %>% + dplyr::select(-highly_variable) %>% # convert rownames to a column + tibble::rownames_to_column("name") %>% + dplyr::as_tibble() + # unlog the input_mat + input_mat_bpcells <- expm1(input_mat_scanpy) + output_bpcells <- highly_variable_features( + input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"), + num_feats = 50, + n_bins = 20, + save_feat_selection = TRUE + ) + output_mat_bpcells <- output_bpcells$feature_selection + expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name)) + expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6)) +} + +compare_feat_selection_to_scanpy() From e4d5cb01a67f7b2c1ea40fd489c825161bb78e83 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 6 Nov 2024 19:35:03 -0800 Subject: [PATCH 07/22] [r] provide more colour to scanpy feat selection test --- r/tests/real_data/scanpy_variable_feat_selection.R | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R index da873a7b..3bac07ab 100644 --- a/r/tests/real_data/scanpy_variable_feat_selection.R +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -8,6 +8,11 @@ devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") + +# Compare the feature selection output of BPCells to that of Scanpy. +# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. +# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`. +# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output. compare_feat_selection_to_scanpy <- function(dir = NULL) { # Set up temp dir if (is.null(dir)) { @@ -29,8 +34,10 @@ compare_feat_selection_to_scanpy <- function(dir = NULL) { dplyr::select(-highly_variable) %>% # convert rownames to a column tibble::rownames_to_column("name") %>% dplyr::as_tibble() - # unlog the input_mat + + # Scanpy undoes a log1p transformation on the input matrix, so we do the same here input_mat_bpcells <- expm1(input_mat_scanpy) + output_bpcells <- highly_variable_features( input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"), num_feats = 50, From 99470e06b04bd269468e093688e67b25e0a47ffb Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 7 Nov 2024 15:57:13 -0800 Subject: [PATCH 08/22] [r] cleanup real data tests --- r/tests/real_data/ArchR_LSI.R | 17 +++++++------- r/tests/real_data/ArchR_insertions.R | 13 +++++++++-- r/tests/real_data/config.csv | 3 +++ .../scanpy_variable_feat_selection.R | 13 ++++------- r/tests/real_data/test_helpers.R | 22 +++++++++++++++++++ 5 files changed, 48 insertions(+), 20 deletions(-) create mode 100644 r/tests/real_data/config.csv create mode 100644 r/tests/real_data/test_helpers.R diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index 2ad74610..e79ee98b 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -6,8 +6,9 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") +source("test_helpers.R") +devtools::load_all(config[["path_bpcells"]]) +devtools::load_all(config[["path_archr"]]) #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. @@ -15,11 +16,9 @@ devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") #' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation #' we can compare between the two packages. #' @param proj An archr project. -test_lsi_similarity_to_archr <- function() { - # Set up temp dir - int_dir <- file.path(tempdir(), "insertion_test") - dir.create(int_dir) - setwd(int_dir) +test_lsi_similarity_to_archr <- function(dir = NULL) { + dir <- create_temp_dir(dir) + setwd(dir) # add iterative lsi for dim reduction proj <- getTestProject() proj <- addPeakMatrix(proj) @@ -33,8 +32,7 @@ test_lsi_similarity_to_archr <- function() { LSIMethod = 2, nDimensions = 2, binarize = FALSE, - outlierQuantiles = NULL, - test_mat = test_mat + outlierQuantiles = NULL ) svd_archr <- lsi_archr$svd lsi_mat_archr <- t(lsi_archr$matSVD) @@ -55,3 +53,4 @@ test_lsi_similarity_to_archr <- function() { pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6)) } +test_lsi_similarity_to_archr() \ No newline at end of file diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R index 150939d4..810133c1 100644 --- a/r/tests/real_data/ArchR_insertions.R +++ b/r/tests/real_data/ArchR_insertions.R @@ -1,5 +1,14 @@ -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +source("test_helpers.R") +devtools::load_all(config[["path_bpcells"]]) +devtools::load_all(config[["path_archr"]]) fix_granges_syntax_for_archr <- function(gr) { mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG) diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv new file mode 100644 index 00000000..915de4af --- /dev/null +++ b/r/tests/real_data/config.csv @@ -0,0 +1,3 @@ +key,value +path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/ +path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R index 3bac07ab..8b608773 100644 --- a/r/tests/real_data/scanpy_variable_feat_selection.R +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -6,23 +6,18 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") - +source("test_helpers.R") +devtools::load_all(config[["path_bpcells"]]) # Compare the feature selection output of BPCells to that of Scanpy. # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. # This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`. # It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output. compare_feat_selection_to_scanpy <- function(dir = NULL) { - # Set up temp dir - if (is.null(dir)) { - dir <- file.path(tempdir(), "feat_selection_test") - if (dir.exists(dir)) unlink(dir, recursive = TRUE) - dir.create(dir) - } + dir <- create_temp_dir(dir) # Call python script - system2("python", c("Scanpy_variable_feat_selection.py", dir)) + system2("python3", c("Scanpy_variable_feat_selection.py", dir)) # read in input csv input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1)) diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R new file mode 100644 index 00000000..2da80072 --- /dev/null +++ b/r/tests/real_data/test_helpers.R @@ -0,0 +1,22 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# Set up k-v pairs for other tests +config_csv <- read.csv("config.csv") +config <- as.list(config_csv)$value +names(config) <- as.list(config_csv)$key + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} \ No newline at end of file From 3bf8914f987f5d88d558f48539e6d65ef349b889 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 7 Nov 2024 16:15:54 -0800 Subject: [PATCH 09/22] [r] clean up lsi, var features docstrings --- r/R/singlecell_utils.R | 28 ++++++++++++++-------------- r/man/highly_variable_features.Rd | 18 ++++++++++-------- r/man/lsi.Rd | 19 ++++++++++--------- 3 files changed, 34 insertions(+), 31 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 5a00462b..13c697af 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -239,17 +239,17 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. #' @param mat (IterableMatrix) dimensions features x cells -#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA. +#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. #' @param n_dimensions (integer) Number of dimensions to keep during PCA. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. +#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @param threads (integer) Number of threads to use. -#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @return -#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. -#' - If save_lsi is TRUE, return a list with the following elements: -#' - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))`` -#' - **svd_attr**: List of SVD attributes -#' - **idf**: Inverse document frequency vector +#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. +#' - If `save_lsi` is `TRUE`, return a list with the following elements: +#' - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))` +#' - `svd_attr`: List of SVD attributes +#' - `idf`: Inverse document frequency vector #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. #' #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: @@ -300,18 +300,18 @@ lsi <- function( return(pca_res) } -#' Get the most variable features within a matrix +#' Get the most variable features within a matrix. #' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, -#' ll features will be returned. +#' all features will be returned. #' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, #' and if the number of features #' within a bin is less than 2, the dispersion is set to 1. -#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected. +#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected. #' @returns -#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. -#' - If `save_feat_selection` is True, return a list with the following elements: -#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. -#' - **feature_selection**: Dataframe with the following columns: +#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - If `save_feat_selection` is `TRUE`, return a list with the following elements: +#' - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. +#' - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`. #' @inheritParams lsi #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). #' diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd index 943d986b..005a4e37 100644 --- a/r/man/highly_variable_features.Rd +++ b/r/man/highly_variable_features.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/singlecell_utils.R \name{highly_variable_features} \alias{highly_variable_features} -\title{Get the most variable features within a matrix} +\title{Get the most variable features within a matrix.} \usage{ highly_variable_features( mat, @@ -16,26 +16,28 @@ highly_variable_features( \item{mat}{(IterableMatrix) dimensions features x cells} \item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, -ll features will be returned.} +all features will be returned.} \item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, and if the number of features within a bin is less than 2, the dispersion is set to 1.} -\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.} +\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.} \item{threads}{(integer) Number of threads to use.} } \value{ \itemize{ -\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. -\item If \code{save_feat_selection} is True, return a list with the following elements: -\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. -\item \strong{feature_selection}: Dataframe with the following columns: +\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements: +\itemize{ +\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. +\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}. +} } } \description{ -Get the most variable features within a matrix +Get the most variable features within a matrix. } \details{ The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd index 299513df..ea2687f9 100644 --- a/r/man/lsi.Rd +++ b/r/man/lsi.Rd @@ -16,23 +16,24 @@ lsi( \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} -\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.} +\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.} \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} -\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.} - -\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.} +\item{scale_factor}{(integer) Scale factor for the tf-idf log transform. +#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.} \item{threads}{(integer) Number of threads to use.} } \value{ \itemize{ -\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}. -\item If save_lsi is TRUE, return a list with the following elements: -\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))`` -\item \strong{svd_attr}: List of SVD attributes -\item \strong{idf}: Inverse document frequency vector +\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}. +\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements: +\itemize{ +\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))} +\item \code{svd_attr}: List of SVD attributes +\item \code{idf}: Inverse document frequency vector +} } } \description{ From a7c617986ef86e6ce50a7db7af882f8d3e283f15 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 7 Nov 2024 18:45:32 -0800 Subject: [PATCH 10/22] [r] add in more lsi real data tests --- r/tests/real_data/ArchR_LSI.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index e79ee98b..09c04f03 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -30,7 +30,7 @@ test_lsi_similarity_to_archr <- function(dir = NULL) { lsi_archr <- .computeLSI( mat = test_mat, LSIMethod = 2, - nDimensions = 2, + nDimensions = 20, binarize = FALSE, outlierQuantiles = NULL ) @@ -47,10 +47,16 @@ test_lsi_similarity_to_archr <- function(dir = NULL) { lsi_bpcells <- lsi( test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), z_score_norm = FALSE, - n_dimensions = 2, + n_dimensions = 20, save_lsi = TRUE ) pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res - testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6)) + testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4)) + # convert signs + lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1, (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`) + # Check for post-pca matrix similarity + testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4)) + # also check for correlation between the two matrices in PC space + testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999) } test_lsi_similarity_to_archr() \ No newline at end of file From acf35b2520efc9aa70fe005565837bede6888e8d Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 18 Nov 2024 12:56:18 -0800 Subject: [PATCH 11/22] [r] remove unused variable from `lsi()` --- r/R/singlecell_utils.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 13c697af..a59d1d73 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -269,8 +269,6 @@ lsi <- function( assert_is_wholenumber(threads) # log(tf-idf) transform - mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) - npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` tf <- mat %>% multiply_cols(1 / npeaks) idf_ <- ncol(mat) / rowSums(mat) From 47256dbaf0407641c73a3c36a33479ee6fe40fc2 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 2 Dec 2024 02:12:19 -0800 Subject: [PATCH 12/22] [r] add requested changes --- r/R/singlecell_utils.R | 37 ++++++++++--------- r/tests/real_data/ArchR_LSI.R | 15 ++++++-- r/tests/real_data/config.csv | 3 -- .../scanpy_variable_feat_selection.R | 13 ++++++- r/tests/real_data/test_helpers.R | 22 ----------- 5 files changed, 42 insertions(+), 48 deletions(-) delete mode 100644 r/tests/real_data/config.csv delete mode 100644 r/tests/real_data/test_helpers.R diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index a59d1d73..9be501fc 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -239,8 +239,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. #' @param mat (IterableMatrix) dimensions features x cells -#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. #' @param n_dimensions (integer) Number of dimensions to keep during PCA. +#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. #' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @param threads (integer) Number of threads to use. @@ -257,7 +257,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' @export lsi <- function( mat, - z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4, + n_dimensions = 50L, z_score_norm = TRUE, scale_factor = 1e4, save_lsi = FALSE, threads = 1L ) { @@ -269,13 +269,17 @@ lsi <- function( assert_is_wholenumber(threads) # log(tf-idf) transform - npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h` + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + npeaks <- mat_stats$col_stats["mean",] * nrow(mat) tf <- mat %>% multiply_cols(1 / npeaks) - idf_ <- ncol(mat) / rowSums(mat) + idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat)) mat_tfidf <- tf %>% multiply_rows(idf_) mat_log_tfidf <- log1p(scale_factor * mat_tfidf) # Save to prevent re-calculation of queued operations - mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE) + mat_log_tfidf <- write_matrix_dir( + convert_matrix_type(mat_log_tfidf, type = "float"), + tempfile("mat_log_tfidf"), compress = TRUE + ) # Z-score normalization if (z_score_norm) { cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats @@ -338,12 +342,11 @@ highly_variable_features <- function( mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) feature_means <- mat_stats$row_stats["mean", ] feature_vars <- mat_stats$row_stats["variance", ] - # Give a small value to features with 0 mean, helps with 0 division - feature_means[feature_means == 0] <- 1e-12 # Calculate dispersion, and log normalize feature_dispersion <- feature_vars / feature_means feature_dispersion[feature_dispersion == 0] <- NA feature_dispersion <- log(feature_dispersion) + feature_dispersion[feature_means == 0] <- 0 feature_means <- log1p(feature_means) features_df <- data.frame( name = names(feature_means), @@ -357,24 +360,22 @@ highly_variable_features <- function( dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% dplyr::group_by(bin) %>% dplyr::mutate( - bin_mean = mean(dispersion, na.rm = TRUE), - bin_sd = sd(dispersion, na.rm = TRUE), - bin_sd_is_na = is.na(bin_sd), - bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1 - bin_mean = ifelse(bin_sd_is_na, 0, bin_mean), - feature_dispersion_norm = (dispersion - bin_mean) / bin_sd - ) %>% + feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion), + feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1 + ) %>% dplyr::ungroup() %>% dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>% - dplyr::arrange(desc(feature_dispersion_norm)) %>% - dplyr::slice(1:min(num_feats, nrow(.))) + dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats) if (save_feat_selection) { + # get rownames that are in features_df$name + feats_of_interest <- which(rownames(mat) %in% features_df$name) return(list( - mat = mat[features_df$name,], + mat = mat[feats_of_interest,], feature_selection = features_df )) } - return(mat[features_df$name,]) + return(mat[feats_of_interest,]) + #return(mat[features_df$name,]) } diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index 09c04f03..ff697b4c 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -6,9 +6,18 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -source("test_helpers.R") -devtools::load_all(config[["path_bpcells"]]) -devtools::load_all(config[["path_archr"]]) +library("BPCells") +library("ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv deleted file mode 100644 index 915de4af..00000000 --- a/r/tests/real_data/config.csv +++ /dev/null @@ -1,3 +0,0 @@ -key,value -path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/ -path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R index 8b608773..d2eb08b2 100644 --- a/r/tests/real_data/scanpy_variable_feat_selection.R +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -6,8 +6,17 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -source("test_helpers.R") -devtools::load_all(config[["path_bpcells"]]) +library("BPCells") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} # Compare the feature selection output of BPCells to that of Scanpy. # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R deleted file mode 100644 index 2da80072..00000000 --- a/r/tests/real_data/test_helpers.R +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright 2024 BPCells contributors -# -# Licensed under the Apache License, Version 2.0 or the MIT license -# , at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - -# Set up k-v pairs for other tests -config_csv <- read.csv("config.csv") -config <- as.list(config_csv)$value -names(config) <- as.list(config_csv)$key - -# Set up temp dir in case it's not already set -create_temp_dir <- function(dir = NULL) { - if (is.null(dir)) { - dir <- file.path(tempdir(), "lsi_test") - if (dir.exists(dir)) unlink(dir, recursive = TRUE) - dir.create(dir) - } - return(dir) -} \ No newline at end of file From 004499a0a7119f05bb5fc5cf3fc46058ebb3f94c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 2 Dec 2024 13:54:25 -0800 Subject: [PATCH 13/22] [r] fix requested changes --- r/R/singlecell_utils.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 9be501fc..b3d873db 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -270,6 +270,7 @@ lsi <- function( # log(tf-idf) transform mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + npeaks <- mat_stats$col_stats["mean",] * nrow(mat) tf <- mat %>% multiply_cols(1 / npeaks) idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat)) @@ -360,22 +361,20 @@ highly_variable_features <- function( dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% dplyr::group_by(bin) %>% dplyr::mutate( - feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion), - feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1 + feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion), + feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1 ) %>% dplyr::ungroup() %>% - dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>% - dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats) + dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE) + feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order if (save_feat_selection) { # get rownames that are in features_df$name - feats_of_interest <- which(rownames(mat) %in% features_df$name) return(list( mat = mat[feats_of_interest,], feature_selection = features_df )) } return(mat[feats_of_interest,]) - #return(mat[features_df$name,]) } From dd801651258686fa71b357fdd5d25345ce4a5d5b Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Tue, 3 Dec 2024 15:56:25 -0800 Subject: [PATCH 14/22] [r] fix lsi docstring, idf_ logic --- r/R/singlecell_utils.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index b3d873db..cdee4537 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -242,7 +242,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' @param n_dimensions (integer) Number of dimensions to keep during PCA. #' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. -#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. +#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @param threads (integer) Number of threads to use. #' @return #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. @@ -270,10 +270,9 @@ lsi <- function( # log(tf-idf) transform mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) - npeaks <- mat_stats$col_stats["mean",] * nrow(mat) tf <- mat %>% multiply_cols(1 / npeaks) - idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat)) + idf_ <- 1 / mat_stats$row_stats["mean",] mat_tfidf <- tf %>% multiply_rows(idf_) mat_log_tfidf <- log1p(scale_factor * mat_tfidf) # Save to prevent re-calculation of queued operations From 8891981744295f0687e92cf12c6d181a08349a8f Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 6 Dec 2024 16:32:58 -0800 Subject: [PATCH 15/22] [r] replace z-score norm with corr cutoffs --- r/R/singlecell_utils.R | 31 ++++++++++++++++------------ r/tests/real_data/ArchR_LSI.R | 1 - r/tests/real_data/ArchR_insertions.R | 15 +++++++++++--- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index cdee4537..28fa45b7 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -240,7 +240,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. #' @param mat (IterableMatrix) dimensions features x cells #' @param n_dimensions (integer) Number of dimensions to keep during PCA. -#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA. +#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to +#' the corr_cutoff, it will be excluded from the final PCA matrix. #' @param scale_factor (integer) Scale factor for the tf-idf log transform. #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. #' @param threads (integer) Number of threads to use. @@ -248,6 +249,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. #' - If `save_lsi` is `TRUE`, return a list with the following elements: #' - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))` +#' - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth #' - `svd_attr`: List of SVD attributes #' - `idf`: Inverse document frequency vector #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. @@ -257,7 +259,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { #' @export lsi <- function( mat, - n_dimensions = 50L, z_score_norm = TRUE, scale_factor = 1e4, + n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4, save_lsi = FALSE, threads = 1L ) { @@ -266,12 +268,13 @@ lsi <- function( assert_len(n_dimensions, 1) assert_greater_than_zero(n_dimensions) assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1)) assert_is_wholenumber(threads) # log(tf-idf) transform mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) - npeaks <- mat_stats$col_stats["mean",] * nrow(mat) - tf <- mat %>% multiply_cols(1 / npeaks) + read_depth <- mat_stats$col_stats["mean",] * nrow(mat) + tf <- mat %>% multiply_cols(1 / read_depth) idf_ <- 1 / mat_stats$row_stats["mean",] mat_tfidf <- tf %>% multiply_rows(idf_) mat_log_tfidf <- log1p(scale_factor * mat_tfidf) @@ -280,22 +283,24 @@ lsi <- function( convert_matrix_type(mat_log_tfidf, type = "float"), tempfile("mat_log_tfidf"), compress = TRUE ) - # Z-score normalization - if (z_score_norm) { - cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats - cell_means <- cell_peak_stats["mean",] - cell_vars <- cell_peak_stats["variance",] - mat_log_tfidf <- mat_log_tfidf %>% - add_cols(-cell_means) %>% - multiply_cols(1 / cell_vars) - } # Run pca svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads) pca_res <- t(svd_attr_$u) %*% mat_log_tfidf + + # Filter out PCs that are highly correlated with sequencing depth + pca_corrs <- abs(cor(read_depth, t(pca_res))) + pca_feats_to_keep <- which(pca_corrs < corr_cutoff) + if (length(pca_feats_to_keep) != n_dimensions) { + # not sure if this is the route we want to take. Should we just leave the PCs in, + # and not use them in downstream analysis? + pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf + } + if(save_lsi) { return(list( pca_res = pca_res, svd_attr = svd_attr_, + unused_pcs <- which(pca_corrs >= corr_cutoff), idf = idf_ )) } diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index ff697b4c..084645d3 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -55,7 +55,6 @@ test_lsi_similarity_to_archr <- function(dir = NULL) { # Do not use z-score normalization, as this isn't done with ArchR lsi_bpcells <- lsi( test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), - z_score_norm = FALSE, n_dimensions = 20, save_lsi = TRUE ) diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R index 810133c1..8818d7e7 100644 --- a/r/tests/real_data/ArchR_insertions.R +++ b/r/tests/real_data/ArchR_insertions.R @@ -6,9 +6,18 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. -source("test_helpers.R") -devtools::load_all(config[["path_bpcells"]]) -devtools::load_all(config[["path_archr"]]) +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} fix_granges_syntax_for_archr <- function(gr) { mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG) From 1e7c6d062dcc50c40a1bfcd5344cb0835cd7889f Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Wed, 8 Jan 2025 17:30:41 -0800 Subject: [PATCH 16/22] [r] update LSI to use norm, feature selection helpers --- r/NAMESPACE | 9 +- r/R/singlecell_utils.R | 426 ++++++++++-------- r/R/transforms.R | 72 --- r/man/DimReduction.Rd | 21 + r/man/lsi.Rd | 37 +- r/man/project.Rd | 25 + ...> select_features_by_binned_dispersion.Rd} | 21 +- r/pkgdown/_pkgdown.yml | 9 +- r/tests/real_data/ArchR_LSI.R | 17 +- r/tests/testthat/test-singlecell_utils.R | 40 +- 10 files changed, 340 insertions(+), 337 deletions(-) create mode 100644 r/man/DimReduction.Rd create mode 100644 r/man/project.Rd rename r/man/{highly_variable_features.Rd => select_features_by_binned_dispersion.Rd} (59%) diff --git a/r/NAMESPACE b/r/NAMESPACE index c7e4fd1c..b296a27f 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -2,11 +2,16 @@ S3method(base::as.data.frame,IterableFragments) S3method(base::as.matrix,IterableMatrix) +S3method(project,DimReduction) +S3method(project,LSI) +S3method(project,default) S3method(svds,IterableMatrix) S3method(svds,default) export("all_matrix_inputs<-") export("cellNames<-") export("chrNames<-") +export(DimReduction) +export(LSI) export(add_cols) export(add_rows) export(all_matrix_inputs) @@ -48,7 +53,6 @@ export(gene_score_archr) export(gene_score_tiles_archr) export(gene_score_weights_archr) export(get_trackplot_height) -export(highly_variable_features) export(import_matrix_market) export(import_matrix_market_10x) export(knn_annoy) @@ -56,7 +60,6 @@ export(knn_hnsw) export(knn_to_geodesic_graph) export(knn_to_snn_graph) export(log1p_slow) -export(lsi) export(marker_features) export(match_gene_symbol) export(matrix_stats) @@ -87,6 +90,7 @@ export(plot_tf_footprint) export(plot_tss_profile) export(plot_tss_scatter) export(prefix_cell_names) +export(project) export(pseudobulk_matrix) export(qc_scATAC) export(range_distance_to_nearest) @@ -112,6 +116,7 @@ export(rowVars.default) export(sctransform_pearson) export(select_cells) export(select_chromosomes) +export(select_features_by_binned_dispersion) export(select_features_by_mean) export(select_features_by_variance) export(select_regions) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 28fa45b7..c6baece6 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -118,6 +118,233 @@ select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) { } +#' Get the most variable features within a matrix. +#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, +#' all features will be returned. +#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +#' and if the number of features +#' within a bin is less than 2, the dispersion is set to 1. +#' @inheritParams select_features_by_variance +#' @returns +#' Return a dataframe with the following columns, sorted descending by bin-normalized dispersion: +#' - `names`: Feature name. +#' - `score`: Bin-normalized dispersion of the feature. +#' - `highly_variable`: Logical vector of whether the feature is highly variable. +#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). +#' +#' Calculate using the following process: +#' 1. Calculate the dispersion of each feature (variance / mean) +#' 2. Log normalize dispersion and mean +#' 3. Bin the features by their means, and normalize dispersion within each bin +#' @export +select_features_by_binned_dispersion <- function( + mat, num_feats = 25000, n_bins = 20, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is_wholenumber(n_bins) + assert_len(n_bins, 1) + assert_greater_than_zero(n_bins) + if (nrow(mat) <= num_feats) { + log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) + features_df <- tibble::tibble( + names = rownames(mat), + score = rep(0, nrow(mat)), + highly_variable = rep(TRUE, nrow(mat)) + ) + return(mat) + } + # Calculate row information for dispersion + mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) + feature_means <- mat_stats$row_stats["mean", ] + feature_vars <- mat_stats$row_stats["variance", ] + # Calculate dispersion, and log normalize + feature_dispersion <- feature_vars / feature_means + feature_dispersion[feature_dispersion == 0] <- NA + feature_dispersion <- log1p(feature_dispersion) + feature_dispersion[feature_means == 0] <- 0 + feature_means <- log1p(feature_means) + features_df <- tibble::tibble( + names = names(feature_means), + var = feature_vars, + mean = feature_means, + dispersion = feature_dispersion + ) + # Bin by mean, and normalize dispersion with each bin + features_df <- features_df %>% + dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% + dplyr::group_by(bin) %>% + dplyr::mutate( + score = (dispersion - mean(dispersion)) / sd(dispersion), + score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1 + ) %>% + dplyr::ungroup() %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) %>% + dplyr::select(c("names", "score", "highly_variable")) + return(features_df) +} + + +################# +# DimReduction Class Definition +################# + +#' Barebones definition of a DimReduction class. +#' +#' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object. +#' Child classes should implement a `project` method to allow for the projection of other matrices using +#' the fitted transformation object. +#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) The projected data +#' @field fitted_params (list) A list of parameters used for the transformation of a matrix. +#' @export +DimReduction <- function(x, fitted_params = list(), ...) { + assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix")) + assert_is(fitted_params, "list") + structure( + list( + cell_embeddings = x, + fitted_params = fitted_params, + ... + ), + class = "DimReduction" + ) +} + +#' Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object. +#' @param x DimReduction object. +#' @param mat IterableMatrix object. +#' @return IterableMatrix object of the projected data. +#' @details DimReduction subclasses should use this to project new data with the same features, to project into the same latent space. +#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object. +#' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features. +#' If there are rownames, reorder the matrix to match the order of the original matrix +#' @export +project <- function(x, mat, ...) { + UseMethod("project") +} +#' @export +project.default <- function(x, mat, ...) { + rlang::abort("project method not implemented for BPCells objects.") +} +#' @export +project.DimReduction <- function(x, mat, ...) { + rlang::abort("project method not implemented for base DimReduction object.") +} + +################# +# LSI Implementation +################# + + +#' Perform latent semantic indexing (LSI) on a matrix. +#' +#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. +#' @param mat (IterableMatrix) dimensions features x cells. +#' @param n_dimensions (integer) Number of dimensions to keep during PCA. +#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to +#' the corr_cutoff, it will be excluded from the final PCA matrix. +#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed. +#' @param threads (integer) Number of threads to use. +#' @returns An object of class `c("LSI", "DimReduction")` with the following attributes: +#' - `cell_embeddings`: The projected data +#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following: +#' - `normalization_method`: The normalization method used +#' - `feature_means`: The means of the features used for normalization +#' - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth +#' - `svd_params`: The matrix calculated for SVD +#' - `feature_names`: The names of the features in the matrix +#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. +#' +#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +#' - 17.1 MB memory usage, 25.1 seconds runtime +#' @seealso `project()` `DimReduction()` `normalize_tfidf()` +#' @export +LSI <- function( + mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(n_dimensions) + assert_len(n_dimensions, 1) + assert_greater_than_zero(n_dimensions) + assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1)) + assert_is_wholenumber(threads) + + # Normalization + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads) + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + if (!is.null(normalize)) mat <- normalize(mat, threads = threads) + + # Save to prevent re-calculation of queued operations + mat <- write_matrix_dir( + convert_matrix_type(mat, type = "float"), + tempfile("mat"), compress = TRUE + ) + # Run pca + svd_attr <- svds(mat, k = n_dimensions, threads = threads) + pca_res <- t(svd_attr$u) %*% mat + + # Filter out PCs that are highly correlated with sequencing depth + pca_corrs <- abs(cor(read_depth, t(pca_res))) + pca_feats_to_keep <- which(pca_corrs < corr_cutoff) + if (length(pca_feats_to_keep) != n_dimensions) { + log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", "))) + pca_res <- pca_res[pca_feats_to_keep, ] + } + fitted_params <- list( + normalization_method = normalize, + feature_means = mat_stats$row_stats["mean", ], + pcs_to_keep = pca_feats_to_keep, + svd_params = svd_attr + ) + res <- DimReduction( + x = pca_res, + fitted_params = fitted_params, + feature_names = rownames(mat) + ) + class(res) <- c("LSI", class(res)) + return(res) +} + + +#' @export +project.LSI <- function(x, mat, threads = 1L, ...) { + assert_is(mat, "IterableMatrix") + assert_is(x, "LSI") + + fitted_params <- x$fitted_params + # Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u + assert_true(nrow(mat) == nrow(fitted_params$svd_params$u)) + if (!is.null(rownames(mat)) && !is.null(x$feature_names)) { + assert_true(all(x$feature_names %in% rownames(mat))) + mat <- mat[x$feature_names, ] + } + + if (!is.null(fitted_params$normalization_method)) { + mat <- fitted_params$normalization_method( + mat, + feature_means = fitted_params$feature_means, + threads = threads + ) + mat <- write_matrix_dir( + convert_matrix_type(mat, type = "float"), + tempfile("mat"), compress = TRUE + ) + } + pca_attr <- fitted_params$svd_params + res <- t(pca_attr$u) %*% mat + if (length(fitted_params$pcs_to_keep) != nrow(res)) { + res <- res[fitted_params$pcs_to_keep, ] + } + return(res) +} + + #' Test for marker features #' #' Given a features x cells matrix, perform one-vs-all differential @@ -182,205 +409,6 @@ marker_features <- function(mat, groups, method="wilcoxon") { ) } - -#' Aggregate counts matrices by cell group or feature. -#' -#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each -#' feature. -#' @param cell_groups (Character/factor) Vector of group/cluster assignments for each cell. Length must be `ncol(mat)`. -#' @param method (Character vector) Method(s) to aggregate counts. If one method is provided, the output will be a matrix. If multiple methods are provided, the output will be a named list of matrices. -#' -#' Current options are: `nonzeros`, `sum`, `mean`, `variance`. -#' @param threads (integer) Number of threads to use. -#' @return -#' - If `method` is length `1`, returns a matrix of shape `(features x groups)`. -#' - If `method` is greater than length `1`, returns a list of matrices with each matrix representing a pseudobulk matrix with a different aggregation method. -#' Each matrix is of shape `(features x groups)`, and names are one of `nonzeros`, `sum`, `mean`, `variance`. -#' @details Some simpler stats are calculated in the process of calculating more complex -#' statistics. So when calculating `variance`, `nonzeros` and `mean` can be included with no -#' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time. -#' @inheritParams marker_features -#' @export -pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { - assert_is(mat, "IterableMatrix") - assert_is(cell_groups, c("factor", "character", "numeric")) - assert_true(length(cell_groups) == ncol(mat)) - cell_groups <- as.factor(cell_groups) - assert_is(method, "character") - methods <- c("variance", "mean", "sum", "nonzeros") - for (m in method) { - if (!(m %in% methods)) { - rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", "))) - } - } - assert_is(threads, "integer") - # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats - iter <- iterate_matrix(parallel_split(mat, threads, threads*4)) - res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) - # if res is a single matrix, return with colnames and rownames - if (length(method) == 1) { - colnames(res[[method]]) <- levels(cell_groups) - rownames(res[[method]]) <- rownames(mat) - return(res[[method]]) - } - # give colnames and rownames for each matrix in res, which is a named list - for (res_slot in names(res)) { - if ((length(res[[res_slot]]) == 0) || !(res_slot %in% method)) { - res[[res_slot]] <- NULL - } else { - colnames(res[[res_slot]]) <- levels(cell_groups) - rownames(res[[res_slot]]) <- rownames(mat) - } - } - return(res) -} - -#' Perform latent semantic indexing (LSI) on a matrix. -#' -#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. -#' @param mat (IterableMatrix) dimensions features x cells -#' @param n_dimensions (integer) Number of dimensions to keep during PCA. -#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to -#' the corr_cutoff, it will be excluded from the final PCA matrix. -#' @param scale_factor (integer) Scale factor for the tf-idf log transform. -#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector. -#' @param threads (integer) Number of threads to use. -#' @return -#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`. -#' - If `save_lsi` is `TRUE`, return a list with the following elements: -#' - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))` -#' - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth -#' - `svd_attr`: List of SVD attributes -#' - `idf`: Inverse document frequency vector -#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. -#' -#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: -#' - 17.1 MB memory usage, 25.1 seconds runtime -#' @export -lsi <- function( - mat, - n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4, - save_lsi = FALSE, - threads = 1L -) { - assert_is(mat, "IterableMatrix") - assert_is_wholenumber(n_dimensions) - assert_len(n_dimensions, 1) - assert_greater_than_zero(n_dimensions) - assert_true(n_dimensions < min(ncol(mat), nrow(mat))) - assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1)) - assert_is_wholenumber(threads) - - # log(tf-idf) transform - mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) - read_depth <- mat_stats$col_stats["mean",] * nrow(mat) - tf <- mat %>% multiply_cols(1 / read_depth) - idf_ <- 1 / mat_stats$row_stats["mean",] - mat_tfidf <- tf %>% multiply_rows(idf_) - mat_log_tfidf <- log1p(scale_factor * mat_tfidf) - # Save to prevent re-calculation of queued operations - mat_log_tfidf <- write_matrix_dir( - convert_matrix_type(mat_log_tfidf, type = "float"), - tempfile("mat_log_tfidf"), compress = TRUE - ) - # Run pca - svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads) - pca_res <- t(svd_attr_$u) %*% mat_log_tfidf - - # Filter out PCs that are highly correlated with sequencing depth - pca_corrs <- abs(cor(read_depth, t(pca_res))) - pca_feats_to_keep <- which(pca_corrs < corr_cutoff) - if (length(pca_feats_to_keep) != n_dimensions) { - # not sure if this is the route we want to take. Should we just leave the PCs in, - # and not use them in downstream analysis? - pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf - } - - if(save_lsi) { - return(list( - pca_res = pca_res, - svd_attr = svd_attr_, - unused_pcs <- which(pca_corrs >= corr_cutoff), - idf = idf_ - )) - } - return(pca_res) -} - -#' Get the most variable features within a matrix. -#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, -#' all features will be returned. -#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, -#' and if the number of features -#' within a bin is less than 2, the dispersion is set to 1. -#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected. -#' @returns -#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. -#' - If `save_feat_selection` is `TRUE`, return a list with the following elements: -#' - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`. -#' - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`. -#' @inheritParams lsi -#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). -#' -#' Calculate using the following process: -#' 1. Calculate the dispersion of each feature (variance / mean) -#' 2. Log normalize dispersion and mean -#' 3. Bin the features by their means, and normalize dispersion within each bin -#' @export -highly_variable_features <- function( - mat, num_feats, n_bins = 20, - save_feat_selection = FALSE, - threads = 1L -) { - assert_is(mat, "IterableMatrix") - assert_greater_than_zero(num_feats) - assert_is_wholenumber(num_feats) - assert_len(num_feats, 1) - assert_is_wholenumber(n_bins) - assert_len(n_bins, 1) - assert_greater_than_zero(n_bins) - if (nrow(mat) <= num_feats) { - log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) - return(mat) - } - # Calculate row information for dispersion - mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) - feature_means <- mat_stats$row_stats["mean", ] - feature_vars <- mat_stats$row_stats["variance", ] - # Calculate dispersion, and log normalize - feature_dispersion <- feature_vars / feature_means - feature_dispersion[feature_dispersion == 0] <- NA - feature_dispersion <- log(feature_dispersion) - feature_dispersion[feature_means == 0] <- 0 - feature_means <- log1p(feature_means) - features_df <- data.frame( - name = names(feature_means), - var = feature_vars, - mean = feature_means, - dispersion = feature_dispersion - ) - - # Bin by mean, and normalize dispersion with each bin - features_df <- features_df %>% - dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% - dplyr::group_by(bin) %>% - dplyr::mutate( - feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion), - feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1 - ) %>% - dplyr::ungroup() %>% - dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE) - feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order - if (save_feat_selection) { - # get rownames that are in features_df$name - return(list( - mat = mat[feats_of_interest,], - feature_selection = features_df - )) - } - return(mat[feats_of_interest,]) -} - #' Aggregate counts matrices by cell group or feature. #' diff --git a/r/R/transforms.R b/r/R/transforms.R index 7a110677..8a2bd25b 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -983,76 +983,4 @@ normalize_tfidf <- function( idf <- 1 / feature_means tf_idf_mat <- tf %>% multiply_rows(idf) return(log1p(tf_idf_mat * scale_factor)) -} -#' Compute LSI For a peak matrix -#' @param mat PeakMatrix -#' @param n_dimensions Number of dimensions to keep during PCA -#' @param scale_factor Scale factor for the tf-idf log transform -#' @param verbose Whether to print out progress -#' @param threads Number of threads to use -#' @return dgCMatrix of shape (n_dimensions, ncol(mat)) -#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA. -#' Tf-idf implementation is from Stuart & Butler et al. 2019. -#' @export -compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) { - assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this? - assert_is(n_dimensions, "integer") - assert_len(n_dimensions, 1) - assert_greater_than_zero(n_dimensions) - assert_true(n_dimensions < min(ncol(mat), nrow(mat))) - - # Signac implementation - npeaks <- colSums(mat) - tf <- mat %>% multiply_cols(1/npeaks) - idf_ <- ncol(mat) / rowSums(mat) - mat_tfidf <- tf %>% multiply_rows(idf_) - mat_log_tfidf <- log1p(scale_factor * mat_tfidf) - - # run z-score norm - cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats - cell_means <- cell_peak_stats["mean",] - cell_vars <- cell_peak_stats["variance",] - mat_lsi_norm <- mat_log_tfidf %>% - add_cols(-cell_means) %>% - multiply_cols(1 / cell_vars) - - # Run pca - svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions) - pca_res <- t(svd_attr$u) %*% mat_lsi_norm - return(pca_res) -} - -#' Get most variable features, given a non-log normalized matrix -highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) { - assert_is(mat, "IterableMatrix") - assert_is(num_feats, "integer") - assert_greater_than_zero(num_feats) - assert_len(num_feats, 1) - assert_is(n_bins, "integer") - assert_len(n_bins, 1) - assert_greater_than_zero(n_bins) - if (nrow(mat) <= num_feats) { - if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), - returning all features", nrow(mat), num_feats)) - return(mat) - } - # Calculate the mean and variance of each feature - # should we set entries that are 0 to NA? - feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ] - feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ] - feature_dispersion <- feature_vars / feature_means - feature_dispersion <- log(feature_dispersion) - feature_means <- log1p(feature_means) - mean_bins <- cut(feature_means, n_bins, labels = FALSE) - - # Calculate the mean and variance of dispersion of each bin - bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x)) - bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x)) - # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1 - bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)] - # map mean_bins indices to bin_stats - feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins] - names(feature_dispersion_norm) <- names(feature_dispersion) - variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)] - return(mat[names(variable_features_), ]) } \ No newline at end of file diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd new file mode 100644 index 00000000..2bb7486c --- /dev/null +++ b/r/man/DimReduction.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{DimReduction} +\alias{DimReduction} +\title{Barebones definition of a DimReduction class.} +\usage{ +DimReduction(x, fitted_params = list(), ...) +} +\description{ +Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object. +Child classes should implement a \code{project} method to allow for the projection of other matrices using +the fitted transformation object. +} +\section{Fields}{ + +\describe{ +\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) The projected data} + +\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.} +}} + diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd index ea2687f9..13348e23 100644 --- a/r/man/lsi.Rd +++ b/r/man/lsi.Rd @@ -1,39 +1,41 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/singlecell_utils.R -\name{lsi} -\alias{lsi} +\name{LSI} +\alias{LSI} \title{Perform latent semantic indexing (LSI) on a matrix.} \usage{ -lsi( +LSI( mat, - z_score_norm = TRUE, n_dimensions = 50L, - scale_factor = 10000, - save_lsi = FALSE, + corr_cutoff = 1, + normalize = normalize_tfidf, threads = 1L ) } \arguments{ -\item{mat}{(IterableMatrix) dimensions features x cells} - -\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.} +\item{mat}{(IterableMatrix) dimensions features x cells.} \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} -\item{scale_factor}{(integer) Scale factor for the tf-idf log transform. -#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.} +\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to +the corr_cutoff, it will be excluded from the final PCA matrix.} + +\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} \item{threads}{(integer) Number of threads to use.} } \value{ +An object of class \code{c("LSI", "DimReduction")} with the following attributes: \itemize{ -\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}. -\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements: +\item \code{cell_embeddings}: The projected data +\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following: \itemize{ -\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))} -\item \code{svd_attr}: List of SVD attributes -\item \code{idf}: Inverse document frequency vector +\item \code{normalization_method}: The normalization method used +\item \code{feature_means}: The means of the features used for normalization +\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth +\item \code{svd_params}: The matrix calculated for SVD } +\item \code{feature_names}: The names of the features in the matrix } } \description{ @@ -47,3 +49,6 @@ Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: \item 17.1 MB memory usage, 25.1 seconds runtime } } +\seealso{ +\code{project()} \code{DimReduction()} \code{normalize_tfidf()} +} diff --git a/r/man/project.Rd b/r/man/project.Rd new file mode 100644 index 00000000..dcab31b5 --- /dev/null +++ b/r/man/project.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{project} +\alias{project} +\title{Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.} +\usage{ +project(x, mat, ...) +} +\arguments{ +\item{x}{DimReduction object.} + +\item{mat}{IterableMatrix object.} +} +\value{ +IterableMatrix object of the projected data. +} +\description{ +Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object. +} +\details{ +DimReduction subclasses should use this to project new data with the same features, to project into the same latent space. +All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object. +If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features. +If there are rownames, reorder the matrix to match the order of the original matrix +} diff --git a/r/man/highly_variable_features.Rd b/r/man/select_features_by_binned_dispersion.Rd similarity index 59% rename from r/man/highly_variable_features.Rd rename to r/man/select_features_by_binned_dispersion.Rd index 005a4e37..b3c573ae 100644 --- a/r/man/highly_variable_features.Rd +++ b/r/man/select_features_by_binned_dispersion.Rd @@ -1,14 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/singlecell_utils.R -\name{highly_variable_features} -\alias{highly_variable_features} +\name{select_features_by_binned_dispersion} +\alias{select_features_by_binned_dispersion} \title{Get the most variable features within a matrix.} \usage{ -highly_variable_features( +select_features_by_binned_dispersion( mat, - num_feats, + num_feats = 25000, n_bins = 20, - save_feat_selection = FALSE, threads = 1L ) } @@ -22,18 +21,14 @@ all features will be returned.} and if the number of features within a bin is less than 2, the dispersion is set to 1.} -\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.} - \item{threads}{(integer) Number of threads to use.} } \value{ +Return a dataframe with the following columns, sorted descending by bin-normalized dispersion: \itemize{ -\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. -\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements: -\itemize{ -\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}. -\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}. -} +\item \code{names}: Feature name. +\item \code{score}: Bin-normalized dispersion of the feature. +\item \code{highly_variable}: Logical vector of whether the feature is highly variable. } } \description{ diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 3cf6509a..64f1f119 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -135,9 +135,9 @@ reference: - normalize_tfidf - select_features_by_variance - select_features_by_dispersion + - select_features_by_binned_dispersion - select_features_by_mean - - lsi - - highly_variable_features + - LSI - IterableMatrix-methods - pseudobulk_matrix @@ -156,6 +156,11 @@ reference: - knn_to_graph - cluster_membership_matrix +- title: "Dimensionality Reductions" +- contents: + - DimReduction + - project + - title: "Plots" diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R index 084645d3..de775f4f 100644 --- a/r/tests/real_data/ArchR_LSI.R +++ b/r/tests/real_data/ArchR_LSI.R @@ -36,7 +36,7 @@ test_lsi_similarity_to_archr <- function(dir = NULL) { # Calculate LSI on ArchR # running LSI without binarizing, as we don't do this in the BPCells implementation # we also don't filter quantile outliers. - lsi_archr <- .computeLSI( + lsi_archr <- ArchR:::.computeLSI( mat = test_mat, LSIMethod = 2, nDimensions = 20, @@ -53,18 +53,17 @@ test_lsi_similarity_to_archr <- function(dir = NULL) { pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr # Calculate LSI on BPCells # Do not use z-score normalization, as this isn't done with ArchR - lsi_bpcells <- lsi( - test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), - n_dimensions = 20, - save_lsi = TRUE + lsi_bpcells <- LSI( + test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), + n_dimensions = 20 ) - pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res + pre_svd_mat_approx_bpcells <- lsi_bpcells$fitted_params$svd_params$u %*% lsi_bpcells$cell_embeddings testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4)) # convert signs - lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1, (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`) + lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1, (2 * (lsi_mat_archr[,1] * lsi_bpcells$cell_embeddings[,1] > 0) - 1), `*`) # Check for post-pca matrix similarity - testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4)) + testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$cell_embeddings, tolerance = 1e-4)) # also check for correlation between the two matrices in PC space - testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999) + testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$cell_embeddings)) > 0.999) } test_lsi_similarity_to_archr() \ No newline at end of file diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index c699d67b..93c62a72 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -36,25 +36,6 @@ test_that("select_features works general case", { }) -test_that("select_features works general case", { - m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") - for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) { - res <- do.call(fn, list(m1, num_feats = 10)) - expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting - expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable - expect_setequal(res$names, rownames(m1)) - res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows - res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) - expect_identical(res_more_feats_than_rows, res_feats_equal_rows) - if (fn != "select_features_by_mean") { - # Check that normalization actually does something - res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL)) - expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score)) - } - } -}) - - test_that("Wilcoxon rank sum works (small)", { x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46) y <- c(1.15, 0.88, 0.90, 0.74, 1.21) @@ -206,11 +187,16 @@ test_that("Pseudobulk aggregation works with multiple return types", { -test_that("Highly variable feature selection works", { +test_that("Feature selection by bin variance works", { mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat - res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1) - res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1) + res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1) + res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1) + browser() + res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) + res <- mat[res_feats,] + res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) + res_t <- t(mat[,res_feats_t]) expect_equal(nrow(res), 10) expect_equal(ncol(res), 26) expect_equal(nrow(res_t), 10) @@ -222,10 +208,16 @@ test_that("LSI works", { rownames(mat) <- paste0("feat", seq_len(nrow(mat))) colnames(mat) <- paste0("cell", seq_len(ncol(mat))) # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to ArchR - lsi_res <- lsi(mat, n_dimensions = 5) - lsi_res_t <- lsi(t(mat), n_dimensions = 5) + lsi_res_obj <- LSI(mat, n_dimensions = 5) + lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5) + lsi_res <- lsi_res_obj$cell_embeddings + lsi_res_t <- lsi_res_t_obj$cell_embeddings + # Check that projection results in the same output if used on the same input matrix + lsi_res_proj <- project(lsi_res_obj, mat) + expect_equal(nrow(lsi_res), 5) expect_equal(ncol(lsi_res), ncol(mat)) expect_equal(nrow(lsi_res_t), 5) expect_equal(ncol(lsi_res_t), nrow(mat)) + expect_equal(lsi_res, lsi_res_proj) }) \ No newline at end of file From e9c302e5e9b601722659b34b566584e1ae8191e6 Mon Sep 17 00:00:00 2001 From: Immanuel Abdi Date: Thu, 9 Jan 2025 22:25:46 -0800 Subject: [PATCH 17/22] [r] update `NEWS.md` --- r/NEWS.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/r/NEWS.md b/r/NEWS.md index 24b6b0d9..86d75556 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -24,9 +24,8 @@ Contributions welcome :) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) - Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169) -- Add MACS2/3 input creation and peak calling through `call_macs_peaks()` (pull request #118) -- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156). -- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). +- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #181). +- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #181). ## Improvements - `trackplot_loop()` now accepts discrete color scales From 7ed6bd7d59074faaef6016a431acff31e3d5a2ba Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 10 Jan 2025 02:32:12 -0800 Subject: [PATCH 18/22] [r] remove test artifacts --- r/tests/testthat/test-singlecell_utils.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 93c62a72..85416f71 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -192,11 +192,11 @@ test_that("Feature selection by bin variance works", { # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1) res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1) - browser() res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) res <- mat[res_feats,] res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) res_t <- t(mat[,res_feats_t]) + expect_equal(nrow(res), 10) expect_equal(ncol(res), 26) expect_equal(nrow(res_t), 10) @@ -220,4 +220,5 @@ test_that("LSI works", { expect_equal(nrow(lsi_res_t), 5) expect_equal(ncol(lsi_res_t), nrow(mat)) expect_equal(lsi_res, lsi_res_proj) -}) \ No newline at end of file +}) + From d67b7db51f7dba0eae8ecb1b02463042780a8542 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Mon, 13 Jan 2025 13:38:25 -0800 Subject: [PATCH 19/22] [r] add logging, partial args --- r/R/singlecell_utils.R | 17 ++++++++++++++--- r/tests/testthat/test-singlecell_utils.R | 7 +++++-- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index c6baece6..6ec76e37 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -265,8 +265,16 @@ project.DimReduction <- function(x, mat, ...) { #' @export LSI <- function( mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf, - threads = 1L + threads = 1L, verbose = FALSE ) { + if (rlang::is_missing(mat)) { + return( + purrr::partial( + LSI, n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, + normalize = normalize, threads = threads, verbose = verbose + ) + ) + } assert_is(mat, "IterableMatrix") assert_is_wholenumber(n_dimensions) assert_len(n_dimensions, 1) @@ -274,8 +282,10 @@ LSI <- function( assert_true(n_dimensions < min(ncol(mat), nrow(mat))) assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1)) assert_is_wholenumber(threads) - + + if (verbose) log_progress("Starting LSI") # Normalization + if (verbose) log_progress("Normalizing matrix") mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads) read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) if (!is.null(normalize)) mat <- normalize(mat, threads = threads) @@ -286,6 +296,7 @@ LSI <- function( tempfile("mat"), compress = TRUE ) # Run pca + if (verbose) log_progress("Calculating SVD") svd_attr <- svds(mat, k = n_dimensions, threads = threads) pca_res <- t(svd_attr$u) %*% mat @@ -293,7 +304,7 @@ LSI <- function( pca_corrs <- abs(cor(read_depth, t(pca_res))) pca_feats_to_keep <- which(pca_corrs < corr_cutoff) if (length(pca_feats_to_keep) != n_dimensions) { - log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", "))) + if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", "))) pca_res <- pca_res[pca_feats_to_keep, ] } fitted_params <- list( diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 85416f71..034d765a 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -190,8 +190,8 @@ test_that("Pseudobulk aggregation works with multiple return types", { test_that("Feature selection by bin variance works", { mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat - res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1) - res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1) + res_table <- select_features_by_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1) + res_table_t <- select_features_by_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1) res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) res <- mat[res_feats,] res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) @@ -210,6 +210,8 @@ test_that("LSI works", { # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to ArchR lsi_res_obj <- LSI(mat, n_dimensions = 5) lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5) + # Also check partial args + lsi_res_obj_partial <- LSI(n_dimensions = 5)(mat) lsi_res <- lsi_res_obj$cell_embeddings lsi_res_t <- lsi_res_t_obj$cell_embeddings # Check that projection results in the same output if used on the same input matrix @@ -219,6 +221,7 @@ test_that("LSI works", { expect_equal(ncol(lsi_res), ncol(mat)) expect_equal(nrow(lsi_res_t), 5) expect_equal(ncol(lsi_res_t), nrow(mat)) + expect_equal(lsi_res_obj, lsi_res_obj_partial) expect_equal(lsi_res, lsi_res_proj) }) From 76f4c7d2955c24b463a58b00f2299f331441a09c Mon Sep 17 00:00:00 2001 From: Immanuel Abdi <56730419+immanuelazn@users.noreply.github.com> Date: Fri, 10 Jan 2025 12:58:07 -0800 Subject: [PATCH 20/22] [r] change check for `pseudobulk_matrix()` to use whole number instead of integer (#183) --- r/R/singlecell_utils.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 34ed7dd6..b222a931 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -456,10 +456,14 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", "))) } } - assert_is(threads, "integer") - # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats - iter <- iterate_matrix(parallel_split(mat, threads, threads*4)) - res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) + assert_is_wholenumber(threads) + + it <- mat %>% + convert_matrix_type("double") %>% + parallel_split(threads, threads*4) %>% + iterate_matrix() + + res <- pseudobulk_matrix_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose) # if res is a single matrix, return with colnames and rownames if (length(method) == 1) { colnames(res[[method]]) <- levels(cell_groups) From 5e3a7fed33e786d7a550394043a615f9128da457 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 17 Jan 2025 21:34:51 -0800 Subject: [PATCH 21/22] [r] update feature selection documentation --- r/R/singlecell_utils.R | 16 ++----- r/man/feature_selection.Rd | 23 ++++++++++ r/man/lsi.Rd | 3 +- r/man/select_features_by_binned_dispersion.Rd | 46 ------------------- r/pkgdown/_pkgdown.yml | 1 - 5 files changed, 29 insertions(+), 60 deletions(-) delete mode 100644 r/man/select_features_by_binned_dispersion.Rd diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index b222a931..d2ce7bad 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -121,22 +121,14 @@ select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) { return(features_df) } - -#' Get the most variable features within a matrix. -#' @param num_feats (integer) Number of features to return. If the number is higher than the number of features in the matrix, -#' all features will be returned. +#' @rdname feature_selection #' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, #' and if the number of features #' within a bin is less than 2, the dispersion is set to 1. -#' @inheritParams select_features_by_variance #' @returns -#' Return a dataframe with the following columns, sorted descending by bin-normalized dispersion: -#' - `names`: Feature name. -#' - `score`: Bin-normalized dispersion of the feature. -#' - `highly_variable`: Logical vector of whether the feature is highly variable. -#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). -#' -#' Calculate using the following process: +#' - `select_features_by_binned_dispersion`: Score representing the bin normalized dispersion of each feature. +#' @details +#' `select_features_by_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015): #' 1. Calculate the dispersion of each feature (variance / mean) #' 2. Log normalize dispersion and mean #' 3. Bin the features by their means, and normalize dispersion within each bin diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd index c57a8e40..007ba177 100644 --- a/r/man/feature_selection.Rd +++ b/r/man/feature_selection.Rd @@ -4,6 +4,7 @@ \alias{select_features_by_variance} \alias{select_features_by_dispersion} \alias{select_features_by_mean} +\alias{select_features_by_binned_dispersion} \title{Feature selection functions} \usage{ select_features_by_variance( @@ -21,6 +22,13 @@ select_features_by_dispersion( ) select_features_by_mean(mat, num_feats = 25000, threads = 1L) + +select_features_by_binned_dispersion( + mat, + num_feats = 25000, + n_bins = 20, + threads = 1L +) } \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} @@ -31,6 +39,10 @@ all features will be returned.} \item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} \item{threads}{(integer) Number of threads to use.} + +\item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +and if the number of features +within a bin is less than 2, the dispersion is set to 1.} } \value{ Return a dataframe with the following columns, sorted descending by score: @@ -52,6 +64,10 @@ Each different feature selection method will have a different scoring method: \itemize{ \item \code{select_features_by_mean}: Score representing the mean accessibility of each feature. } + +\itemize{ +\item \code{select_features_by_binned_dispersion}: Score representing the bin normalized dispersion of each feature. +} } \description{ Apply a feature selection method to a \verb{(features x cells)} matrix. @@ -75,4 +91,11 @@ Apply a feature selection method to a \verb{(features x cells)} matrix. \item Get the sum of each binarized feature. \item Find \code{num_feats} features with the highest accessibility. } + +\code{select_features_by_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015): +\enumerate{ +\item Calculate the dispersion of each feature (variance / mean) +\item Log normalize dispersion and mean +\item Bin the features by their means, and normalize dispersion within each bin +} } diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd index 13348e23..dff5c88e 100644 --- a/r/man/lsi.Rd +++ b/r/man/lsi.Rd @@ -9,7 +9,8 @@ LSI( n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf, - threads = 1L + threads = 1L, + verbose = FALSE ) } \arguments{ diff --git a/r/man/select_features_by_binned_dispersion.Rd b/r/man/select_features_by_binned_dispersion.Rd deleted file mode 100644 index b3c573ae..00000000 --- a/r/man/select_features_by_binned_dispersion.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/singlecell_utils.R -\name{select_features_by_binned_dispersion} -\alias{select_features_by_binned_dispersion} -\title{Get the most variable features within a matrix.} -\usage{ -select_features_by_binned_dispersion( - mat, - num_feats = 25000, - n_bins = 20, - threads = 1L -) -} -\arguments{ -\item{mat}{(IterableMatrix) dimensions features x cells} - -\item{num_feats}{(integer) Number of features to return. If the number is higher than the number of features in the matrix, -all features will be returned.} - -\item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, -and if the number of features -within a bin is less than 2, the dispersion is set to 1.} - -\item{threads}{(integer) Number of threads to use.} -} -\value{ -Return a dataframe with the following columns, sorted descending by bin-normalized dispersion: -\itemize{ -\item \code{names}: Feature name. -\item \code{score}: Bin-normalized dispersion of the feature. -\item \code{highly_variable}: Logical vector of whether the feature is highly variable. -} -} -\description{ -Get the most variable features within a matrix. -} -\details{ -The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015). - -Calculate using the following process: -\enumerate{ -\item Calculate the dispersion of each feature (variance / mean) -\item Log normalize dispersion and mean -\item Bin the features by their means, and normalize dispersion within each bin -} -} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index b34d528c..c83d52d4 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -132,7 +132,6 @@ reference: - checksum - apply_by_row - regress_out - - select_features_by_binned_dispersion - LSI - IterableMatrix-methods - pseudobulk_matrix From 6c4285b0647a27b7c0b43a5c6fffebfd07a6e872 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 24 Jan 2025 17:46:12 -0800 Subject: [PATCH 22/22] [r] fix binned dispersion naming --- r/NAMESPACE | 2 +- r/R/singlecell_utils.R | 19 ++++++++++++------- r/man/feature_selection.Rd | 7 ++++--- r/tests/testthat/test-singlecell_utils.R | 8 ++++---- 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index 314978df..2f78fe51 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -116,7 +116,7 @@ export(rowVars.default) export(sctransform_pearson) export(select_cells) export(select_chromosomes) -export(select_features_by_binned_dispersion) +export(select_features_binned_dispersion) export(select_features_mean) export(select_features_variance) export(select_regions) diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index e429a257..773c5fc6 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -150,14 +150,14 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread #' and if the number of features #' within a bin is less than 2, the dispersion is set to 1. #' @returns -#' - `select_features_by_binned_dispersion`: Score representing the bin normalized dispersion of each feature. +#' - `select_features_binned_dispersion`: Score representing the bin normalized dispersion of each feature. #' @details -#' `select_features_by_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015): +#' `select_features_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015): #' 1. Calculate the dispersion of each feature (variance / mean) #' 2. Log normalize dispersion and mean #' 3. Bin the features by their means, and normalize dispersion within each bin #' @export -select_features_by_binned_dispersion <- function( +select_features_binned_dispersion <- function( mat, num_feats = 25000, n_bins = 20, threads = 1L ) { @@ -289,9 +289,14 @@ LSI <- function( ) { if (rlang::is_missing(mat)) { return( - purrr::partial( - LSI, n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, - normalize = normalize, threads = threads, verbose = verbose + create_partial( + missing_args = list( + n_dimensions = missing(n_dimensions), + corr_cutoff = missing(corr_cutoff), + normalize = missing(normalize), + threads = missing(threads), + verbose = missing(verbose) + ) ) ) } @@ -308,7 +313,7 @@ LSI <- function( if (verbose) log_progress("Normalizing matrix") mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads) read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) - if (!is.null(normalize)) mat <- normalize(mat, threads = threads) + if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) # Save to prevent re-calculation of queued operations mat <- write_matrix_dir( diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd index 51756a83..c391c795 100644 --- a/r/man/feature_selection.Rd +++ b/r/man/feature_selection.Rd @@ -4,6 +4,7 @@ \alias{select_features_variance} \alias{select_features_dispersion} \alias{select_features_mean} +\alias{select_features_binned_dispersion} \title{Feature selection functions} \usage{ select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L) @@ -17,7 +18,7 @@ select_features_dispersion( select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L) -select_features_by_binned_dispersion( +select_features_binned_dispersion( mat, num_feats = 25000, n_bins = 20, @@ -62,7 +63,7 @@ Each different feature selection method will have a different scoring method: } \itemize{ -\item \code{select_features_by_binned_dispersion}: Score representing the bin normalized dispersion of each feature. +\item \code{select_features_binned_dispersion}: Score representing the bin normalized dispersion of each feature. } } \description{ @@ -88,7 +89,7 @@ Apply a feature selection method to a \verb{(features x cells)} matrix. \item Find \code{num_feats} features with the highest accessibility. } -\code{select_features_by_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015): +\code{select_features_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015): \enumerate{ \item Calculate the dispersion of each feature (variance / mean) \item Log normalize dispersion and mean diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 441ea851..33effb58 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -197,8 +197,8 @@ test_that("Pseudobulk aggregation works with multiple return types", { test_that("Feature selection by bin variance works", { mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat - res_table <- select_features_by_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1) - res_table_t <- select_features_by_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1) + res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1) + res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1) res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) res <- mat[res_feats,] res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) @@ -218,7 +218,7 @@ test_that("LSI works", { lsi_res_obj <- LSI(mat, n_dimensions = 5) lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5) # Also check partial args - lsi_res_obj_partial <- LSI(n_dimensions = 5)(mat) + lsi_res_obj_partial <- LSI(n_dimensions = 5, normalize = normalize_tfidf(scale_factor = 10000, threads = 4), threads = 4)(mat) lsi_res <- lsi_res_obj$cell_embeddings lsi_res_t <- lsi_res_t_obj$cell_embeddings # Check that projection results in the same output if used on the same input matrix @@ -228,7 +228,7 @@ test_that("LSI works", { expect_equal(ncol(lsi_res), ncol(mat)) expect_equal(nrow(lsi_res_t), 5) expect_equal(ncol(lsi_res_t), nrow(mat)) - expect_equal(lsi_res_obj, lsi_res_obj_partial) + expect_equal(lsi_res_obj$cell_embeddings, lsi_res_obj_partial$cell_embeddings) expect_equal(lsi_res, lsi_res_proj) })