diff --git a/r/NAMESPACE b/r/NAMESPACE index e9998384..2f78fe51 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -2,11 +2,16 @@ S3method(base::as.data.frame,IterableFragments) S3method(base::as.matrix,IterableMatrix) +S3method(project,DimReduction) +S3method(project,LSI) +S3method(project,default) S3method(svds,IterableMatrix) S3method(svds,default) export("all_matrix_inputs<-") export("cellNames<-") export("chrNames<-") +export(DimReduction) +export(LSI) export(add_cols) export(add_rows) export(all_matrix_inputs) @@ -85,6 +90,7 @@ export(plot_tf_footprint) export(plot_tss_profile) export(plot_tss_scatter) export(prefix_cell_names) +export(project) export(pseudobulk_matrix) export(qc_scATAC) export(range_distance_to_nearest) @@ -110,6 +116,7 @@ export(rowVars.default) export(sctransform_pearson) export(select_cells) export(select_chromosomes) +export(select_features_binned_dispersion) export(select_features_mean) export(select_features_variance) export(select_regions) diff --git a/r/NEWS.md b/r/NEWS.md index dc220204..deefee46 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -42,6 +42,8 @@ of the new features this release and will continue to help with maintenance and - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) - Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169) +- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #181). +- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #181). ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R index 37f61d98..773c5fc6 100644 --- a/r/R/singlecell_utils.R +++ b/r/R/singlecell_utils.R @@ -145,6 +145,241 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread return(features_df) } +#' @rdname feature_selection +#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +#' and if the number of features +#' within a bin is less than 2, the dispersion is set to 1. +#' @returns +#' - `select_features_binned_dispersion`: Score representing the bin normalized dispersion of each feature. +#' @details +#' `select_features_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015): +#' 1. Calculate the dispersion of each feature (variance / mean) +#' 2. Log normalize dispersion and mean +#' 3. Bin the features by their means, and normalize dispersion within each bin +#' @export +select_features_binned_dispersion <- function( + mat, num_feats = 25000, n_bins = 20, + threads = 1L +) { + assert_is(mat, "IterableMatrix") + assert_greater_than_zero(num_feats) + assert_is_wholenumber(num_feats) + assert_len(num_feats, 1) + assert_is_wholenumber(n_bins) + assert_len(n_bins, 1) + assert_greater_than_zero(n_bins) + if (nrow(mat) <= num_feats) { + log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats)) + features_df <- tibble::tibble( + names = rownames(mat), + score = rep(0, nrow(mat)), + highly_variable = rep(TRUE, nrow(mat)) + ) + return(mat) + } + # Calculate row information for dispersion + mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads) + feature_means <- mat_stats$row_stats["mean", ] + feature_vars <- mat_stats$row_stats["variance", ] + # Calculate dispersion, and log normalize + feature_dispersion <- feature_vars / feature_means + feature_dispersion[feature_dispersion == 0] <- NA + feature_dispersion <- log1p(feature_dispersion) + feature_dispersion[feature_means == 0] <- 0 + feature_means <- log1p(feature_means) + features_df <- tibble::tibble( + names = names(feature_means), + var = feature_vars, + mean = feature_means, + dispersion = feature_dispersion + ) + # Bin by mean, and normalize dispersion with each bin + features_df <- features_df %>% + dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% + dplyr::group_by(bin) %>% + dplyr::mutate( + score = (dispersion - mean(dispersion)) / sd(dispersion), + score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1 + ) %>% + dplyr::ungroup() %>% + dplyr::arrange(desc(score)) %>% + dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) %>% + dplyr::select(c("names", "score", "highly_variable")) + return(features_df) +} + + +################# +# DimReduction Class Definition +################# + +#' Barebones definition of a DimReduction class. +#' +#' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object. +#' Child classes should implement a `project` method to allow for the projection of other matrices using +#' the fitted transformation object. +#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) The projected data +#' @field fitted_params (list) A list of parameters used for the transformation of a matrix. +#' @export +DimReduction <- function(x, fitted_params = list(), ...) { + assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix")) + assert_is(fitted_params, "list") + structure( + list( + cell_embeddings = x, + fitted_params = fitted_params, + ... + ), + class = "DimReduction" + ) +} + +#' Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object. +#' @param x DimReduction object. +#' @param mat IterableMatrix object. +#' @return IterableMatrix object of the projected data. +#' @details DimReduction subclasses should use this to project new data with the same features, to project into the same latent space. +#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object. +#' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features. +#' If there are rownames, reorder the matrix to match the order of the original matrix +#' @export +project <- function(x, mat, ...) { + UseMethod("project") +} +#' @export +project.default <- function(x, mat, ...) { + rlang::abort("project method not implemented for BPCells objects.") +} +#' @export +project.DimReduction <- function(x, mat, ...) { + rlang::abort("project method not implemented for base DimReduction object.") +} + +################# +# LSI Implementation +################# + + +#' Perform latent semantic indexing (LSI) on a matrix. +#' +#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`. +#' @param mat (IterableMatrix) dimensions features x cells. +#' @param n_dimensions (integer) Number of dimensions to keep during PCA. +#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to +#' the corr_cutoff, it will be excluded from the final PCA matrix. +#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed. +#' @param threads (integer) Number of threads to use. +#' @returns An object of class `c("LSI", "DimReduction")` with the following attributes: +#' - `cell_embeddings`: The projected data +#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following: +#' - `normalization_method`: The normalization method used +#' - `feature_means`: The means of the features used for normalization +#' - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth +#' - `svd_params`: The matrix calculated for SVD +#' - `feature_names`: The names of the features in the matrix +#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. +#' +#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +#' - 17.1 MB memory usage, 25.1 seconds runtime +#' @seealso `project()` `DimReduction()` `normalize_tfidf()` +#' @export +LSI <- function( + mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf, + threads = 1L, verbose = FALSE +) { + if (rlang::is_missing(mat)) { + return( + create_partial( + missing_args = list( + n_dimensions = missing(n_dimensions), + corr_cutoff = missing(corr_cutoff), + normalize = missing(normalize), + threads = missing(threads), + verbose = missing(verbose) + ) + ) + ) + } + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(n_dimensions) + assert_len(n_dimensions, 1) + assert_greater_than_zero(n_dimensions) + assert_true(n_dimensions < min(ncol(mat), nrow(mat))) + assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1)) + assert_is_wholenumber(threads) + + if (verbose) log_progress("Starting LSI") + # Normalization + if (verbose) log_progress("Normalizing matrix") + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads) + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat) + + # Save to prevent re-calculation of queued operations + mat <- write_matrix_dir( + convert_matrix_type(mat, type = "float"), + tempfile("mat"), compress = TRUE + ) + # Run pca + if (verbose) log_progress("Calculating SVD") + svd_attr <- svds(mat, k = n_dimensions, threads = threads) + pca_res <- t(svd_attr$u) %*% mat + + # Filter out PCs that are highly correlated with sequencing depth + pca_corrs <- abs(cor(read_depth, t(pca_res))) + pca_feats_to_keep <- which(pca_corrs < corr_cutoff) + if (length(pca_feats_to_keep) != n_dimensions) { + if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", "))) + pca_res <- pca_res[pca_feats_to_keep, ] + } + fitted_params <- list( + normalization_method = normalize, + feature_means = mat_stats$row_stats["mean", ], + pcs_to_keep = pca_feats_to_keep, + svd_params = svd_attr + ) + res <- DimReduction( + x = pca_res, + fitted_params = fitted_params, + feature_names = rownames(mat) + ) + class(res) <- c("LSI", class(res)) + return(res) +} + + +#' @export +project.LSI <- function(x, mat, threads = 1L, ...) { + assert_is(mat, "IterableMatrix") + assert_is(x, "LSI") + + fitted_params <- x$fitted_params + # Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u + assert_true(nrow(mat) == nrow(fitted_params$svd_params$u)) + if (!is.null(rownames(mat)) && !is.null(x$feature_names)) { + assert_true(all(x$feature_names %in% rownames(mat))) + mat <- mat[x$feature_names, ] + } + + if (!is.null(fitted_params$normalization_method)) { + mat <- fitted_params$normalization_method( + mat, + feature_means = fitted_params$feature_means, + threads = threads + ) + mat <- write_matrix_dir( + convert_matrix_type(mat, type = "float"), + tempfile("mat"), compress = TRUE + ) + } + pca_attr <- fitted_params$svd_params + res <- t(pca_attr$u) %*% mat + if (length(fitted_params$pcs_to_keep) != nrow(res)) { + res <- res[fitted_params$pcs_to_keep, ] + } + return(res) +} + #' Test for marker features #' @@ -210,6 +445,8 @@ marker_features <- function(mat, groups, method="wilcoxon") { ) } + + #' Aggregate counts matrices by cell group or feature. #' #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each @@ -228,7 +465,7 @@ marker_features <- function(mat, groups, method="wilcoxon") { #' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time. #' @inheritParams marker_features #' @export -pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) { +pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) { assert_is(mat, "IterableMatrix") assert_is(cell_groups, c("factor", "character", "numeric")) assert_true(length(cell_groups) == ncol(mat)) @@ -240,7 +477,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) { rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", "))) } } - assert_is(threads, "integer") + assert_is_wholenumber(threads) it <- mat %>% convert_matrix_type("double") %>% diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd new file mode 100644 index 00000000..2bb7486c --- /dev/null +++ b/r/man/DimReduction.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{DimReduction} +\alias{DimReduction} +\title{Barebones definition of a DimReduction class.} +\usage{ +DimReduction(x, fitted_params = list(), ...) +} +\description{ +Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object. +Child classes should implement a \code{project} method to allow for the projection of other matrices using +the fitted transformation object. +} +\section{Fields}{ + +\describe{ +\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) The projected data} + +\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.} +}} + diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd index f83e2199..c391c795 100644 --- a/r/man/feature_selection.Rd +++ b/r/man/feature_selection.Rd @@ -4,6 +4,7 @@ \alias{select_features_variance} \alias{select_features_dispersion} \alias{select_features_mean} +\alias{select_features_binned_dispersion} \title{Feature selection functions} \usage{ select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L) @@ -16,6 +17,13 @@ select_features_dispersion( ) select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L) + +select_features_binned_dispersion( + mat, + num_feats = 25000, + n_bins = 20, + threads = 1L +) } \arguments{ \item{mat}{(IterableMatrix) dimensions features x cells} @@ -28,6 +36,10 @@ all features will be returned.} \item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} \item{threads}{(integer) Number of threads to use.} + +\item{n_bins}{(integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin, +and if the number of features +within a bin is less than 2, the dispersion is set to 1.} } \value{ Return a dataframe with the following columns, sorted descending by score: @@ -49,6 +61,10 @@ Each different feature selection method will have a different scoring method: \itemize{ \item \code{select_features_by_mean}: Score representing the mean accessibility of each feature. } + +\itemize{ +\item \code{select_features_binned_dispersion}: Score representing the bin normalized dispersion of each feature. +} } \description{ Apply a feature selection method to a \verb{(features x cells)} matrix. @@ -72,4 +88,11 @@ Apply a feature selection method to a \verb{(features x cells)} matrix. \item Get the sum of each binarized feature. \item Find \code{num_feats} features with the highest accessibility. } + +\code{select_features_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015): +\enumerate{ +\item Calculate the dispersion of each feature (variance / mean) +\item Log normalize dispersion and mean +\item Bin the features by their means, and normalize dispersion within each bin +} } diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd new file mode 100644 index 00000000..dff5c88e --- /dev/null +++ b/r/man/lsi.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{LSI} +\alias{LSI} +\title{Perform latent semantic indexing (LSI) on a matrix.} +\usage{ +LSI( + mat, + n_dimensions = 50L, + corr_cutoff = 1, + normalize = normalize_tfidf, + threads = 1L, + verbose = FALSE +) +} +\arguments{ +\item{mat}{(IterableMatrix) dimensions features x cells.} + +\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.} + +\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to +the corr_cutoff, it will be excluded from the final PCA matrix.} + +\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.} + +\item{threads}{(integer) Number of threads to use.} +} +\value{ +An object of class \code{c("LSI", "DimReduction")} with the following attributes: +\itemize{ +\item \code{cell_embeddings}: The projected data +\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following: +\itemize{ +\item \code{normalization_method}: The normalization method used +\item \code{feature_means}: The means of the features used for normalization +\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth +\item \code{svd_params}: The matrix calculated for SVD +} +\item \code{feature_names}: The names of the features in the matrix +} +} +\description{ +Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}. +} +\details{ +Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019. + +Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example: +\itemize{ +\item 17.1 MB memory usage, 25.1 seconds runtime +} +} +\seealso{ +\code{project()} \code{DimReduction()} \code{normalize_tfidf()} +} diff --git a/r/man/project.Rd b/r/man/project.Rd new file mode 100644 index 00000000..dcab31b5 --- /dev/null +++ b/r/man/project.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/singlecell_utils.R +\name{project} +\alias{project} +\title{Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.} +\usage{ +project(x, mat, ...) +} +\arguments{ +\item{x}{DimReduction object.} + +\item{mat}{IterableMatrix object.} +} +\value{ +IterableMatrix object of the projected data. +} +\description{ +Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object. +} +\details{ +DimReduction subclasses should use this to project new data with the same features, to project into the same latent space. +All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object. +If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features. +If there are rownames, reorder the matrix to match the order of the original matrix +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 0d98b13d..7a5a3d13 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -132,6 +132,7 @@ reference: - checksum - apply_by_row - regress_out + - LSI - IterableMatrix-methods - pseudobulk_matrix @@ -147,6 +148,11 @@ reference: - knn_to_graph - cluster_membership_matrix +- title: "Dimensionality Reductions" +- contents: + - DimReduction + - project + - title: "Plots" diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R new file mode 100644 index 00000000..de775f4f --- /dev/null +++ b/r/tests/real_data/ArchR_LSI.R @@ -0,0 +1,69 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +library("BPCells") +library("ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} + +#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells. +#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR. +#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix +#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation +#' we can compare between the two packages. +#' @param proj An archr project. +test_lsi_similarity_to_archr <- function(dir = NULL) { + dir <- create_temp_dir(dir) + setwd(dir) + # add iterative lsi for dim reduction + proj <- getTestProject() + proj <- addPeakMatrix(proj) + # Get the peak matrix + test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix")) + # Calculate LSI on ArchR + # running LSI without binarizing, as we don't do this in the BPCells implementation + # we also don't filter quantile outliers. + lsi_archr <- ArchR:::.computeLSI( + mat = test_mat, + LSIMethod = 2, + nDimensions = 20, + binarize = FALSE, + outlierQuantiles = NULL + ) + svd_archr <- lsi_archr$svd + lsi_mat_archr <- t(lsi_archr$matSVD) + # set rownames to NA, as we don't have rownames in the BPCells implementation + rownames(lsi_mat_archr) <- NULL + # PCA Matrix = T(u) * Pre-SVD Matrix + # u * PCA Matrix = u * T(u) * Pre-SVD Matrix + # u * PCA Matrix = Pre-SVD Matrix + pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr + # Calculate LSI on BPCells + # Do not use z-score normalization, as this isn't done with ArchR + lsi_bpcells <- LSI( + test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), + n_dimensions = 20 + ) + pre_svd_mat_approx_bpcells <- lsi_bpcells$fitted_params$svd_params$u %*% lsi_bpcells$cell_embeddings + testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4)) + # convert signs + lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1, (2 * (lsi_mat_archr[,1] * lsi_bpcells$cell_embeddings[,1] > 0) - 1), `*`) + # Check for post-pca matrix similarity + testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$cell_embeddings, tolerance = 1e-4)) + # also check for correlation between the two matrices in PC space + testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$cell_embeddings)) > 0.999) +} +test_lsi_similarity_to_archr() \ No newline at end of file diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R index 150939d4..8818d7e7 100644 --- a/r/tests/real_data/ArchR_insertions.R +++ b/r/tests/real_data/ArchR_insertions.R @@ -1,5 +1,23 @@ -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/") -devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/") +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r") +devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} fix_granges_syntax_for_archr <- function(gr) { mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG) diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py new file mode 100644 index 00000000..6ae28c1b --- /dev/null +++ b/r/tests/real_data/Scanpy_variable_feat_selection.py @@ -0,0 +1,46 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +import sys, tempfile, os +import numpy as np +import pandas as pd +import scanpy as sc + + +def call_highly_var_genes_single_batch(temp_dir: str) -> None: + """ + Call highly_variable genes on a single batch of PBMC68k data using their interpreation of + the Seurat implementation. + Write the input anndata object csv at `/highly_var_genes_scanpy_input.csv` + Write the output as a csv, at `/highly_var_genes_scanpy_output.csv` + + Args: + temp_dir (str): Path to the temporary directory to write the input and output files. + """ + # Dataset is only (765, 700) + adata = sc.datasets.pbmc68k_reduced() + adata.var_names_make_unique() + res = sc.pp._highly_variable_genes.highly_variable_genes(adata, + n_top_genes = 50, + n_bins = 20, + flavor = "seurat", + inplace = False, + check_values = False).drop(columns = 'mean_bin') + # remove mean_bin + adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv")) + res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv")) + + +if __name__ == "__main__": + # We use the first argument as the temporary directory + if len(sys.argv) < 2: + # If no argument is provided, use the current directory + call_highly_var_genes_single_batch(".") + else: + call_highly_var_genes_single_batch(sys.argv[1]) + \ No newline at end of file diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R new file mode 100644 index 00000000..d2eb08b2 --- /dev/null +++ b/r/tests/real_data/scanpy_variable_feat_selection.R @@ -0,0 +1,58 @@ +# Copyright 2024 BPCells contributors +# +# Licensed under the Apache License, Version 2.0 or the MIT license +# , at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +library("BPCells") + +# Set up temp dir in case it's not already set +create_temp_dir <- function(dir = NULL) { + if (is.null(dir)) { + dir <- file.path(tempdir(), "lsi_test") + if (dir.exists(dir)) unlink(dir, recursive = TRUE) + dir.create(dir) + } + return(dir) +} + +# Compare the feature selection output of BPCells to that of Scanpy. +# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. +# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`. +# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output. +compare_feat_selection_to_scanpy <- function(dir = NULL) { + dir <- create_temp_dir(dir) + + # Call python script + system2("python3", c("Scanpy_variable_feat_selection.py", dir)) + + # read in input csv + input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1)) + output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1) + # filter output mat to only where "highly_variable" is true + output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable) + output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% + dplyr::arrange(desc(dispersions_norm)) %>% + dplyr::select(-highly_variable) %>% # convert rownames to a column + tibble::rownames_to_column("name") %>% + dplyr::as_tibble() + + # Scanpy undoes a log1p transformation on the input matrix, so we do the same here + input_mat_bpcells <- expm1(input_mat_scanpy) + + output_bpcells <- highly_variable_features( + input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"), + num_feats = 50, + n_bins = 20, + save_feat_selection = TRUE + ) + output_mat_bpcells <- output_bpcells$feature_selection + expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name)) + expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6)) + expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6)) +} + +compare_feat_selection_to_scanpy() diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R index 08e41b7c..33effb58 100644 --- a/r/tests/testthat/test-singlecell_utils.R +++ b/r/tests/testthat/test-singlecell_utils.R @@ -13,6 +13,10 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val = as(m, "dgCMatrix") } +generate_dense_matrix <- function(nrow, ncol) { + m <- matrix(runif(nrow * ncol), nrow = nrow) +} + test_that("select_features works general case", { m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix") for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) { @@ -37,6 +41,7 @@ test_that("select_features works general case", { } }) + test_that("Wilcoxon rank sum works (small)", { x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46) y <- c(1.15, 0.88, 0.90, 0.74, 1.21) @@ -187,3 +192,43 @@ test_that("Pseudobulk aggregation works with multiple return types", { } }) + + +test_that("Feature selection by bin variance works", { + mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix") + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to Seurat + res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1) + res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1) + res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) + res <- mat[res_feats,] + res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) + res_t <- t(mat[,res_feats_t]) + + expect_equal(nrow(res), 10) + expect_equal(ncol(res), 26) + expect_equal(nrow(res_t), 10) + expect_equal(ncol(res_t), 500) +}) + +test_that("LSI works", { + mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix") + rownames(mat) <- paste0("feat", seq_len(nrow(mat))) + colnames(mat) <- paste0("cell", seq_len(ncol(mat))) + # Test only that outputs are reasonable. There is a full comparison in `tests/real_data/` that compares implementation to ArchR + lsi_res_obj <- LSI(mat, n_dimensions = 5) + lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5) + # Also check partial args + lsi_res_obj_partial <- LSI(n_dimensions = 5, normalize = normalize_tfidf(scale_factor = 10000, threads = 4), threads = 4)(mat) + lsi_res <- lsi_res_obj$cell_embeddings + lsi_res_t <- lsi_res_t_obj$cell_embeddings + # Check that projection results in the same output if used on the same input matrix + lsi_res_proj <- project(lsi_res_obj, mat) + + expect_equal(nrow(lsi_res), 5) + expect_equal(ncol(lsi_res), ncol(mat)) + expect_equal(nrow(lsi_res_t), 5) + expect_equal(ncol(lsi_res_t), nrow(mat)) + expect_equal(lsi_res_obj$cell_embeddings, lsi_res_obj_partial$cell_embeddings) + expect_equal(lsi_res, lsi_res_proj) +}) +