From 13c37606f1952f14ea86b22a0399981118eab459 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 30 Oct 2024 19:00:33 -0700
Subject: [PATCH 01/22] [r] add lsi, var feature selection

---
 r/R/transforms.R | 72 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/r/R/transforms.R b/r/R/transforms.R
index 8a2bd25b..7a110677 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -983,4 +983,76 @@ normalize_tfidf <- function(
   idf <- 1 / feature_means
   tf_idf_mat <- tf %>% multiply_rows(idf)
   return(log1p(tf_idf_mat * scale_factor))
+}
+#' Compute LSI For a peak matrix 
+#' @param mat PeakMatrix
+#' @param n_dimensions Number of dimensions to keep during PCA
+#' @param scale_factor Scale factor for the tf-idf log transform
+#' @param verbose Whether to print out progress
+#' @param threads Number of threads to use
+#' @return dgCMatrix of shape (n_dimensions, ncol(mat))
+#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA.  
+#' Tf-idf implementation is from Stuart & Butler et al. 2019.  
+#' @export
+compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) {
+  assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this?
+  assert_is(n_dimensions, "integer")
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+
+  # Signac implementation
+  npeaks <- colSums(mat)
+  tf <- mat %>% multiply_cols(1/npeaks)
+  idf_ <- ncol(mat) / rowSums(mat)
+  mat_tfidf <- tf %>% multiply_rows(idf_)
+  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
+
+  # run z-score norm
+  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats
+  cell_means <- cell_peak_stats["mean",]
+  cell_vars <- cell_peak_stats["variance",]
+  mat_lsi_norm <- mat_log_tfidf %>%
+    add_cols(-cell_means) %>%
+    multiply_cols(1 / cell_vars)
+
+  # Run pca
+  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions)
+  pca_res <- t(svd_attr$u) %*% mat_lsi_norm
+  return(pca_res)
+}
+
+#' Get most variable features, given a non-log normalized matrix
+highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(num_feats, "integer")
+  assert_greater_than_zero(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(n_bins, "integer")
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), 
+                                       returning all features", nrow(mat), num_feats))
+    return(mat)
+  }
+  # Calculate the mean and variance of each feature
+  # should we set entries that are 0 to NA?
+  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ]
+  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ]
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion <- log(feature_dispersion)
+  feature_means <- log1p(feature_means)
+  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
+
+  # Calculate the mean and variance of dispersion of each bin
+  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x))
+  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x))
+  # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1
+  bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)]
+  # map mean_bins indices to bin_stats
+  feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins]
+  names(feature_dispersion_norm) <- names(feature_dispersion)
+  variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)]
+  return(mat[names(variable_features_), ])
 }
\ No newline at end of file

From 36a8983a53de0bbe558743d9ca862413882b6d91 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 4 Nov 2024 15:14:26 -0800
Subject: [PATCH 02/22] [r] add lsi, variable feature selection

---
 r/NAMESPACE                              |   2 +
 r/NEWS.md                                |   3 +
 r/R/singlecell_utils.R                   | 106 +++++++++++++++++++++++
 r/man/highly_variable_features.Rd        |  36 ++++++++
 r/man/lsi.Rd                             |  46 ++++++++++
 r/pkgdown/_pkgdown.yml                   |   2 +
 r/tests/testthat/test-singlecell_utils.R |  49 +++++++++++
 7 files changed, 244 insertions(+)
 create mode 100644 r/man/highly_variable_features.Rd
 create mode 100644 r/man/lsi.Rd

diff --git a/r/NAMESPACE b/r/NAMESPACE
index c90e1a15..c7e4fd1c 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -48,6 +48,7 @@ export(gene_score_archr)
 export(gene_score_tiles_archr)
 export(gene_score_weights_archr)
 export(get_trackplot_height)
+export(highly_variable_features)
 export(import_matrix_market)
 export(import_matrix_market_10x)
 export(knn_annoy)
@@ -55,6 +56,7 @@ export(knn_hnsw)
 export(knn_to_geodesic_graph)
 export(knn_to_snn_graph)
 export(log1p_slow)
+export(lsi)
 export(marker_features)
 export(match_gene_symbol)
 export(matrix_stats)
diff --git a/r/NEWS.md b/r/NEWS.md
index 73ca847d..24b6b0d9 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -24,6 +24,9 @@ Contributions welcome :)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
 - Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
+- Add MACS2/3 input creation and peak calling through `call_macs_peaks()` (pull request #118)
+- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156).
+- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). 
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 32e45934..c4d7fbf3 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -233,4 +233,110 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
     }
   }
   return(res)
+}
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
+#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory.  
+#' If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
+#' but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.
+#' @param threads (integer) Number of threads to use.
+#' @return dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' ** Saving in memory vs disk: **
+#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
+#' This is done to prevent re-calculation of queued operations during PCA optimization.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime
+#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+#' 
+#' @export
+lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_is_wholenumber(threads)
+
+  # log(tf-idf) transform
+  npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+  tf <- mat %>% multiply_cols(1 / npeaks)
+  idf_ <- ncol(mat) / rowSums(mat)
+  mat_tfidf <- tf %>% multiply_rows(idf_)
+  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
+  # Save to prevent re-calculation of queued operations
+  if (save_in_memory) {
+    mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE)
+  } else {
+    mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+  } 
+  # Z-score normalization
+  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats
+  cell_means <- cell_peak_stats["mean",]
+  cell_vars <- cell_peak_stats["variance",]
+  mat_lsi_norm <- mat_log_tfidf %>%
+    add_cols(-cell_means) %>%
+    multiply_cols(1 / cell_vars)
+  # Run pca
+  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr_$u) %*% mat_lsi_norm
+  return(pca_res)
+}
+
+#' Get the most variable features within a matrix
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' ll features will be returned.
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @returns IterableMatrix subset of the most variable features.
+#' @inheritParams lsi
+#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+#' 
+#' Calculate using the following process:
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
+    return(mat)
+  }
+  
+  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ]
+  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ]
+  feature_means[feature_means == 0] <- 1e-12
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log(feature_dispersion)
+  feature_means <- log1p(feature_means)
+  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
+  
+  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE))
+  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE))
+  # Set feats that are in bins with only one feat to have a norm dispersion of 1
+  one_gene_bin <- is.na(bin_sd)
+  bin_sd[one_gene_bin] <- bin_mean[one_gene_bin]
+  bin_mean[one_gene_bin] <- 0
+  # map mean_bins indices to bin_stats
+  # Do a character search as bins without features mess up numeric indexing
+  feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)]
+  names(feature_dispersion_norm) <- names(feature_dispersion)
+  feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values
+  if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats))
+  variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)]
+  return(mat[names(variable_features_), ])
 }
\ No newline at end of file
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
new file mode 100644
index 00000000..54e67599
--- /dev/null
+++ b/r/man/highly_variable_features.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{highly_variable_features}
+\alias{highly_variable_features}
+\title{Get the most variable features within a matrix}
+\usage{
+highly_variable_features(mat, num_feats, n_bins, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
+ll features will be returned.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+IterableMatrix subset of the most variable features.
+}
+\description{
+Get the most variable features within a matrix
+}
+\details{
+The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+
+Calculate using the following process:
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
+}
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
new file mode 100644
index 00000000..38c0d73b
--- /dev/null
+++ b/r/man/lsi.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{lsi}
+\alias{lsi}
+\title{Perform latent semantic indexing (LSI) on a matrix.}
+\usage{
+lsi(
+  mat,
+  n_dimensions = 50L,
+  scale_factor = 10000,
+  save_in_memory = FALSE,
+  threads = 1L
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells}
+
+\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
+
+\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
+
+\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory.
+If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
+but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+dgCMatrix of shape (n_dimensions, ncol(mat)).
+}
+\description{
+Perform latent semantic indexing (LSI) on a matrix.
+}
+\details{
+Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+
+** Saving in memory vs disk: **
+Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
+This is done to prevent re-calculation of queued operations during PCA optimization.
+
+Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+\itemize{
+\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime
+\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+}
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 526018e4..3cf6509a 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -136,6 +136,8 @@ reference:
   - select_features_by_variance
   - select_features_by_dispersion
   - select_features_by_mean
+  - lsi
+  - highly_variable_features
   - IterableMatrix-methods
   - pseudobulk_matrix
 
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 111324b7..c699d67b 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -13,6 +13,10 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
   as(m, "dgCMatrix")
 }
 
+generate_dense_matrix <- function(nrow, ncol) {
+  m <- matrix(runif(nrow * ncol), nrow = nrow)
+}
+
 test_that("select_features works general case", {
   m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
   for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
@@ -31,6 +35,26 @@ test_that("select_features works general case", {
   }
 })
 
+
+test_that("select_features works general case", {
+  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
+  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
+    res <- do.call(fn, list(m1, num_feats = 10))
+    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
+    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
+    expect_setequal(res$names, rownames(m1))
+    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
+    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
+    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
+    if (fn != "select_features_by_mean") {
+      # Check that normalization actually does something
+      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
+      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
+    }
+  }
+})
+
+
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -180,3 +204,28 @@ test_that("Pseudobulk aggregation works with multiple return types", {
   }
 })
 
+
+
+test_that("Highly variable feature selection works", {
+    mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+    res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    expect_equal(nrow(res), 10)
+    expect_equal(ncol(res), 26)
+    expect_equal(nrow(res_t), 10)
+    expect_equal(ncol(res_t), 500)
+})
+
+test_that("LSI works", {
+    mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+    colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
+    lsi_res <- lsi(mat, n_dimensions = 5)
+    lsi_res_t <- lsi(t(mat), n_dimensions = 5)
+    expect_equal(nrow(lsi_res), 5)
+    expect_equal(ncol(lsi_res), ncol(mat))
+    expect_equal(nrow(lsi_res_t), 5)
+    expect_equal(ncol(lsi_res_t), nrow(mat))
+})
\ No newline at end of file

From 2be2efebe4f53f03f9d895bdecc33f762babef13 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 16:53:53 -0800
Subject: [PATCH 03/22] [r] parametrize z_score_norm, create temp option to
 return more info `save_lsi`, remove `save_in_memory` in `lsi()`

---
 r/R/singlecell_utils.R | 157 +++++++++++++++++++++++++++++------------
 r/man/lsi.Rd           |  24 ++++---
 2 files changed, 126 insertions(+), 55 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index c4d7fbf3..969acf35 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -237,25 +237,28 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 
 #' Perform latent semantic indexing (LSI) on a matrix.
 #' @param mat (IterableMatrix) dimensions features x cells
+#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' @param save_in_memory (logical) If TRUE, save the log(tf-idf) matrix in memory.  
-#' If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
-#' but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.
 #' @param threads (integer) Number of threads to use.
-#' @return dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
+#' @return 
+#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' - If save_lsi is TRUE, return a list with the following elements:
+#'  - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat))
+#' - **svd_attr**: List of SVD attributes
+#' - **idf**: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 #' 
-#' ** Saving in memory vs disk: **
-#' Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
-#' This is done to prevent re-calculation of queued operations during PCA optimization.
-#' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
-#' - Saving in memory: 233 MB memory usage, 22.7 seconds runtime
-#' - Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
-#' 
+#' - 17.1 MB memory usage, 25.1 seconds runtime
 #' @export
-lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FALSE, threads = 1L) {
+lsi <- function(
+  mat,
+  z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4,
+  save_lsi = FALSE,
+  threads = 1L
+) {
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(n_dimensions)
   assert_len(n_dimensions, 1)
@@ -264,27 +267,34 @@ lsi <- function(mat, n_dimensions = 50L, scale_factor = 1e4, save_in_memory = FA
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  
   npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / rowSums(mat)
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations
-  if (save_in_memory) {
-    mat_log_tfidf <- write_matrix_memory(mat_log_tfidf, compress = FALSE)
-  } else {
-    mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
-  } 
+  mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
   # Z-score normalization
-  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance", threads = threads)$col_stats
-  cell_means <- cell_peak_stats["mean",]
-  cell_vars <- cell_peak_stats["variance",]
-  mat_lsi_norm <- mat_log_tfidf %>%
-    add_cols(-cell_means) %>%
-    multiply_cols(1 / cell_vars)
+  if (z_score_norm) {
+    cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
+    cell_means <- cell_peak_stats["mean",]
+    cell_vars <- cell_peak_stats["variance",]
+    mat_log_tfidf <- mat_log_tfidf %>%
+      add_cols(-cell_means) %>%
+      multiply_cols(1 / cell_vars)
+  }
   # Run pca
-  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions, threads = threads)
-  pca_res <- t(svd_attr_$u) %*% mat_lsi_norm
+  svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
+  if(save_lsi) {
+    return(list(
+      pca_res = pca_res,
+      svd_attr = svd_attr_,
+      idf = idf_
+    ))
+  }
   return(pca_res)
 }
 
@@ -315,28 +325,87 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
     log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
     return(mat)
   }
-  
-  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats["mean", ]
-  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats["variance", ]
+  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
+  feature_means <- mat_stats$row_stats["mean", ]
+  feature_vars <- mat_stats$row_stats["variance", ]
   feature_means[feature_means == 0] <- 1e-12
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
   feature_means <- log1p(feature_means)
-  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
-  
-  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x, na.rm = TRUE))
-  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x, na.rm = TRUE))
-  # Set feats that are in bins with only one feat to have a norm dispersion of 1
-  one_gene_bin <- is.na(bin_sd)
-  bin_sd[one_gene_bin] <- bin_mean[one_gene_bin]
-  bin_mean[one_gene_bin] <- 0
-  # map mean_bins indices to bin_stats
-  # Do a character search as bins without features mess up numeric indexing
-  feature_dispersion_norm <- (feature_dispersion - bin_mean[as.character(mean_bins)]) / bin_sd[as.character(mean_bins)]
-  names(feature_dispersion_norm) <- names(feature_dispersion)
-  feature_dispersion_norm <- sort(feature_dispersion_norm) # sorting automatically removes NA values
-  if (length(feature_dispersion_norm) < num_feats) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all non-zero features", length(feature_dispersion_norm), num_feats))
-  variable_features_ <- feature_dispersion_norm[max(1, (length(feature_dispersion_norm) - num_feats + 1)):length(feature_dispersion_norm)]
-  return(mat[names(variable_features_), ])
+  features_df <- data.frame(
+    name = names(feature_means),
+    vars = feature_vars, 
+    means = feature_means,
+    dispersion = feature_dispersion
+  ) 
+  features_df <- features_df %>% 
+    dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% 
+    dplyr::group_by(bin) %>% 
+    dplyr::mutate( 
+      bin_mean = mean(dispersion, na.rm = TRUE), 
+      bin_sd = sd(dispersion, na.rm = TRUE),
+      bin_sd_is_na = is.na(bin_sd), 
+      bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1
+      bin_mean = ifelse(bin_sd_is_na, 0, bin_mean),
+      feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
+    ) %>%
+    dplyr::ungroup() %>%
+    dplyr::select(name, feature_dispersion_norm) %>%
+    dplyr::arrange(desc(feature_dispersion_norm)) %>% 
+    dplyr::slice(1:min(num_feats, nrow(.)))
+  return(mat[features_df$name,])
+}
+
+                   
+#' Aggregate counts matrices by cell group or feature.
+#'
+#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
+#' feature.
+#' @param cell_groups (Character/factor) Vector of group/cluster assignments for each cell. Length must be `ncol(mat)`.
+#' @param method (Character vector) Method(s) to aggregate counts. If one method is provided, the output will be a matrix. If multiple methods are provided, the output will be a named list of matrices.
+#'
+#' Current options are: `nonzeros`, `sum`, `mean`, `variance`.
+#' @param threads (integer) Number of threads to use.
+#' @return
+#'  - If `method` is length `1`, returns a matrix of shape `(features x groups)`.
+#'  - If `method` is greater than length `1`, returns a list of matrices with each matrix representing a pseudobulk matrix with a different aggregation method.
+#' Each matrix is of shape `(features x groups)`, and names are one of `nonzeros`, `sum`, `mean`, `variance`.
+#' @details Some simpler stats are calculated in the process of calculating more complex
+#' statistics. So when calculating `variance`, `nonzeros` and `mean` can be included with no
+#' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time.
+#' @inheritParams marker_features
+#' @export
+pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(cell_groups, c("factor", "character", "numeric"))
+  assert_true(length(cell_groups) == ncol(mat))
+  cell_groups <- as.factor(cell_groups)
+  assert_is(method, "character")
+  methods <- c("variance", "mean", "sum", "nonzeros")
+  for (m in method) {
+    if (!(m %in% methods)) {
+      rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", ")))
+    }
+  }
+  assert_is(threads, "integer")
+  # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats
+  iter <- iterate_matrix(parallel_split(mat, threads, threads*4))
+  res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
+  # if res is a single matrix, return with colnames and rownames
+  if (length(method) == 1) {
+    colnames(res[[method]]) <- levels(cell_groups)
+    rownames(res[[method]]) <- rownames(mat)
+    return(res[[method]])
+  }
+  # give colnames and rownames for each matrix in res, which is a named list
+  for (res_slot in names(res)) {
+    if ((length(res[[res_slot]]) == 0) || !(res_slot %in% method)) {
+      res[[res_slot]] <- NULL
+    } else {
+      colnames(res[[res_slot]]) <- levels(cell_groups)
+      rownames(res[[res_slot]]) <- rownames(mat)
+    }
+  }
+  return(res)
 }
\ No newline at end of file
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 38c0d73b..137e4437 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -6,27 +6,34 @@
 \usage{
 lsi(
   mat,
+  z_score_norm = TRUE,
   n_dimensions = 50L,
   scale_factor = 10000,
-  save_in_memory = FALSE,
+  save_lsi = FALSE,
   threads = 1L
 )
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
+\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.}
+
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
 \item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
 
-\item{save_in_memory}{(logical) If TRUE, save the log(tf-idf) matrix in memory.
-If FALSE, save to a temporary location in disk.  Saving in memory will result in faster downstream operations,
-but will require in higher memory usage.  Comparison of memory usage and speed is in the details section.}
+\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
-dgCMatrix of shape (n_dimensions, ncol(mat)).
+\itemize{
+\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+\item If save_lsi is TRUE, return a list with the following elements:
+\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat))
+\item \strong{svd_attr}: List of SVD attributes
+\item \strong{idf}: Inverse document frequency vector
+}
 }
 \description{
 Perform latent semantic indexing (LSI) on a matrix.
@@ -34,13 +41,8 @@ Perform latent semantic indexing (LSI) on a matrix.
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 
-** Saving in memory vs disk: **
-Following the log(tf-idf) transform, the matrix is stored into a temporary location, as the next step will break the sparsity pattern of the matrix.
-This is done to prevent re-calculation of queued operations during PCA optimization.
-
 Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 \itemize{
-\item Saving in memory: 233 MB memory usage, 22.7 seconds runtime
-\item Saving in disk: 17.1 MB memory usage, 25.1 seconds runtime
+\item 17.1 MB memory usage, 25.1 seconds runtime
 }
 }

From dccc3a56e67f104aa4e3c9e46e65fa0ba7c8c9d8 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 16:54:18 -0800
Subject: [PATCH 04/22] [r] add test case for LSI comparing to archr

---
 r/tests/real_data/ArchR_LSI.R | 50 +++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 r/tests/real_data/ArchR_LSI.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
new file mode 100644
index 00000000..b78d700f
--- /dev/null
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -0,0 +1,50 @@
+devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
+devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+
+#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
+#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
+#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix
+#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
+#' we can compare between the two packages.
+#' @param proj An archr project.
+test_lsi_similarity_to_archr <- function() {
+    # Set up temp dir
+    int_dir <- file.path(tempdir(), "insertion_test")
+    dir.create(int_dir)
+    setwd(int_dir)
+    # add iterative lsi for dim reduction
+    proj <- getTestProject()
+    proj <- addPeakMatrix(proj)
+    # Get the peak matrix
+    test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix"))
+    # Calculate LSI on ArchR
+    # running LSI without binarizing, as we don't do this in the BPCells implementation
+    # we also don't filter quantile outliers.
+    lsi_archr <- .computeLSI(
+        mat = test_mat,
+        LSIMethod = 2,
+        nDimensions = 2,
+        binarize = FALSE,
+        outlierQuantiles = NULL,
+        test_mat = test_mat
+    )
+    svd_archr <- lsi_archr$svd
+    lsi_mat_archr <- t(lsi_archr$matSVD)
+    # set rownames to NA, as we don't have rownames in the BPCells implementation
+    rownames(lsi_mat_archr) <- NULL
+    # PCA Matrix = T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = u * T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = Pre-SVD Matrix
+    pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
+    # Calculate LSI on BPCells
+    # Do not use z-score normalization, as this isn't done with ArchR
+    lsi_bpcells <- lsi(
+        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
+        z_score_norm = FALSE,
+        n_dimensions = 2,
+        save_lsi = TRUE
+    )
+    pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
+}
+test_lsi_similarity_to_archr()
\ No newline at end of file

From 183dd40a18cddae2d353a1a439d73cfb26af6e5b Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:29:32 -0800
Subject: [PATCH 05/22] [r] clean up var gene selection, lsi docstring

---
 r/R/singlecell_utils.R            | 38 ++++++++++++++++++++++++-------
 r/man/highly_variable_features.Rd | 17 ++++++++++++--
 r/man/lsi.Rd                      |  6 ++---
 3 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 969acf35..5a00462b 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -236,6 +236,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 }
 
 #' Perform latent semantic indexing (LSI) on a matrix.
+#' 
+#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
@@ -243,9 +245,9 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' @param threads (integer) Number of threads to use.
 #' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @return 
-#' - If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
 #' - If save_lsi is TRUE, return a list with the following elements:
-#'  - **pca_res**: dgCMatrix of shape (n_dimensions, ncol(mat))
+#'  - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))``
 #' - **svd_attr**: List of SVD attributes
 #' - **idf**: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
@@ -304,7 +306,12 @@ lsi <- function(
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
-#' @returns IterableMatrix subset of the most variable features.
+#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected.
+#' @returns 
+#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - If `save_feat_selection` is True, return a list with the following elements:
+#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - **feature_selection**: Dataframe with the following columns:
 #' @inheritParams lsi
 #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
 #' 
@@ -313,7 +320,11 @@ lsi <- function(
 #'  2. Log normalize dispersion and mean
 #'  3. Bin the features by their means, and normalize dispersion within each bin
 #' @export
-highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
+highly_variable_features <- function(
+  mat, num_feats, n_bins = 20,
+  save_feat_selection = FALSE,
+  threads = 1L
+) {
   assert_is(mat, "IterableMatrix")
   assert_greater_than_zero(num_feats)
   assert_is_wholenumber(num_feats)
@@ -325,22 +336,27 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
     log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
     return(mat)
   }
+  # Calculate row information for dispersion
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
   feature_means <- mat_stats$row_stats["mean", ]
   feature_vars <- mat_stats$row_stats["variance", ]
+  # Give a small value to features with 0 mean, helps with 0 division
   feature_means[feature_means == 0] <- 1e-12
+  # Calculate dispersion, and log normalize
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
   feature_means <- log1p(feature_means)
   features_df <- data.frame(
     name = names(feature_means),
-    vars = feature_vars, 
-    means = feature_means,
+    var = feature_vars, 
+    mean = feature_means,
     dispersion = feature_dispersion
   ) 
+
+  # Bin by mean, and normalize dispersion with each bin
   features_df <- features_df %>% 
-    dplyr::mutate(bin = cut(means, n_bins, labels=FALSE)) %>% 
+    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
       bin_mean = mean(dispersion, na.rm = TRUE), 
@@ -351,9 +367,15 @@ highly_variable_features <- function(mat, num_feats, n_bins, threads = 1L) {
       feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
     ) %>%
     dplyr::ungroup() %>%
-    dplyr::select(name, feature_dispersion_norm) %>%
+    dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
     dplyr::arrange(desc(feature_dispersion_norm)) %>% 
     dplyr::slice(1:min(num_feats, nrow(.)))
+  if (save_feat_selection) {
+    return(list(
+      mat = mat[features_df$name,],
+      feature_selection = features_df
+    ))
+  }
   return(mat[features_df$name,])
 }
 
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
index 54e67599..943d986b 100644
--- a/r/man/highly_variable_features.Rd
+++ b/r/man/highly_variable_features.Rd
@@ -4,7 +4,13 @@
 \alias{highly_variable_features}
 \title{Get the most variable features within a matrix}
 \usage{
-highly_variable_features(mat, num_feats, n_bins, threads = 1L)
+highly_variable_features(
+  mat,
+  num_feats,
+  n_bins = 20,
+  save_feat_selection = FALSE,
+  threads = 1L
+)
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
@@ -16,10 +22,17 @@ ll features will be returned.}
 and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 
+\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.}
+
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
-IterableMatrix subset of the most variable features.
+\itemize{
+\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item If \code{save_feat_selection} is True, return a list with the following elements:
+\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item \strong{feature_selection}: Dataframe with the following columns:
+}
 }
 \description{
 Get the most variable features within a matrix
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 137e4437..299513df 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -28,15 +28,15 @@ lsi(
 }
 \value{
 \itemize{
-\item If save_lsi is FALSE, return a dgCMatrix of shape (n_dimensions, ncol(mat)).
+\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
 \item If save_lsi is TRUE, return a list with the following elements:
-\item \strong{pca_res}: dgCMatrix of shape (n_dimensions, ncol(mat))
+\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))``
 \item \strong{svd_attr}: List of SVD attributes
 \item \strong{idf}: Inverse document frequency vector
 }
 }
 \description{
-Perform latent semantic indexing (LSI) on a matrix.
+Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
 }
 \details{
 Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.

From 4972f3430ab9da12bbde231699a9c52b92ad0cec Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:30:00 -0800
Subject: [PATCH 06/22] [r] add variable gene selection test

---
 r/tests/real_data/ArchR_LSI.R                 |  9 +++-
 .../Scanpy_variable_feat_selection.py         | 46 ++++++++++++++++++
 .../scanpy_variable_feat_selection.R          | 47 +++++++++++++++++++
 3 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 r/tests/real_data/Scanpy_variable_feat_selection.py
 create mode 100644 r/tests/real_data/scanpy_variable_feat_selection.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index b78d700f..2ad74610 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -1,3 +1,11 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
 devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
 devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
 
@@ -47,4 +55,3 @@ test_lsi_similarity_to_archr <- function() {
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
     testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
 }
-test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py
new file mode 100644
index 00000000..6ae28c1b
--- /dev/null
+++ b/r/tests/real_data/Scanpy_variable_feat_selection.py
@@ -0,0 +1,46 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+import sys, tempfile, os
+import numpy as np
+import pandas as pd
+import scanpy as sc
+
+
+def call_highly_var_genes_single_batch(temp_dir: str) -> None:
+    """
+    Call highly_variable genes on a single batch of PBMC68k data using their interpreation of 
+    the Seurat implementation.
+    Write the input anndata object csv at `<temp_dir>/highly_var_genes_scanpy_input.csv`
+    Write the output as a csv, at `<temp_dir>/highly_var_genes_scanpy_output.csv`
+
+    Args:
+        temp_dir (str): Path to the temporary directory to write the input and output files.
+    """
+    # Dataset is only (765, 700)
+    adata = sc.datasets.pbmc68k_reduced()
+    adata.var_names_make_unique()
+    res = sc.pp._highly_variable_genes.highly_variable_genes(adata, 
+                                                             n_top_genes = 50,
+                                                             n_bins = 20, 
+                                                             flavor = "seurat",
+                                                             inplace = False,
+                                                             check_values = False).drop(columns = 'mean_bin')
+    # remove mean_bin
+    adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv"))
+    res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv"))
+
+
+if __name__ == "__main__":
+    # We use the first argument as the temporary directory
+    if len(sys.argv) < 2:
+        # If no argument is provided, use the current directory
+        call_highly_var_genes_single_batch(".")
+    else:
+        call_highly_var_genes_single_batch(sys.argv[1])
+    
\ No newline at end of file
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
new file mode 100644
index 00000000..da873a7b
--- /dev/null
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -0,0 +1,47 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+
+compare_feat_selection_to_scanpy <- function(dir = NULL) {
+    # Set up temp dir
+    if (is.null(dir)) {
+        dir <- file.path(tempdir(), "feat_selection_test")
+        if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+        dir.create(dir)
+    }
+
+    # Call python script
+    system2("python", c("Scanpy_variable_feat_selection.py", dir))
+    
+    # read in input csv
+    input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
+    output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1)
+    # filter output mat to only where "highly_variable" is true
+    output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable)
+    output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% 
+        dplyr::arrange(desc(dispersions_norm)) %>% 
+        dplyr::select(-highly_variable) %>% # convert rownames to a column
+        tibble::rownames_to_column("name") %>%
+        dplyr::as_tibble()
+    # unlog the input_mat
+    input_mat_bpcells <- expm1(input_mat_scanpy)
+    output_bpcells <- highly_variable_features(
+        input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        num_feats = 50,
+        n_bins = 20,
+        save_feat_selection = TRUE
+    )
+    output_mat_bpcells <- output_bpcells$feature_selection
+    expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name))
+    expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6))
+}
+
+compare_feat_selection_to_scanpy()

From e4d5cb01a67f7b2c1ea40fd489c825161bb78e83 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 6 Nov 2024 19:35:03 -0800
Subject: [PATCH 07/22] [r] provide more colour to scanpy feat selection test

---
 r/tests/real_data/scanpy_variable_feat_selection.R | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index da873a7b..3bac07ab 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -8,6 +8,11 @@
 
 devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
 
+
+# Compare the feature selection output of BPCells to that of Scanpy.
+# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
+# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
+# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
 compare_feat_selection_to_scanpy <- function(dir = NULL) {
     # Set up temp dir
     if (is.null(dir)) {
@@ -29,8 +34,10 @@ compare_feat_selection_to_scanpy <- function(dir = NULL) {
         dplyr::select(-highly_variable) %>% # convert rownames to a column
         tibble::rownames_to_column("name") %>%
         dplyr::as_tibble()
-    # unlog the input_mat
+
+    # Scanpy undoes a log1p transformation on the input matrix, so we do the same here
     input_mat_bpcells <- expm1(input_mat_scanpy)
+    
     output_bpcells <- highly_variable_features(
         input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
         num_feats = 50,

From 99470e06b04bd269468e093688e67b25e0a47ffb Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 15:57:13 -0800
Subject: [PATCH 08/22] [r] cleanup real data tests

---
 r/tests/real_data/ArchR_LSI.R                 | 17 +++++++-------
 r/tests/real_data/ArchR_insertions.R          | 13 +++++++++--
 r/tests/real_data/config.csv                  |  3 +++
 .../scanpy_variable_feat_selection.R          | 13 ++++-------
 r/tests/real_data/test_helpers.R              | 22 +++++++++++++++++++
 5 files changed, 48 insertions(+), 20 deletions(-)
 create mode 100644 r/tests/real_data/config.csv
 create mode 100644 r/tests/real_data/test_helpers.R

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index 2ad74610..e79ee98b 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -6,8 +6,9 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
+devtools::load_all(config[["path_archr"]])
 
 #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
 #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
@@ -15,11 +16,9 @@ devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
 #' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
 #' we can compare between the two packages.
 #' @param proj An archr project.
-test_lsi_similarity_to_archr <- function() {
-    # Set up temp dir
-    int_dir <- file.path(tempdir(), "insertion_test")
-    dir.create(int_dir)
-    setwd(int_dir)
+test_lsi_similarity_to_archr <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+    setwd(dir)
     # add iterative lsi for dim reduction
     proj <- getTestProject()
     proj <- addPeakMatrix(proj)
@@ -33,8 +32,7 @@ test_lsi_similarity_to_archr <- function() {
         LSIMethod = 2,
         nDimensions = 2,
         binarize = FALSE,
-        outlierQuantiles = NULL,
-        test_mat = test_mat
+        outlierQuantiles = NULL
     )
     svd_archr <- lsi_archr$svd
     lsi_mat_archr <- t(lsi_archr$matSVD)
@@ -55,3 +53,4 @@ test_lsi_similarity_to_archr <- function() {
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
     testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
 }
+test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 150939d4..810133c1 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -1,5 +1,14 @@
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
+devtools::load_all(config[["path_archr"]])
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)
diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv
new file mode 100644
index 00000000..915de4af
--- /dev/null
+++ b/r/tests/real_data/config.csv
@@ -0,0 +1,3 @@
+key,value
+path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/
+path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index 3bac07ab..8b608773 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -6,23 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
-
+source("test_helpers.R")
+devtools::load_all(config[["path_bpcells"]])
 
 # Compare the feature selection output of BPCells to that of Scanpy.
 # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
 # This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
 # It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
 compare_feat_selection_to_scanpy <- function(dir = NULL) {
-    # Set up temp dir
-    if (is.null(dir)) {
-        dir <- file.path(tempdir(), "feat_selection_test")
-        if (dir.exists(dir)) unlink(dir, recursive = TRUE)
-        dir.create(dir)
-    }
+    dir <- create_temp_dir(dir)
 
     # Call python script
-    system2("python", c("Scanpy_variable_feat_selection.py", dir))
+    system2("python3", c("Scanpy_variable_feat_selection.py", dir))
     
     # read in input csv
     input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R
new file mode 100644
index 00000000..2da80072
--- /dev/null
+++ b/r/tests/real_data/test_helpers.R
@@ -0,0 +1,22 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+# Set up k-v pairs for other tests
+config_csv <- read.csv("config.csv")
+config <- as.list(config_csv)$value
+names(config) <- as.list(config_csv)$key
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
\ No newline at end of file

From 3bf8914f987f5d88d558f48539e6d65ef349b889 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 16:15:54 -0800
Subject: [PATCH 09/22] [r] clean up lsi, var features docstrings

---
 r/R/singlecell_utils.R            | 28 ++++++++++++++--------------
 r/man/highly_variable_features.Rd | 18 ++++++++++--------
 r/man/lsi.Rd                      | 19 ++++++++++---------
 3 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 5a00462b..13c697af 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -239,17 +239,17 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' 
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
-#' @param z_score_norm (logical) If TRUE, z-score normalize the matrix before PCA.
+#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
+#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
-#' @param save_lsi (logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @return 
-#' - If save_lsi is FALSE, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
-#' - If save_lsi is TRUE, return a list with the following elements:
-#'  - **pca_res**: dgCMatrix of shape `(n_dimensions, ncol(mat))``
-#' - **svd_attr**: List of SVD attributes
-#' - **idf**: Inverse document frequency vector
+#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
+#' - If `save_lsi` is `TRUE`, return a list with the following elements:
+#'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
+#'    - `svd_attr`: List of SVD attributes
+#'    - `idf`: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
 #' 
 #' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
@@ -300,18 +300,18 @@ lsi <- function(
   return(pca_res)
 }
 
-#' Get the most variable features within a matrix
+#' Get the most variable features within a matrix.
 #' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
-#' ll features will be returned.
+#' all features will be returned.
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
-#' @param save_feat_selection (logical) If TRUE, save the dispersions, means, and the features selected.
+#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected.
 #' @returns 
-#' - If `save_feat_selection` is False, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#' - If `save_feat_selection` is True, return a list with the following elements:
-#' - **mat**: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#' - **feature_selection**: Dataframe with the following columns:
+#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#' - If `save_feat_selection` is `TRUE`, return a list with the following elements:
+#'    - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
+#'    - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`.
 #' @inheritParams lsi
 #' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
 #' 
diff --git a/r/man/highly_variable_features.Rd b/r/man/highly_variable_features.Rd
index 943d986b..005a4e37 100644
--- a/r/man/highly_variable_features.Rd
+++ b/r/man/highly_variable_features.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/singlecell_utils.R
 \name{highly_variable_features}
 \alias{highly_variable_features}
-\title{Get the most variable features within a matrix}
+\title{Get the most variable features within a matrix.}
 \usage{
 highly_variable_features(
   mat,
@@ -16,26 +16,28 @@ highly_variable_features(
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
 \item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-ll features will be returned.}
+all features will be returned.}
 
 \item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
 and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 
-\item{save_feat_selection}{(logical) If TRUE, save the dispersions, means, and the features selected.}
+\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
 \itemize{
-\item If \code{save_feat_selection} is False, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item If \code{save_feat_selection} is True, return a list with the following elements:
-\item \strong{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item \strong{feature_selection}: Dataframe with the following columns:
+\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements:
+\itemize{
+\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
+\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}.
+}
 }
 }
 \description{
-Get the most variable features within a matrix
+Get the most variable features within a matrix.
 }
 \details{
 The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 299513df..ea2687f9 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -16,23 +16,24 @@ lsi(
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
 
-\item{z_score_norm}{(logical) If TRUE, z-score normalize the matrix before PCA.}
+\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.}
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
-\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.}
-
-\item{save_lsi}{(logical) If TRUE, save the SVD attributes for the matrix, as well as the idf normalization vector.}
+\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.
+#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
 \itemize{
-\item If save_lsi is FALSE, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
-\item If save_lsi is TRUE, return a list with the following elements:
-\item \strong{pca_res}: dgCMatrix of shape `(n_dimensions, ncol(mat))``
-\item \strong{svd_attr}: List of SVD attributes
-\item \strong{idf}: Inverse document frequency vector
+\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
+\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements:
+\itemize{
+\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}
+\item \code{svd_attr}: List of SVD attributes
+\item \code{idf}: Inverse document frequency vector
+}
 }
 }
 \description{

From a7c617986ef86e6ce50a7db7af882f8d3e283f15 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Thu, 7 Nov 2024 18:45:32 -0800
Subject: [PATCH 10/22] [r] add in more lsi real data tests

---
 r/tests/real_data/ArchR_LSI.R | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index e79ee98b..09c04f03 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -30,7 +30,7 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     lsi_archr <- .computeLSI(
         mat = test_mat,
         LSIMethod = 2,
-        nDimensions = 2,
+        nDimensions = 20,
         binarize = FALSE,
         outlierQuantiles = NULL
     )
@@ -47,10 +47,16 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     lsi_bpcells <- lsi(
         test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
         z_score_norm = FALSE,
-        n_dimensions = 2,
+        n_dimensions = 20,
         save_lsi = TRUE
     )
     pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
-    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-6))
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
+    # convert signs
+    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`)
+    # Check for post-pca matrix similarity
+    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4))
+    # also check for correlation between the two matrices in PC space
+    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999)
 }
 test_lsi_similarity_to_archr()
\ No newline at end of file

From acf35b2520efc9aa70fe005565837bede6888e8d Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 18 Nov 2024 12:56:18 -0800
Subject: [PATCH 11/22] [r] remove unused variable from `lsi()`

---
 r/R/singlecell_utils.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 13c697af..a59d1d73 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -269,8 +269,6 @@ lsi <- function(
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
-  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  
   npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / rowSums(mat)

From 47256dbaf0407641c73a3c36a33479ee6fe40fc2 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 2 Dec 2024 02:12:19 -0800
Subject: [PATCH 12/22] [r] add requested changes

---
 r/R/singlecell_utils.R                        | 37 ++++++++++---------
 r/tests/real_data/ArchR_LSI.R                 | 15 ++++++--
 r/tests/real_data/config.csv                  |  3 --
 .../scanpy_variable_feat_selection.R          | 13 ++++++-
 r/tests/real_data/test_helpers.R              | 22 -----------
 5 files changed, 42 insertions(+), 48 deletions(-)
 delete mode 100644 r/tests/real_data/config.csv
 delete mode 100644 r/tests/real_data/test_helpers.R

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index a59d1d73..9be501fc 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -239,8 +239,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' 
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
-#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
 #' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
@@ -257,7 +257,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' @export
 lsi <- function(
   mat,
-  z_score_norm = TRUE, n_dimensions = 50L, scale_factor = 1e4,
+  n_dimensions = 50L, z_score_norm = TRUE,  scale_factor = 1e4,
   save_lsi = FALSE,
   threads = 1L
 ) {
@@ -269,13 +269,17 @@ lsi <- function(
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
-  npeaks <- colSums(mat) # Finding that sums are non-multithreaded and there's no interface to pass it in, but there is implementation in `ConcatenateMatrix.h`
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
-  idf_ <- ncol(mat) / rowSums(mat)
+  idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations
-  mat_log_tfidf <- write_matrix_dir(mat_log_tfidf, tempfile("mat_log_tfidf"), compress = FALSE)
+  mat_log_tfidf <- write_matrix_dir(
+    convert_matrix_type(mat_log_tfidf, type = "float"), 
+    tempfile("mat_log_tfidf"), compress = TRUE
+  )
   # Z-score normalization
   if (z_score_norm) {
     cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
@@ -338,12 +342,11 @@ highly_variable_features <- function(
   mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
   feature_means <- mat_stats$row_stats["mean", ]
   feature_vars <- mat_stats$row_stats["variance", ]
-  # Give a small value to features with 0 mean, helps with 0 division
-  feature_means[feature_means == 0] <- 1e-12
   # Calculate dispersion, and log normalize
   feature_dispersion <- feature_vars / feature_means
   feature_dispersion[feature_dispersion == 0] <- NA
   feature_dispersion <- log(feature_dispersion)
+  feature_dispersion[feature_means == 0] <- 0
   feature_means <- log1p(feature_means)
   features_df <- data.frame(
     name = names(feature_means),
@@ -357,24 +360,22 @@ highly_variable_features <- function(
     dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
-      bin_mean = mean(dispersion, na.rm = TRUE), 
-      bin_sd = sd(dispersion, na.rm = TRUE),
-      bin_sd_is_na = is.na(bin_sd), 
-      bin_sd = ifelse(bin_sd_is_na, bin_mean, bin_sd), # Set feats that are in bins with only one feat to have a norm dispersion of 1
-      bin_mean = ifelse(bin_sd_is_na, 0, bin_mean),
-      feature_dispersion_norm = (dispersion - bin_mean) / bin_sd
-    ) %>%
+      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),  
+      feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+    ) %>%  
     dplyr::ungroup() %>%
     dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
-    dplyr::arrange(desc(feature_dispersion_norm)) %>% 
-    dplyr::slice(1:min(num_feats, nrow(.)))
+    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats)
   if (save_feat_selection) {
+    # get rownames that are in features_df$name
+    feats_of_interest <- which(rownames(mat) %in% features_df$name)
     return(list(
-      mat = mat[features_df$name,],
+      mat = mat[feats_of_interest,],
       feature_selection = features_df
     ))
   }
-  return(mat[features_df$name,])
+  return(mat[feats_of_interest,])
+  #return(mat[features_df$name,])
 }
 
                    
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index 09c04f03..ff697b4c 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -6,9 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
-devtools::load_all(config[["path_archr"]])
+library("BPCells")
+library("ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 #' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
 #' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
diff --git a/r/tests/real_data/config.csv b/r/tests/real_data/config.csv
deleted file mode 100644
index 915de4af..00000000
--- a/r/tests/real_data/config.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-key,value
-path_archr,/mnt/c/users/Imman/PycharmProjects/ArchR/
-path_bpcells,/mnt/c/users/Imman/PycharmProjects/BPCells/r
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
index 8b608773..d2eb08b2 100644
--- a/r/tests/real_data/scanpy_variable_feat_selection.R
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -6,8 +6,17 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
+library("BPCells")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 # Compare the feature selection output of BPCells to that of Scanpy.
 # Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
diff --git a/r/tests/real_data/test_helpers.R b/r/tests/real_data/test_helpers.R
deleted file mode 100644
index 2da80072..00000000
--- a/r/tests/real_data/test_helpers.R
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright 2024 BPCells contributors
-# 
-# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
-# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
-# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
-# option. This file may not be copied, modified, or distributed
-# except according to those terms.
-
-# Set up k-v pairs for other tests
-config_csv <- read.csv("config.csv")
-config <- as.list(config_csv)$value
-names(config) <- as.list(config_csv)$key
-
-# Set up temp dir in case it's not already set
-create_temp_dir <- function(dir = NULL) {
-  if (is.null(dir)) {
-    dir <- file.path(tempdir(), "lsi_test")
-    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
-    dir.create(dir)
-  }
-  return(dir)
-}
\ No newline at end of file

From 004499a0a7119f05bb5fc5cf3fc46058ebb3f94c Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 2 Dec 2024 13:54:25 -0800
Subject: [PATCH 13/22] [r] fix requested changes

---
 r/R/singlecell_utils.R | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 9be501fc..b3d873db 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -270,6 +270,7 @@ lsi <- function(
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+  
   npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
   idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
@@ -360,22 +361,20 @@ highly_variable_features <- function(
     dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
     dplyr::group_by(bin) %>% 
     dplyr::mutate( 
-      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),  
-      feature_dispersion_norm = dplyr::if_else(n() == 1, 1, feature_dispersion_norm) # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),
+      feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
     ) %>%  
     dplyr::ungroup() %>%
-    dplyr::select(c(-bin_sd_is_na, -var, -bin_sd, -bin_mean)) %>%
-    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats)
+    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE)
+  feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order
   if (save_feat_selection) {
     # get rownames that are in features_df$name
-    feats_of_interest <- which(rownames(mat) %in% features_df$name)
     return(list(
       mat = mat[feats_of_interest,],
       feature_selection = features_df
     ))
   }
   return(mat[feats_of_interest,])
-  #return(mat[features_df$name,])
 }
 
                    

From dd801651258686fa71b357fdd5d25345ce4a5d5b Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Tue, 3 Dec 2024 15:56:25 -0800
Subject: [PATCH 14/22] [r] fix lsi docstring, idf_ logic

---
 r/R/singlecell_utils.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index b3d873db..cdee4537 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -242,7 +242,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
 #' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
+#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
 #' @return 
 #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
@@ -270,10 +270,9 @@ lsi <- function(
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  
   npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
   tf <- mat %>% multiply_cols(1 / npeaks)
-  idf_ <- ncol(mat) / (mat_stats$row_stats["mean",] * nrow(mat))
+  idf_ <- 1 / mat_stats$row_stats["mean",]
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
   # Save to prevent re-calculation of queued operations

From 8891981744295f0687e92cf12c6d181a08349a8f Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 6 Dec 2024 16:32:58 -0800
Subject: [PATCH 15/22] [r] replace z-score norm with corr cutoffs

---
 r/R/singlecell_utils.R               | 31 ++++++++++++++++------------
 r/tests/real_data/ArchR_LSI.R        |  1 -
 r/tests/real_data/ArchR_insertions.R | 15 +++++++++++---
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index cdee4537..28fa45b7 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -240,7 +240,8 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
 #' @param mat (IterableMatrix) dimensions features x cells
 #' @param n_dimensions (integer) Number of dimensions to keep during PCA.
-#' @param z_score_norm (logical) If `TRUE`, z-score normalize the matrix before PCA.
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+#' the corr_cutoff, it will be excluded from the final PCA matrix.
 #' @param scale_factor (integer) Scale factor for the tf-idf log transform.
 #' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
 #' @param threads (integer) Number of threads to use.
@@ -248,6 +249,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
 #' - If `save_lsi` is `TRUE`, return a list with the following elements:
 #'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
+#'    - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth
 #'    - `svd_attr`: List of SVD attributes
 #'    - `idf`: Inverse document frequency vector
 #' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
@@ -257,7 +259,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
 #' @export
 lsi <- function(
   mat,
-  n_dimensions = 50L, z_score_norm = TRUE,  scale_factor = 1e4,
+  n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
   save_lsi = FALSE,
   threads = 1L
 ) {
@@ -266,12 +268,13 @@ lsi <- function(
   assert_len(n_dimensions, 1)
   assert_greater_than_zero(n_dimensions)
   assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
   assert_is_wholenumber(threads)
 
   # log(tf-idf) transform
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  npeaks <- mat_stats$col_stats["mean",] * nrow(mat) 
-  tf <- mat %>% multiply_cols(1 / npeaks)
+  read_depth <- mat_stats$col_stats["mean",] * nrow(mat) 
+  tf <- mat %>% multiply_cols(1 / read_depth)
   idf_ <- 1 / mat_stats$row_stats["mean",]
   mat_tfidf <- tf %>% multiply_rows(idf_)
   mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
@@ -280,22 +283,24 @@ lsi <- function(
     convert_matrix_type(mat_log_tfidf, type = "float"), 
     tempfile("mat_log_tfidf"), compress = TRUE
   )
-  # Z-score normalization
-  if (z_score_norm) {
-    cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats = "variance", threads = threads)$col_stats
-    cell_means <- cell_peak_stats["mean",]
-    cell_vars <- cell_peak_stats["variance",]
-    mat_log_tfidf <- mat_log_tfidf %>%
-      add_cols(-cell_means) %>%
-      multiply_cols(1 / cell_vars)
-  }
   # Run pca
   svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
   pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
+  
+  # Filter out PCs that are highly correlated with sequencing depth
+  pca_corrs <- abs(cor(read_depth, t(pca_res)))
+  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
+  if (length(pca_feats_to_keep) != n_dimensions) {
+    # not sure if this is the route we want to take.  Should we just leave the PCs in,
+    # and not use them in downstream analysis?
+    pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf
+  }
+
   if(save_lsi) {
     return(list(
       pca_res = pca_res,
       svd_attr = svd_attr_,
+      unused_pcs <- which(pca_corrs >= corr_cutoff),
       idf = idf_
     ))
   }
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index ff697b4c..084645d3 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -55,7 +55,6 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     # Do not use z-score normalization, as this isn't done with ArchR
     lsi_bpcells <- lsi(
         test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
-        z_score_norm = FALSE,
         n_dimensions = 20,
         save_lsi = TRUE
     )
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 810133c1..8818d7e7 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -6,9 +6,18 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
-source("test_helpers.R")
-devtools::load_all(config[["path_bpcells"]])
-devtools::load_all(config[["path_archr"]])
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)

From 1e7c6d062dcc50c40a1bfcd5344cb0835cd7889f Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Wed, 8 Jan 2025 17:30:41 -0800
Subject: [PATCH 16/22] [r] update LSI to use norm, feature selection helpers

---
 r/NAMESPACE                                   |   9 +-
 r/R/singlecell_utils.R                        | 426 ++++++++++--------
 r/R/transforms.R                              |  72 ---
 r/man/DimReduction.Rd                         |  21 +
 r/man/lsi.Rd                                  |  37 +-
 r/man/project.Rd                              |  25 +
 ...> select_features_by_binned_dispersion.Rd} |  21 +-
 r/pkgdown/_pkgdown.yml                        |   9 +-
 r/tests/real_data/ArchR_LSI.R                 |  17 +-
 r/tests/testthat/test-singlecell_utils.R      |  40 +-
 10 files changed, 340 insertions(+), 337 deletions(-)
 create mode 100644 r/man/DimReduction.Rd
 create mode 100644 r/man/project.Rd
 rename r/man/{highly_variable_features.Rd => select_features_by_binned_dispersion.Rd} (59%)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index c7e4fd1c..b296a27f 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -2,11 +2,16 @@
 
 S3method(base::as.data.frame,IterableFragments)
 S3method(base::as.matrix,IterableMatrix)
+S3method(project,DimReduction)
+S3method(project,LSI)
+S3method(project,default)
 S3method(svds,IterableMatrix)
 S3method(svds,default)
 export("all_matrix_inputs<-")
 export("cellNames<-")
 export("chrNames<-")
+export(DimReduction)
+export(LSI)
 export(add_cols)
 export(add_rows)
 export(all_matrix_inputs)
@@ -48,7 +53,6 @@ export(gene_score_archr)
 export(gene_score_tiles_archr)
 export(gene_score_weights_archr)
 export(get_trackplot_height)
-export(highly_variable_features)
 export(import_matrix_market)
 export(import_matrix_market_10x)
 export(knn_annoy)
@@ -56,7 +60,6 @@ export(knn_hnsw)
 export(knn_to_geodesic_graph)
 export(knn_to_snn_graph)
 export(log1p_slow)
-export(lsi)
 export(marker_features)
 export(match_gene_symbol)
 export(matrix_stats)
@@ -87,6 +90,7 @@ export(plot_tf_footprint)
 export(plot_tss_profile)
 export(plot_tss_scatter)
 export(prefix_cell_names)
+export(project)
 export(pseudobulk_matrix)
 export(qc_scATAC)
 export(range_distance_to_nearest)
@@ -112,6 +116,7 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
+export(select_features_by_binned_dispersion)
 export(select_features_by_mean)
 export(select_features_by_variance)
 export(select_regions)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 28fa45b7..c6baece6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -118,6 +118,233 @@ select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
 }
 
 
+#' Get the most variable features within a matrix.
+#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
+#' all features will be returned.
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @inheritParams select_features_by_variance
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by bin-normalized dispersion:
+#'   - `names`: Feature name.
+#'   - `score`: Bin-normalized dispersion of the feature.
+#'   - `highly_variable`: Logical vector of whether the feature is highly variable.
+#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
+#' 
+#' Calculate using the following process:
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+select_features_by_binned_dispersion <- function(
+  mat, num_feats = 25000, n_bins = 20,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
+    features_df <- tibble::tibble(
+      names = rownames(mat),
+      score = rep(0, nrow(mat)),
+      highly_variable = rep(TRUE, nrow(mat))
+    )
+    return(mat)
+  }
+  # Calculate row information for dispersion
+  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
+  feature_means <- mat_stats$row_stats["mean", ]
+  feature_vars <- mat_stats$row_stats["variance", ]
+  # Calculate dispersion, and log normalize
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log1p(feature_dispersion)
+  feature_dispersion[feature_means == 0] <- 0
+  feature_means <- log1p(feature_means)
+  features_df <- tibble::tibble(
+    names = names(feature_means),
+    var = feature_vars, 
+    mean = feature_means,
+    dispersion = feature_dispersion
+  )
+  # Bin by mean, and normalize dispersion with each bin
+  features_df <- features_df %>% 
+    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
+    dplyr::group_by(bin) %>% 
+    dplyr::mutate( 
+      score = (dispersion - mean(dispersion)) / sd(dispersion),
+      score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+    ) %>% 
+    dplyr::ungroup() %>%
+    dplyr::arrange(desc(score)) %>%
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) %>% 
+    dplyr::select(c("names", "score", "highly_variable"))
+  return(features_df)
+}
+
+
+#################
+# DimReduction Class Definition
+#################
+
+#' Barebones definition of a DimReduction class.
+#' 
+#' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
+#' Child classes should implement a `project` method to allow for the projection of other matrices using
+#' the fitted transformation object.
+#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) The projected data
+#' @field fitted_params (list) A list of parameters used for the transformation of a matrix.
+#' @export
+DimReduction <- function(x, fitted_params = list(), ...) {
+  assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix"))
+  assert_is(fitted_params, "list")
+  structure(
+    list(
+      cell_embeddings = x,
+      fitted_params = fitted_params,
+      ...
+    ),
+    class = "DimReduction"
+  )
+}
+
+#' Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+#' @param x DimReduction object.
+#' @param mat IterableMatrix object.
+#' @return IterableMatrix object of the projected data.
+#' @details DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
+#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
+#' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+#' If there are rownames, reorder the matrix to match the order of the original matrix
+#' @export
+project <- function(x, mat, ...) {
+  UseMethod("project")
+}
+#' @export 
+project.default <- function(x, mat, ...) {
+  rlang::abort("project method not implemented for BPCells objects.")
+}
+#' @export
+project.DimReduction <- function(x, mat, ...) {
+  rlang::abort("project method not implemented for base DimReduction object.")
+}
+
+#################
+# LSI Implementation
+#################
+
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' 
+#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
+#' @param mat (IterableMatrix) dimensions features x cells.
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+#' the corr_cutoff, it will be excluded from the final PCA matrix.
+#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param threads (integer) Number of threads to use.
+#' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
+#' - `cell_embeddings`: The projected data
+#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+#'   - `normalization_method`: The normalization method used
+#'   - `feature_means`: The means of the features used for normalization
+#'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
+#'   - `svd_params`: The matrix calculated for SVD
+#' - `feature_names`: The names of the features in the matrix
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - 17.1 MB memory usage, 25.1 seconds runtime
+#' @seealso `project()` `DimReduction()` `normalize_tfidf()`
+#' @export
+LSI <- function(
+  mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
+  assert_is_wholenumber(threads)
+
+  # Normalization
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
+  read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  
+  # Save to prevent re-calculation of queued operations
+  mat <- write_matrix_dir(
+    convert_matrix_type(mat, type = "float"),
+    tempfile("mat"), compress = TRUE
+  )
+  # Run pca
+  svd_attr <- svds(mat, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr$u) %*% mat
+  
+  # Filter out PCs that are highly correlated with sequencing depth
+  pca_corrs <- abs(cor(read_depth, t(pca_res)))
+  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
+  if (length(pca_feats_to_keep) != n_dimensions) {
+    log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
+    pca_res <- pca_res[pca_feats_to_keep, ]
+  }
+  fitted_params <- list(
+    normalization_method = normalize,
+    feature_means = mat_stats$row_stats["mean", ],
+    pcs_to_keep = pca_feats_to_keep,
+    svd_params = svd_attr
+  )
+  res <- DimReduction(
+    x = pca_res,
+    fitted_params = fitted_params,
+    feature_names = rownames(mat)
+  )
+  class(res) <- c("LSI", class(res))
+  return(res)
+}
+
+
+#' @export
+project.LSI <- function(x, mat, threads = 1L, ...) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(x, "LSI")
+
+  fitted_params <- x$fitted_params
+  # Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u
+  assert_true(nrow(mat) == nrow(fitted_params$svd_params$u))
+  if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
+    assert_true(all(x$feature_names %in% rownames(mat)))
+    mat <- mat[x$feature_names, ]
+  }
+
+  if (!is.null(fitted_params$normalization_method))  {
+    mat <- fitted_params$normalization_method(
+      mat,
+      feature_means = fitted_params$feature_means,
+      threads = threads
+    )
+    mat <- write_matrix_dir(
+      convert_matrix_type(mat, type = "float"),
+      tempfile("mat"), compress = TRUE
+    )
+  }
+  pca_attr <- fitted_params$svd_params
+  res <- t(pca_attr$u) %*% mat
+  if (length(fitted_params$pcs_to_keep) != nrow(res)) {
+    res <- res[fitted_params$pcs_to_keep, ]
+  }
+  return(res)
+}
+
+
 #' Test for marker features
 #'
 #' Given a features x cells matrix, perform one-vs-all differential
@@ -182,205 +409,6 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
-
-#' Aggregate counts matrices by cell group or feature.
-#'
-#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
-#' feature.
-#' @param cell_groups (Character/factor) Vector of group/cluster assignments for each cell. Length must be `ncol(mat)`.
-#' @param method (Character vector) Method(s) to aggregate counts. If one method is provided, the output will be a matrix. If multiple methods are provided, the output will be a named list of matrices.
-#'
-#' Current options are: `nonzeros`, `sum`, `mean`, `variance`.
-#' @param threads (integer) Number of threads to use.
-#' @return
-#'  - If `method` is length `1`, returns a matrix of shape `(features x groups)`.
-#'  - If `method` is greater than length `1`, returns a list of matrices with each matrix representing a pseudobulk matrix with a different aggregation method.
-#' Each matrix is of shape `(features x groups)`, and names are one of `nonzeros`, `sum`, `mean`, `variance`.
-#' @details Some simpler stats are calculated in the process of calculating more complex
-#' statistics. So when calculating `variance`, `nonzeros` and `mean` can be included with no
-#' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time.
-#' @inheritParams marker_features
-#' @export
-pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
-  assert_is(mat, "IterableMatrix")
-  assert_is(cell_groups, c("factor", "character", "numeric"))
-  assert_true(length(cell_groups) == ncol(mat))
-  cell_groups <- as.factor(cell_groups)
-  assert_is(method, "character")
-  methods <- c("variance", "mean", "sum", "nonzeros")
-  for (m in method) {
-    if (!(m %in% methods)) {
-      rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", ")))
-    }
-  }
-  assert_is(threads, "integer")
-  # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats
-  iter <- iterate_matrix(parallel_split(mat, threads, threads*4))
-  res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
-  # if res is a single matrix, return with colnames and rownames
-  if (length(method) == 1) {
-    colnames(res[[method]]) <- levels(cell_groups)
-    rownames(res[[method]]) <- rownames(mat)
-    return(res[[method]])
-  }
-  # give colnames and rownames for each matrix in res, which is a named list
-  for (res_slot in names(res)) {
-    if ((length(res[[res_slot]]) == 0) || !(res_slot %in% method)) {
-      res[[res_slot]] <- NULL
-    } else {
-      colnames(res[[res_slot]]) <- levels(cell_groups)
-      rownames(res[[res_slot]]) <- rownames(mat)
-    }
-  }
-  return(res)
-}
-
-#' Perform latent semantic indexing (LSI) on a matrix.
-#' 
-#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
-#' @param mat (IterableMatrix) dimensions features x cells
-#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
-#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
-#' the corr_cutoff, it will be excluded from the final PCA matrix.
-#' @param scale_factor (integer) Scale factor for the tf-idf log transform.
-#' @param save_lsi (logical) If `TRUE`, save the SVD attributes for the matrix, as well as the idf normalization vector.
-#' @param threads (integer) Number of threads to use.
-#' @return 
-#' - If `save_lsi` is `FALSE`, return a dgCMatrix of shape `(n_dimensions, ncol(mat))`.
-#' - If `save_lsi` is `TRUE`, return a list with the following elements:
-#'    - `pca_res`: dgCMatrix of shape `(n_dimensions, ncol(mat))`
-#'    - `unused_pcs`: Integer vector of PCs that were filtered out due to high correlation with sequencing depth
-#'    - `svd_attr`: List of SVD attributes
-#'    - `idf`: Inverse document frequency vector
-#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
-#' 
-#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
-#' - 17.1 MB memory usage, 25.1 seconds runtime
-#' @export
-lsi <- function(
-  mat,
-  n_dimensions = 50L, corr_cutoff = 1, scale_factor = 1e4,
-  save_lsi = FALSE,
-  threads = 1L
-) {
-  assert_is(mat, "IterableMatrix")
-  assert_is_wholenumber(n_dimensions)
-  assert_len(n_dimensions, 1)
-  assert_greater_than_zero(n_dimensions)
-  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
-  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
-  assert_is_wholenumber(threads)
-
-  # log(tf-idf) transform
-  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
-  read_depth <- mat_stats$col_stats["mean",] * nrow(mat) 
-  tf <- mat %>% multiply_cols(1 / read_depth)
-  idf_ <- 1 / mat_stats$row_stats["mean",]
-  mat_tfidf <- tf %>% multiply_rows(idf_)
-  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
-  # Save to prevent re-calculation of queued operations
-  mat_log_tfidf <- write_matrix_dir(
-    convert_matrix_type(mat_log_tfidf, type = "float"), 
-    tempfile("mat_log_tfidf"), compress = TRUE
-  )
-  # Run pca
-  svd_attr_ <- svds(mat_log_tfidf, k = n_dimensions, threads = threads)
-  pca_res <- t(svd_attr_$u) %*% mat_log_tfidf
-  
-  # Filter out PCs that are highly correlated with sequencing depth
-  pca_corrs <- abs(cor(read_depth, t(pca_res)))
-  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
-  if (length(pca_feats_to_keep) != n_dimensions) {
-    # not sure if this is the route we want to take.  Should we just leave the PCs in,
-    # and not use them in downstream analysis?
-    pca_res <- t(svd_attr_$u[, pca_feats_to_keep]) %*% mat_log_tfidf
-  }
-
-  if(save_lsi) {
-    return(list(
-      pca_res = pca_res,
-      svd_attr = svd_attr_,
-      unused_pcs <- which(pca_corrs >= corr_cutoff),
-      idf = idf_
-    ))
-  }
-  return(pca_res)
-}
-
-#' Get the most variable features within a matrix.
-#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
-#' all features will be returned.
-#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
-#' and if the number of features
-#' within a bin is less than 2, the dispersion is set to 1.
-#' @param save_feat_selection (logical) If `TRUE`, save the dispersions, means, and the features selected.
-#' @returns 
-#' - If `save_feat_selection` is `FALSE`, return an IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#' - If `save_feat_selection` is `TRUE`, return a list with the following elements:
-#'    - `mat`: IterableMatrix subset of the most variable features of shape `(num_variable_features, ncol(mat))`.
-#'    - `feature_selection`: Dataframe with columns `name`, `mean`, `dispersion`, `bin`, `feature_dispersion_norm`.
-#' @inheritParams lsi
-#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
-#' 
-#' Calculate using the following process:
-#'  1. Calculate the dispersion of each feature (variance / mean)
-#'  2. Log normalize dispersion and mean
-#'  3. Bin the features by their means, and normalize dispersion within each bin
-#' @export
-highly_variable_features <- function(
-  mat, num_feats, n_bins = 20,
-  save_feat_selection = FALSE,
-  threads = 1L
-) {
-  assert_is(mat, "IterableMatrix")
-  assert_greater_than_zero(num_feats)
-  assert_is_wholenumber(num_feats)
-  assert_len(num_feats, 1)
-  assert_is_wholenumber(n_bins)
-  assert_len(n_bins, 1)
-  assert_greater_than_zero(n_bins)
-  if (nrow(mat) <= num_feats) {
-    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
-    return(mat)
-  }
-  # Calculate row information for dispersion
-  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
-  feature_means <- mat_stats$row_stats["mean", ]
-  feature_vars <- mat_stats$row_stats["variance", ]
-  # Calculate dispersion, and log normalize
-  feature_dispersion <- feature_vars / feature_means
-  feature_dispersion[feature_dispersion == 0] <- NA
-  feature_dispersion <- log(feature_dispersion)
-  feature_dispersion[feature_means == 0] <- 0
-  feature_means <- log1p(feature_means)
-  features_df <- data.frame(
-    name = names(feature_means),
-    var = feature_vars, 
-    mean = feature_means,
-    dispersion = feature_dispersion
-  ) 
-
-  # Bin by mean, and normalize dispersion with each bin
-  features_df <- features_df %>% 
-    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
-    dplyr::group_by(bin) %>% 
-    dplyr::mutate( 
-      feature_dispersion_norm = (dispersion - mean(dispersion)) / sd(dispersion),
-      feature_dispersion_norm = if (dplyr::n() == 1) {1} else {feature_dispersion_norm} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
-    ) %>%  
-    dplyr::ungroup() %>%
-    dplyr::slice_max(order_by = feature_dispersion_norm, n = num_feats, with_ties = FALSE)
-  feats_of_interest <- which(rownames(mat) %in% features_df$name) # get rownames to get original sorted order
-  if (save_feat_selection) {
-    # get rownames that are in features_df$name
-    return(list(
-      mat = mat[feats_of_interest,],
-      feature_selection = features_df
-    ))
-  }
-  return(mat[feats_of_interest,])
-}
-
                    
 #' Aggregate counts matrices by cell group or feature.
 #'
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 7a110677..8a2bd25b 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -983,76 +983,4 @@ normalize_tfidf <- function(
   idf <- 1 / feature_means
   tf_idf_mat <- tf %>% multiply_rows(idf)
   return(log1p(tf_idf_mat * scale_factor))
-}
-#' Compute LSI For a peak matrix 
-#' @param mat PeakMatrix
-#' @param n_dimensions Number of dimensions to keep during PCA
-#' @param scale_factor Scale factor for the tf-idf log transform
-#' @param verbose Whether to print out progress
-#' @param threads Number of threads to use
-#' @return dgCMatrix of shape (n_dimensions, ncol(mat))
-#' @details Compute LSI through first doing a tf-idf transform, a z-score normalization, then PCA.  
-#' Tf-idf implementation is from Stuart & Butler et al. 2019.  
-#' @export
-compute_lsi <- function(mat, n_dimensions = 50, scale_factor = 1e-4, verbose = FALSE, threads = 1) {
-  assert_is(mat, "IterableMatrix") # Should be a peak matrix, should we enforce this?
-  assert_is(n_dimensions, "integer")
-  assert_len(n_dimensions, 1)
-  assert_greater_than_zero(n_dimensions)
-  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
-
-  # Signac implementation
-  npeaks <- colSums(mat)
-  tf <- mat %>% multiply_cols(1/npeaks)
-  idf_ <- ncol(mat) / rowSums(mat)
-  mat_tfidf <- tf %>% multiply_rows(idf_)
-  mat_log_tfidf <- log1p(scale_factor * mat_tfidf)
-
-  # run z-score norm
-  cell_peak_stats <- matrix_stats(mat_log_tfidf, col_stats="variance")$col_stats
-  cell_means <- cell_peak_stats["mean",]
-  cell_vars <- cell_peak_stats["variance",]
-  mat_lsi_norm <- mat_log_tfidf %>%
-    add_cols(-cell_means) %>%
-    multiply_cols(1 / cell_vars)
-
-  # Run pca
-  svd_attr_ <- svds(mat_lsi_norm, k = n_dimensions)
-  pca_res <- t(svd_attr$u) %*% mat_lsi_norm
-  return(pca_res)
-}
-
-#' Get most variable features, given a non-log normalized matrix
-highly_variable_features <- function(mat, num_feats, n_bins, verbose = FALSE) {
-  assert_is(mat, "IterableMatrix")
-  assert_is(num_feats, "integer")
-  assert_greater_than_zero(num_feats)
-  assert_len(num_feats, 1)
-  assert_is(n_bins, "integer")
-  assert_len(n_bins, 1)
-  assert_greater_than_zero(n_bins)
-  if (nrow(mat) <= num_feats) {
-    if (verbose) log_progress(sprintf("Number of features (%s) is less than num_feats (%s), 
-                                       returning all features", nrow(mat), num_feats))
-    return(mat)
-  }
-  # Calculate the mean and variance of each feature
-  # should we set entries that are 0 to NA?
-  feature_means <- matrix_stats(mat, row_stats = c("mean"))$row_stats['mean', ]
-  feature_vars <- matrix_stats(mat, row_stats = c("variance"))$row_stats['variance', ]
-  feature_dispersion <- feature_vars / feature_means
-  feature_dispersion <- log(feature_dispersion)
-  feature_means <- log1p(feature_means)
-  mean_bins <- cut(feature_means, n_bins, labels = FALSE)
-
-  # Calculate the mean and variance of dispersion of each bin
-  bin_mean <- tapply(feature_dispersion, mean_bins, function(x) mean(x))
-  bin_sd <- tapply(feature_dispersion, mean_bins, function(x) sd(x))
-  # Set bin_sd value to bin_mean if bin_sd is NA, results in norm dispersion of 1
-  bin_sd[is.na(bin_sd)] <- bin_mean[is.na(bin_sd)]
-  # map mean_bins indices to bin_stats
-  feature_dispersion_norm <- (feature_dispersion - bin_mean[mean_bins]) / bin_sd[mean_bins]
-  names(feature_dispersion_norm) <- names(feature_dispersion)
-  variable_features_ <- sort(feature_dispersion_norm)[nrow(mat)-num_feats:nrow(mat)]
-  return(mat[names(variable_features_), ])
 }
\ No newline at end of file
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
new file mode 100644
index 00000000..2bb7486c
--- /dev/null
+++ b/r/man/DimReduction.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{DimReduction}
+\alias{DimReduction}
+\title{Barebones definition of a DimReduction class.}
+\usage{
+DimReduction(x, fitted_params = list(), ...)
+}
+\description{
+Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
+Child classes should implement a \code{project} method to allow for the projection of other matrices using
+the fitted transformation object.
+}
+\section{Fields}{
+
+\describe{
+\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) The projected data}
+
+\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.}
+}}
+
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index ea2687f9..13348e23 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -1,39 +1,41 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/singlecell_utils.R
-\name{lsi}
-\alias{lsi}
+\name{LSI}
+\alias{LSI}
 \title{Perform latent semantic indexing (LSI) on a matrix.}
 \usage{
-lsi(
+LSI(
   mat,
-  z_score_norm = TRUE,
   n_dimensions = 50L,
-  scale_factor = 10000,
-  save_lsi = FALSE,
+  corr_cutoff = 1,
+  normalize = normalize_tfidf,
   threads = 1L
 )
 }
 \arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{z_score_norm}{(logical) If \code{TRUE}, z-score normalize the matrix before PCA.}
+\item{mat}{(IterableMatrix) dimensions features x cells.}
 
 \item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
 
-\item{scale_factor}{(integer) Scale factor for the tf-idf log transform.
-#' @param save_lsi (logical) If \code{TRUE}, save the SVD attributes for the matrix, as well as the idf normalization vector.}
+\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+the corr_cutoff, it will be excluded from the final PCA matrix.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
 
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
+An object of class \code{c("LSI", "DimReduction")} with the following attributes:
 \itemize{
-\item If \code{save_lsi} is \code{FALSE}, return a dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}.
-\item If \code{save_lsi} is \code{TRUE}, return a list with the following elements:
+\item \code{cell_embeddings}: The projected data
+\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
 \itemize{
-\item \code{pca_res}: dgCMatrix of shape \verb{(n_dimensions, ncol(mat))}
-\item \code{svd_attr}: List of SVD attributes
-\item \code{idf}: Inverse document frequency vector
+\item \code{normalization_method}: The normalization method used
+\item \code{feature_means}: The means of the features used for normalization
+\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
+\item \code{svd_params}: The matrix calculated for SVD
 }
+\item \code{feature_names}: The names of the features in the matrix
 }
 }
 \description{
@@ -47,3 +49,6 @@ Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
 \item 17.1 MB memory usage, 25.1 seconds runtime
 }
 }
+\seealso{
+\code{project()} \code{DimReduction()} \code{normalize_tfidf()}
+}
diff --git a/r/man/project.Rd b/r/man/project.Rd
new file mode 100644
index 00000000..dcab31b5
--- /dev/null
+++ b/r/man/project.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{project}
+\alias{project}
+\title{Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.}
+\usage{
+project(x, mat, ...)
+}
+\arguments{
+\item{x}{DimReduction object.}
+
+\item{mat}{IterableMatrix object.}
+}
+\value{
+IterableMatrix object of the projected data.
+}
+\description{
+Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+}
+\details{
+DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
+All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
+If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+If there are rownames, reorder the matrix to match the order of the original matrix
+}
diff --git a/r/man/highly_variable_features.Rd b/r/man/select_features_by_binned_dispersion.Rd
similarity index 59%
rename from r/man/highly_variable_features.Rd
rename to r/man/select_features_by_binned_dispersion.Rd
index 005a4e37..b3c573ae 100644
--- a/r/man/highly_variable_features.Rd
+++ b/r/man/select_features_by_binned_dispersion.Rd
@@ -1,14 +1,13 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/singlecell_utils.R
-\name{highly_variable_features}
-\alias{highly_variable_features}
+\name{select_features_by_binned_dispersion}
+\alias{select_features_by_binned_dispersion}
 \title{Get the most variable features within a matrix.}
 \usage{
-highly_variable_features(
+select_features_by_binned_dispersion(
   mat,
-  num_feats,
+  num_feats = 25000,
   n_bins = 20,
-  save_feat_selection = FALSE,
   threads = 1L
 )
 }
@@ -22,18 +21,14 @@ all features will be returned.}
 and if the number of features
 within a bin is less than 2, the dispersion is set to 1.}
 
-\item{save_feat_selection}{(logical) If \code{TRUE}, save the dispersions, means, and the features selected.}
-
 \item{threads}{(integer) Number of threads to use.}
 }
 \value{
+Return a dataframe with the following columns, sorted descending by bin-normalized dispersion:
 \itemize{
-\item If \code{save_feat_selection} is \code{FALSE}, return an IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item If \code{save_feat_selection} is \code{TRUE}, return a list with the following elements:
-\itemize{
-\item \code{mat}: IterableMatrix subset of the most variable features of shape \verb{(num_variable_features, ncol(mat))}.
-\item \code{feature_selection}: Dataframe with columns \code{name}, \code{mean}, \code{dispersion}, \code{bin}, \code{feature_dispersion_norm}.
-}
+\item \code{names}: Feature name.
+\item \code{score}: Bin-normalized dispersion of the feature.
+\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
 }
 }
 \description{
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 3cf6509a..64f1f119 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -135,9 +135,9 @@ reference:
   - normalize_tfidf
   - select_features_by_variance
   - select_features_by_dispersion
+  - select_features_by_binned_dispersion
   - select_features_by_mean
-  - lsi
-  - highly_variable_features
+  - LSI
   - IterableMatrix-methods
   - pseudobulk_matrix
 
@@ -156,6 +156,11 @@ reference:
   - knn_to_graph
   - cluster_membership_matrix
 
+- title: "Dimensionality Reductions"
+- contents: 
+  - DimReduction
+  - project
+
 
 - title: "Plots"
 
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
index 084645d3..de775f4f 100644
--- a/r/tests/real_data/ArchR_LSI.R
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -36,7 +36,7 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     # Calculate LSI on ArchR
     # running LSI without binarizing, as we don't do this in the BPCells implementation
     # we also don't filter quantile outliers.
-    lsi_archr <- .computeLSI(
+    lsi_archr <- ArchR:::.computeLSI(
         mat = test_mat,
         LSIMethod = 2,
         nDimensions = 20,
@@ -53,18 +53,17 @@ test_lsi_similarity_to_archr <- function(dir = NULL) {
     pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
     # Calculate LSI on BPCells
     # Do not use z-score normalization, as this isn't done with ArchR
-    lsi_bpcells <- lsi(
-        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"), 
-        n_dimensions = 20,
-        save_lsi = TRUE
+    lsi_bpcells <- LSI(
+        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        n_dimensions = 20
     )
-    pre_svd_mat_approx_bpcells <- lsi_bpcells$svd_attr$u %*% lsi_bpcells$pca_res
+    pre_svd_mat_approx_bpcells <- lsi_bpcells$fitted_params$svd_params$u %*% lsi_bpcells$cell_embeddings
     testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
     # convert signs
-    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$pca_res[,1] > 0) - 1), `*`)
+    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$cell_embeddings[,1] > 0) - 1), `*`)
     # Check for post-pca matrix similarity
-    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$pca_res, tolerance = 1e-4))
+    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$cell_embeddings, tolerance = 1e-4))
     # also check for correlation between the two matrices in PC space
-    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$pca_res)) > 0.999)
+    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$cell_embeddings)) > 0.999)
 }
 test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index c699d67b..93c62a72 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -36,25 +36,6 @@ test_that("select_features works general case", {
 })
 
 
-test_that("select_features works general case", {
-  m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
-  for (fn in c("select_features_by_variance", "select_features_by_dispersion", "select_features_by_mean")) {
-    res <- do.call(fn, list(m1, num_feats = 10))
-    expect_equal(nrow(res), nrow(m1)) # Check that dataframe has correct features we're expecting
-    expect_equal(sum(res$highly_variable), 10) # Only 10 features marked as highly variable
-    expect_setequal(res$names, rownames(m1))
-    res_more_feats_than_rows <- do.call(fn, list(m1, num_feats = 10000)) # more features than rows
-    res_feats_equal_rows <- do.call(fn, list(m1, num_feats = 100)) 
-    expect_identical(res_more_feats_than_rows, res_feats_equal_rows)
-    if (fn != "select_features_by_mean") {
-      # Check that normalization actually does something
-      res_no_norm <- do.call(fn, list(m1, num_feats = 10, normalize = NULL))
-      expect_true(!all((res %>% dplyr::arrange(names))$score == (res_no_norm %>% dplyr::arrange(names))$score))
-    }
-  }
-})
-
-
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -206,11 +187,16 @@ test_that("Pseudobulk aggregation works with multiple return types", {
 
 
 
-test_that("Highly variable feature selection works", {
+test_that("Feature selection by bin variance works", {
     mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
-    res <- highly_variable_features(mat, num_feats = 10, n_bins = 5, threads = 1)
-    res_t <- highly_variable_features(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    browser()
+    res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
+    res <- mat[res_feats,]
+    res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
+    res_t <- t(mat[,res_feats_t])
     expect_equal(nrow(res), 10)
     expect_equal(ncol(res), 26)
     expect_equal(nrow(res_t), 10)
@@ -222,10 +208,16 @@ test_that("LSI works", {
     rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
     colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
-    lsi_res <- lsi(mat, n_dimensions = 5)
-    lsi_res_t <- lsi(t(mat), n_dimensions = 5)
+    lsi_res_obj <- LSI(mat, n_dimensions = 5)
+    lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
+    lsi_res <- lsi_res_obj$cell_embeddings
+    lsi_res_t <- lsi_res_t_obj$cell_embeddings
+    # Check that projection results in the same output if used on the same input matrix
+    lsi_res_proj <- project(lsi_res_obj, mat)
+
     expect_equal(nrow(lsi_res), 5)
     expect_equal(ncol(lsi_res), ncol(mat))
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
+    expect_equal(lsi_res, lsi_res_proj)
 })
\ No newline at end of file

From e9c302e5e9b601722659b34b566584e1ae8191e6 Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <immanuelazn@gmail.com>
Date: Thu, 9 Jan 2025 22:25:46 -0800
Subject: [PATCH 17/22] [r] update `NEWS.md`

---
 r/NEWS.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/r/NEWS.md b/r/NEWS.md
index 24b6b0d9..86d75556 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -24,9 +24,8 @@ Contributions welcome :)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
 - Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
-- Add MACS2/3 input creation and peak calling through `call_macs_peaks()` (pull request #118)
-- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #156).
-- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #156). 
+- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #181).
+- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #181). 
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales

From 7ed6bd7d59074faaef6016a431acff31e3d5a2ba Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 10 Jan 2025 02:32:12 -0800
Subject: [PATCH 18/22] [r] remove test artifacts

---
 r/tests/testthat/test-singlecell_utils.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 93c62a72..85416f71 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -192,11 +192,11 @@ test_that("Feature selection by bin variance works", {
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
     res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1)
     res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1)
-    browser()
     res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
     res <- mat[res_feats,]
     res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
     res_t <- t(mat[,res_feats_t])
+    
     expect_equal(nrow(res), 10)
     expect_equal(ncol(res), 26)
     expect_equal(nrow(res_t), 10)
@@ -220,4 +220,5 @@ test_that("LSI works", {
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
     expect_equal(lsi_res, lsi_res_proj)
-})
\ No newline at end of file
+})
+

From d67b7db51f7dba0eae8ecb1b02463042780a8542 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Mon, 13 Jan 2025 13:38:25 -0800
Subject: [PATCH 19/22] [r] add logging, partial args

---
 r/R/singlecell_utils.R                   | 17 ++++++++++++++---
 r/tests/testthat/test-singlecell_utils.R |  7 +++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index c6baece6..6ec76e37 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -265,8 +265,16 @@ project.DimReduction <- function(x, mat, ...) {
 #' @export
 LSI <- function(
   mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf,
-  threads = 1L
+  threads = 1L, verbose = FALSE
 ) {
+  if (rlang::is_missing(mat)) {
+    return(
+      purrr::partial(
+        LSI, n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, 
+        normalize = normalize, threads = threads, verbose = verbose
+      )
+    )
+  }
   assert_is(mat, "IterableMatrix")
   assert_is_wholenumber(n_dimensions)
   assert_len(n_dimensions, 1)
@@ -274,8 +282,10 @@ LSI <- function(
   assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
   assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
   assert_is_wholenumber(threads)
-
+  
+  if (verbose) log_progress("Starting LSI")
   # Normalization
+  if (verbose) log_progress("Normalizing matrix")
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
   read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
   if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
@@ -286,6 +296,7 @@ LSI <- function(
     tempfile("mat"), compress = TRUE
   )
   # Run pca
+  if (verbose) log_progress("Calculating SVD")
   svd_attr <- svds(mat, k = n_dimensions, threads = threads)
   pca_res <- t(svd_attr$u) %*% mat
   
@@ -293,7 +304,7 @@ LSI <- function(
   pca_corrs <- abs(cor(read_depth, t(pca_res)))
   pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
   if (length(pca_feats_to_keep) != n_dimensions) {
-    log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
+    if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
     pca_res <- pca_res[pca_feats_to_keep, ]
   }
   fitted_params <- list(
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 85416f71..034d765a 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -190,8 +190,8 @@ test_that("Pseudobulk aggregation works with multiple return types", {
 test_that("Feature selection by bin variance works", {
     mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
-    res_table <- select_features_by_bin_variance(mat, num_feats = 10, n_bins = 5, threads = 1)
-    res_table_t <- select_features_by_bin_variance(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_table <- select_features_by_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_by_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
     res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
     res <- mat[res_feats,]
     res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
@@ -210,6 +210,8 @@ test_that("LSI works", {
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
     lsi_res_obj <- LSI(mat, n_dimensions = 5)
     lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
+    # Also check partial args
+    lsi_res_obj_partial <- LSI(n_dimensions = 5)(mat)
     lsi_res <- lsi_res_obj$cell_embeddings
     lsi_res_t <- lsi_res_t_obj$cell_embeddings
     # Check that projection results in the same output if used on the same input matrix
@@ -219,6 +221,7 @@ test_that("LSI works", {
     expect_equal(ncol(lsi_res), ncol(mat))
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
+    expect_equal(lsi_res_obj, lsi_res_obj_partial)
     expect_equal(lsi_res, lsi_res_proj)
 })
 

From 76f4c7d2955c24b463a58b00f2299f331441a09c Mon Sep 17 00:00:00 2001
From: Immanuel Abdi <56730419+immanuelazn@users.noreply.github.com>
Date: Fri, 10 Jan 2025 12:58:07 -0800
Subject: [PATCH 20/22] [r] change check for `pseudobulk_matrix()` to use whole
 number instead of integer (#183)

---
 r/R/singlecell_utils.R | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 34ed7dd6..b222a931 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -456,10 +456,14 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
       rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", ")))
     }
   }
-  assert_is(threads, "integer")
-  # if multiple methods are provided, only need to pass in the top method as it will also calculate the less complex stats
-  iter <- iterate_matrix(parallel_split(mat, threads, threads*4))
-  res <- pseudobulk_matrix_cpp(iter, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
+  assert_is_wholenumber(threads)
+
+  it <- mat %>%
+    convert_matrix_type("double") %>%
+    parallel_split(threads, threads*4) %>%
+    iterate_matrix()
+  
+  res <- pseudobulk_matrix_cpp(it, cell_groups = as.integer(cell_groups) - 1, method = method, transpose = mat@transpose)
   # if res is a single matrix, return with colnames and rownames
   if (length(method) == 1) {
     colnames(res[[method]]) <- levels(cell_groups)

From 5e3a7fed33e786d7a550394043a615f9128da457 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 17 Jan 2025 21:34:51 -0800
Subject: [PATCH 21/22] [r] update feature selection documentation

---
 r/R/singlecell_utils.R                        | 16 ++-----
 r/man/feature_selection.Rd                    | 23 ++++++++++
 r/man/lsi.Rd                                  |  3 +-
 r/man/select_features_by_binned_dispersion.Rd | 46 -------------------
 r/pkgdown/_pkgdown.yml                        |  1 -
 5 files changed, 29 insertions(+), 60 deletions(-)
 delete mode 100644 r/man/select_features_by_binned_dispersion.Rd

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index b222a931..d2ce7bad 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -121,22 +121,14 @@ select_features_by_mean <- function(mat, num_feats = 25000, threads = 1L) {
   return(features_df)
 }
 
-
-#' Get the most variable features within a matrix.
-#' @param num_feats (integer) Number of features to return.  If the number is higher than the number of features in the matrix, 
-#' all features will be returned.
+#' @rdname feature_selection
 #' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
-#' @inheritParams select_features_by_variance
 #' @returns
-#' Return a dataframe with the following columns, sorted descending by bin-normalized dispersion:
-#'   - `names`: Feature name.
-#'   - `score`: Bin-normalized dispersion of the feature.
-#'   - `highly_variable`: Logical vector of whether the feature is highly variable.
-#' @details The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
-#' 
-#' Calculate using the following process:
+#'  - `select_features_by_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
+#' @details 
+#' `select_features_by_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
 #'  1. Calculate the dispersion of each feature (variance / mean)
 #'  2. Log normalize dispersion and mean
 #'  3. Bin the features by their means, and normalize dispersion within each bin
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index c57a8e40..007ba177 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -4,6 +4,7 @@
 \alias{select_features_by_variance}
 \alias{select_features_by_dispersion}
 \alias{select_features_by_mean}
+\alias{select_features_by_binned_dispersion}
 \title{Feature selection functions}
 \usage{
 select_features_by_variance(
@@ -21,6 +22,13 @@ select_features_by_dispersion(
 )
 
 select_features_by_mean(mat, num_feats = 25000, threads = 1L)
+
+select_features_by_binned_dispersion(
+  mat,
+  num_feats = 25000,
+  n_bins = 20,
+  threads = 1L
+)
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
@@ -31,6 +39,10 @@ all features will be returned.}
 \item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
 
 \item{threads}{(integer) Number of threads to use.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
 }
 \value{
 Return a dataframe with the following columns, sorted descending by score:
@@ -52,6 +64,10 @@ Each different feature selection method will have a different scoring method:
 \itemize{
 \item \code{select_features_by_mean}: Score representing the mean accessibility of each feature.
 }
+
+\itemize{
+\item \code{select_features_by_binned_dispersion}: Score representing the bin normalized dispersion of each feature.
+}
 }
 \description{
 Apply a feature selection method to a \verb{(features x cells)} matrix.
@@ -75,4 +91,11 @@ Apply a feature selection method to a \verb{(features x cells)} matrix.
 \item Get the sum of each binarized feature.
 \item Find \code{num_feats} features with the highest accessibility.
 }
+
+\code{select_features_by_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
 }
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
index 13348e23..dff5c88e 100644
--- a/r/man/lsi.Rd
+++ b/r/man/lsi.Rd
@@ -9,7 +9,8 @@ LSI(
   n_dimensions = 50L,
   corr_cutoff = 1,
   normalize = normalize_tfidf,
-  threads = 1L
+  threads = 1L,
+  verbose = FALSE
 )
 }
 \arguments{
diff --git a/r/man/select_features_by_binned_dispersion.Rd b/r/man/select_features_by_binned_dispersion.Rd
deleted file mode 100644
index b3c573ae..00000000
--- a/r/man/select_features_by_binned_dispersion.Rd
+++ /dev/null
@@ -1,46 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/singlecell_utils.R
-\name{select_features_by_binned_dispersion}
-\alias{select_features_by_binned_dispersion}
-\title{Get the most variable features within a matrix.}
-\usage{
-select_features_by_binned_dispersion(
-  mat,
-  num_feats = 25000,
-  n_bins = 20,
-  threads = 1L
-)
-}
-\arguments{
-\item{mat}{(IterableMatrix) dimensions features x cells}
-
-\item{num_feats}{(integer) Number of features to return.  If the number is higher than the number of features in the matrix,
-all features will be returned.}
-
-\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
-and if the number of features
-within a bin is less than 2, the dispersion is set to 1.}
-
-\item{threads}{(integer) Number of threads to use.}
-}
-\value{
-Return a dataframe with the following columns, sorted descending by bin-normalized dispersion:
-\itemize{
-\item \code{names}: Feature name.
-\item \code{score}: Bin-normalized dispersion of the feature.
-\item \code{highly_variable}: Logical vector of whether the feature is highly variable.
-}
-}
-\description{
-Get the most variable features within a matrix.
-}
-\details{
-The formula for calculating the most variable features is from the Seurat package (Satjia et al. 2015).
-
-Calculate using the following process:
-\enumerate{
-\item Calculate the dispersion of each feature (variance / mean)
-\item Log normalize dispersion and mean
-\item Bin the features by their means, and normalize dispersion within each bin
-}
-}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index b34d528c..c83d52d4 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -132,7 +132,6 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
-  - select_features_by_binned_dispersion
   - LSI
   - IterableMatrix-methods
   - pseudobulk_matrix

From 6c4285b0647a27b7c0b43a5c6fffebfd07a6e872 Mon Sep 17 00:00:00 2001
From: immanuelazn <immanuelazn@gmail.com>
Date: Fri, 24 Jan 2025 17:46:12 -0800
Subject: [PATCH 22/22] [r] fix binned dispersion naming

---
 r/NAMESPACE                              |  2 +-
 r/R/singlecell_utils.R                   | 19 ++++++++++++-------
 r/man/feature_selection.Rd               |  7 ++++---
 r/tests/testthat/test-singlecell_utils.R |  8 ++++----
 4 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/r/NAMESPACE b/r/NAMESPACE
index 314978df..2f78fe51 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -116,7 +116,7 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
-export(select_features_by_binned_dispersion)
+export(select_features_binned_dispersion)
 export(select_features_mean)
 export(select_features_variance)
 export(select_regions)
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index e429a257..773c5fc6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -150,14 +150,14 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
 #' and if the number of features
 #' within a bin is less than 2, the dispersion is set to 1.
 #' @returns
-#'  - `select_features_by_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
+#'  - `select_features_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
 #' @details 
-#' `select_features_by_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+#' `select_features_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
 #'  1. Calculate the dispersion of each feature (variance / mean)
 #'  2. Log normalize dispersion and mean
 #'  3. Bin the features by their means, and normalize dispersion within each bin
 #' @export
-select_features_by_binned_dispersion <- function(
+select_features_binned_dispersion <- function(
   mat, num_feats = 25000, n_bins = 20,
   threads = 1L
 ) {
@@ -289,9 +289,14 @@ LSI <- function(
 ) {
   if (rlang::is_missing(mat)) {
     return(
-      purrr::partial(
-        LSI, n_dimensions = n_dimensions, corr_cutoff = corr_cutoff, 
-        normalize = normalize, threads = threads, verbose = verbose
+      create_partial(
+        missing_args = list(
+          n_dimensions = missing(n_dimensions),
+          corr_cutoff = missing(corr_cutoff), 
+          normalize = missing(normalize), 
+          threads = missing(threads),
+          verbose = missing(verbose)
+        )
       )
     )
   }
@@ -308,7 +313,7 @@ LSI <- function(
   if (verbose) log_progress("Normalizing matrix")
   mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
   read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
-  if (!is.null(normalize)) mat <- normalize(mat, threads = threads)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
   
   # Save to prevent re-calculation of queued operations
   mat <- write_matrix_dir(
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index 51756a83..c391c795 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -4,6 +4,7 @@
 \alias{select_features_variance}
 \alias{select_features_dispersion}
 \alias{select_features_mean}
+\alias{select_features_binned_dispersion}
 \title{Feature selection functions}
 \usage{
 select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
@@ -17,7 +18,7 @@ select_features_dispersion(
 
 select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
 
-select_features_by_binned_dispersion(
+select_features_binned_dispersion(
   mat,
   num_feats = 25000,
   n_bins = 20,
@@ -62,7 +63,7 @@ Each different feature selection method will have a different scoring method:
 }
 
 \itemize{
-\item \code{select_features_by_binned_dispersion}: Score representing the bin normalized dispersion of each feature.
+\item \code{select_features_binned_dispersion}: Score representing the bin normalized dispersion of each feature.
 }
 }
 \description{
@@ -88,7 +89,7 @@ Apply a feature selection method to a \verb{(features x cells)} matrix.
 \item Find \code{num_feats} features with the highest accessibility.
 }
 
-\code{select_features_by_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+\code{select_features_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
 \enumerate{
 \item Calculate the dispersion of each feature (variance / mean)
 \item Log normalize dispersion and mean
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 441ea851..33effb58 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -197,8 +197,8 @@ test_that("Pseudobulk aggregation works with multiple return types", {
 test_that("Feature selection by bin variance works", {
     mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
     # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
-    res_table <- select_features_by_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
-    res_table_t <- select_features_by_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
     res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
     res <- mat[res_feats,]
     res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
@@ -218,7 +218,7 @@ test_that("LSI works", {
     lsi_res_obj <- LSI(mat, n_dimensions = 5)
     lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
     # Also check partial args
-    lsi_res_obj_partial <- LSI(n_dimensions = 5)(mat)
+    lsi_res_obj_partial <- LSI(n_dimensions = 5, normalize = normalize_tfidf(scale_factor = 10000, threads = 4), threads = 4)(mat)
     lsi_res <- lsi_res_obj$cell_embeddings
     lsi_res_t <- lsi_res_t_obj$cell_embeddings
     # Check that projection results in the same output if used on the same input matrix
@@ -228,7 +228,7 @@ test_that("LSI works", {
     expect_equal(ncol(lsi_res), ncol(mat))
     expect_equal(nrow(lsi_res_t), 5)
     expect_equal(ncol(lsi_res_t), nrow(mat))
-    expect_equal(lsi_res_obj, lsi_res_obj_partial)
+    expect_equal(lsi_res_obj$cell_embeddings, lsi_res_obj_partial$cell_embeddings)
     expect_equal(lsi_res, lsi_res_proj)
 })