diff --git a/r/NAMESPACE b/r/NAMESPACE
index e9998384..2f78fe51 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -2,11 +2,16 @@
 
 S3method(base::as.data.frame,IterableFragments)
 S3method(base::as.matrix,IterableMatrix)
+S3method(project,DimReduction)
+S3method(project,LSI)
+S3method(project,default)
 S3method(svds,IterableMatrix)
 S3method(svds,default)
 export("all_matrix_inputs<-")
 export("cellNames<-")
 export("chrNames<-")
+export(DimReduction)
+export(LSI)
 export(add_cols)
 export(add_rows)
 export(all_matrix_inputs)
@@ -85,6 +90,7 @@ export(plot_tf_footprint)
 export(plot_tss_profile)
 export(plot_tss_scatter)
 export(prefix_cell_names)
+export(project)
 export(pseudobulk_matrix)
 export(qc_scATAC)
 export(range_distance_to_nearest)
@@ -110,6 +116,7 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
+export(select_features_binned_dispersion)
 export(select_features_mean)
 export(select_features_variance)
 export(select_regions)
diff --git a/r/NEWS.md b/r/NEWS.md
index dc220204..deefee46 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -42,6 +42,8 @@ of the new features this release and will continue to help with maintenance and
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
 - Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
 - Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
+- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #181).
+- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #181). 
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
index 37f61d98..773c5fc6 100644
--- a/r/R/singlecell_utils.R
+++ b/r/R/singlecell_utils.R
@@ -145,6 +145,241 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
   return(features_df)
 }
 
+#' @rdname feature_selection
+#' @param n_bins (integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin, 
+#' and if the number of features
+#' within a bin is less than 2, the dispersion is set to 1.
+#' @returns
+#'  - `select_features_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
+#' @details 
+#' `select_features_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+#'  1. Calculate the dispersion of each feature (variance / mean)
+#'  2. Log normalize dispersion and mean
+#'  3. Bin the features by their means, and normalize dispersion within each bin
+#' @export
+select_features_binned_dispersion <- function(
+  mat, num_feats = 25000, n_bins = 20,
+  threads = 1L
+) {
+  assert_is(mat, "IterableMatrix")
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is_wholenumber(n_bins)
+  assert_len(n_bins, 1)
+  assert_greater_than_zero(n_bins)
+  if (nrow(mat) <= num_feats) {
+    log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
+    features_df <- tibble::tibble(
+      names = rownames(mat),
+      score = rep(0, nrow(mat)),
+      highly_variable = rep(TRUE, nrow(mat))
+    )
+    return(mat)
+  }
+  # Calculate row information for dispersion
+  mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
+  feature_means <- mat_stats$row_stats["mean", ]
+  feature_vars <- mat_stats$row_stats["variance", ]
+  # Calculate dispersion, and log normalize
+  feature_dispersion <- feature_vars / feature_means
+  feature_dispersion[feature_dispersion == 0] <- NA
+  feature_dispersion <- log1p(feature_dispersion)
+  feature_dispersion[feature_means == 0] <- 0
+  feature_means <- log1p(feature_means)
+  features_df <- tibble::tibble(
+    names = names(feature_means),
+    var = feature_vars, 
+    mean = feature_means,
+    dispersion = feature_dispersion
+  )
+  # Bin by mean, and normalize dispersion with each bin
+  features_df <- features_df %>% 
+    dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>% 
+    dplyr::group_by(bin) %>% 
+    dplyr::mutate( 
+      score = (dispersion - mean(dispersion)) / sd(dispersion),
+      score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1  
+    ) %>% 
+    dplyr::ungroup() %>%
+    dplyr::arrange(desc(score)) %>%
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) %>% 
+    dplyr::select(c("names", "score", "highly_variable"))
+  return(features_df)
+}
+
+
+#################
+# DimReduction Class Definition
+#################
+
+#' Barebones definition of a DimReduction class.
+#' 
+#' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
+#' Child classes should implement a `project` method to allow for the projection of other matrices using
+#' the fitted transformation object.
+#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) The projected data
+#' @field fitted_params (list) A list of parameters used for the transformation of a matrix.
+#' @export
+DimReduction <- function(x, fitted_params = list(), ...) {
+  assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix"))
+  assert_is(fitted_params, "list")
+  structure(
+    list(
+      cell_embeddings = x,
+      fitted_params = fitted_params,
+      ...
+    ),
+    class = "DimReduction"
+  )
+}
+
+#' Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+#' @param x DimReduction object.
+#' @param mat IterableMatrix object.
+#' @return IterableMatrix object of the projected data.
+#' @details DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
+#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
+#' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+#' If there are rownames, reorder the matrix to match the order of the original matrix
+#' @export
+project <- function(x, mat, ...) {
+  UseMethod("project")
+}
+#' @export 
+project.default <- function(x, mat, ...) {
+  rlang::abort("project method not implemented for BPCells objects.")
+}
+#' @export
+project.DimReduction <- function(x, mat, ...) {
+  rlang::abort("project method not implemented for base DimReduction object.")
+}
+
+#################
+# LSI Implementation
+#################
+
+
+#' Perform latent semantic indexing (LSI) on a matrix.
+#' 
+#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
+#' @param mat (IterableMatrix) dimensions features x cells.
+#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
+#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+#' the corr_cutoff, it will be excluded from the final PCA matrix.
+#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param threads (integer) Number of threads to use.
+#' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
+#' - `cell_embeddings`: The projected data
+#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+#'   - `normalization_method`: The normalization method used
+#'   - `feature_means`: The means of the features used for normalization
+#'   - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
+#'   - `svd_params`: The matrix calculated for SVD
+#' - `feature_names`: The names of the features in the matrix
+#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+#' 
+#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+#' - 17.1 MB memory usage, 25.1 seconds runtime
+#' @seealso `project()` `DimReduction()` `normalize_tfidf()`
+#' @export
+LSI <- function(
+  mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf,
+  threads = 1L, verbose = FALSE
+) {
+  if (rlang::is_missing(mat)) {
+    return(
+      create_partial(
+        missing_args = list(
+          n_dimensions = missing(n_dimensions),
+          corr_cutoff = missing(corr_cutoff), 
+          normalize = missing(normalize), 
+          threads = missing(threads),
+          verbose = missing(verbose)
+        )
+      )
+    )
+  }
+  assert_is(mat, "IterableMatrix")
+  assert_is_wholenumber(n_dimensions)
+  assert_len(n_dimensions, 1)
+  assert_greater_than_zero(n_dimensions)
+  assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
+  assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
+  assert_is_wholenumber(threads)
+  
+  if (verbose) log_progress("Starting LSI")
+  # Normalization
+  if (verbose) log_progress("Normalizing matrix")
+  mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
+  read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  
+  # Save to prevent re-calculation of queued operations
+  mat <- write_matrix_dir(
+    convert_matrix_type(mat, type = "float"),
+    tempfile("mat"), compress = TRUE
+  )
+  # Run pca
+  if (verbose) log_progress("Calculating SVD")
+  svd_attr <- svds(mat, k = n_dimensions, threads = threads)
+  pca_res <- t(svd_attr$u) %*% mat
+  
+  # Filter out PCs that are highly correlated with sequencing depth
+  pca_corrs <- abs(cor(read_depth, t(pca_res)))
+  pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
+  if (length(pca_feats_to_keep) != n_dimensions) {
+    if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
+    pca_res <- pca_res[pca_feats_to_keep, ]
+  }
+  fitted_params <- list(
+    normalization_method = normalize,
+    feature_means = mat_stats$row_stats["mean", ],
+    pcs_to_keep = pca_feats_to_keep,
+    svd_params = svd_attr
+  )
+  res <- DimReduction(
+    x = pca_res,
+    fitted_params = fitted_params,
+    feature_names = rownames(mat)
+  )
+  class(res) <- c("LSI", class(res))
+  return(res)
+}
+
+
+#' @export
+project.LSI <- function(x, mat, threads = 1L, ...) {
+  assert_is(mat, "IterableMatrix")
+  assert_is(x, "LSI")
+
+  fitted_params <- x$fitted_params
+  # Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u
+  assert_true(nrow(mat) == nrow(fitted_params$svd_params$u))
+  if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
+    assert_true(all(x$feature_names %in% rownames(mat)))
+    mat <- mat[x$feature_names, ]
+  }
+
+  if (!is.null(fitted_params$normalization_method))  {
+    mat <- fitted_params$normalization_method(
+      mat,
+      feature_means = fitted_params$feature_means,
+      threads = threads
+    )
+    mat <- write_matrix_dir(
+      convert_matrix_type(mat, type = "float"),
+      tempfile("mat"), compress = TRUE
+    )
+  }
+  pca_attr <- fitted_params$svd_params
+  res <- t(pca_attr$u) %*% mat
+  if (length(fitted_params$pcs_to_keep) != nrow(res)) {
+    res <- res[fitted_params$pcs_to_keep, ]
+  }
+  return(res)
+}
+
 
 #' Test for marker features
 #'
@@ -210,6 +445,8 @@ marker_features <- function(mat, groups, method="wilcoxon") {
     )
 }
 
+
+                   
 #' Aggregate counts matrices by cell group or feature.
 #'
 #' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
@@ -228,7 +465,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
 #' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time.
 #' @inheritParams marker_features
 #' @export
-pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) {
+pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
   assert_is(mat, "IterableMatrix")
   assert_is(cell_groups, c("factor", "character", "numeric"))
   assert_true(length(cell_groups) == ncol(mat))
@@ -240,7 +477,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) {
       rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", ")))
     }
   }
-  assert_is(threads, "integer")
+  assert_is_wholenumber(threads)
 
   it <- mat %>%
     convert_matrix_type("double") %>%
diff --git a/r/man/DimReduction.Rd b/r/man/DimReduction.Rd
new file mode 100644
index 00000000..2bb7486c
--- /dev/null
+++ b/r/man/DimReduction.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{DimReduction}
+\alias{DimReduction}
+\title{Barebones definition of a DimReduction class.}
+\usage{
+DimReduction(x, fitted_params = list(), ...)
+}
+\description{
+Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
+Child classes should implement a \code{project} method to allow for the projection of other matrices using
+the fitted transformation object.
+}
+\section{Fields}{
+
+\describe{
+\item{\code{cell_embeddings}}{(IterableMatrix, dgCMatrix, matrix) The projected data}
+
+\item{\code{fitted_params}}{(list) A list of parameters used for the transformation of a matrix.}
+}}
+
diff --git a/r/man/feature_selection.Rd b/r/man/feature_selection.Rd
index f83e2199..c391c795 100644
--- a/r/man/feature_selection.Rd
+++ b/r/man/feature_selection.Rd
@@ -4,6 +4,7 @@
 \alias{select_features_variance}
 \alias{select_features_dispersion}
 \alias{select_features_mean}
+\alias{select_features_binned_dispersion}
 \title{Feature selection functions}
 \usage{
 select_features_variance(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
@@ -16,6 +17,13 @@ select_features_dispersion(
 )
 
 select_features_mean(mat, num_feats = 0.05, normalize = NULL, threads = 1L)
+
+select_features_binned_dispersion(
+  mat,
+  num_feats = 25000,
+  n_bins = 20,
+  threads = 1L
+)
 }
 \arguments{
 \item{mat}{(IterableMatrix) dimensions features x cells}
@@ -28,6 +36,10 @@ all features will be returned.}
 \item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
 
 \item{threads}{(integer) Number of threads to use.}
+
+\item{n_bins}{(integer) Number of bins for binning mean gene expression.  Normalizing dispersion is done with respect to each bin,
+and if the number of features
+within a bin is less than 2, the dispersion is set to 1.}
 }
 \value{
 Return a dataframe with the following columns, sorted descending by score:
@@ -49,6 +61,10 @@ Each different feature selection method will have a different scoring method:
 \itemize{
 \item \code{select_features_by_mean}: Score representing the mean accessibility of each feature.
 }
+
+\itemize{
+\item \code{select_features_binned_dispersion}: Score representing the bin normalized dispersion of each feature.
+}
 }
 \description{
 Apply a feature selection method to a \verb{(features x cells)} matrix.
@@ -72,4 +88,11 @@ Apply a feature selection method to a \verb{(features x cells)} matrix.
 \item Get the sum of each binarized feature.
 \item Find \code{num_feats} features with the highest accessibility.
 }
+
+\code{select_features_binned_dispersion} calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
+\enumerate{
+\item Calculate the dispersion of each feature (variance / mean)
+\item Log normalize dispersion and mean
+\item Bin the features by their means, and normalize dispersion within each bin
+}
 }
diff --git a/r/man/lsi.Rd b/r/man/lsi.Rd
new file mode 100644
index 00000000..dff5c88e
--- /dev/null
+++ b/r/man/lsi.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{LSI}
+\alias{LSI}
+\title{Perform latent semantic indexing (LSI) on a matrix.}
+\usage{
+LSI(
+  mat,
+  n_dimensions = 50L,
+  corr_cutoff = 1,
+  normalize = normalize_tfidf,
+  threads = 1L,
+  verbose = FALSE
+)
+}
+\arguments{
+\item{mat}{(IterableMatrix) dimensions features x cells.}
+
+\item{n_dimensions}{(integer) Number of dimensions to keep during PCA.}
+
+\item{corr_cutoff}{(numeric) Numeric filter for the correlation of a PC to the sequencing depth.  If the PC has a correlation that is great or equal to
+the corr_cutoff, it will be excluded from the final PCA matrix.}
+
+\item{normalize}{(function) Normalize matrix using a given function. If \code{NULL}, no normalization is performed.}
+
+\item{threads}{(integer) Number of threads to use.}
+}
+\value{
+An object of class \code{c("LSI", "DimReduction")} with the following attributes:
+\itemize{
+\item \code{cell_embeddings}: The projected data
+\item \code{fitted_params}: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
+\itemize{
+\item \code{normalization_method}: The normalization method used
+\item \code{feature_means}: The means of the features used for normalization
+\item \code{pcs_to_keep}: The PCs that were kept after filtering by correlation to sequencing depth
+\item \code{svd_params}: The matrix calculated for SVD
+}
+\item \code{feature_names}: The names of the features in the matrix
+}
+}
+\description{
+Given a \verb{(features x cells)} matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape \verb{(n_dimensions, ncol(mat))}.
+}
+\details{
+Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA.  Tf-idf implementation is from Stuart & Butler et al. 2019.
+
+Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
+\itemize{
+\item 17.1 MB memory usage, 25.1 seconds runtime
+}
+}
+\seealso{
+\code{project()} \code{DimReduction()} \code{normalize_tfidf()}
+}
diff --git a/r/man/project.Rd b/r/man/project.Rd
new file mode 100644
index 00000000..dcab31b5
--- /dev/null
+++ b/r/man/project.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/singlecell_utils.R
+\name{project}
+\alias{project}
+\title{Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.}
+\usage{
+project(x, mat, ...)
+}
+\arguments{
+\item{x}{DimReduction object.}
+
+\item{mat}{IterableMatrix object.}
+}
+\value{
+IterableMatrix object of the projected data.
+}
+\description{
+Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
+}
+\details{
+DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
+All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
+If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
+If there are rownames, reorder the matrix to match the order of the original matrix
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index 0d98b13d..7a5a3d13 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -132,6 +132,7 @@ reference:
   - checksum
   - apply_by_row
   - regress_out
+  - LSI
   - IterableMatrix-methods
   - pseudobulk_matrix
 
@@ -147,6 +148,11 @@ reference:
   - knn_to_graph
   - cluster_membership_matrix
 
+- title: "Dimensionality Reductions"
+- contents: 
+  - DimReduction
+  - project
+
 
 - title: "Plots"
 
diff --git a/r/tests/real_data/ArchR_LSI.R b/r/tests/real_data/ArchR_LSI.R
new file mode 100644
index 00000000..de775f4f
--- /dev/null
+++ b/r/tests/real_data/ArchR_LSI.R
@@ -0,0 +1,69 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+library("BPCells")
+library("ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
+
+#' Perform a dimensionality reduction with tf-idf and SVD (LSI) on a matrix on ArchR and BPCells.
+#' As LSI uses an iterative approach on ArchR, we compare by using a single-iteration private function on ArchR.
+#' As the SVD implementation is not necessarily the same between the two packages, we take the SVD matrix
+#' from both functions and compare the matrix multiplication of the U and SVD matrices, which should give an approximation
+#' we can compare between the two packages.
+#' @param proj An archr project.
+test_lsi_similarity_to_archr <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+    setwd(dir)
+    # add iterative lsi for dim reduction
+    proj <- getTestProject()
+    proj <- addPeakMatrix(proj)
+    # Get the peak matrix
+    test_mat <- assay(getMatrixFromProject(proj, useMatrix = "PeakMatrix"))
+    # Calculate LSI on ArchR
+    # running LSI without binarizing, as we don't do this in the BPCells implementation
+    # we also don't filter quantile outliers.
+    lsi_archr <- ArchR:::.computeLSI(
+        mat = test_mat,
+        LSIMethod = 2,
+        nDimensions = 20,
+        binarize = FALSE,
+        outlierQuantiles = NULL
+    )
+    svd_archr <- lsi_archr$svd
+    lsi_mat_archr <- t(lsi_archr$matSVD)
+    # set rownames to NA, as we don't have rownames in the BPCells implementation
+    rownames(lsi_mat_archr) <- NULL
+    # PCA Matrix = T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = u * T(u) * Pre-SVD Matrix
+    # u * PCA Matrix = Pre-SVD Matrix
+    pre_svd_mat_approx_archr <- lsi_archr$svd$u %*% lsi_mat_archr
+    # Calculate LSI on BPCells
+    # Do not use z-score normalization, as this isn't done with ArchR
+    lsi_bpcells <- LSI(
+        test_mat %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        n_dimensions = 20
+    )
+    pre_svd_mat_approx_bpcells <- lsi_bpcells$fitted_params$svd_params$u %*% lsi_bpcells$cell_embeddings
+    testthat::expect_true(all.equal(pre_svd_mat_approx_archr, pre_svd_mat_approx_bpcells, tolerance = 1e-4))
+    # convert signs
+    lsi_mat_archr <- sweep(lsi_mat_archr, MARGIN = 1,  (2 * (lsi_mat_archr[,1] * lsi_bpcells$cell_embeddings[,1] > 0) - 1), `*`)
+    # Check for post-pca matrix similarity
+    testthat::expect_true(all.equal(lsi_mat_archr, lsi_bpcells$cell_embeddings, tolerance = 1e-4))
+    # also check for correlation between the two matrices in PC space
+    testthat::expect_true(cor(as.vector(lsi_mat_archr), as.vector(lsi_bpcells$cell_embeddings)) > 0.999)
+}
+test_lsi_similarity_to_archr()
\ No newline at end of file
diff --git a/r/tests/real_data/ArchR_insertions.R b/r/tests/real_data/ArchR_insertions.R
index 150939d4..8818d7e7 100644
--- a/r/tests/real_data/ArchR_insertions.R
+++ b/r/tests/real_data/ArchR_insertions.R
@@ -1,5 +1,23 @@
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/ArchR/")
-devtools::load_all("/mnt/c/users/Immanuel/PycharmProjects/BPCells/r/")
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/BPCells/r")
+devtools::load_all("/mnt/c/Users/Immanuel/PycharmProjects/ArchR")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
 
 fix_granges_syntax_for_archr <- function(gr) {
     mcols(gr)$RG <- gsub("PBSmall#", "", mcols(gr)$RG)
diff --git a/r/tests/real_data/Scanpy_variable_feat_selection.py b/r/tests/real_data/Scanpy_variable_feat_selection.py
new file mode 100644
index 00000000..6ae28c1b
--- /dev/null
+++ b/r/tests/real_data/Scanpy_variable_feat_selection.py
@@ -0,0 +1,46 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+import sys, tempfile, os
+import numpy as np
+import pandas as pd
+import scanpy as sc
+
+
+def call_highly_var_genes_single_batch(temp_dir: str) -> None:
+    """
+    Call highly_variable genes on a single batch of PBMC68k data using their interpreation of 
+    the Seurat implementation.
+    Write the input anndata object csv at `<temp_dir>/highly_var_genes_scanpy_input.csv`
+    Write the output as a csv, at `<temp_dir>/highly_var_genes_scanpy_output.csv`
+
+    Args:
+        temp_dir (str): Path to the temporary directory to write the input and output files.
+    """
+    # Dataset is only (765, 700)
+    adata = sc.datasets.pbmc68k_reduced()
+    adata.var_names_make_unique()
+    res = sc.pp._highly_variable_genes.highly_variable_genes(adata, 
+                                                             n_top_genes = 50,
+                                                             n_bins = 20, 
+                                                             flavor = "seurat",
+                                                             inplace = False,
+                                                             check_values = False).drop(columns = 'mean_bin')
+    # remove mean_bin
+    adata.to_df().to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_input.csv"))
+    res.to_csv(os.path.join(temp_dir, "highly_var_genes_scanpy_output.csv"))
+
+
+if __name__ == "__main__":
+    # We use the first argument as the temporary directory
+    if len(sys.argv) < 2:
+        # If no argument is provided, use the current directory
+        call_highly_var_genes_single_batch(".")
+    else:
+        call_highly_var_genes_single_batch(sys.argv[1])
+    
\ No newline at end of file
diff --git a/r/tests/real_data/scanpy_variable_feat_selection.R b/r/tests/real_data/scanpy_variable_feat_selection.R
new file mode 100644
index 00000000..d2eb08b2
--- /dev/null
+++ b/r/tests/real_data/scanpy_variable_feat_selection.R
@@ -0,0 +1,58 @@
+# Copyright 2024 BPCells contributors
+# 
+# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+# option. This file may not be copied, modified, or distributed
+# except according to those terms.
+
+library("BPCells")
+
+# Set up temp dir in case it's not already set
+create_temp_dir <- function(dir = NULL) {
+  if (is.null(dir)) {
+    dir <- file.path(tempdir(), "lsi_test")
+    if (dir.exists(dir)) unlink(dir, recursive = TRUE)
+    dir.create(dir)
+  }
+  return(dir)
+}
+
+# Compare the feature selection output of BPCells to that of Scanpy.
+# Scanpy technically utilizes the Seurat (Satija et al. 2015) method for feature selection, so we should expect similar results of either pkg. 
+# This function calls a python script that runs Scanpy feature selection on a test dataset, and writes both input/output to `dir`.
+# It then reads in the input/output from the python script, calls the BPCells feature selection function, and compares the output to the Scanpy output.
+compare_feat_selection_to_scanpy <- function(dir = NULL) {
+    dir <- create_temp_dir(dir)
+
+    # Call python script
+    system2("python3", c("Scanpy_variable_feat_selection.py", dir))
+    
+    # read in input csv
+    input_mat_scanpy <- t(read.csv(file.path(dir, "highly_var_genes_scanpy_input.csv"), row.names = 1))
+    output_mat_scanpy <- read.csv(file.path(dir, "highly_var_genes_scanpy_output.csv"), row.names = 1)
+    # filter output mat to only where "highly_variable" is true
+    output_mat_scanpy$highly_variable <- as.logical(output_mat_scanpy$highly_variable)
+    output_mat_scanpy <- output_mat_scanpy[output_mat_scanpy$highly_variable,] %>% 
+        dplyr::arrange(desc(dispersions_norm)) %>% 
+        dplyr::select(-highly_variable) %>% # convert rownames to a column
+        tibble::rownames_to_column("name") %>%
+        dplyr::as_tibble()
+
+    # Scanpy undoes a log1p transformation on the input matrix, so we do the same here
+    input_mat_bpcells <- expm1(input_mat_scanpy)
+    
+    output_bpcells <- highly_variable_features(
+        input_mat_bpcells %>% as("dgCMatrix") %>% as("IterableMatrix"),
+        num_feats = 50,
+        n_bins = 20,
+        save_feat_selection = TRUE
+    )
+    output_mat_bpcells <- output_bpcells$feature_selection
+    expect_true(all.equal(output_mat_bpcells$name, output_mat_scanpy$name))
+    expect_true(all.equal(output_mat_bpcells$mean, output_mat_scanpy$means, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$dispersion, output_mat_scanpy$dispersions, tolerance = 1e-6))
+    expect_true(all.equal(output_mat_bpcells$feature_dispersion_norm, output_mat_scanpy$dispersions_norm, tolerance = 1e-6))
+}
+
+compare_feat_selection_to_scanpy()
diff --git a/r/tests/testthat/test-singlecell_utils.R b/r/tests/testthat/test-singlecell_utils.R
index 08e41b7c..33effb58 100644
--- a/r/tests/testthat/test-singlecell_utils.R
+++ b/r/tests/testthat/test-singlecell_utils.R
@@ -13,6 +13,10 @@ generate_sparse_matrix <- function(nrow, ncol, fraction_nonzero = 0.5, max_val =
   as(m, "dgCMatrix")
 }
 
+generate_dense_matrix <- function(nrow, ncol) {
+  m <- matrix(runif(nrow * ncol), nrow = nrow)
+}
+
 test_that("select_features works general case", {
   m1 <- generate_sparse_matrix(100, 50) %>% as("IterableMatrix")
   for (fn in c("select_features_variance", "select_features_dispersion", "select_features_mean")) {
@@ -37,6 +41,7 @@ test_that("select_features works general case", {
   }
 })
 
+
 test_that("Wilcoxon rank sum works (small)", {
     x <- c(0.80, 0.83, 1.89, 1.04, 1.45, 1.38, 1.91, 1.64, 0.73, 1.46)
     y <- c(1.15, 0.88, 0.90, 0.74, 1.21)
@@ -187,3 +192,43 @@ test_that("Pseudobulk aggregation works with multiple return types", {
   }
 })
 
+
+
+test_that("Feature selection by bin variance works", {
+    mat <- generate_sparse_matrix(500, 26, fraction_nonzero = 0.1) %>% as("IterableMatrix")
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to Seurat
+    res_table <- select_features_binned_dispersion(mat, num_feats = 10, n_bins = 5, threads = 1)
+    res_table_t <- select_features_binned_dispersion(t(mat), num_feats = 10, n_bins = 5, threads = 1)
+    res_feats <- res_table %>% dplyr::filter(highly_variable) %>% dplyr::pull(names) 
+    res <- mat[res_feats,]
+    res_feats_t <- res_table_t %>% dplyr::filter(highly_variable) %>% dplyr::pull(names)
+    res_t <- t(mat[,res_feats_t])
+    
+    expect_equal(nrow(res), 10)
+    expect_equal(ncol(res), 26)
+    expect_equal(nrow(res_t), 10)
+    expect_equal(ncol(res_t), 500)
+})
+
+test_that("LSI works", {
+    mat <- matrix(runif(240), nrow=10) %>% as("dgCMatrix") %>% as("IterableMatrix")
+    rownames(mat) <- paste0("feat", seq_len(nrow(mat)))
+    colnames(mat) <- paste0("cell", seq_len(ncol(mat)))
+    # Test only that outputs are reasonable.  There is a full comparison in `tests/real_data/` that compares implementation to ArchR
+    lsi_res_obj <- LSI(mat, n_dimensions = 5)
+    lsi_res_t_obj <- LSI(t(mat), n_dimensions = 5)
+    # Also check partial args
+    lsi_res_obj_partial <- LSI(n_dimensions = 5, normalize = normalize_tfidf(scale_factor = 10000, threads = 4), threads = 4)(mat)
+    lsi_res <- lsi_res_obj$cell_embeddings
+    lsi_res_t <- lsi_res_t_obj$cell_embeddings
+    # Check that projection results in the same output if used on the same input matrix
+    lsi_res_proj <- project(lsi_res_obj, mat)
+
+    expect_equal(nrow(lsi_res), 5)
+    expect_equal(ncol(lsi_res), ncol(mat))
+    expect_equal(nrow(lsi_res_t), 5)
+    expect_equal(ncol(lsi_res_t), nrow(mat))
+    expect_equal(lsi_res_obj$cell_embeddings, lsi_res_obj_partial$cell_embeddings)
+    expect_equal(lsi_res, lsi_res_proj)
+})
+