Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
13c3760
[r] add lsi, var feature selection
immanuelazn Oct 31, 2024
36a8983
[r] add lsi, variable feature selection
immanuelazn Nov 4, 2024
2be2efe
[r] parametrize z_score_norm, create temp option to return more info …
immanuelazn Nov 7, 2024
dccc3a5
[r] add test case for LSI comparing to archr
immanuelazn Nov 7, 2024
183dd40
[r] clean up var gene selection, lsi docstring
immanuelazn Nov 7, 2024
4972f34
[r] add variable gene selection test
immanuelazn Nov 7, 2024
e4d5cb0
[r] provide more colour to scanpy feat selection test
immanuelazn Nov 7, 2024
99470e0
[r] cleanup real data tests
immanuelazn Nov 7, 2024
3bf8914
[r] clean up lsi, var features docstrings
immanuelazn Nov 8, 2024
a7c6179
[r] add in more lsi real data tests
immanuelazn Nov 8, 2024
acf35b2
[r] remove unused variable from `lsi()`
immanuelazn Nov 18, 2024
47256db
[r] add requested changes
immanuelazn Dec 2, 2024
004499a
[r] fix requested changes
immanuelazn Dec 2, 2024
dd80165
[r] fix lsi docstring, idf_ logic
immanuelazn Dec 3, 2024
8891981
[r] replace z-score norm with corr cutoffs
immanuelazn Dec 7, 2024
1e7c6d0
[r] update LSI to use norm, feature selection helpers
immanuelazn Jan 9, 2025
e9c302e
[r] update `NEWS.md`
immanuelazn Jan 10, 2025
7ed6bd7
[r] remove test artifacts
immanuelazn Jan 10, 2025
d67b7db
[r] add logging, partial args
immanuelazn Jan 13, 2025
1ae19b2
Merge branch 'ia/feature-selection' into ia/lsi
immanuelazn Jan 18, 2025
76f4c7d
[r] change check for `pseudobulk_matrix()` to use whole number instea…
immanuelazn Jan 10, 2025
5e3a7fe
[r] update feature selection documentation
immanuelazn Jan 18, 2025
613b0df
Merge branch 'ia/feature-selection' into ia/lsi
immanuelazn Jan 24, 2025
6c4285b
[r] fix binned dispersion naming
immanuelazn Jan 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

S3method(base::as.data.frame,IterableFragments)
S3method(base::as.matrix,IterableMatrix)
S3method(project,DimReduction)
S3method(project,LSI)
S3method(project,default)
S3method(svds,IterableMatrix)
S3method(svds,default)
export("all_matrix_inputs<-")
export("cellNames<-")
export("chrNames<-")
export(DimReduction)
export(LSI)
export(add_cols)
export(add_rows)
export(all_matrix_inputs)
Expand Down Expand Up @@ -85,6 +90,7 @@ export(plot_tf_footprint)
export(plot_tss_profile)
export(plot_tss_scatter)
export(prefix_cell_names)
export(project)
export(pseudobulk_matrix)
export(qc_scATAC)
export(range_distance_to_nearest)
Expand All @@ -110,6 +116,7 @@ export(rowVars.default)
export(sctransform_pearson)
export(select_cells)
export(select_chromosomes)
export(select_features_binned_dispersion)
export(select_features_mean)
export(select_features_variance)
export(select_regions)
Expand Down
2 changes: 2 additions & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ of the new features this release and will continue to help with maintenance and
- Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
- Add `lsi()` function to perform latent semantic indexing on a matrix (pull request #181).
- Add `highly_variable_features()` function to identify highly variable features in a matrix (pull request #181).

## Improvements
- `trackplot_loop()` now accepts discrete color scales
Expand Down
241 changes: 239 additions & 2 deletions r/R/singlecell_utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,241 @@ select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, thread
return(features_df)
}

#' @rdname feature_selection
#' @param n_bins (integer) Number of bins for binning mean gene expression. Normalizing dispersion is done with respect to each bin,
#' and if the number of features
#' within a bin is less than 2, the dispersion is set to 1.
#' @returns
#' - `select_features_binned_dispersion`: Score representing the bin normalized dispersion of each feature.
#' @details
#' `select_features_binned_dispersion` calculates the bin normalized dispersion of each feature using the following process, given by the Seurat package (Satjia et al. 2015):
#' 1. Calculate the dispersion of each feature (variance / mean)
#' 2. Log normalize dispersion and mean
#' 3. Bin the features by their means, and normalize dispersion within each bin
#' @export
select_features_binned_dispersion <- function(
mat, num_feats = 25000, n_bins = 20,
threads = 1L
) {
assert_is(mat, "IterableMatrix")
assert_greater_than_zero(num_feats)
assert_is_wholenumber(num_feats)
assert_len(num_feats, 1)
assert_is_wholenumber(n_bins)
assert_len(n_bins, 1)
assert_greater_than_zero(n_bins)
if (nrow(mat) <= num_feats) {
log_progress(sprintf("Number of features (%s) is less than num_feats (%s), returning all features", nrow(mat), num_feats))
features_df <- tibble::tibble(
names = rownames(mat),
score = rep(0, nrow(mat)),
highly_variable = rep(TRUE, nrow(mat))
)
return(mat)
}
# Calculate row information for dispersion
mat_stats <- matrix_stats(mat, row_stats = c("variance"), threads = threads)
feature_means <- mat_stats$row_stats["mean", ]
feature_vars <- mat_stats$row_stats["variance", ]
# Calculate dispersion, and log normalize
feature_dispersion <- feature_vars / feature_means
feature_dispersion[feature_dispersion == 0] <- NA
feature_dispersion <- log1p(feature_dispersion)
feature_dispersion[feature_means == 0] <- 0
feature_means <- log1p(feature_means)
features_df <- tibble::tibble(
names = names(feature_means),
var = feature_vars,
mean = feature_means,
dispersion = feature_dispersion
)
# Bin by mean, and normalize dispersion with each bin
features_df <- features_df %>%
dplyr::mutate(bin = cut(mean, n_bins, labels=FALSE)) %>%
dplyr::group_by(bin) %>%
dplyr::mutate(
score = (dispersion - mean(dispersion)) / sd(dispersion),
score = if (dplyr::n() == 1) {1} else {score} # Set feats that are in bins with only one feat to have a norm dispersion of 1
) %>%
dplyr::ungroup() %>%
dplyr::arrange(desc(score)) %>%
dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats) %>%
dplyr::select(c("names", "score", "highly_variable"))
return(features_df)
}


#################
# DimReduction Class Definition
#################

#' Barebones definition of a DimReduction class.
#'
#' Represents a latent space output of a matrix after a transformation function, with any required information to reproject other inputs using this object.
#' Child classes should implement a `project` method to allow for the projection of other matrices using
#' the fitted transformation object.
#' @field cell_embeddings (IterableMatrix, dgCMatrix, matrix) The projected data
#' @field fitted_params (list) A list of parameters used for the transformation of a matrix.
#' @export
DimReduction <- function(x, fitted_params = list(), ...) {
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the part that I am most iffy about. I don't see much in the codebase regarding new S3 classes, so I did not have much to reference off of. I am also iffy about how we want to expose the documentation to users. I created a new DimReduction field in pkgdown.yml to store this and project(). Any feedback on this section specifically would be greatly appreciated!

assert_is(x, c("IterableMatrix", "dgCMatrix", "matrix"))
assert_is(fitted_params, "list")
structure(
list(
cell_embeddings = x,
fitted_params = fitted_params,
...
),
class = "DimReduction"
)
}

#' Perform a dimensionality reduction on a matrix using a pre-fit DimReduction object.
#' @param x DimReduction object.
#' @param mat IterableMatrix object.
#' @return IterableMatrix object of the projected data.
#' @details DimReduction subclasses should use this to project new data with the same features, to project into the same latent space.
#' All required information to run a projection should be held in x$fitted_params, including pertinent parameters when construction the DimReduction subclass object.
#' If there are no rownames, assume that the matrix is in the same order as the original matrix, assuming that they have the same number of features.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The row matching logic intuitively makes sense to me to have, but I recognize this wasn't in the design docs. I was able to accomplish this in LSI() by providing an additional attr named feature_names. Should this be reflected in the DimReduction base class as well?

#' If there are rownames, reorder the matrix to match the order of the original matrix
#' @export
project <- function(x, mat, ...) {
UseMethod("project")
}
#' @export
project.default <- function(x, mat, ...) {
rlang::abort("project method not implemented for BPCells objects.")
}
#' @export
project.DimReduction <- function(x, mat, ...) {
rlang::abort("project method not implemented for base DimReduction object.")
}

#################
# LSI Implementation
#################


#' Perform latent semantic indexing (LSI) on a matrix.
#'
#' Given a `(features x cells)` matrix, perform LSI to perform tf-idf, z-score normalization, and PCA to create a latent space representation of the matrix of shape `(n_dimensions, ncol(mat))`.
#' @param mat (IterableMatrix) dimensions features x cells.
#' @param n_dimensions (integer) Number of dimensions to keep during PCA.
#' @param corr_cutoff (numeric) Numeric filter for the correlation of a PC to the sequencing depth. If the PC has a correlation that is great or equal to
#' the corr_cutoff, it will be excluded from the final PCA matrix.
#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
#' @param threads (integer) Number of threads to use.
#' @returns An object of class `c("LSI", "DimReduction")` with the following attributes:
#' - `cell_embeddings`: The projected data
#' - `fitted_params`: A tibble of the parameters used for iterative LSI, with rows as iterations. Columns include the following:
#' - `normalization_method`: The normalization method used
#' - `feature_means`: The means of the features used for normalization
#' - `pcs_to_keep`: The PCs that were kept after filtering by correlation to sequencing depth
#' - `svd_params`: The matrix calculated for SVD
#' - `feature_names`: The names of the features in the matrix
#' @details Compute LSI through first doing a log(tf-idf) transform, z-score normalization, then PCA. Tf-idf implementation is from Stuart & Butler et al. 2019.
#'
#' Running on a 2600 cell dataset with 50000 peaks and 4 threads, as an example:
#' - 17.1 MB memory usage, 25.1 seconds runtime
#' @seealso `project()` `DimReduction()` `normalize_tfidf()`
#' @export
LSI <- function(
mat, n_dimensions = 50L, corr_cutoff = 1, normalize = normalize_tfidf,
threads = 1L, verbose = FALSE
) {
if (rlang::is_missing(mat)) {
return(
create_partial(
missing_args = list(
n_dimensions = missing(n_dimensions),
corr_cutoff = missing(corr_cutoff),
normalize = missing(normalize),
threads = missing(threads),
verbose = missing(verbose)
)
)
)
}
assert_is(mat, "IterableMatrix")
assert_is_wholenumber(n_dimensions)
assert_len(n_dimensions, 1)
assert_greater_than_zero(n_dimensions)
assert_true(n_dimensions < min(ncol(mat), nrow(mat)))
assert_true((corr_cutoff >= 0) && (corr_cutoff <= 1))
assert_is_wholenumber(threads)

if (verbose) log_progress("Starting LSI")
# Normalization
if (verbose) log_progress("Normalizing matrix")
mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"), threads = threads)
read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)

# Save to prevent re-calculation of queued operations
mat <- write_matrix_dir(
convert_matrix_type(mat, type = "float"),
tempfile("mat"), compress = TRUE
)
# Run pca
if (verbose) log_progress("Calculating SVD")
svd_attr <- svds(mat, k = n_dimensions, threads = threads)
pca_res <- t(svd_attr$u) %*% mat

# Filter out PCs that are highly correlated with sequencing depth
pca_corrs <- abs(cor(read_depth, t(pca_res)))
pca_feats_to_keep <- which(pca_corrs < corr_cutoff)
if (length(pca_feats_to_keep) != n_dimensions) {
if (verbose) log_progress(sprintf("Dropping PCs %s due to high correlation with sequencing depth", paste(setdiff(1:n_dimensions, pca_feats_to_keep), collapse = ", ")))
pca_res <- pca_res[pca_feats_to_keep, ]
}
fitted_params <- list(
normalization_method = normalize,
feature_means = mat_stats$row_stats["mean", ],
pcs_to_keep = pca_feats_to_keep,
svd_params = svd_attr
)
res <- DimReduction(
x = pca_res,
fitted_params = fitted_params,
feature_names = rownames(mat)
)
class(res) <- c("LSI", class(res))
return(res)
}


#' @export
Copy link
Copy Markdown
Collaborator Author

@immanuelazn immanuelazn Jan 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I put a docstring for this or is this intuitive enough for S3 dispatch?

project.LSI <- function(x, mat, threads = 1L, ...) {
assert_is(mat, "IterableMatrix")
assert_is(x, "LSI")

fitted_params <- x$fitted_params
# Do a check to make sure that the number of rows in the matrix is the same as the number of rows in SVD$u
assert_true(nrow(mat) == nrow(fitted_params$svd_params$u))
if (!is.null(rownames(mat)) && !is.null(x$feature_names)) {
assert_true(all(x$feature_names %in% rownames(mat)))
mat <- mat[x$feature_names, ]
}

if (!is.null(fitted_params$normalization_method)) {
mat <- fitted_params$normalization_method(
mat,
feature_means = fitted_params$feature_means,
threads = threads
)
mat <- write_matrix_dir(
convert_matrix_type(mat, type = "float"),
tempfile("mat"), compress = TRUE
)
}
pca_attr <- fitted_params$svd_params
res <- t(pca_attr$u) %*% mat
if (length(fitted_params$pcs_to_keep) != nrow(res)) {
res <- res[fitted_params$pcs_to_keep, ]
}
return(res)
}


#' Test for marker features
#'
Expand Down Expand Up @@ -210,6 +445,8 @@ marker_features <- function(mat, groups, method="wilcoxon") {
)
}



#' Aggregate counts matrices by cell group or feature.
#'
#' Given a `(features x cells)` matrix, group cells by `cell_groups` and aggregate counts by `method` for each
Expand All @@ -228,7 +465,7 @@ marker_features <- function(mat, groups, method="wilcoxon") {
#' extra calculation time, and when calculating `mean`, adding `nonzeros` will take no extra time.
#' @inheritParams marker_features
#' @export
pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) {
pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 1L) {
assert_is(mat, "IterableMatrix")
assert_is(cell_groups, c("factor", "character", "numeric"))
assert_true(length(cell_groups) == ncol(mat))
Expand All @@ -240,7 +477,7 @@ pseudobulk_matrix <- function(mat, cell_groups, method = "sum", threads = 0L) {
rlang::abort(sprintf("method must be one of: %s", paste(methods, collapse = ", ")))
}
}
assert_is(threads, "integer")
assert_is_wholenumber(threads)

it <- mat %>%
convert_matrix_type("double") %>%
Expand Down
21 changes: 21 additions & 0 deletions r/man/DimReduction.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions r/man/feature_selection.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading