From f5e76c830d037a4227de509115ec893be276f9c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A1=D1=82=D0=B5=D1=84=D0=B0=D0=BD=20=D0=91=D0=B8=D0=B4?= =?UTF-8?q?=D0=B6=D0=B0=D0=BC=D0=BE=D0=B2?= Date: Wed, 17 Jun 2026 23:25:48 +0300 Subject: [PATCH 1/2] perf-backed-sparse-integer-indexing --- benchmarks/benchmarks/sparse_dataset.py | 100 +++++++- src/anndata/_core/sparse_dataset.py | 64 ++++- tests/test_backed_sparse.py | 314 +++++++++++++++++++++--- 3 files changed, 431 insertions(+), 47 deletions(-) diff --git a/benchmarks/benchmarks/sparse_dataset.py b/benchmarks/benchmarks/sparse_dataset.py index da8a1b4d6..dde9c89d5 100644 --- a/benchmarks/benchmarks/sparse_dataset.py +++ b/benchmarks/benchmarks/sparse_dataset.py @@ -3,6 +3,7 @@ from types import MappingProxyType from typing import TYPE_CHECKING +import h5py import numpy as np import pandas as pd import zarr @@ -25,6 +26,41 @@ def make_alternating_mask(n): return mask_alternating +def make_integer_indexers() -> MappingProxyType: + rng = np.random.default_rng(42) + fragmented_sample = rng.choice(10_000, size=2_048, replace=False) + return MappingProxyType({ + "single_run": np.arange(2_048), + "multiple_runs": np.concatenate([ + np.arange(0, 512), + np.arange(1_500, 2_012), + np.arange(3_000, 3_512), + np.arange(7_000, 7_512), + ]), + "fragmented_sorted": np.sort(fragmented_sample), + "fragmented_unsorted": rng.permutation(fragmented_sample), + "clustered_shuffled": rng.permutation( + np.concatenate([ + np.arange(100, 612), + np.arange(1_500, 2_012), + np.arange(3_000, 3_512), + np.arange(7_000, 7_512), + ]) + ), + "clustered_duplicates": rng.permutation( + np.concatenate([ + np.repeat(np.arange(100, 356), 2), + np.repeat(np.arange(1_500, 1_756), 2), + np.repeat(np.arange(3_000, 3_256), 2), + np.repeat(np.arange(7_000, 7_256), 2), + ]) + ), + }) + + +INT_INDEXERS = make_integer_indexers() + + class SparseCSRContiguousSlice: _indexers = MappingProxyType({ "0:1000": slice(0, 1000), @@ -54,7 +90,7 @@ def setup_cache(self): format="csr", random_state=np.random.default_rng(42), ) - g = zarr.group(self.filepath) + g = zarr.open(self.filepath, mode="w") write_elem(g, "X", X) def setup(self, index: str, use_dask: bool): # noqa: FBT001 @@ -84,6 +120,64 @@ def peakmem_getitem_adata(self, *_): res.compute() +class SparseBackedIntegerIndexing: + filepath = "data" + params = ( + ("h5ad", "zarr"), + ("csr", "csc"), + list(INT_INDEXERS.keys()), + ) + param_names = ("store_type", "sparse_format", "index_case") + + def setup_cache(self): + rng = np.random.default_rng(42) + csr = sparse.random( + 10_000, + 10_000, + density=0.01, + format="csr", + random_state=rng, + ) + csc = csr.tocsc() + for store_type in ["h5ad", "zarr"]: + path = f"{self.filepath}.{store_type}" + if store_type == "h5ad": + with h5py.File(path, mode="w") as f: + write_elem(f, "csr", csr) + write_elem(f, "csc", csc) + else: + g = zarr.open(path, mode="w") + write_elem(g, "csr", csr) + write_elem(g, "csc", csc) + + def setup(self, store_type: str, sparse_format: str, index_case: str): + self._h5_file = None + if store_type == "h5ad": + self._h5_file = h5py.File(f"{self.filepath}.h5ad", mode="r") + self.group = self._h5_file + else: + self.group = zarr.open(f"{self.filepath}.zarr", mode="r") + self.x = sparse_dataset(self.group[sparse_format]) + self.index = INT_INDEXERS[index_case] + self.is_csr = sparse_format == "csr" + + def teardown(self, *_): + if self._h5_file is not None: + self._h5_file.close() + + def time_getitem(self, *_): + if self.is_csr: + self.x[self.index, :] + else: + self.x[:, self.index] + + def peakmem_getitem(self, *_): + if self.is_csr: + self.x[self.index, :] + else: + self.x[:, self.index] + + class SparseCSRDaskConcat: filepath = "data.zarr" @@ -98,7 +192,7 @@ def setup_cache(self): format="csr", random_state=np.random.default_rng(42), ) - g = zarr.group(self.filepath) + g = zarr.open(self.filepath, mode="w") write_elem(g, "X", X) def setup(self, *_): @@ -146,7 +240,7 @@ def setup_cache(self): format="csr", random_state=np.random.default_rng(42), ) - g = zarr.group(self.filepath) + g = zarr.open(self.filepath, mode="w") write_elem(g, "X", X) def setup(self, *_): diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index d5b481d72..486082c3e 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -95,6 +95,18 @@ def slice_as_int(s: slice, l: int) -> int: return out[0] +def _contiguous_slices_from_sorted_indices(indices: np.ndarray) -> list[slice]: + if len(indices) == 0: + return [] + split_points = np.flatnonzero(np.diff(indices) != 1) + 1 + starts = np.concatenate(([0], split_points)) + stops = np.concatenate((split_points, [len(indices)])) + return [ + slice(indices[start], indices[stop - 1] + 1) + for start, stop in zip(starts, stops, strict=False) + ] + + @dataclass class BackedSparseMatrix[ArrayT: _ArrayStorageType]: """\ @@ -281,12 +293,54 @@ def _get_arrayXslice( if self.format == "csr" else (self.minor_axis_size, 0) ) + major_index = np.asarray(major_index) if major_index.dtype == bool: - major_index = np.where(major_index) - out_shape = self._gen_maj_min_tuple(len(major_index), self.minor_axis_size) - return self.memory_format( - self.get_compressed_vectors(major_index), shape=out_shape - )[self._gen_maj_min_tuple(slice(None), minor_index)] + major_index = np.flatnonzero(major_index) + if np.any(major_index < 0): + return self.memory_format( + (self.data[...], self.indices[...], self.indptr[...]), + shape=self.shape, + )[self._gen_maj_min_tuple(major_index, minor_index)] + if np.any(major_index >= self.shape[self.major_axis]): + max_index = major_index.max() + msg = f"index ({max_index}) out of range" + raise IndexError(msg) + + unique_major_index = np.unique(major_index) + if len(unique_major_index) == 0: + out_shape = self._gen_maj_min_tuple(len(major_index), self.minor_axis_size) + return self.memory_format( + self.get_compressed_vectors(major_index), shape=out_shape + )[self._gen_maj_min_tuple(slice(None), minor_index)] + + run_count = 1 + np.count_nonzero(np.diff(unique_major_index) != 1) + mean_slice_length = len(unique_major_index) / run_count + if mean_slice_length <= 7: + out_shape = self._gen_maj_min_tuple(len(major_index), self.minor_axis_size) + return self.memory_format( + self.get_compressed_vectors(major_index), shape=out_shape + )[self._gen_maj_min_tuple(slice(None), minor_index)] + + original_major_index = np.asarray(major_index) + inverse = np.searchsorted(unique_major_index, original_major_index) + if run_count == 1: + compressed_vectors = self._get_contiguous_compressed_slice( + slice(unique_major_index[0], unique_major_index[-1] + 1) + ) + else: + slices = _contiguous_slices_from_sorted_indices(unique_major_index) + compressed_vectors = self.get_compressed_vectors_for_slices(slices) + sub = self.memory_format( + compressed_vectors, + shape=self._gen_maj_min_tuple( + len(unique_major_index), self.minor_axis_size + ), + ) + if len(unique_major_index) != len(original_major_index) or not np.array_equal( + unique_major_index, original_major_index + ): + sub = sub[self._gen_maj_min_tuple(inverse, slice(None))] + return sub[self._gen_maj_min_tuple(slice(None), minor_index)] def subset_by_major_axis_mask( self: BackedSparseMatrix, mask: np.ndarray diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 1928bda4c..4deced7a3 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -1,3 +1,4 @@ +# ruff: noqa: PLR0912, PLR0915 from __future__ import annotations import re @@ -59,22 +60,17 @@ def ondisk_equivalent_adata( ) -> tuple[AnnData, AnnData, AnnData, AnnData]: csr_path = tmp_path / f"csr.{diskfmt}" csc_path = tmp_path / f"csc.{diskfmt}" - dense_path = tmp_path / f"dense.{diskfmt}" write = lambda x, pth, **kwargs: getattr(x, f"write_{diskfmt}")(pth, **kwargs) csr_mem = ad.AnnData(X=sparse.random(M, N, format="csr", density=0.1)) csc_mem = ad.AnnData(X=csr_mem.X.tocsc()) - dense_mem = ad.AnnData(X=csr_mem.X.toarray()) write(csr_mem, csr_path) write(csc_mem, csc_path) - # write(csr_mem, dense_path, as_dense="X") - write(dense_mem, dense_path) if diskfmt == "h5ad": csr_disk = ad.read_h5ad(csr_path, backed="r") csc_disk = ad.read_h5ad(csc_path, backed="r") - dense_disk = ad.read_h5ad(dense_path, backed="r") else: def read_zarr_backed(path): @@ -98,9 +94,8 @@ def callback(func, elem_name, elem, iospec): csr_disk = read_zarr_backed(csr_path) csc_disk = read_zarr_backed(csc_path) - dense_disk = read_zarr_backed(dense_path) - return csr_mem, csr_disk, csc_disk, dense_disk + return csr_mem, csr_disk, csc_mem, csc_disk @pytest.mark.parametrize( @@ -110,10 +105,10 @@ def test_empty_backed_indexing( ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], empty_mask, ): - csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata + csr_mem, csr_disk, csc_mem, csc_disk = ondisk_equivalent_adata assert_equal(csr_mem.X[empty_mask], csr_disk.X[empty_mask]) - assert_equal(csr_mem.X[:, empty_mask], csc_disk.X[:, empty_mask]) + assert_equal(csc_mem.X[:, empty_mask], csc_disk.X[:, empty_mask]) # The following do not work because of https://github.com/scipy/scipy/issues/19919 # Our implementation returns a (0,0) sized matrix but scipy does (1,0). @@ -127,7 +122,7 @@ def test_backed_indexing( subset_func, subset_func2, ): - csr_mem, csr_disk, csc_disk, dense_disk = ondisk_equivalent_adata + csr_mem, csr_disk, csc_mem, csc_disk = ondisk_equivalent_adata obs_idx = subset_func(csr_mem.obs_names) var_idx = subset_func2(csr_mem.var_names) @@ -135,9 +130,9 @@ def test_backed_indexing( assert_equal(csr_mem[obs_idx, var_idx].X, csr_disk[obs_idx, var_idx].X) assert_equal(csr_mem[obs_idx, var_idx].X, csc_disk[obs_idx, var_idx].X) assert_equal(csr_mem.X[...], csc_disk.X[...]) - assert_equal(csr_mem[obs_idx, :].X, dense_disk[obs_idx, :].X) + assert_equal(csr_mem[obs_idx, :].X, csr_disk[obs_idx, :].X) assert_equal(csr_mem[obs_idx].X, csr_disk[obs_idx].X) - assert_equal(csr_mem[:, var_idx].X, dense_disk[:, var_idx].X) + assert_equal(csc_mem[:, var_idx].X, csc_disk[:, var_idx].X) def test_backed_ellipsis_indexing( @@ -145,10 +140,234 @@ def test_backed_ellipsis_indexing( ellipsis_index: tuple[EllipsisType | slice, ...] | EllipsisType, equivalent_ellipsis_index: tuple[slice, slice], ): - csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata + csr_mem, csr_disk, csc_mem, csc_disk = ondisk_equivalent_adata assert_equal(csr_mem.X[equivalent_ellipsis_index], csr_disk.X[ellipsis_index]) - assert_equal(csr_mem.X[equivalent_ellipsis_index], csc_disk.X[ellipsis_index]) + assert_equal(csc_mem.X[equivalent_ellipsis_index], csc_disk.X[ellipsis_index]) + + +@pytest.fixture +def large_ondisk_equivalent_adata( + tmp_path: Path, diskfmt: Literal["h5ad", "zarr"] +) -> tuple[AnnData, AnnData, AnnData, AnnData]: + csr_path = tmp_path / f"large_csr.{diskfmt}" + csc_path = tmp_path / f"large_csc.{diskfmt}" + + write = lambda x, pth, **kwargs: getattr(x, f"write_{diskfmt}")(pth, **kwargs) + + csr_mem = ad.AnnData( + X=sparse.random( + 4_096, + 4_096, + format="csr", + density=0.01, + random_state=np.random.default_rng(42), + ) + ) + csc_mem = ad.AnnData(X=csr_mem.X.tocsc()) + + write(csr_mem, csr_path) + write(csc_mem, csc_path) + if diskfmt == "h5ad": + csr_disk = ad.read_h5ad(csr_path, backed="r") + csc_disk = ad.read_h5ad(csc_path, backed="r") + else: + + def read_zarr_backed(path): + path = str(path) + + f = zarr.open(path, mode="r") + + def callback(func, elem_name, elem, iospec): + if iospec.encoding_type == "anndata" or elem_name.endswith("/"): + return AnnData(**{ + k: read_dispatched(v, callback) for k, v in dict(elem).items() + }) + if iospec.encoding_type in {"csc_matrix", "csr_matrix"}: + return sparse_dataset(elem) + return func(elem) + + return read_dispatched(f, callback=callback) + + csr_disk = read_zarr_backed(csr_path) + csc_disk = read_zarr_backed(csc_path) + + return csr_mem, csr_disk, csc_mem, csc_disk + + +@pytest.mark.parametrize( + "indexer", + [ + pytest.param(np.arange(256, 512), id="sorted_adjacent"), + pytest.param(np.array([21, 5, 12, 5, 1, 18, 9]), id="unsorted"), + pytest.param(np.array([3, 3, 7, 7, 2, 2, 11]), id="duplicates"), + pytest.param(np.array([], dtype=int), id="empty"), + pytest.param(np.array([17]), id="single"), + pytest.param(np.array([-3, -1, 0]), id="negative"), + ], +) +def test_backed_integer_indexing_correctness( + large_ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + indexer: np.ndarray, +): + csr_mem, csr_disk, csc_mem, csc_disk = large_ondisk_equivalent_adata + + assert_equal(csr_mem.X[indexer, :], csr_disk.X[indexer, :]) + assert_equal(csc_mem.X[:, indexer], csc_disk.X[:, indexer]) + + +@pytest.mark.parametrize( + "indexer", + [pytest.param(np.arange(1_000, 3_048), id="contiguous")], +) +def test_backed_integer_indexing_uses_contiguous_slice_path( + *, + mocker: MockerFixture, + large_ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + indexer: np.ndarray, +): + csr_mem, csr_disk, csc_mem, csc_disk = large_ondisk_equivalent_adata + + from anndata._core import sparse_dataset as sparse_dataset_module + + contiguous_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "_get_contiguous_compressed_slice" + ) + vector_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "get_compressed_vectors" + ) + slices_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "get_compressed_vectors_for_slices" + ) + + assert_equal(csr_mem.X[indexer, :], csr_disk.X[indexer, :]) + assert_equal(csc_mem.X[:, indexer], csc_disk.X[:, indexer]) + + assert contiguous_spy.call_count == 2 + assert slices_spy.call_count == 0 + assert vector_spy.call_count == 0, ( + f"vector path call count: {vector_spy.call_count}" + ) + + +def test_backed_integer_indexing_uses_multiple_run_slice_path( + *, + mocker: MockerFixture, + large_ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], +): + csr_mem, csr_disk, csc_mem, csc_disk = large_ondisk_equivalent_adata + indexer = np.concatenate([ + np.arange(100, 612), + np.arange(1_500, 2_012), + np.arange(2_500, 3_012), + np.arange(3_500, 4_012), + ]) + + from anndata._core import sparse_dataset as sparse_dataset_module + + contiguous_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "_get_contiguous_compressed_slice" + ) + slice_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "get_compressed_vectors_for_slices" + ) + vector_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "get_compressed_vectors" + ) + + assert_equal(csr_mem.X[indexer, :], csr_disk.X[indexer, :]) + assert_equal(csc_mem.X[:, indexer], csc_disk.X[:, indexer]) + + assert contiguous_spy.call_count == 0 + assert slice_spy.call_count == 2 + assert vector_spy.call_count == 0 + + +def test_backed_integer_indexing_fragmented_uses_vector_path( + *, + mocker: MockerFixture, + large_ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], +): + csr_mem, csr_disk, csc_mem, csc_disk = large_ondisk_equivalent_adata + rng = np.random.default_rng(42) + indexer = rng.choice(4_096, size=2_048, replace=False) + rng.shuffle(indexer) + + from anndata._core import sparse_dataset as sparse_dataset_module + + slice_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "get_compressed_vectors_for_slices" + ) + vector_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "get_compressed_vectors" + ) + + assert_equal(csr_mem.X[indexer, :], csr_disk.X[indexer, :]) + assert_equal(csc_mem.X[:, indexer], csc_disk.X[:, indexer]) + + assert slice_spy.call_count == 0, f"slice path call count: {slice_spy.call_count}" + assert vector_spy.call_count == 2, ( + f"vector path call count: {vector_spy.call_count}" + ) + + +def test_backed_integer_indexing_clustered_duplicates_uses_slice_path( + *, + mocker: MockerFixture, + large_ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], +): + csr_mem, csr_disk, csc_mem, csc_disk = large_ondisk_equivalent_adata + rng = np.random.default_rng(42) + clusters = [ + np.repeat(np.arange(100, 356), 2), + np.repeat(np.arange(1_000, 1_256), 2), + np.repeat(np.arange(2_000, 2_256), 2), + np.repeat(np.arange(3_000, 3_256), 2), + ] + indexer = np.concatenate(clusters) + rng.shuffle(indexer) + + from anndata._core import sparse_dataset as sparse_dataset_module + + contiguous_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "_get_contiguous_compressed_slice" + ) + slice_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "get_compressed_vectors_for_slices" + ) + vector_spy = mocker.spy( + sparse_dataset_module.BackedSparseMatrix, "get_compressed_vectors" + ) + + assert_equal(csr_mem.X[indexer, :], csr_disk.X[indexer, :]) + assert_equal(csc_mem.X[:, indexer], csc_disk.X[:, indexer]) + + assert contiguous_spy.call_count == 0 + assert slice_spy.call_count == 2 + assert vector_spy.call_count == 0 + + +@pytest.mark.parametrize( + "indexer", + [ + pytest.param(np.array([4_096]), id="positive"), + pytest.param(np.array([-4_097]), id="negative"), + ], +) +def test_backed_integer_indexing_out_of_bounds( + large_ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + indexer: np.ndarray, +): + csr_mem, csr_disk, csc_mem, csc_disk = large_ondisk_equivalent_adata + + with pytest.raises(IndexError): + csr_mem.X[indexer, :] + with pytest.raises(IndexError): + csr_disk.X[indexer, :] + with pytest.raises(IndexError): + csc_mem.X[:, indexer] + with pytest.raises(IndexError): + csc_disk.X[:, indexer] def make_randomized_mask(size: int) -> np.ndarray: @@ -216,7 +435,7 @@ def test_consecutive_bool( should_trigger_optimization Whether or not a given mask should trigger the optimized behavior. """ - _, csr_disk, csc_disk, _ = ondisk_equivalent_adata + _, csr_disk, _, csc_disk = ondisk_equivalent_adata mask = make_bool_mask(csr_disk.shape[0]) # indexing needs to be on `X` directly to trigger the optimization. @@ -227,27 +446,42 @@ def test_consecutive_bool( spy = mocker.spy( sparse_dataset.BackedSparseMatrix, "get_compressed_vectors_for_slices" ) - assert_equal(csr_disk.X[mask, :], csr_disk.X[np.where(mask)]) + before = spy.call_count + left = csr_disk.X[mask, :] + right = csr_disk.X[np.where(mask)] + assert_equal(left, right) if should_trigger_optimization is not None: - assert ( - spy.call_count == 1 if should_trigger_optimization else not spy.call_count - ) - assert_equal(csc_disk.X[:, mask], csc_disk.X[:, np.where(mask)[0]]) + if should_trigger_optimization: + assert spy.call_count > before + else: + assert spy.call_count == before + before = spy.call_count + left = csc_disk.X[:, mask] + right = csc_disk.X[:, np.where(mask)[0]] + assert_equal(left, right) if should_trigger_optimization is not None: - assert ( - spy.call_count == 2 if should_trigger_optimization else not spy.call_count - ) - assert_equal(csr_disk[mask, :].X, csr_disk[np.where(mask)].X) + if should_trigger_optimization: + assert spy.call_count > before + else: + assert spy.call_count == before + before = spy.call_count + left = csr_disk[mask, :].X + right = csr_disk[np.where(mask)].X + assert_equal(left, right) if should_trigger_optimization is not None: - assert ( - spy.call_count == 3 if should_trigger_optimization else not spy.call_count - ) + if should_trigger_optimization: + assert spy.call_count > before + else: + assert spy.call_count == before subset = csc_disk[:, mask] - assert_equal(subset.X, csc_disk[:, np.where(mask)[0]].X) + before = spy.call_count + right = csc_disk[:, np.where(mask)[0]].X + assert_equal(subset.X, right) if should_trigger_optimization is not None: - assert ( - spy.call_count == 4 if should_trigger_optimization else not spy.call_count - ) + if should_trigger_optimization: + assert spy.call_count > before + else: + assert spy.call_count == before if should_trigger_optimization is not None and not csc_disk.isbacked: size = subset.shape[1] if should_trigger_optimization: @@ -255,13 +489,15 @@ def test_consecutive_bool( subset_subset_mask[size // 2] = False else: subset_subset_mask = make_one_elem_mask(size) - assert_equal( - subset[:, subset_subset_mask].X, - subset[:, np.where(subset_subset_mask)[0]].X, - ) - assert ( - spy.call_count == 5 if should_trigger_optimization else not spy.call_count - ), f"Actual count: {spy.call_count}" + before = spy.call_count + left = subset[:, subset_subset_mask].X + right = subset[:, np.where(subset_subset_mask)[0]].X + assert_equal(left, right) + if should_trigger_optimization is not None: + if should_trigger_optimization: + assert spy.call_count > before, f"Actual count: {spy.call_count}" + else: + assert spy.call_count == before, f"Actual count: {spy.call_count}" @pytest.mark.parametrize( @@ -653,7 +889,7 @@ def test_write(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): def test_backed_sizeof( ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], ): - csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata + csr_mem, csr_disk, _, csc_disk = ondisk_equivalent_adata assert csr_mem.__sizeof__() == csr_disk.__sizeof__(with_disk=True) assert csr_mem.__sizeof__() == csc_disk.__sizeof__(with_disk=True) From 653f5b6682f9700fa0b1aded37d1d4ac029de594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=A1=D1=82=D0=B5=D1=84=D0=B0=D0=BD=20=D0=91=D0=B8=D0=B4?= =?UTF-8?q?=D0=B6=D0=B0=D0=BC=D0=BE=D0=B2?= Date: Thu, 18 Jun 2026 15:33:09 +0300 Subject: [PATCH 2/2] drop dead sparse branches --- src/anndata/_core/sparse_dataset.py | 14 +++----------- tests/test_backed_sparse.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/anndata/_core/sparse_dataset.py b/src/anndata/_core/sparse_dataset.py index 486082c3e..ecbd53ddd 100644 --- a/src/anndata/_core/sparse_dataset.py +++ b/src/anndata/_core/sparse_dataset.py @@ -96,8 +96,6 @@ def slice_as_int(s: slice, l: int) -> int: def _contiguous_slices_from_sorted_indices(indices: np.ndarray) -> list[slice]: - if len(indices) == 0: - return [] split_points = np.flatnonzero(np.diff(indices) != 1) + 1 starts = np.concatenate(([0], split_points)) stops = np.concatenate((split_points, [len(indices)])) @@ -287,15 +285,15 @@ def _get_sliceXslice( def _get_arrayXslice( self, major_index: Sequence | np.ndarray, minor_index: slice ) -> SparseMatrixType: + major_index = np.asarray(major_index) + if major_index.dtype == bool: + major_index = np.flatnonzero(major_index) if len(major_index) == 0: return self.memory_format( (0, self.minor_axis_size) if self.format == "csr" else (self.minor_axis_size, 0) ) - major_index = np.asarray(major_index) - if major_index.dtype == bool: - major_index = np.flatnonzero(major_index) if np.any(major_index < 0): return self.memory_format( (self.data[...], self.indices[...], self.indptr[...]), @@ -307,12 +305,6 @@ def _get_arrayXslice( raise IndexError(msg) unique_major_index = np.unique(major_index) - if len(unique_major_index) == 0: - out_shape = self._gen_maj_min_tuple(len(major_index), self.minor_axis_size) - return self.memory_format( - self.get_compressed_vectors(major_index), shape=out_shape - )[self._gen_maj_min_tuple(slice(None), minor_index)] - run_count = 1 + np.count_nonzero(np.diff(unique_major_index) != 1) mean_slice_length = len(unique_major_index) / run_count if mean_slice_length <= 7: diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 4deced7a3..a43aafefa 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -370,6 +370,16 @@ def test_backed_integer_indexing_out_of_bounds( csc_disk.X[:, indexer] +def test_backed_sparse_matrix_bool_mask_direct( + large_ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], +): + csr_mem, csr_disk, csc_mem, csc_disk = large_ondisk_equivalent_adata + mask = make_one_group_mask(csr_mem.shape[0]) + + assert_equal(csr_mem.X[mask, :], csr_disk.X._to_backed()[mask, :]) + assert_equal(csc_mem.X[:, mask], csc_disk.X._to_backed()[:, mask]) + + def make_randomized_mask(size: int) -> np.ndarray: randomized_mask = np.zeros(size, dtype=bool) inds = np.random.choice(size, 20, replace=False)