From f2d1e318a24142000fc323d97b4313cdf4766006 Mon Sep 17 00:00:00 2001 From: Shashank Katiyar <72993520+shashkat@users.noreply.github.com> Date: Wed, 24 Jun 2026 23:23:02 -0400 Subject: [PATCH] fixed bug of incorrect nhood_profile calculation in function _calculate_neighborhood_profile --- src/squidpy/gr/_niche.py | 62 ++++++++++++++++++++++++--------------- tests/conftest.py | 57 +++++++++++++++++++++++++++++++++++ tests/graph/test_niche.py | 26 ++++++++++++++++ 3 files changed, 121 insertions(+), 24 deletions(-) diff --git a/src/squidpy/gr/_niche.py b/src/squidpy/gr/_niche.py index 07db78246..5b0be4a60 100644 --- a/src/squidpy/gr/_niche.py +++ b/src/squidpy/gr/_niche.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import warnings from typing import Any, Literal import anndata as ad @@ -10,7 +11,7 @@ import scipy.sparse as sps from anndata import AnnData from numpy.typing import NDArray -from scipy.sparse import coo_matrix, hstack, issparse, spdiags +from scipy.sparse import coo_matrix, hstack, issparse, lil_matrix, spdiags from scipy.spatial import distance from sklearn.metrics import f1_score from sklearn.mixture import GaussianMixture @@ -607,35 +608,48 @@ def _calculate_neighborhood_profile( Returns an obs x category matrix where each column is the absolute/relative frequency of a category in the neighborhood """ - nonzero_indices = np.split(matrix.col, matrix.row.searchsorted(np.arange(1, matrix.shape[0]))) - neighbor_matrix = pd.DataFrame(nonzero_indices) + # ensure that adata.obs[group] is of categorical type, as that makes it explicit, which cols of the returned profile_df + # correspond to which categories in group + if adata.obs[groups].dtype.name != "category": + warnings.warn( + "Since adata.obs[groups] does not already have categorical dtype, converting it into categorical type.", + stacklevel=2, + ) + adata.obs[groups] = adata.obs[groups].astype("category") - # get unique categories - unique_categories = np.unique(adata.obs[groups].values) + # ensure matrix is in csc format for efficient column slicing + if matrix.format != "csc": + matrix = matrix.tocsc() - # get obs x k matrix where each column is the category of the k-th neighbor - indices_with_nan = neighbor_matrix.to_numpy() - valid_indices = neighbor_matrix.fillna(-1).astype(int).to_numpy() - cat_by_id = adata.obs[groups].values[valid_indices] - cat_by_id[indices_with_nan == -1] = np.nan - # cat_by_id = np.take(category_arr, neighbor_matrix) + # get cell categories in order + categories_order = adata.obs[groups].cat.categories + n_categories = len(categories_order) - # in obs x k matrix convert categorical values to numerical values - cat_indices = {category: index for index, category in enumerate(unique_categories)} - cat_values = np.vectorize(cat_indices.get)(cat_by_id) + # map category to column index + category_to_idx = {ct: i for i, ct in enumerate(categories_order)} - # get obx x category matrix where each column is the absolute amount of a category in the neighborhood - m, k = cat_by_id.shape - abs_freq = np.zeros((m, len(unique_categories)), dtype=int) - np.add.at(abs_freq, (np.arange(m)[:, None], cat_values), 1) + # pre allocate sparse LIL matrix for efficient assignment (n_cells x n_categories) + profile_sparse = lil_matrix((matrix.shape[0], n_categories), dtype=np.float64) - # normalize by n_neighbors to get relative frequency of each category - rel_freq = abs_freq / k + # for each category, sum over cells of that category + for ct in categories_order: + ct_mask = adata.obs[groups] == ct # boolean mask for cells of this category + col_indices = np.where(ct_mask)[0] # indices of those cells + if len(col_indices) > 0: + col_slice = matrix[:, col_indices] # sparse submatrix + profile_sparse[:, category_to_idx[ct]] = col_slice.sum(axis=1).A1 - if abs_nhood: - return pd.DataFrame(abs_freq, index=adata.obs.index) - else: - return pd.DataFrame(rel_freq, index=adata.obs.index) + # convert to dataframe (csr for final storage, dense for pandas) + profile_df = pd.DataFrame(profile_sparse.tocsr().todense(), index=adata.obs[groups].index, columns=categories_order) + + # now according to parameter abs_nhood, make raw counts into proportions or not + if not abs_nhood: + total_neighs = profile_df.sum(axis=1) + profile_df = profile_df.div(total_neighs, axis=0) + # this may lead to some values being nan, as some cells might have had no neighbors. Make those values as 0 + profile_df = profile_df.fillna(0.0) + + return profile_df def _utag(adata: AnnData, normalize_adj: bool, spatial_connectivity_key: str) -> AnnData: diff --git a/tests/conftest.py b/tests/conftest.py index 83d405d8d..2dbb6e41d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -117,6 +117,63 @@ def dummy_adata() -> AnnData: return adata +@pytest.fixture() +def dummy_adata2() -> AnnData: + r = np.random.RandomState(100) + adata = AnnData(r.rand(10, 100), obs={"celltype": r.choice(["foo", "bar", "baz"], size=10)}) + adata.obs.index = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] + + # Spatial layout of the 10 data points in the grid: + # + # Y ▲ + # │ + # 5 │ a b c Points: (2,5), (3,5), (5,5) + # │ + # 4 │ d e Points: (1,4), (4,4) + # │ + # 3 │ f g Points: (3,3), (5,3) + # │ + # 2 │ h Points: (4,2) + # │ + # 1 │ i j Points: (1,1), (5,1) + # │ + # └─────────────────────────► + # 1 2 3 4 5 X + # + # + # celltypes- + # a bar + # b baz + # c foo + # d foo + # e foo + # f baz + # g baz + # h baz + # i bar + # j baz + + adata.obsm["spatial"] = np.array( + [ + [2, 5], + [3, 5], + [5, 5], + [1, 4], + [4, 4], + [3, 3], + [5, 3], + [4, 2], + [1, 1], + [5, 1], + ] + ) + + # using a radius of 1.5 will lead to, say e being a neighbor of b,c,f,g but not h. So all side-adjacent and diagonal-adjacent + # locations will be neighbors + sq.gr.spatial_neighbors_radius(adata, radius=1.5) + return adata + + @pytest.fixture() def adata_intmat() -> AnnData: graph = csr_matrix( diff --git a/tests/graph/test_niche.py b/tests/graph/test_niche.py index 7faad2712..c8e87130e 100644 --- a/tests/graph/test_niche.py +++ b/tests/graph/test_niche.py @@ -1,6 +1,7 @@ from __future__ import annotations from anndata import AnnData +from pandas import DataFrame from pandas.testing import assert_frame_equal from scipy.sparse import issparse @@ -12,6 +13,31 @@ GROUPS = "celltype_mapped_refined" +def test_calculate_neighborhood_profile(dummy_adata2: AnnData): + "calculate_neighborhood_profile function needs to be tested, as it is at the base of the functionality of neighborhood flavor" + matrix = dummy_adata2.obsp["spatial_connectivities"].tocoo() + nhood_profile = _calculate_neighborhood_profile(dummy_adata2, "celltype", matrix, True) + relative_nhood_profile = _calculate_neighborhood_profile(dummy_adata2, "celltype", matrix, False) + + # nhood_profile and relative_nhood_profile should have same entries as this manually determined version for dummy_adata2 + expected_nhood_profile = DataFrame( + { + 0: [0, 1, 0, 1, 0, 0, 0, 0, 0, 0], + 1: [1, 0, 0, 0, 3, 1, 1, 3, 0, 1], + 2: [1, 1, 1, 0, 1, 1, 1, 0, 0, 0], + }, + index=list("abcdefghij"), + dtype="float", + ) + total_neighs = expected_nhood_profile.sum(axis=1) + expected_relative_nhood_profile = expected_nhood_profile.div(total_neighs, axis=0) + expected_relative_nhood_profile = expected_relative_nhood_profile.fillna(0.0) + + # compare + assert (nhood_profile.values == expected_nhood_profile.values).all() + assert (relative_nhood_profile.values == expected_relative_nhood_profile.values).all() + + def test_niche_calc_nhood(adata_seqfish: AnnData): """Check whether niche calculation using neighborhood profile approach works as intended.""" spatial_neighbors(adata_seqfish, coord_type="generic", delaunay=False, n_neighs=N_NEIGHBORS)