From f2d1e318a24142000fc323d97b4313cdf4766006 Mon Sep 17 00:00:00 2001
From: Shashank Katiyar <72993520+shashkat@users.noreply.github.com>
Date: Wed, 24 Jun 2026 23:23:02 -0400
Subject: [PATCH] fixed bug of incorrect nhood_profile calculation in function
 _calculate_neighborhood_profile

---
 src/squidpy/gr/_niche.py  | 62 ++++++++++++++++++++++++---------------
 tests/conftest.py         | 57 +++++++++++++++++++++++++++++++++++
 tests/graph/test_niche.py | 26 ++++++++++++++++
 3 files changed, 121 insertions(+), 24 deletions(-)

diff --git a/src/squidpy/gr/_niche.py b/src/squidpy/gr/_niche.py
index 07db78246..5b0be4a60 100644
--- a/src/squidpy/gr/_niche.py
+++ b/src/squidpy/gr/_niche.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import contextlib
+import warnings
 from typing import Any, Literal
 
 import anndata as ad
@@ -10,7 +11,7 @@
 import scipy.sparse as sps
 from anndata import AnnData
 from numpy.typing import NDArray
-from scipy.sparse import coo_matrix, hstack, issparse, spdiags
+from scipy.sparse import coo_matrix, hstack, issparse, lil_matrix, spdiags
 from scipy.spatial import distance
 from sklearn.metrics import f1_score
 from sklearn.mixture import GaussianMixture
@@ -607,35 +608,48 @@ def _calculate_neighborhood_profile(
     Returns an obs x category matrix where each column is the absolute/relative frequency of a category in the neighborhood
     """
 
-    nonzero_indices = np.split(matrix.col, matrix.row.searchsorted(np.arange(1, matrix.shape[0])))
-    neighbor_matrix = pd.DataFrame(nonzero_indices)
+    # ensure that adata.obs[group] is of categorical type, as that makes it explicit, which cols of the returned profile_df
+    # correspond to which categories in group
+    if adata.obs[groups].dtype.name != "category":
+        warnings.warn(
+            "Since adata.obs[groups] does not already have categorical dtype, converting it into categorical type.",
+            stacklevel=2,
+        )
+        adata.obs[groups] = adata.obs[groups].astype("category")
 
-    # get unique categories
-    unique_categories = np.unique(adata.obs[groups].values)
+    # ensure matrix is in csc format for efficient column slicing
+    if matrix.format != "csc":
+        matrix = matrix.tocsc()
 
-    # get obs x k matrix where each column is the category of the k-th neighbor
-    indices_with_nan = neighbor_matrix.to_numpy()
-    valid_indices = neighbor_matrix.fillna(-1).astype(int).to_numpy()
-    cat_by_id = adata.obs[groups].values[valid_indices]
-    cat_by_id[indices_with_nan == -1] = np.nan
-    # cat_by_id = np.take(category_arr, neighbor_matrix)
+    # get cell categories in order
+    categories_order = adata.obs[groups].cat.categories
+    n_categories = len(categories_order)
 
-    # in obs x k matrix convert categorical values to numerical values
-    cat_indices = {category: index for index, category in enumerate(unique_categories)}
-    cat_values = np.vectorize(cat_indices.get)(cat_by_id)
+    # map category to column index
+    category_to_idx = {ct: i for i, ct in enumerate(categories_order)}
 
-    # get obx x category matrix where each column is the absolute amount of a category in the neighborhood
-    m, k = cat_by_id.shape
-    abs_freq = np.zeros((m, len(unique_categories)), dtype=int)
-    np.add.at(abs_freq, (np.arange(m)[:, None], cat_values), 1)
+    # pre allocate sparse LIL matrix for efficient assignment (n_cells x n_categories)
+    profile_sparse = lil_matrix((matrix.shape[0], n_categories), dtype=np.float64)
 
-    # normalize by n_neighbors to get relative frequency of each category
-    rel_freq = abs_freq / k
+    # for each category, sum over cells of that category
+    for ct in categories_order:
+        ct_mask = adata.obs[groups] == ct  # boolean mask for cells of this category
+        col_indices = np.where(ct_mask)[0]  # indices of those cells
+        if len(col_indices) > 0:
+            col_slice = matrix[:, col_indices]  # sparse submatrix
+            profile_sparse[:, category_to_idx[ct]] = col_slice.sum(axis=1).A1
 
-    if abs_nhood:
-        return pd.DataFrame(abs_freq, index=adata.obs.index)
-    else:
-        return pd.DataFrame(rel_freq, index=adata.obs.index)
+    # convert to dataframe (csr for final storage, dense for pandas)
+    profile_df = pd.DataFrame(profile_sparse.tocsr().todense(), index=adata.obs[groups].index, columns=categories_order)
+
+    # now according to parameter abs_nhood, make raw counts into proportions or not
+    if not abs_nhood:
+        total_neighs = profile_df.sum(axis=1)
+        profile_df = profile_df.div(total_neighs, axis=0)
+        # this may lead to some values being nan, as some cells might have had no neighbors. Make those values as 0
+        profile_df = profile_df.fillna(0.0)
+
+    return profile_df
 
 
 def _utag(adata: AnnData, normalize_adj: bool, spatial_connectivity_key: str) -> AnnData:
diff --git a/tests/conftest.py b/tests/conftest.py
index 83d405d8d..2dbb6e41d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -117,6 +117,63 @@ def dummy_adata() -> AnnData:
     return adata
 
 
+@pytest.fixture()
+def dummy_adata2() -> AnnData:
+    r = np.random.RandomState(100)
+    adata = AnnData(r.rand(10, 100), obs={"celltype": r.choice(["foo", "bar", "baz"], size=10)})
+    adata.obs.index = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
+
+    # Spatial layout of the 10 data points in the grid:
+    #
+    #   Y ▲
+    #     │
+    #   5 │       a    b         c    Points: (2,5), (3,5), (5,5)
+    #     │
+    #   4 │  d              e         Points: (1,4), (4,4)
+    #     │
+    #   3 │            f         g    Points: (3,3), (5,3)
+    #     │
+    #   2 │                 h         Points: (4,2)
+    #     │
+    #   1 │  i                   j    Points: (1,1), (5,1)
+    #     │
+    #     └─────────────────────────►
+    #        1    2    3    4    5   X
+    #
+    #
+    # celltypes-
+    # a      bar
+    # b      baz
+    # c      foo
+    # d      foo
+    # e      foo
+    # f      baz
+    # g      baz
+    # h      baz
+    # i      bar
+    # j      baz
+
+    adata.obsm["spatial"] = np.array(
+        [
+            [2, 5],
+            [3, 5],
+            [5, 5],
+            [1, 4],
+            [4, 4],
+            [3, 3],
+            [5, 3],
+            [4, 2],
+            [1, 1],
+            [5, 1],
+        ]
+    )
+
+    # using a radius of 1.5 will lead to, say e being a neighbor of b,c,f,g but not h. So all side-adjacent and diagonal-adjacent
+    # locations will be neighbors
+    sq.gr.spatial_neighbors_radius(adata, radius=1.5)
+    return adata
+
+
 @pytest.fixture()
 def adata_intmat() -> AnnData:
     graph = csr_matrix(
diff --git a/tests/graph/test_niche.py b/tests/graph/test_niche.py
index 7faad2712..c8e87130e 100644
--- a/tests/graph/test_niche.py
+++ b/tests/graph/test_niche.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from anndata import AnnData
+from pandas import DataFrame
 from pandas.testing import assert_frame_equal
 from scipy.sparse import issparse
 
@@ -12,6 +13,31 @@
 GROUPS = "celltype_mapped_refined"
 
 
+def test_calculate_neighborhood_profile(dummy_adata2: AnnData):
+    "calculate_neighborhood_profile function needs to be tested, as it is at the base of the functionality of neighborhood flavor"
+    matrix = dummy_adata2.obsp["spatial_connectivities"].tocoo()
+    nhood_profile = _calculate_neighborhood_profile(dummy_adata2, "celltype", matrix, True)
+    relative_nhood_profile = _calculate_neighborhood_profile(dummy_adata2, "celltype", matrix, False)
+
+    # nhood_profile and relative_nhood_profile should have same entries as this manually determined version for dummy_adata2
+    expected_nhood_profile = DataFrame(
+        {
+            0: [0, 1, 0, 1, 0, 0, 0, 0, 0, 0],
+            1: [1, 0, 0, 0, 3, 1, 1, 3, 0, 1],
+            2: [1, 1, 1, 0, 1, 1, 1, 0, 0, 0],
+        },
+        index=list("abcdefghij"),
+        dtype="float",
+    )
+    total_neighs = expected_nhood_profile.sum(axis=1)
+    expected_relative_nhood_profile = expected_nhood_profile.div(total_neighs, axis=0)
+    expected_relative_nhood_profile = expected_relative_nhood_profile.fillna(0.0)
+
+    # compare
+    assert (nhood_profile.values == expected_nhood_profile.values).all()
+    assert (relative_nhood_profile.values == expected_relative_nhood_profile.values).all()
+
+
 def test_niche_calc_nhood(adata_seqfish: AnnData):
     """Check whether niche calculation using neighborhood profile approach works as intended."""
     spatial_neighbors(adata_seqfish, coord_type="generic", delaunay=False, n_neighs=N_NEIGHBORS)