From eea36eb7ef3eaa8c44b06c95e168343b7f7a8ffd Mon Sep 17 00:00:00 2001 From: J Berg Date: Tue, 30 Jun 2026 13:46:19 -0700 Subject: [PATCH 1/3] Use KDTree for better ripley L mem usage --- src/squidpy/gr/_ripley.py | 24 +++++++++++++++--------- tests/graph/test_ripley.py | 5 +++++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/squidpy/gr/_ripley.py b/src/squidpy/gr/_ripley.py index 7c92ae357..e086fd326 100644 --- a/src/squidpy/gr/_ripley.py +++ b/src/squidpy/gr/_ripley.py @@ -10,8 +10,7 @@ from numpy.random import default_rng from scanpy import logging as logg from scipy.spatial import ConvexHull, Delaunay -from scipy.spatial.distance import pdist -from sklearn.neighbors import NearestNeighbors +from sklearn.neighbors import KDTree, NearestNeighbors from sklearn.preprocessing import LabelEncoder from spatialdata import SpatialData @@ -146,8 +145,7 @@ def ripley( distances, _ = tree_c.kneighbors(coordinates[cluster_idx != i, :], n_neighbors=n_neigh) bins, obs_stats = _f_g_function(distances.squeeze(), support) elif mode == RipleyStat.L: - distances = pdist(coord_c, metric=metric) - bins, obs_stats = _l_function(distances, support, N, area) + bins, obs_stats = _l_function(coord_c, support, N, area, metric) else: raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.") obs_arr[i] = obs_stats @@ -166,8 +164,7 @@ def ripley( distances_i, _ = tree_i.kneighbors(coordinates, n_neighbors=1) _, stats_i = _f_g_function(distances_i.squeeze(), support) elif mode == RipleyStat.L: - distances_i = pdist(random_i, metric=metric) - _, stats_i = _l_function(distances_i, support, N, area) + _, stats_i = _l_function(random_i, support, N, area, metric) else: raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.") @@ -208,10 +205,19 @@ def _f_g_function(distances: NDArrayA, support: NDArrayA) -> tuple[NDArrayA, NDA return bins, np.concatenate((np.zeros((1,), dtype=float), fracs)) -def _l_function(distances: NDArrayA, support: NDArrayA, n: int, area: float) -> tuple[NDArrayA, NDArrayA]: - n_pairs_less_than_d = (distances < support.reshape(-1, 1)).sum(axis=1) +def _l_function(points: NDArrayA, support: NDArrayA, n: int, area: float, metric: str) -> tuple[NDArrayA, NDArrayA]: + # Ripley's K(d) is the number of ordered point pairs within distance d. `two_point_correlation` + # computes exactly that (cumulatively over `support`, in a single tree pass) without + # materializing the O(m^2) pairwise distances. + # KDTree accepts the same metric names as the F/G modes (sklearn.metrics.DistanceMetric). + tree = KDTree(points, metric=metric) + # `two_point_correlation` counts ordered pairs incl. the `m` self-matches at distance 0; + # subtracting `m` gives ordered non-self pairs == the previous `n_pairs_less_than_d * 2`. + # `dualtree=True` has been observed to be roughly 2x faster than the single-tree default. + num_points = points.shape[0] + n_ordered_pairs_less_than_d = tree.two_point_correlation(points, support, dualtree=True) - num_points intensity = n / area - k_estimate = ((n_pairs_less_than_d * 2) / n) / intensity + k_estimate = (n_ordered_pairs_less_than_d / n) / intensity l_estimate = np.sqrt(k_estimate / np.pi) return support, l_estimate diff --git a/tests/graph/test_ripley.py b/tests/graph/test_ripley.py index 750acbff2..fdbff72e2 100644 --- a/tests/graph/test_ripley.py +++ b/tests/graph/test_ripley.py @@ -10,6 +10,11 @@ CLUSTER_KEY = "leiden" +def test_ripley_L_unsupported_metric_raises(adata_ripley: AnnData): + with pytest.raises(ValueError, match="metric"): + ripley(adata_ripley, cluster_key=CLUSTER_KEY, mode="L", metric="cosine") + + @pytest.mark.parametrize("mode", list(RipleyStat)) def test_ripley_modes(adata_ripley: AnnData, mode: RipleyStat): adata = adata_ripley From 61e74c5260388a59bfdbee1935b5f7353c8b23e9 Mon Sep 17 00:00:00 2001 From: J Berg Date: Tue, 30 Jun 2026 14:46:04 -0700 Subject: [PATCH 2/3] better docs and more helpful error messages --- src/squidpy/gr/_ripley.py | 4 ++++ tests/graph/test_ripley.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/squidpy/gr/_ripley.py b/src/squidpy/gr/_ripley.py index e086fd326..a87b7e463 100644 --- a/src/squidpy/gr/_ripley.py +++ b/src/squidpy/gr/_ripley.py @@ -83,6 +83,8 @@ def ripley( metric Which metric to use for computing distances. For available metrics, check out :class:`sklearn.metrics.DistanceMetric`. + For Ripley's L specifically, only metrics supported by :class:`sklearn.neighbors.KDTree` + are valid (see its ``valid_metrics`` attribute). n_neigh Number of neighbors to consider for the KNN graph. n_simulations @@ -206,6 +208,8 @@ def _f_g_function(distances: NDArrayA, support: NDArrayA) -> tuple[NDArrayA, NDA def _l_function(points: NDArrayA, support: NDArrayA, n: int, area: float, metric: str) -> tuple[NDArrayA, NDArrayA]: + if metric not in KDTree.valid_metrics: + raise ValueError(f"Unsupported metric '{metric}'. Ripley's L supports {KDTree.valid_metrics}") # Ripley's K(d) is the number of ordered point pairs within distance d. `two_point_correlation` # computes exactly that (cumulatively over `support`, in a single tree pass) without # materializing the O(m^2) pairwise distances. diff --git a/tests/graph/test_ripley.py b/tests/graph/test_ripley.py index fdbff72e2..fa50f2000 100644 --- a/tests/graph/test_ripley.py +++ b/tests/graph/test_ripley.py @@ -11,7 +11,7 @@ def test_ripley_L_unsupported_metric_raises(adata_ripley: AnnData): - with pytest.raises(ValueError, match="metric"): + with pytest.raises(ValueError, match="Unsupported metric"): ripley(adata_ripley, cluster_key=CLUSTER_KEY, mode="L", metric="cosine") From c76ef30534f769faf3436f2e019c29aa10984929 Mon Sep 17 00:00:00 2001 From: J Berg Date: Tue, 30 Jun 2026 16:20:50 -0700 Subject: [PATCH 3/3] cleanup --- src/squidpy/gr/_ripley.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/squidpy/gr/_ripley.py b/src/squidpy/gr/_ripley.py index a87b7e463..fd4cbfd7e 100644 --- a/src/squidpy/gr/_ripley.py +++ b/src/squidpy/gr/_ripley.py @@ -213,10 +213,9 @@ def _l_function(points: NDArrayA, support: NDArrayA, n: int, area: float, metric # Ripley's K(d) is the number of ordered point pairs within distance d. `two_point_correlation` # computes exactly that (cumulatively over `support`, in a single tree pass) without # materializing the O(m^2) pairwise distances. - # KDTree accepts the same metric names as the F/G modes (sklearn.metrics.DistanceMetric). tree = KDTree(points, metric=metric) # `two_point_correlation` counts ordered pairs incl. the `m` self-matches at distance 0; - # subtracting `m` gives ordered non-self pairs == the previous `n_pairs_less_than_d * 2`. + # subtracting `m` gives ordered non-self pairs. # `dualtree=True` has been observed to be roughly 2x faster than the single-tree default. num_points = points.shape[0] n_ordered_pairs_less_than_d = tree.two_point_correlation(points, support, dualtree=True) - num_points