diff --git a/src/squidpy/gr/_ripley.py b/src/squidpy/gr/_ripley.py index 7c92ae357..fd4cbfd7e 100644 --- a/src/squidpy/gr/_ripley.py +++ b/src/squidpy/gr/_ripley.py @@ -10,8 +10,7 @@ from numpy.random import default_rng from scanpy import logging as logg from scipy.spatial import ConvexHull, Delaunay -from scipy.spatial.distance import pdist -from sklearn.neighbors import NearestNeighbors +from sklearn.neighbors import KDTree, NearestNeighbors from sklearn.preprocessing import LabelEncoder from spatialdata import SpatialData @@ -84,6 +83,8 @@ def ripley( metric Which metric to use for computing distances. For available metrics, check out :class:`sklearn.metrics.DistanceMetric`. + For Ripley's L specifically, only metrics supported by :class:`sklearn.neighbors.KDTree` + are valid (see its ``valid_metrics`` attribute). n_neigh Number of neighbors to consider for the KNN graph. n_simulations @@ -146,8 +147,7 @@ def ripley( distances, _ = tree_c.kneighbors(coordinates[cluster_idx != i, :], n_neighbors=n_neigh) bins, obs_stats = _f_g_function(distances.squeeze(), support) elif mode == RipleyStat.L: - distances = pdist(coord_c, metric=metric) - bins, obs_stats = _l_function(distances, support, N, area) + bins, obs_stats = _l_function(coord_c, support, N, area, metric) else: raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.") obs_arr[i] = obs_stats @@ -166,8 +166,7 @@ def ripley( distances_i, _ = tree_i.kneighbors(coordinates, n_neighbors=1) _, stats_i = _f_g_function(distances_i.squeeze(), support) elif mode == RipleyStat.L: - distances_i = pdist(random_i, metric=metric) - _, stats_i = _l_function(distances_i, support, N, area) + _, stats_i = _l_function(random_i, support, N, area, metric) else: raise NotImplementedError(f"Mode `{mode.s!r}` is not yet implemented.") @@ -208,10 +207,20 @@ def _f_g_function(distances: NDArrayA, support: NDArrayA) -> tuple[NDArrayA, NDA return bins, np.concatenate((np.zeros((1,), dtype=float), fracs)) -def _l_function(distances: NDArrayA, support: NDArrayA, n: int, area: float) -> tuple[NDArrayA, NDArrayA]: - n_pairs_less_than_d = (distances < support.reshape(-1, 1)).sum(axis=1) +def _l_function(points: NDArrayA, support: NDArrayA, n: int, area: float, metric: str) -> tuple[NDArrayA, NDArrayA]: + if metric not in KDTree.valid_metrics: + raise ValueError(f"Unsupported metric '{metric}'. Ripley's L supports {KDTree.valid_metrics}") + # Ripley's K(d) is the number of ordered point pairs within distance d. `two_point_correlation` + # computes exactly that (cumulatively over `support`, in a single tree pass) without + # materializing the O(m^2) pairwise distances. + tree = KDTree(points, metric=metric) + # `two_point_correlation` counts ordered pairs incl. the `m` self-matches at distance 0; + # subtracting `m` gives ordered non-self pairs. + # `dualtree=True` has been observed to be roughly 2x faster than the single-tree default. + num_points = points.shape[0] + n_ordered_pairs_less_than_d = tree.two_point_correlation(points, support, dualtree=True) - num_points intensity = n / area - k_estimate = ((n_pairs_less_than_d * 2) / n) / intensity + k_estimate = (n_ordered_pairs_less_than_d / n) / intensity l_estimate = np.sqrt(k_estimate / np.pi) return support, l_estimate diff --git a/tests/graph/test_ripley.py b/tests/graph/test_ripley.py index 750acbff2..fa50f2000 100644 --- a/tests/graph/test_ripley.py +++ b/tests/graph/test_ripley.py @@ -10,6 +10,11 @@ CLUSTER_KEY = "leiden" +def test_ripley_L_unsupported_metric_raises(adata_ripley: AnnData): + with pytest.raises(ValueError, match="Unsupported metric"): + ripley(adata_ripley, cluster_key=CLUSTER_KEY, mode="L", metric="cosine") + + @pytest.mark.parametrize("mode", list(RipleyStat)) def test_ripley_modes(adata_ripley: AnnData, mode: RipleyStat): adata = adata_ripley