diff --git a/src/distillery/graph/metrics.py b/src/distillery/graph/metrics.py index 1fd6d509..dbcc2d75 100644 --- a/src/distillery/graph/metrics.py +++ b/src/distillery/graph/metrics.py @@ -2,6 +2,7 @@ from __future__ import annotations +import math from typing import Any from distillery.graph import nx @@ -25,3 +26,64 @@ def communities(g: Any) -> list[set[str]]: return [] undirected = g.to_undirected() if g.is_directed() else g return list(nx.community.louvain_communities(undirected)) + + +def constraint(g: Any, *, k: int = 10) -> list[tuple[str, float]]: + """Top-k structural-hole brokers by Burt's constraint (ascending). + + Computed on the undirected projection. Burt's constraint is *low* for nodes + that bridge otherwise-disconnected neighbours (a structural hole / broker) + and *high* for nodes embedded in a dense, redundant clique. Results are + sorted ascending, so the first entries are the strongest brokers. + + Nodes with no neighbours (constraint is NaN) are excluded. + """ + _require_networkx() + if g.number_of_nodes() == 0: + return [] + undirected = g.to_undirected() if g.is_directed() else g + raw = nx.constraint(undirected) + ranked = [ + (node, float(value)) + for node, value in raw.items() + if value is not None and not math.isnan(value) + ] + ranked.sort(key=lambda kv: kv[1]) + return ranked[:k] + + +def link_prediction( + g: Any, *, source: str | None = None, k: int = 10 +) -> list[tuple[str, str, float]]: + """Top-k predicted edges by the Adamic-Adar index (descending). + + Adamic-Adar scores a candidate (non-existent) edge by its shared + neighbours, weighting each by ``1 / log(degree)`` so a connection through a + niche shared node counts more than one through a hub. Computed on the + undirected projection. + + When *source* is given, only candidate edges from that node to its + non-neighbours are scored (emerging adjacencies for one entry); the source + must be a node in the graph or an empty list is returned. When *source* is + ``None``, all non-existent edges are scored — bound the graph first (e.g. + via ``scope="ego"``) since this is quadratic in node count. + + Returns a list of ``(source, target, score)`` tuples. + """ + _require_networkx() + if g.number_of_nodes() == 0: + return [] + undirected = g.to_undirected() if g.is_directed() else g + ebunch: list[tuple[str, str]] | None + if source is not None: + if source not in undirected: + return [] + excluded = set(undirected[source]) + excluded.add(source) + ebunch = [(source, target) for target in undirected.nodes if target not in excluded] + if not ebunch: + return [] + else: + ebunch = None + ranked = sorted(nx.adamic_adar_index(undirected, ebunch), key=lambda t: t[2], reverse=True) + return [(u, v, float(p)) for u, v, p in ranked[:k]] diff --git a/src/distillery/mcp/server.py b/src/distillery/mcp/server.py index 0a0b689f..9e9d2a37 100644 --- a/src/distillery/mcp/server.py +++ b/src/distillery/mcp/server.py @@ -1256,12 +1256,15 @@ async def distillery_relations( # noqa: PLR0913 - relation_id (str, required for remove): UUID of the relation to delete. - hops (int, optional for traverse, default=2): BFS depth, capped at [1, 3]. - metric (str, required for metrics): Graph metric to compute. - Valid: [bridges, communities]. Requires the [graph] optional extra. + Valid: [bridges, communities, constraint, link_prediction]. Requires + the [graph] optional extra. - scope (str, optional for metrics, default="global"): Subgraph scope. Valid: [global, ego]. ``"ego"`` requires ``entry_id``. - - limit (int, optional for metrics, default=10): For ``metric="bridges"`` - returns the top-k entries by betweenness centrality; for - ``metric="communities"`` returns the K largest communities. + - limit (int, optional for metrics, default=10): top-k results. + ``bridges`` = entries by betweenness centrality; ``communities`` = K + largest communities; ``constraint`` = entries by lowest Burt constraint + (strongest structural-hole brokers); ``link_prediction`` = top predicted + edges by Adamic-Adar (pass ``entry_id`` to score adjacencies for one entry). - project / tags / date_from / date_to (optional, metrics global scope): restrict the entries whose relations participate in the graph. diff --git a/src/distillery/mcp/tools/relations.py b/src/distillery/mcp/tools/relations.py index 98466afe..9e19960d 100644 --- a/src/distillery/mcp/tools/relations.py +++ b/src/distillery/mcp/tools/relations.py @@ -36,7 +36,7 @@ _GRAPH_METRICS_PAGE_SIZE = 1000 _GRAPH_METRICS_MAX_IDS = 100_000 -_VALID_METRICS = {"bridges", "communities"} +_VALID_METRICS = {"bridges", "communities", "constraint", "link_prediction"} _VALID_SCOPES = {"global", "ego"} # --------------------------------------------------------------------------- @@ -582,7 +582,7 @@ async def _handle_metrics( # noqa: PLR0911, PLR0912 # ----- cache lookup ----- from distillery.graph.builders import build_relations_graph from distillery.graph.cache import default_cache - from distillery.graph.metrics import bridges, communities + from distillery.graph.metrics import bridges, communities, constraint, link_prediction cache = default_cache() cache_key = ( @@ -634,6 +634,14 @@ async def _handle_metrics( # noqa: PLR0911, PLR0912 results: list[dict[str, Any]] = [ {"id": node, "score": round(score, 6)} for node, score in ranked ] + elif metric == "constraint": + # Ascending: lowest Burt constraint = strongest structural-hole broker. + ranked = constraint(g, k=limit) + results = [{"id": node, "score": round(score, 6)} for node, score in ranked] + elif metric == "link_prediction": + # entry_id (when given) is the source node — emerging adjacencies for it. + preds = link_prediction(g, source=entry_id_value, k=limit) + results = [{"source": u, "target": v, "score": round(p, 6)} for u, v, p in preds] else: # metric == "communities" comms = communities(g) comms_sorted = sorted(comms, key=lambda c: len(c), reverse=True)[:limit] diff --git a/tests/graph/test_metrics.py b/tests/graph/test_metrics.py index 8b05cecb..a452ecd9 100644 --- a/tests/graph/test_metrics.py +++ b/tests/graph/test_metrics.py @@ -11,11 +11,38 @@ pytest.importorskip("networkx") from distillery.graph.builders import build_relations_graph # noqa: E402 -from distillery.graph.metrics import bridges, communities # noqa: E402 +from distillery.graph.metrics import ( # noqa: E402 + bridges, + communities, + constraint, + link_prediction, +) pytestmark = pytest.mark.unit +def _bowtie() -> list[dict[str, str]]: + """M brokers two otherwise-disconnected pairs: {A,B} and {X,Y}.""" + return [ + {"from_id": "M", "to_id": "A", "relation_type": "link"}, + {"from_id": "M", "to_id": "B", "relation_type": "link"}, + {"from_id": "A", "to_id": "B", "relation_type": "link"}, + {"from_id": "M", "to_id": "X", "relation_type": "link"}, + {"from_id": "M", "to_id": "Y", "relation_type": "link"}, + {"from_id": "X", "to_id": "Y", "relation_type": "link"}, + ] + + +def _shared_neighbors() -> list[dict[str, str]]: + """A and B share neighbours C, D but are not directly connected.""" + return [ + {"from_id": "A", "to_id": "C", "relation_type": "link"}, + {"from_id": "A", "to_id": "D", "relation_type": "link"}, + {"from_id": "B", "to_id": "C", "relation_type": "link"}, + {"from_id": "B", "to_id": "D", "relation_type": "link"}, + ] + + def test_bridges_star_graph_center_first() -> None: """In a star A-{B,C,D}, the centre A has the highest betweenness.""" rels = [ @@ -54,3 +81,43 @@ def test_communities_two_clusters_with_bridge() -> None: assert len(comms) == 2 members = sorted([sorted(c) for c in comms], key=lambda x: x[0]) assert members == [["A", "B", "C"], ["X", "Y", "Z"]] + + +def test_constraint_broker_has_lowest_score() -> None: + """In a bowtie, the broker M sits in a structural hole -> lowest constraint.""" + g = build_relations_graph(_bowtie(), directed=True) + ranked = constraint(g, k=10) + assert ranked + assert ranked[0][0] == "M" + scores = dict(ranked) + # The broker is less constrained than a node embedded in a dense triangle. + assert scores["M"] < scores["A"] + + +def test_constraint_empty_graph_returns_empty() -> None: + g = build_relations_graph([], directed=True) + assert constraint(g, k=10) == [] + + +def test_link_prediction_source_predicts_shared_neighbour() -> None: + """From A, the top Adamic-Adar candidate is B (they share C and D).""" + g = build_relations_graph(_shared_neighbors(), directed=True) + preds = link_prediction(g, source="A", k=5) + assert preds + src, tgt, score = preds[0] + assert src == "A" + assert tgt == "B" + assert score > 0 + + +def test_link_prediction_global_surfaces_shared_pair() -> None: + """With no source, the A-B (and C-D) non-edges are scored across the graph.""" + g = build_relations_graph(_shared_neighbors(), directed=True) + preds = link_prediction(g, k=10) + pairs = {frozenset((u, v)) for u, v, _ in preds} + assert frozenset(("A", "B")) in pairs + + +def test_link_prediction_unknown_source_returns_empty() -> None: + g = build_relations_graph(_shared_neighbors(), directed=True) + assert link_prediction(g, source="ZZZ", k=5) == [] diff --git a/tests/test_mcp_tools/test_relations_metrics.py b/tests/test_mcp_tools/test_relations_metrics.py index 46719f94..ba12a6a6 100644 --- a/tests/test_mcp_tools/test_relations_metrics.py +++ b/tests/test_mcp_tools/test_relations_metrics.py @@ -132,6 +132,69 @@ async def test_metrics_communities_global(store) -> None: # type: ignore[no-unt assert isinstance(row["members"], list) +async def _seed_bowtie(store): # type: ignore[no-untyped-def] + """M brokers two otherwise-disconnected pairs {A,B} and {X,Y}. Returns id map.""" + ids = {name: await _store_entry(store, content=f"entry {name}") for name in "MABXY"} + pairs = [("M", "A"), ("M", "B"), ("A", "B"), ("M", "X"), ("M", "Y"), ("X", "Y")] + for src, dst in pairs: + await store.add_relation(ids[src], ids[dst], "link") + return ids + + +async def test_metrics_constraint_broker_first(store) -> None: # type: ignore[no-untyped-def] + pytest.importorskip("networkx") + + ids = await _seed_bowtie(store) + result = await _handle_relations( + store, {"action": "metrics", "metric": "constraint", "scope": "global"} + ) + data = _parse(result) + + assert data.get("error") is not True + assert data["metric"] == "constraint" + assert isinstance(data["results"], list) and data["results"] + for row in data["results"]: + assert "id" in row and "score" in row + # The broker M has the lowest Burt constraint -> ranked first. + assert data["results"][0]["id"] == ids["M"] + + +async def test_metrics_link_prediction_with_source(store) -> None: # type: ignore[no-untyped-def] + pytest.importorskip("networkx") + + # A and B share neighbours C, D but are not directly connected. + ids = {name: await _store_entry(store, content=f"entry {name}") for name in "ABCD"} + for src, dst in [("A", "C"), ("A", "D"), ("B", "C"), ("B", "D")]: + await store.add_relation(ids[src], ids[dst], "link") + + result = await _handle_relations( + store, + {"action": "metrics", "metric": "link_prediction", "scope": "global", "entry_id": ids["A"]}, + ) + data = _parse(result) + + assert data.get("error") is not True + assert data["metric"] == "link_prediction" + assert isinstance(data["results"], list) and data["results"] + top = data["results"][0] + assert top["source"] == ids["A"] + assert top["target"] == ids["B"] + assert top["score"] > 0 + + +async def test_metrics_invalid_metric_rejected(store) -> None: # type: ignore[no-untyped-def] + pytest.importorskip("networkx") + + result = await _handle_relations( + store, {"action": "metrics", "metric": "pagerank", "scope": "global"} + ) + data = _parse(result) + assert data.get("error") is True + assert data["code"] == "INVALID_PARAMS" + # Error message should enumerate the now-expanded metric set. + assert "constraint" in data["message"] and "link_prediction" in data["message"] + + async def test_metrics_invalid_metric_returns_invalid_params(store) -> None: # type: ignore[no-untyped-def] pytest.importorskip("networkx")