From 5c363b2772e1582a19c4c107dd747e5193f4afc3 Mon Sep 17 00:00:00 2001 From: Andrew Maiorov Date: Mon, 11 May 2026 17:56:34 +0200 Subject: [PATCH 1/6] feat(autoheal): compute heal plan for overlapping partitions This commit reworks the heal plan computation algorithm to work in environments where overlapping partitions are permitted, while keeping it functionally equivalent otherwise. The algorithm is essentially: smallest set of nodes directly connected to any partitioned nodes must be healed, i.e. must reboot and rejoin the cluster. --- src/mria_autoheal.erl | 187 ++++++++++++++++++++++++++++------- test/mria_autoheal_SUITE.erl | 2 +- 2 files changed, 154 insertions(+), 35 deletions(-) diff --git a/src/mria_autoheal.erl b/src/mria_autoheal.erl index b7c826d..8c405a8 100644 --- a/src/mria_autoheal.erl +++ b/src/mria_autoheal.erl @@ -25,12 +25,14 @@ -record(autoheal, {delay, role, proc, timer}). -type autoheal() :: #autoheal{}. +-type cluster_view() :: {node(), [node()], [node()]}. -export_type([autoheal/0]). -include_lib("snabbkaffe/include/trace.hrl"). -define(DEFAULT_DELAY, 15000). +-define(CLUSTER_RPC_TIMEOUT, 5000). -define(LOG(Level, Format, Args), logger:Level("Mria(Autoheal): " ++ Format, Args)). @@ -77,24 +79,12 @@ handle_msg({report_partition, Node}, Autoheal = #autoheal{delay = Delay, timer = handle_msg(Msg = {create_splitview, Node}, Autoheal = #autoheal{delay = Delay, timer = TRef}) when Node =:= node() -> ensure_cancel_timer(TRef), - case is_majority_alive() of + Nodes = mria_mnesia:db_nodes(), + ClusterViews = collect_cluster_views(Nodes), + HasMajority = length(ClusterViews) > length(Nodes) div 2, + case HasMajority of true -> - Nodes = mria_mnesia:db_nodes(), - RPCResult = erpc:multicall(Nodes, mria_mnesia, running_nodes, []), - SplitView = lists:foldl(fun({N, Result}, Acc) -> - case Result of - {ok, Peers} -> - Acc #{N => Peers}; - _ -> - %% Ignore unreachable nodes: - Acc - end - end, - #{}, - lists:zip(Nodes, RPCResult)), - Cliques = lists:sort(fun compare_cliques/2, - mria_lib:find_clusters(SplitView)), - mria_node_monitor:cast(coordinator(Cliques), {heal_partition, Cliques}), + apply_heal_plan(ClusterViews), Autoheal#autoheal{timer = undefined}; false -> Autoheal#autoheal{timer = mria_node_monitor:run_after(Delay, {autoheal, Msg})} @@ -118,38 +108,107 @@ handle_msg({heal_partition, Cliques}, Autoheal= #autoheal{proc = _Proc}) -> handle_msg({'EXIT', Pid, normal}, Autoheal = #autoheal{proc = Pid}) -> Autoheal#autoheal{proc = undefined}; -handle_msg({'EXIT', Pid, Reason}, Autoheal = #autoheal{proc = Pid}) -> - ?LOG(critical, "Autoheal process crashed: ~s", [Reason]), +handle_msg({'EXIT', Pid, Reason}, Autoheal = #autoheal{delay = Delay, proc = Pid}) -> + ?LOG(critical, "Autoheal process crashed: ~p", [Reason]), + mria_node_monitor:run_after(Delay, confirm_partition), Autoheal#autoheal{proc = undefined}; handle_msg(Msg, Autoheal) -> ?LOG(critical, "Unexpected msg: ~p", [Msg, Autoheal]), Autoheal. -compare_cliques(Running1, Running2) -> +-spec collect_cluster_views([node()]) -> [cluster_view()]. +collect_cluster_views(Nodes) -> + RPCResult = erpc:multicall(Nodes, mria_mnesia, cluster_view, [], ?CLUSTER_RPC_TIMEOUT), + [ {Node, Running, Stopped} + %% Ignore unreachable nodes: + || {Node, {ok, {Running, Stopped}}} <- lists:zip(Nodes, RPCResult)]. + +-spec apply_heal_plan([cluster_view()]) -> ok. +apply_heal_plan(ClusterViews) -> + {Survivors, Victims, SplitView} = find_split_view(ClusterViews), + Coordinator = case Survivors of + [_ | _] -> coordinator(Survivors); + [] -> node() + end, + case SplitView of + [] -> ok; + _ -> ?tp(info, mria_autoheal_plan, #{ survivors => Survivors + , victims => Victims + , split_view => SplitView + , coordinator => Coordinator + }) + end, + case Victims of + [_ | _] -> + mria_node_monitor:cast(Coordinator, + {heal_partition, [Survivors, Victims]}); + false -> + ok + end. + +find_split_view(ClusterViews) -> + ClusterViewsSorted = lists:sort(fun compare_cluster_view/2, ClusterViews), + SplitView = compute_split_view(ClusterViewsSorted), + {Survivors, Partitioned} = compute_heal_plan(SplitView), + Victims = [N || N <- Partitioned, lists:keymember(N, 1, ClusterViews)], + {Survivors, Victims, SplitView}. + +compare_cluster_view({_N1, Running1, _Partitioned1}, {_N2, Running2, _Partitioned2}) -> Len1 = length(Running1), Len2 = length(Running2), if - Len1 > Len2 -> true; - Len1 == Len2 -> lists:member(node(), Running1); - true -> false + %% Prefer partitions with higher number of surviving nodes. + Len1 > Len2 -> true; + Len1 < Len2 -> false; + %% If number of nodes is the same, sort by list of running nodes. + true -> Running1 < Running2 end. --spec coordinator([[node()]]) -> node(). -coordinator([Majority | _]) -> - mria_membership:coordinator(Majority). +compute_split_view([{_Node, _Running, []} | Views]) -> + %% Node observes no partitions, ignore. + compute_split_view(Views); +compute_split_view([{Node, Running, Partitioned} | Views]) -> + %% Node observes some nodes as partitioned from it. + %% These nodes need to be rebooted, and as such they should not be part of the split view. + ViewsPartitioned = [PV || PV = {PN, _, _} <- Views, lists:member(PN, Partitioned)], + ViewsRest = Views -- ViewsPartitioned, + %% Taints are nodes connected to the partitioned nodes that should also be rebooted: + %% these nodes could have replicated writes from partitioned nodes that were not seen by + %% other nodes. + Taints = lists:append([PRunning || {_, PRunning, _} <- ViewsPartitioned]), + ViewTainted = {Node, Running -- Taints, lists:usort(Partitioned ++ Taints)}, + [ViewTainted | compute_split_view(ViewsRest)]; +compute_split_view([]) -> + []. + +compute_heal_plan(SplitView) -> + %% If we have more than one parition in split view, we need to reboot _all_ of the nodes + %% in each view's partition (i.e. ⋃(Partitioned)). Then we need to find candidates to do + %% it, as ⋃(Running) ∖ ⋃(Partitioned). + {_Nodes, Rs, Ps} = lists:unzip3(SplitView), + URunning = ordsets:union([ordsets:from_list(R) || R <- Rs]), + UPartitioned = ordsets:union([ordsets:from_list(P) || P <- Ps]), + {ordsets:subtract(URunning, UPartitioned), UPartitioned}. + +-spec coordinator([node()]) -> node(). +coordinator(Candidates) -> + case lists:member(node(), Candidates) of + true -> node(); + false -> mria_membership:coordinator(Candidates) + end. -spec heal_partition([[node()]]) -> ok. heal_partition([[_Majority]]) -> %% There are no partitions: ok; heal_partition([Majority|Minorities]) -> - Result = reboot_minority(lists:append(Minorities)), + Result = reboot_partitioned(lists:append(Minorities)), mria_lib:exec_callback(heal_partition, {Majority, Minorities}), Result. -reboot_minority(Minority) -> - ?tp(info, "Rebooting minority", #{nodes => Minority}), - lists:foreach(fun rejoin/1, Minority). +reboot_partitioned(Nodes) -> + ?tp(info, "Rebooting partitions", #{nodes => Nodes}), + lists:foreach(fun rejoin/1, Nodes). rejoin(Node) -> Ret = rpc:call(Node, mria, join, [node(), heal]), @@ -163,7 +222,67 @@ ensure_cancel_timer(undefined) -> ensure_cancel_timer(TRef) -> catch erlang:cancel_timer(TRef). -is_majority_alive() -> - All = mria_mnesia:cluster_nodes(all), - NotAliveLen = length(All -- [node() | nodes()]), - NotAliveLen < (length(All) div 2). +%%================================================================================ +%% Unit tests +%%================================================================================ + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). +split_view_no_partition_test_() -> + ?_assertMatch({_, [], []}, + find_split_view([ {1, [1, 2, 3], []} + , {2, [1, 2, 3], []} + , {3, [1, 2, 3], []} + ])). + +split_view_symmetric_partition_test_() -> + [ ?_assertMatch({[2, 3], [1], _}, + find_split_view([ {1, [1, 2, 3], []} + , {2, [2, 3], [1]} + , {3, [2, 3], [1]} + ])) + , ?_assertMatch({[1, 2], [3, 4], _}, + find_split_view([ {1, [1, 2], [3, 4]} + , {2, [1, 2], [3, 4]} + , {3, [3, 4], [1, 2]} + , {4, [3, 4], [1, 2]} + ])) + , ?_assertMatch({[1, 2, 3], [4, 5, 6], _}, + find_split_view([ {1, [1, 2, 3], [4, 5, 6]} + , {2, [1, 2, 3], [4, 5, 6]} + , {3, [1, 2, 3], [4, 5, 6]} + , {4, [4, 5], [1, 2, 3, 6]} + , {5, [4, 5], [1, 2, 3, 6]} + , {6, [4, 5, 6], [1, 2, 3]} + ])) + ]. + +split_view_full_split_test_() -> + ?_assertMatch({[1], [2, 3, 4], _}, + find_split_view([ {1, [1], [2, 3, 4]} + , {2, [2], [1, 3, 4]} + , {3, [3], [1, 2, 4]} + , {4, [4], [1, 2, 3]} + ])). + +split_view_overlapping_partition_test_() -> + [ ?_assertMatch({[], [1, 2, 3, 4], _}, + find_split_view([ {1, [1, 4], [2, 3]} + , {2, [2, 3], [1, 4]} + , {3, [2, 3, 4], [1]} + , {4, [1, 3, 4], [2]}])) + , ?_assertMatch({[3], [1, 2, 4], _}, + find_split_view([ {1, [1, 2, 3, 4], []} + , {2, [1, 2, 3, 4], []} + , {3, [1, 2, 3], [4]} + , {4, [1, 2, 4], [3]}])) + ]. + +split_view_unreachable_node_test_() -> + ?_assertMatch({_, [], _}, + find_split_view([ {1, [1, 2, 3, 4], [5]} + , {2, [1, 2, 3, 4], [5]} + , {3, [1, 2, 3, 4], [5]} + , {4, [1, 2, 3, 4], [5]}])). + +-endif. diff --git a/test/mria_autoheal_SUITE.erl b/test/mria_autoheal_SUITE.erl index f0221d6..571284b 100644 --- a/test/mria_autoheal_SUITE.erl +++ b/test/mria_autoheal_SUITE.erl @@ -200,7 +200,7 @@ assert_replicant_bootstrapped(R, C, Trace) -> %% Verify that mria callbacks have been executed during heal prop_callbacks(Trace0) -> {Trace, _} = ?split_trace_at(#{?snk_kind := teardown_cluster}, Trace0), - {_, [HealEvent|AfterHeal]} = ?split_trace_at(#{?snk_kind := "Rebooting minority"}, Trace), + {_, [HealEvent|AfterHeal]} = ?split_trace_at(#{?snk_kind := "Rebooting partitions"}, Trace), #{nodes := Minority} = HealEvent, %% Check that all minority nodes have been restarted: [?assert( From 80402a52c20659658b1b85927cb09d60a96a7ee0 Mon Sep 17 00:00:00 2001 From: Andrew Maiorov Date: Tue, 12 May 2026 13:49:43 +0200 Subject: [PATCH 2/6] test(autoheal): verify autoheal under overlapping partitions --- test/mria_autoheal_SUITE.erl | 38 ++++++++++++++++++++++++++++++++++++ test/mria_ct.erl | 6 ++++-- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/test/mria_autoheal_SUITE.erl b/test/mria_autoheal_SUITE.erl index 571284b..490f576 100644 --- a/test/mria_autoheal_SUITE.erl +++ b/test/mria_autoheal_SUITE.erl @@ -88,6 +88,44 @@ t_autoheal_with_replicants(Config) when is_list(Config) -> end, [fun ?MODULE:prop_callbacks/1]). +t_autoheal_overlapping_paritions(Config) when is_list(Config) -> + Cluster = mria_ct:cluster([core, core, core, core], + [{mria, cluster_autoheal, 200}], + [{beam_args, "-kernel prevent_overlapping_partitions false"}]), + ?check_trace( + #{timetrap => 25000}, + try + Nodes = [N1, N2, N3, N4] = mria_ct:start_cluster(mria, Cluster), + %% Simulate netsplit: + true = rpc:cast(N4, erlang, disconnect_node, [N3]), + ok = timer:sleep(1000), + %% Nodes report overlapping partitions: + ?assertMatch({[N1, N2, N3, N4], []}, view(N1)), + ?assertMatch({[N1, N2, N3, N4], []}, view(N2)), + ?assertMatch({[N1, N2, N3], [N4]}, view(N3)), + ?assertMatch({[N1, N2, N4], [N3]}, view(N4)), + %% Wait for autoheal, it should happen automatically: + ?retry(1000, 20, + begin + ?assertMatch({Nodes, []}, view(N1)), + ?assertMatch({Nodes, []}, view(N2)), + ?assertMatch({Nodes, []}, view(N3)), + ?assertMatch({Nodes, []}, view(N4)) + end), + Nodes + after + ok = mria_ct:teardown_cluster(Cluster) + end, + [ fun ?MODULE:prop_callbacks/1 + , fun([N1, N2, N3, N4], Trace) -> + %% Partitioned node N4 tainted N1 and N2, all of them should be restarted: + ?assertMatch( [#{survivors := [N3], victims := [N1, N2, N4]}] + , ?of_kind(mria_autoheal_plan, Trace)), + ?assertMatch( [#{nodes := [N1, N2, N4]}] + , ?of_kind("Rebooting partitions", Trace)) + end + ]). + t_autoheal_majority_reachable(Config) when is_list(Config) -> Cluster = mria_ct:cluster([core, core, core, core, core], [{mria, cluster_autoheal, 200}]), ?check_trace( diff --git a/test/mria_ct.erl b/test/mria_ct.erl index 0ff3e85..4feaaca 100644 --- a/test/mria_ct.erl +++ b/test/mria_ct.erl @@ -87,6 +87,7 @@ cluster(Specs0, CommonEnv, ClusterOpts) -> , number => Number , role => Role , code_paths => CodePaths + , beam_args => proplists:get_value(beam_args, ClusterOpts, "") , cover => Cover } || #{role := Role, name := Name, env := Env, code_paths := CodePaths, num := Number, cover := Cover} <- Specs]. @@ -103,10 +104,11 @@ start_cluster(mria_async, Specs) -> spawn(fun() -> [start_mria(I) || I <- Specs] end), Ret. -start_slave(node, #{name := Name, env := Env, code_paths := CodePaths, cover := Cover}) -> +start_slave(node, #{name := Name, env := Env, code_paths := CodePaths, cover := Cover} = Spec) -> CommonBeamOpts = "+S 1:1 " % We want VMs to only occupy a single core "-kernel inet_dist_listen_min 3000 " % Avoid collisions with gen_rpc ports - "-kernel inet_dist_listen_max 3050 ", + "-kernel inet_dist_listen_max 3050 " + ++ maps:get(beam_args, Spec, "") ++ " ", Node = do_start_slave(Name, CommonBeamOpts), Self = filename:dirname(code:which(?MODULE)), [rpc:call(Node, code, add_patha, [Path]) || Path <- [Self|CodePaths]], From 1281b19f2488fac2975194cd1931f952712df005 Mon Sep 17 00:00:00 2001 From: Andrew Maiorov Date: Tue, 12 May 2026 20:40:40 +0200 Subject: [PATCH 3/6] feat(autoheal): improve heal plan coverage This commit changes heal plan algorithm to instead consider fully connected sub-clusters in (potentially overlapping) cluster cliques computed from cluster partitions. This, for example, improves plans for situations where only a single link between 2 nodes is broken: only those 2 nodes will be asked to rejoin. --- src/mria_autoheal.erl | 139 ++++++++++++++++------------------- src/mria_lib.erl | 104 ++++++++++++++++++++++++++ test/mria_autoheal_SUITE.erl | 6 +- 3 files changed, 172 insertions(+), 77 deletions(-) diff --git a/src/mria_autoheal.erl b/src/mria_autoheal.erl index 8c405a8..5f5b7e2 100644 --- a/src/mria_autoheal.erl +++ b/src/mria_autoheal.erl @@ -126,69 +126,59 @@ collect_cluster_views(Nodes) -> -spec apply_heal_plan([cluster_view()]) -> ok. apply_heal_plan(ClusterViews) -> - {Survivors, Victims, SplitView} = find_split_view(ClusterViews), - Coordinator = case Survivors of - [_ | _] -> coordinator(Survivors); - [] -> node() - end, - case SplitView of - [] -> ok; - _ -> ?tp(info, mria_autoheal_plan, #{ survivors => Survivors - , victims => Victims - , split_view => SplitView - , coordinator => Coordinator - }) - end, - case Victims of - [_ | _] -> - mria_node_monitor:cast(Coordinator, - {heal_partition, [Survivors, Victims]}); - false -> + case find_split_view(ClusterViews) of + SplitView = [Survivors | Rest] -> + Victims = lists:usort(lists:append(Rest)), + Coordinator = coordinator(Survivors), + ?tp(info, mria_autoheal_plan, #{ survivors => Survivors + , victims => Victims + , split_view => SplitView + , coordinator => Coordinator + }), + case Victims of + [_ | _] -> + mria_node_monitor:cast(Coordinator, + {heal_partition, [Survivors, Victims]}); + [] -> + ok + end; + [] -> ok end. find_split_view(ClusterViews) -> - ClusterViewsSorted = lists:sort(fun compare_cluster_view/2, ClusterViews), - SplitView = compute_split_view(ClusterViewsSorted), - {Survivors, Partitioned} = compute_heal_plan(SplitView), - Victims = [N || N <- Partitioned, lists:keymember(N, 1, ClusterViews)], - {Survivors, Victims, SplitView}. - -compare_cluster_view({_N1, Running1, _Partitioned1}, {_N2, Running2, _Partitioned2}) -> - Len1 = length(Running1), Len2 = length(Running2), - if - %% Prefer partitions with higher number of surviving nodes. - Len1 > Len2 -> true; - Len1 < Len2 -> false; - %% If number of nodes is the same, sort by list of running nodes. - true -> Running1 < Running2 - end. + Cluster = maps:from_list([{N, Connected} || {N, Connected, _} <- ClusterViews]), + Cliques = mria_lib:find_cliques(Cluster), + compute_split_view(Cliques). -compute_split_view([{_Node, _Running, []} | Views]) -> - %% Node observes no partitions, ignore. - compute_split_view(Views); -compute_split_view([{Node, Running, Partitioned} | Views]) -> - %% Node observes some nodes as partitioned from it. - %% These nodes need to be rebooted, and as such they should not be part of the split view. - ViewsPartitioned = [PV || PV = {PN, _, _} <- Views, lists:member(PN, Partitioned)], - ViewsRest = Views -- ViewsPartitioned, - %% Taints are nodes connected to the partitioned nodes that should also be rebooted: - %% these nodes could have replicated writes from partitioned nodes that were not seen by - %% other nodes. - Taints = lists:append([PRunning || {_, PRunning, _} <- ViewsPartitioned]), - ViewTainted = {Node, Running -- Taints, lists:usort(Partitioned ++ Taints)}, - [ViewTainted | compute_split_view(ViewsRest)]; compute_split_view([]) -> - []. + []; +compute_split_view(Cliques0) -> + %% Find if there are overlaps involving largest clique. + %% If there is, split the overlap and repeat. + Cliques1 = [C0 | Rest] = lists:sort(fun compare_clique/2, Cliques0), + case isolate_overlaps(C0, Rest, []) of + no_overlaps -> Cliques1; + Cliques -> compute_split_view(Cliques) + end. -compute_heal_plan(SplitView) -> - %% If we have more than one parition in split view, we need to reboot _all_ of the nodes - %% in each view's partition (i.e. ⋃(Partitioned)). Then we need to find candidates to do - %% it, as ⋃(Running) ∖ ⋃(Partitioned). - {_Nodes, Rs, Ps} = lists:unzip3(SplitView), - URunning = ordsets:union([ordsets:from_list(R) || R <- Rs]), - UPartitioned = ordsets:union([ordsets:from_list(P) || P <- Ps]), - {ordsets:subtract(URunning, UPartitioned), UPartitioned}. +isolate_overlaps(C0, [C1 | Cs], Acc) -> + case ordsets:intersection(C0, C1) of + [] -> isolate_overlaps(C0, Cs, [C1 | Acc]); + CX -> + %% If C0 overlaps C1, replace them with [C0 ∩ C1, C0 \ C1, C1 \ C0]. + CD0 = ordsets:subtract(C0, C1), + CD1 = ordsets:subtract(C1, C0), + [CX] ++ [CD0 || CD0 =/= []] ++ [CD1 || CD1 =/= []] ++ Acc ++ Cs + end; +isolate_overlaps(_C0, [], _Acc) -> + no_overlaps. + +compare_clique(C0, C1) -> + case length(C0) - length(C1) of + 0 -> C0 =< C1; + N -> N > 0 + end. -spec coordinator([node()]) -> node(). coordinator(Candidates) -> @@ -229,25 +219,25 @@ ensure_cancel_timer(TRef) -> -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). split_view_no_partition_test_() -> - ?_assertMatch({_, [], []}, + ?_assertMatch([[1, 2, 3]], find_split_view([ {1, [1, 2, 3], []} , {2, [1, 2, 3], []} , {3, [1, 2, 3], []} ])). split_view_symmetric_partition_test_() -> - [ ?_assertMatch({[2, 3], [1], _}, + [ ?_assertMatch([[2, 3], [1]], find_split_view([ {1, [1, 2, 3], []} , {2, [2, 3], [1]} , {3, [2, 3], [1]} ])) - , ?_assertMatch({[1, 2], [3, 4], _}, + , ?_assertMatch([[1, 2], [3, 4]], find_split_view([ {1, [1, 2], [3, 4]} , {2, [1, 2], [3, 4]} , {3, [3, 4], [1, 2]} , {4, [3, 4], [1, 2]} ])) - , ?_assertMatch({[1, 2, 3], [4, 5, 6], _}, + , ?_assertMatch([[1, 2, 3], [4, 5], [6]], find_split_view([ {1, [1, 2, 3], [4, 5, 6]} , {2, [1, 2, 3], [4, 5, 6]} , {3, [1, 2, 3], [4, 5, 6]} @@ -258,7 +248,7 @@ split_view_symmetric_partition_test_() -> ]. split_view_full_split_test_() -> - ?_assertMatch({[1], [2, 3, 4], _}, + ?_assertMatch([[1], [2], [3], [4]], find_split_view([ {1, [1], [2, 3, 4]} , {2, [2], [1, 3, 4]} , {3, [3], [1, 2, 4]} @@ -266,23 +256,24 @@ split_view_full_split_test_() -> ])). split_view_overlapping_partition_test_() -> - [ ?_assertMatch({[], [1, 2, 3, 4], _}, + [ ?_assertMatch([[1], [2], [3], [4]], find_split_view([ {1, [1, 4], [2, 3]} , {2, [2, 3], [1, 4]} , {3, [2, 3, 4], [1]} , {4, [1, 3, 4], [2]}])) - , ?_assertMatch({[3], [1, 2, 4], _}, - find_split_view([ {1, [1, 2, 3, 4], []} - , {2, [1, 2, 3, 4], []} - , {3, [1, 2, 3], [4]} - , {4, [1, 2, 4], [3]}])) + , ?_assertMatch([[1, 2, 3], [4], [5]], + find_split_view([ {1, [1, 2, 3, 4, 5], []} + , {2, [1, 2, 3, 4, 5], []} + , {3, [1, 2, 3, 4, 5], []} + , {4, [1, 2, 3, 4], [5]} + , {5, [1, 2, 3, 5], [4]}])) + , ?_assertMatch([[1, 2], [3, 4], [5], [6]], + find_split_view([ {1, [1, 2], [3, 4, 5]} + , {2, [1, 2], [3, 4, 5]} + , {3, [3, 4, 5, 6], [1, 2]} + , {4, [3, 4, 5, 6], [1, 2]} + , {5, [3, 4, 5], [1, 2, 6]} + , {6, [3, 4, 6], [1, 2, 5]}])) ]. -split_view_unreachable_node_test_() -> - ?_assertMatch({_, [], _}, - find_split_view([ {1, [1, 2, 3, 4], [5]} - , {2, [1, 2, 3, 4], [5]} - , {3, [1, 2, 3, 4], [5]} - , {4, [1, 2, 3, 4], [5]}])). - -endif. diff --git a/src/mria_lib.erl b/src/mria_lib.erl index 6fd7109..0e81fcc 100644 --- a/src/mria_lib.erl +++ b/src/mria_lib.erl @@ -44,6 +44,7 @@ , unwrap_exception/1 , find_clusters/1 + , find_cliques/1 ]). -export_type([ subscriber/0 @@ -157,6 +158,19 @@ rpc_cast(Destination, Module, Function, Args) -> find_clusters(ClusterView) -> find_clusters(maps:keys(ClusterView), ClusterView, []). +%% Enumerate cliques in the graph. +%% Graph is undirected, edge is considered to exist if 2 vertices have each other in +%% adjacency lists. +-spec find_cliques(#{V => [V]}) -> [[V]]. +find_cliques(G0) -> + G = maps:map(fun(V, _) -> mutuals(V, G0) end, G0), + Vs = ordsets:from_list(maps:keys(G)), + DegreeOrder = lists:sort( + fun(V1, V2) -> length(maps:get(V1, G)) >= length(maps:get(V2, G)) end, + Vs + ), + bron_kerbosch(G, DegreeOrder, _R = [], Vs, _X = [], []). + %%================================================================================ %% Misc functions %%================================================================================ @@ -283,12 +297,50 @@ find_clusters([Node|Rest], NodeInfo, Acc) -> Cluster = lists:usort([Node|MutualConnections]), find_clusters(Rest -- MutualConnections, NodeInfo, [Cluster|Acc]). +%% Returns set of vertices in `G' mutually connected to `V'. +mutuals(V, G) -> + Ns = ordsets:from_list(maps:get(V, G) -- [V]), + ordsets:filter(fun(Vn) -> lists:member(V, maps:get(Vn, G)) end, Ns). + +%% Enumerates cliques in the given graph recursively. +%% Refer to Bron-Kerbosh algorithm for details. +bron_kerbosch(_G, _Order, R, [], [], Acc) -> + [R | Acc]; +bron_kerbosch(G, Order, R0, P0, X0, Acc0) -> + {value, VPivot} = lists:search( + fun(V) -> ordsets:is_element(V, P0) orelse ordsets:is_element(V, X0) end, + Order + ), + Vs = ordsets:subtract(P0, maps:get(VPivot, G)), + {_, _, Acc} = lists:foldl( + fun(V, {P1, X1, Acc1}) -> + Nv = maps:get(V, G), + Acc = bron_kerbosch( G + , Order + , ordsets:union(R0, [V]) + , ordsets:intersection(P1, Nv) + , ordsets:intersection(X1, Nv) + , Acc1), + P = ordsets:subtract(P1, [V]), + X = ordsets:union(X1, [V]), + {P, X, Acc} + end, + {P0, X0, Acc0}, + Vs + ), + Acc. + %%================================================================================ %% Unit tests %%================================================================================ -ifdef(TEST). + -include_lib("eunit/include/eunit.hrl"). +-undef(LET). + +-include_lib("proper/include/proper_common.hrl"). +-include_lib("snabbkaffe/include/test_macros.hrl"). find_clusters_test_() -> [ ?_assertMatch( [[1, 2, 3]] @@ -313,4 +365,56 @@ find_clusters_test_() -> })) ) ]. + +find_cliques_test_() -> + [ + ?_assertMatch( [[1, 2, 3]] + , lists:sort(find_cliques(#{ 1 => [1, 2, 3] + , 2 => [2, 1, 3] + , 3 => [2, 3, 1] + })) + ) + , ?_assertMatch( [[1], [2, 3]] + , lists:sort(find_cliques(#{ 1 => [1, 2, 3] + , 2 => [2, 3] + , 3 => [3, 2] + })) + ) + , ?_assertMatch( [[1, 2, 3], [4, 5], [6]] + , lists:sort(find_cliques(#{ 1 => [1, 2, 3] + , 2 => [1, 2, 3] + , 3 => [3, 2, 1] + , 4 => [4, 5] + , 5 => [4, 5] + , 6 => [6, 4, 5] + })) + ) + %% Overlapping cliques: + , ?_assertMatch( [[1, 2, 3], [1, 2, 4]] + , lists:sort(find_cliques(#{ 1 => [1, 2, 3, 4] + , 2 => [1, 2, 3, 4] + , 3 => [1, 2, 3] + , 4 => [1, 2, 4] + })) + ) + , ?_assertMatch( [[1, 4], [2, 3], [3, 4]] + , lists:sort(find_cliques(#{1 => [1, 4] + , 2 => [2, 3] + , 3 => [2, 3, 4] + , 4 => [1, 3, 4] + })) + ) + ]. + +prop_test_() -> + Config = [{proper, #{numtests => 100, max_size => 300, timeout => 30000}}], + {timeout, 30, ?_test(?run_prop(Config, ?FORALL(G, t_graph(), is_list(find_cliques(G)))))}. + +t_graph() -> + ?LET(N, proper_types:non_neg_integer(), + ?LET(L, [ {I, ?LET(Vs, proper_types:list(proper_types:range(1, N)), lists:usort(Vs))} + || I <- lists:seq(1, N) + ], + maps:from_list(L))). + -endif. %% TEST diff --git a/test/mria_autoheal_SUITE.erl b/test/mria_autoheal_SUITE.erl index 490f576..23b68ca 100644 --- a/test/mria_autoheal_SUITE.erl +++ b/test/mria_autoheal_SUITE.erl @@ -118,10 +118,10 @@ t_autoheal_overlapping_paritions(Config) when is_list(Config) -> end, [ fun ?MODULE:prop_callbacks/1 , fun([N1, N2, N3, N4], Trace) -> - %% Partitioned node N4 tainted N1 and N2, all of them should be restarted: - ?assertMatch( [#{survivors := [N3], victims := [N1, N2, N4]}] + %% Both N3 and N4 are potentially inconsistent and should be restarted: + ?assertMatch( [#{survivors := [N1, N2], victims := [N3, N4]}] , ?of_kind(mria_autoheal_plan, Trace)), - ?assertMatch( [#{nodes := [N1, N2, N4]}] + ?assertMatch( [#{nodes := [N3, N4]}] , ?of_kind("Rebooting partitions", Trace)) end ]). From 3ef4d380c28f7d09cbe17f1b52ee649b80d83364 Mon Sep 17 00:00:00 2001 From: Andrew Maiorov Date: Tue, 12 May 2026 20:53:52 +0200 Subject: [PATCH 4/6] test(autoheal): add more complex testcase --- test/mria_autoheal_SUITE.erl | 42 +++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/test/mria_autoheal_SUITE.erl b/test/mria_autoheal_SUITE.erl index 23b68ca..cb73fd8 100644 --- a/test/mria_autoheal_SUITE.erl +++ b/test/mria_autoheal_SUITE.erl @@ -88,7 +88,7 @@ t_autoheal_with_replicants(Config) when is_list(Config) -> end, [fun ?MODULE:prop_callbacks/1]). -t_autoheal_overlapping_paritions(Config) when is_list(Config) -> +t_autoheal_overlapping_parition(Config) when is_list(Config) -> Cluster = mria_ct:cluster([core, core, core, core], [{mria, cluster_autoheal, 200}], [{beam_args, "-kernel prevent_overlapping_partitions false"}]), @@ -126,6 +126,46 @@ t_autoheal_overlapping_paritions(Config) when is_list(Config) -> end ]). +t_autoheal_complex_overlapping_paritions(Config) when is_list(Config) -> + Cluster = mria_ct:cluster([core, core, core, core], + [{mria, cluster_autoheal, 200}], + [{beam_args, "-kernel prevent_overlapping_partitions false"}]), + ?check_trace( + #{timetrap => 25000}, + try + Nodes = [N1, N2, N3, N4] = mria_ct:start_cluster(mria, Cluster), + %% Simulate netsplit: + true = rpc:cast(N1, erlang, disconnect_node, [N2]), + true = rpc:cast(N1, erlang, disconnect_node, [N3]), + true = rpc:cast(N2, erlang, disconnect_node, [N4]), + ok = timer:sleep(1000), + %% Nodes report overlapping partitions: + ?assertMatch({[N1, N4], [N2, N3]}, view(N1)), + ?assertMatch({[N2, N3], [N1, N4]}, view(N2)), + ?assertMatch({[N2, N3, N4], [N1]}, view(N3)), + ?assertMatch({[N1, N3, N4], [N2]}, view(N4)), + %% Wait for autoheal, it should happen automatically: + ?retry(1000, 20, + begin + ?assertMatch({Nodes, []}, view(N1)), + ?assertMatch({Nodes, []}, view(N2)), + ?assertMatch({Nodes, []}, view(N3)), + ?assertMatch({Nodes, []}, view(N4)) + end), + Nodes + after + ok = mria_ct:teardown_cluster(Cluster) + end, + [ fun ?MODULE:prop_callbacks/1 + , fun([N1, N2, N3, N4], Trace) -> + %% All but one node are potentially inconsistent and should be restarted: + ?assertMatch( [#{survivors := [N1], victims := [N2, N3, N4]}] + , ?of_kind(mria_autoheal_plan, Trace)), + ?assertMatch( [#{nodes := [N2, N3, N4]}] + , ?of_kind("Rebooting partitions", Trace)) + end + ]). + t_autoheal_majority_reachable(Config) when is_list(Config) -> Cluster = mria_ct:cluster([core, core, core, core, core], [{mria, cluster_autoheal, 200}]), ?check_trace( From 0b8ccee4c38a70175be2c46bdf657f8abe34923a Mon Sep 17 00:00:00 2001 From: Andrew Maiorov Date: Thu, 14 May 2026 15:47:28 +0200 Subject: [PATCH 5/6] refactor(autoheal): simplify computations This commit changes the core of split view computation algorithm from "overlapping cliques" analysis to the "reachability matrix" approach. The primary observation is that largest set of nodes that agree on their reachability and therefore consistency should contain equal vectors in the cluster reachability matrix. This is likely to produce the same results as "overlapping cliques" approach (at least as far as tests show) but much cheaper. --- src/mria_autoheal.erl | 187 +++++++++++++++++++++++++++++++++++------- src/mria_lib.erl | 104 ----------------------- 2 files changed, 157 insertions(+), 134 deletions(-) diff --git a/src/mria_autoheal.erl b/src/mria_autoheal.erl index 5f5b7e2..b49a9dc 100644 --- a/src/mria_autoheal.erl +++ b/src/mria_autoheal.erl @@ -146,35 +146,61 @@ apply_heal_plan(ClusterViews) -> ok end. -find_split_view(ClusterViews) -> - Cluster = maps:from_list([{N, Connected} || {N, Connected, _} <- ClusterViews]), - Cliques = mria_lib:find_cliques(Cluster), - compute_split_view(Cliques). - -compute_split_view([]) -> - []; -compute_split_view(Cliques0) -> - %% Find if there are overlaps involving largest clique. - %% If there is, split the overlap and repeat. - Cliques1 = [C0 | Rest] = lists:sort(fun compare_clique/2, Cliques0), - case isolate_overlaps(C0, Rest, []) of - no_overlaps -> Cliques1; - Cliques -> compute_split_view(Cliques) - end. - -isolate_overlaps(C0, [C1 | Cs], Acc) -> - case ordsets:intersection(C0, C1) of - [] -> isolate_overlaps(C0, Cs, [C1 | Acc]); - CX -> - %% If C0 overlaps C1, replace them with [C0 ∩ C1, C0 \ C1, C1 \ C0]. - CD0 = ordsets:subtract(C0, C1), - CD1 = ordsets:subtract(C1, C0), - [CX] ++ [CD0 || CD0 =/= []] ++ [CD1 || CD1 =/= []] ++ Acc ++ Cs - end; -isolate_overlaps(_C0, [], _Acc) -> - no_overlaps. - -compare_clique(C0, C1) -> +%% Purpose of this function is to find the largest set of nodes to survive the +%% partition heal. As these nodes will seed all restarting nodes, they should +%% contain consistent set of Mria data, i.e. they should have replicated the +%% same set of transactions. +%% +%% These survivor nodes are chosen according to reachability matrix: +%% 1. Each node starts with a bit vector containing only itself. +%% 2. For every reported running node `RN' by node `N', RN's reachability +%% vector is updated. This means each final vector represents the set of +%% nodes that reported the corresponding node as running (reachable). +%% 3. The largest set of nodes that agrees on their reachability vectors is +%% chosen as survivors. All other sets of nodes are considered victims. +%% +%% If there are several equally large such sets, the one that compares lower is +%% preferred, according to Erlang term order. +%% +%% Set of survivors nodes is returned in the head of resulting list, while tail +%% contains sets of victim nodes, potentially separated into disagreeing +%% partitions. +-spec find_split_view([{node(), _Running :: [node()], _Partitioned :: [node()]}]) -> + [_Survivors :: [node()] | _Victims :: [[node()]]]. +find_split_view(ClusterViews = [_ | _]) -> + Cluster = lists:sort([N || {N, _, _} <- ClusterViews]), + Vectors0 = maps:from_list(lists:zipwith( + fun(N, Idx) -> {N, 1 bsl Idx} end, + Cluster, + lists:seq(0, length(Cluster) - 1) + )), + Vectors = lists:foldl( + fun({N, Running, _Stopped}, Vectors1) -> + Flag = maps:get(N, Vectors0), + lists:foldl( + fun(RN, Vectors) -> + case maps:is_key(RN, Vectors) of + true -> maps:update_with(RN, fun(V) -> V bor Flag end, Vectors); + false -> Vectors + end + end, + Vectors1, + Running) + end, + Vectors0, + ClusterViews), + Components = maps:values( + maps:groups_from_list( fun({_, V}) -> V end + , fun({N, _}) -> N end + , maps:to_list(Vectors))), + lists:sort( fun compare_components/2 + , [lists:sort(C) || C <- Components]); +find_split_view([]) -> + []. + +%% Compares connected components by size of set of universals. +%% Orders component with larger set of universals before smaller. +compare_components(C0, C1) -> case length(C0) - length(C1) of 0 -> C0 =< C1; N -> N > 0 @@ -217,7 +243,14 @@ ensure_cancel_timer(TRef) -> %%================================================================================ -ifdef(TEST). + +-include_lib("proper/include/proper_common.hrl"). -include_lib("eunit/include/eunit.hrl"). +-include_lib("snabbkaffe/include/test_macros.hrl"). + +split_view_empty_test_() -> + ?_assertMatch([], find_split_view([])). + split_view_no_partition_test_() -> ?_assertMatch([[1, 2, 3]], find_split_view([ {1, [1, 2, 3], []} @@ -227,7 +260,7 @@ split_view_no_partition_test_() -> split_view_symmetric_partition_test_() -> [ ?_assertMatch([[2, 3], [1]], - find_split_view([ {1, [1, 2, 3], []} + find_split_view([ {1, [1], [2, 3]} , {2, [2, 3], [1]} , {3, [2, 3], [1]} ])) @@ -261,6 +294,11 @@ split_view_overlapping_partition_test_() -> , {2, [2, 3], [1, 4]} , {3, [2, 3, 4], [1]} , {4, [1, 3, 4], [2]}])) + , ?_assertMatch([[1], [2], [3], [4]], + find_split_view([ {1, [4, 1, 2], [3]} + , {2, [1, 2, 3], [4]} + , {3, [2, 3, 4], [1]} + , {4, [3, 4, 1], [2]}])) , ?_assertMatch([[1, 2, 3], [4], [5]], find_split_view([ {1, [1, 2, 3, 4, 5], []} , {2, [1, 2, 3, 4, 5], []} @@ -274,6 +312,95 @@ split_view_overlapping_partition_test_() -> , {4, [3, 4, 5, 6], [1, 2]} , {5, [3, 4, 5], [1, 2, 6]} , {6, [3, 4, 6], [1, 2, 5]}])) + + , ?_assertMatch([[1], [2], [3], [4], [5]], + find_split_view([ {1, [1, 2, 3, 4, 5], []} + , {2, [1, 2, 3, 4], [5]} + , {3, [1, 2, 3, 5], [4]} + , {4, [1, 2, 4, 5], [3]} + , {5, [1, 3, 4, 5], [2]}])) ]. +split_view_asymm_partition_test_() -> + ?_assertMatch([[1, 2], [3], [4]], + find_split_view([ {1, [1, 2, 4], [3]} + , {2, [1, 2, 4], [3]} + , {3, [3, 4], [1, 2]} + , {4, [1, 2, 4], [3]} + ])). + +split_view_single_component_overlapping_test_() -> + [ ?_assertMatch([[1, 2, 3], [6, 7], [4], [5]], + find_split_view([ {1, [1, 2, 3, 4, 5], [6, 7]} + , {2, [1, 2, 3, 4, 5], [6, 7]} + , {3, [1, 2, 3, 4, 5], [6, 7]} + , {4, [1, 2, 3, 4, 6, 7], [5]} + , {5, [1, 2, 3, 5, 6, 7], [4]} + , {6, [4, 5, 6, 7], [1, 2, 3]} + , {7, [4, 5, 6, 7], [1, 2, 3]}])) + , ?_assertMatch([[2, 3], [4, 5], [1], [6]], + find_split_view([ {1, [1, 6, 2, 3], [4, 5]} + , {2, [2, 1, 3], [4, 5, 6]} + , {3, [3, 1, 2], [4, 5, 6]} + , {4, [4, 5, 6], [1, 2, 3]} + , {5, [5, 4, 6], [1, 2, 3]} + , {6, [6, 1, 4, 5], [2, 3]} + ])) + ]. + +prop_split_view_complete_test_() -> + Config = [{proper, #{numtests => 100, max_size => 300, timeout => 15000}}], + {timeout, 20, ?_test(?run_prop(Config, + ?FORALL(ClusterViews, t_cluster_views(), + case find_split_view(ClusterViews) of + [] -> true; + [Survivors | Rest] -> + ClusterNodes = lists:sort([Node || {Node, _, _} <- ClusterViews]), + Victims = lists:append(Rest), + {conjunction, [ + {survivors_victims_disjoint, + proper:equals(Survivors, Survivors -- Victims)}, + {no_missed_nodes, + proper:equals(lists:sort(Survivors), ClusterNodes -- Victims)} + ]} + end)))}. + +prop_split_view_nonempty_survivors_test_() -> + Config = [{proper, #{numtests => 100, max_size => 300, timeout => 15000}}], + {timeout, 20, ?_test(?run_prop(Config, + ?FORALL(ClusterViews, t_nonempty_cluster_views(), + case find_split_view(ClusterViews) of + [] -> false; + [Survivors | _] -> Survivors =/= [] + end)))}. + +t_nonempty_cluster_views() -> + ?SUCHTHAT(X, t_cluster_views(), X =/= []). + +t_cluster_views() -> + ?LET(NNodes, ?SIZED(S, S), + ?LET(NPartitions, proper_types:oneof([0, 0, 0, 0, 1, 2, 3, 4]), + ?LET(LBoundaries, [proper_types:range(1, NNodes) || _ <- lists:seq(1, NPartitions)], + ?LET(NBrokenLinks, proper_types:non_neg_integer(), + ?LET(LBrokenLinks, [ {proper_types:range(1, NNodes), proper_types:range(1, NNodes)} + || _ <- lists:seq(1, NBrokenLinks)], + begin + Cluster = lists:seq(1, NNodes), + Boundaries = lists:usort([1, NNodes + 1 | LBoundaries]), + Partitions = lists:zipwith( fun(N1, N2) -> lists:seq(N1, N2 - 1) end + , Boundaries + , tl(Boundaries) + , trim), + BrokenLinks = sets:from_list(LBrokenLinks, [{version, 2}]), + IsBrokenLink = fun + (N1, N1) -> false; + (N1, N2) -> sets:is_element({N1, N2}, BrokenLinks) orelse + sets:is_element({N2, N1}, BrokenLinks) + end, + lists:append([ [ {Node, [N || N <- Nodes, not IsBrokenLink(N, Node)], + [N || N <- Nodes, IsBrokenLink(N, Node)] ++ (Cluster -- Nodes)} + || Node <- Nodes] + || Nodes <- Partitions]) + end))))). + -endif. diff --git a/src/mria_lib.erl b/src/mria_lib.erl index 0e81fcc..6fd7109 100644 --- a/src/mria_lib.erl +++ b/src/mria_lib.erl @@ -44,7 +44,6 @@ , unwrap_exception/1 , find_clusters/1 - , find_cliques/1 ]). -export_type([ subscriber/0 @@ -158,19 +157,6 @@ rpc_cast(Destination, Module, Function, Args) -> find_clusters(ClusterView) -> find_clusters(maps:keys(ClusterView), ClusterView, []). -%% Enumerate cliques in the graph. -%% Graph is undirected, edge is considered to exist if 2 vertices have each other in -%% adjacency lists. --spec find_cliques(#{V => [V]}) -> [[V]]. -find_cliques(G0) -> - G = maps:map(fun(V, _) -> mutuals(V, G0) end, G0), - Vs = ordsets:from_list(maps:keys(G)), - DegreeOrder = lists:sort( - fun(V1, V2) -> length(maps:get(V1, G)) >= length(maps:get(V2, G)) end, - Vs - ), - bron_kerbosch(G, DegreeOrder, _R = [], Vs, _X = [], []). - %%================================================================================ %% Misc functions %%================================================================================ @@ -297,50 +283,12 @@ find_clusters([Node|Rest], NodeInfo, Acc) -> Cluster = lists:usort([Node|MutualConnections]), find_clusters(Rest -- MutualConnections, NodeInfo, [Cluster|Acc]). -%% Returns set of vertices in `G' mutually connected to `V'. -mutuals(V, G) -> - Ns = ordsets:from_list(maps:get(V, G) -- [V]), - ordsets:filter(fun(Vn) -> lists:member(V, maps:get(Vn, G)) end, Ns). - -%% Enumerates cliques in the given graph recursively. -%% Refer to Bron-Kerbosh algorithm for details. -bron_kerbosch(_G, _Order, R, [], [], Acc) -> - [R | Acc]; -bron_kerbosch(G, Order, R0, P0, X0, Acc0) -> - {value, VPivot} = lists:search( - fun(V) -> ordsets:is_element(V, P0) orelse ordsets:is_element(V, X0) end, - Order - ), - Vs = ordsets:subtract(P0, maps:get(VPivot, G)), - {_, _, Acc} = lists:foldl( - fun(V, {P1, X1, Acc1}) -> - Nv = maps:get(V, G), - Acc = bron_kerbosch( G - , Order - , ordsets:union(R0, [V]) - , ordsets:intersection(P1, Nv) - , ordsets:intersection(X1, Nv) - , Acc1), - P = ordsets:subtract(P1, [V]), - X = ordsets:union(X1, [V]), - {P, X, Acc} - end, - {P0, X0, Acc0}, - Vs - ), - Acc. - %%================================================================================ %% Unit tests %%================================================================================ -ifdef(TEST). - -include_lib("eunit/include/eunit.hrl"). --undef(LET). - --include_lib("proper/include/proper_common.hrl"). --include_lib("snabbkaffe/include/test_macros.hrl"). find_clusters_test_() -> [ ?_assertMatch( [[1, 2, 3]] @@ -365,56 +313,4 @@ find_clusters_test_() -> })) ) ]. - -find_cliques_test_() -> - [ - ?_assertMatch( [[1, 2, 3]] - , lists:sort(find_cliques(#{ 1 => [1, 2, 3] - , 2 => [2, 1, 3] - , 3 => [2, 3, 1] - })) - ) - , ?_assertMatch( [[1], [2, 3]] - , lists:sort(find_cliques(#{ 1 => [1, 2, 3] - , 2 => [2, 3] - , 3 => [3, 2] - })) - ) - , ?_assertMatch( [[1, 2, 3], [4, 5], [6]] - , lists:sort(find_cliques(#{ 1 => [1, 2, 3] - , 2 => [1, 2, 3] - , 3 => [3, 2, 1] - , 4 => [4, 5] - , 5 => [4, 5] - , 6 => [6, 4, 5] - })) - ) - %% Overlapping cliques: - , ?_assertMatch( [[1, 2, 3], [1, 2, 4]] - , lists:sort(find_cliques(#{ 1 => [1, 2, 3, 4] - , 2 => [1, 2, 3, 4] - , 3 => [1, 2, 3] - , 4 => [1, 2, 4] - })) - ) - , ?_assertMatch( [[1, 4], [2, 3], [3, 4]] - , lists:sort(find_cliques(#{1 => [1, 4] - , 2 => [2, 3] - , 3 => [2, 3, 4] - , 4 => [1, 3, 4] - })) - ) - ]. - -prop_test_() -> - Config = [{proper, #{numtests => 100, max_size => 300, timeout => 30000}}], - {timeout, 30, ?_test(?run_prop(Config, ?FORALL(G, t_graph(), is_list(find_cliques(G)))))}. - -t_graph() -> - ?LET(N, proper_types:non_neg_integer(), - ?LET(L, [ {I, ?LET(Vs, proper_types:list(proper_types:range(1, N)), lists:usort(Vs))} - || I <- lists:seq(1, N) - ], - maps:from_list(L))). - -endif. %% TEST From efd299d1c0f3a1ccb147fbb311b4f7a7aa00d574 Mon Sep 17 00:00:00 2001 From: Andrew Maiorov Date: Thu, 14 May 2026 19:57:46 +0200 Subject: [PATCH 6/6] fix(autoheal): keep chosing coordinator deterministically --- src/mria_autoheal.erl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/mria_autoheal.erl b/src/mria_autoheal.erl index b49a9dc..04f9a88 100644 --- a/src/mria_autoheal.erl +++ b/src/mria_autoheal.erl @@ -207,11 +207,8 @@ compare_components(C0, C1) -> end. -spec coordinator([node()]) -> node(). -coordinator(Candidates) -> - case lists:member(node(), Candidates) of - true -> node(); - false -> mria_membership:coordinator(Candidates) - end. +coordinator(Survivors) -> + mria_membership:coordinator(Survivors). -spec heal_partition([[node()]]) -> ok. heal_partition([[_Majority]]) ->