From 8436285141c5d8e8afaa57a2e6afe485bbfe3106 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Tue, 30 Jun 2026 20:31:28 -0500 Subject: [PATCH 1/3] test: cover flagged edge cases + fix snapshot persistence-health bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Source-vs-tests audit of the four testing tiers. Adds coverage at the lowest honest tier (all tier 1) for branches with no covering test, and fixes one real bug found along the way. Bug fix: - storage_service.save_snapshot: a non-serializable snapshot (TypeError from json.dumps) logged the error but left db_healthy=True, so the #131 "DB write failing" badge never fired while snapshots silently stopped persisting. Route it through _db_error like every other write path. Tests added (tier 1): - pytest: unserializable snapshot flips db_healthy (regression guard for above). - pithead shell: firewall install-failure rollback (#270); wallet subaddress / integrated hard-fail (#250); remote-mode-without-host and non-/24 subnet guards (#180); ensure_owner conditional + whole-tree recursive chown (#255). - frontend: ProxyTotals high-reject styling; per-worker api/reject badges + XvB/Unknown pool badges; the #278/#313 Tari-✔ invariant (never on active-but-dead); Gauge done-vs-syncing. Docs: - test-inventory regenerated (764 -> 770 cases); testing-strategy Known-gaps gains a "Coverage-audit follow-ups" section listing the tier-3/4 backstops still needed (real-kernel firewall rollback, real mixed-ownership ensure_owner, real-container monerod failover, non-blocking-Tari ignore, busy/reorg, double-outage, doctor NTP check, ENOSPC, Tor-down, insecure+main row). make lint + make test green; dashboard 94.73% cov; patch coverage 100%. Co-Authored-By: Claude Opus 4.8 --- .../service/storage_service.py | 5 +- .../tests/frontend/components.test.mjs | 53 +++++++++++ .../tests/service/test_storage_service.py | 10 ++ docs/test-inventory.md | 26 ++++-- docs/testing-strategy.md | 43 +++++++++ tests/stack/run.sh | 93 +++++++++++++++++++ 6 files changed, 219 insertions(+), 11 deletions(-) diff --git a/build/dashboard/mining_dashboard/service/storage_service.py b/build/dashboard/mining_dashboard/service/storage_service.py index 6700f5f..c5e6599 100644 --- a/build/dashboard/mining_dashboard/service/storage_service.py +++ b/build/dashboard/mining_dashboard/service/storage_service.py @@ -475,7 +475,10 @@ def save_snapshot(self, data: dict[str, Any]): except sqlite3.Error as e: self._db_error("Snapshot Save Error", e) except TypeError as e: - self.logger.error(f"Snapshot serialization error: {e}") + # A non-serializable snapshot is a persistent write failure (data lost on restart), + # so flag persistence unhealthy like every other write path — otherwise the #131 + # badge stays green while snapshots silently never persist. + self._db_error("Snapshot Serialization Error", e) def load_snapshot(self) -> dict[str, Any] | None: """Loads the last persisted application state snapshot.""" diff --git a/build/dashboard/tests/frontend/components.test.mjs b/build/dashboard/tests/frontend/components.test.mjs index 1cef66b..d8caf4b 100644 --- a/build/dashboard/tests/frontend/components.test.mjs +++ b/build/dashboard/tests/frontend/components.test.mjs @@ -148,6 +148,59 @@ test('ProxyTotals footer is hidden until the proxy reports data', () => { assert.match(renderApp({ state: s }), /Proxy totals/); }); +test('ProxyTotals reddens the rejected figure only when reject_level is high', () => { + // The base fixture's workers are all clean, so nothing reaches the styled-rejects branch. + const s = clone(); + Object.assign(s.proxy_summary, { + has_data: true, accepted: '1200', rejected: '50', reject_pct: '4%', + reject_level: 'high', invalid: '0', best: '123', + }); + assert.match(renderApp({ state: s }), /status-bad">50/); // high -> rejected total is reddened + s.proxy_summary.reject_level = 'ok'; + assert.doesNotMatch(renderApp({ state: s }), /status-bad">50/); // ok -> plain, not reddened +}); + +test('WorkersTable surfaces the per-rig api-unreadable and reject badges, and the pool badge variants', () => { + // The single fixture pins both workers to pool=p2pool, api_ok=null, reject_flag=null, so these + // three problem-rig signals — the whole point of the pool/api/rejected columns — never render. + const s = clone(); + s.workers[0].api_ok = false; // xmrig API unreadable -> "api ⚠" + s.workers[0].reject_flag = { text: '90% rejected', title: 'high reject rate' }; + s.workers[0].pool = 'xvb'; // purple XvB badge + s.workers[1].pool = 'somethingelse'; // unrecognised -> Unknown (bad) badge + const html = renderApp({ state: s }); + assert.match(html, /api ⚠/); // api_ok===false badge (only UI signal a rig's API is unreadable) + assert.match(html, /90% rejected/); // per-row reject badge (how you spot a problem rig) + assert.match(html, /badge-purple">XvB/); + assert.match(html, /badge-bad">Unknown/); +}); + +test('Tari status gates the ✔ on a live gRPC channel, never on active-but-dead (#278/#313)', () => { + // The ✔ must mean the merge-mine channel is actually up. A dead channel that still reads "active" + // must show status-warn and NO check — otherwise a TRANSIENT_FAILURE reads as healthy (#278/#313). + const connected = clone(); + Object.assign(connected.tari, { connected: true, active: true, status: 'Merge mining' }); + const cHtml = renderApp({ state: connected }); + assert.match(cHtml, /status-ok">Merge mining/); + assert.match(cHtml, /check-inline/); // connected -> the ✔ shows + + const deadButActive = clone(); + Object.assign(deadButActive.tari, { connected: false, active: true, status: 'Merge mining' }); + const dHtml = renderApp({ state: deadButActive }); + assert.match(dHtml, /status-warn">Merge mining/); + assert.doesNotMatch(dHtml, /check-inline/); // active-but-dead -> NO ✔ (the invariant) +}); + +test('Sync gauge shows a ✔ for a done chain and a live percent while syncing', () => { + const s = clone(); + s.syncing = true; + s.sync.monero.state = 'syncing'; + s.sync.monero.percent = 42; + assert.match(renderApp({ state: s }), /42%/); // syncing chain shows its percent + s.sync.monero.state = 'done'; + assert.match(renderApp({ state: s }), /check-big/); // done chain shows the ✔, not a percent +}); + // --- Component Health & Egress (#170) --------------------------------------------------- test('ComponentHealth shows a Tor-only summary, the topology nodes, and the egress drawer', () => { diff --git a/build/dashboard/tests/service/test_storage_service.py b/build/dashboard/tests/service/test_storage_service.py index e59ca13..54dc62e 100644 --- a/build/dashboard/tests/service/test_storage_service.py +++ b/build/dashboard/tests/service/test_storage_service.py @@ -159,6 +159,16 @@ def test_empty_snapshot_not_saved(self, state_manager): def test_load_missing_snapshot_returns_none(self, state_manager): assert state_manager.load_snapshot() is None + def test_unserializable_snapshot_flags_persistence_unhealthy(self, state_manager): + # A snapshot json.dumps can't serialize (here a set) is a persistent write failure: the + # data is lost and will be lost on restart. Like every other write path it must flip + # db_healthy so /api/state raises the #131 badge — not log-and-look-green (regression guard + # for save_snapshot's TypeError branch that used to call logger.error directly). + assert state_manager.is_db_healthy() is True + state_manager.save_snapshot({"workers": {1, 2, 3}}) # set -> TypeError in json.dumps + assert state_manager.is_db_healthy() is False + assert state_manager.load_snapshot() is None # nothing was persisted + def test_share_stats_persist_across_instances(self, tmp_path): # Issue #82: the per-worker share counts and the proxy /summary totals ride along in the # latest_data snapshot, so they survive a dashboard restart (the snapshot is what diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 4d32481..2b91884 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -4,8 +4,8 @@ _Generated by `make test-inventory` ([`tests/inventory.sh`](../tests/inventory.s edit by hand** — re-run the target to refresh. See [Testing Strategy](testing-strategy.md) for how the tiers fit together._ -**Totals:** 608 dashboard unit tests · 12 contract tests · 60 frontend -tests · 51 `pithead` shell sections · 17 harness self-test sections · +**Totals:** 609 dashboard unit tests · 12 contract tests · 64 frontend +tests · 52 `pithead` shell sections · 17 harness self-test sections · 9 live config scenarios (17 axis values) · 7 mini-stack scenarios. > Counts are **test functions / named cases** (parametrized pytest cases expand to more at @@ -14,9 +14,9 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · | Tier | Suite | Cases | |---|---|---| -| 1 — Unit | dashboard pytest | 608 | -| 1 — Unit | frontend (node --test) | 60 | -| 1 — Unit | `pithead` shell suite | 51 sections | +| 1 — Unit | dashboard pytest | 609 | +| 1 — Unit | frontend (node --test) | 64 | +| 1 — Unit | `pithead` shell suite | 52 sections | | 1 — Unit | compose interpolation + hardening (#90) | 1 | | 2 — Contract | fake-daemon clients | 12 | | 3 — Mini-stack | docker control-plane scenarios | 7 | @@ -27,7 +27,7 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · ## Tier 1 — Unit & component -### Dashboard (pytest) — 608 tests +### Dashboard (pytest) — 609 tests #### tests/client/test_docker_control.py — 6 - test_tcp_scheme_rewritten_to_http @@ -450,7 +450,7 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · - test_routed_fraction_in_unit_interval - test_max_donation_fraction_within_reserve_bounds -#### tests/service/test_storage_service.py — 30 +#### tests/service/test_storage_service.py — 31 - test_get_tiers - test_default_xvb_stats - test_partial_updates @@ -472,6 +472,7 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · - test_roundtrip - test_empty_snapshot_not_saved - test_load_missing_snapshot_returns_none +- test_unserializable_snapshot_flags_persistence_unhealthy - test_share_stats_persist_across_instances - test_state_persists_across_instances - test_legacy_kv_keys_migrated_on_load @@ -691,7 +692,7 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · - test_no_when_in_tier_but_no_share - test_na_when_xvb_off -### Frontend logic (node --test) — 60 tests +### Frontend logic (node --test) — 64 tests - withAlpha: appends an 8-bit alpha to a #rrggbb hex - withAlpha: non-#rrggbb values pass through opaque (a palette change cannot break fills) - padYAxis: pads the range and clamps the floor to zero @@ -710,6 +711,10 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · - WorkersTable renders headers and a row per worker with status classes - WorkersTable with no workers still renders the headers but no rows - ProxyTotals footer is hidden until the proxy reports data +- ProxyTotals reddens the rejected figure only when reject_level is high +- WorkersTable surfaces the per-rig api-unreadable and reject badges, and the pool badge variants +- Tari status gates the ✔ on a live gRPC channel, never on active-but-dead (#278/#313) +- Sync gauge shows a ✔ for a done chain and a live percent while syncing - ComponentHealth shows a Tor-only summary, the topology nodes, and the egress drawer - ComponentHealth flips to a warning summary when the posture leaks - ComponentHealth still renders the panel but omits the drawer when egress is absent @@ -753,7 +758,7 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · - edgePath: column-crossing edges route orthogonally through a clear lane - route palette + names cover every route the server can emit -### `pithead` shell suite (tests/stack/run.sh) — 51 sections +### `pithead` shell suite (tests/stack/run.sh) — 52 sections - unit: resolve_default - unit: assert_safe_dir - unit: is_public_ip classifier (#113) @@ -785,6 +790,7 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · - unit: node credential helpers - unit: randomx_boot_params (#176) - unit: grub heal + boot-param insert (#176) +- unit: ensure_owner conditional recursive chown (#255) - unit: disk_component_gib - unit: check_disk_grouped (mocked df) - node configs: no clearnet DNS egress (#161 monerod, #162 tari) @@ -961,5 +967,5 @@ tests · 51 `pithead` shell sections · 17 harness self-test sections · --- -_Grand total: **764** enumerated cases/sections across the four tiers (plus the live +_Grand total: **770** enumerated cases/sections across the four tiers (plus the live lifecycle and fault-injection phases, which are exercised on a real server)._ diff --git a/docs/testing-strategy.md b/docs/testing-strategy.md index 3504498..53d5cab 100644 --- a/docs/testing-strategy.md +++ b/docs/testing-strategy.md @@ -204,6 +204,49 @@ These are deliberately not yet covered and are the road to full production confi separate exercise (`SECURITY.md`). These tests pin the decisions we've already made; they don't find new ones. +### Coverage-audit follow-ups (2026-06) + +A source-vs-tests audit added Tier-1 coverage for a real bug (snapshot serialization failure left +the #131 persistence badge green), the firewall install-failure rollback (#270), the wallet +hard-fail guards (#250), remote-host/subnet validation (#180), `ensure_owner`'s whole-tree scan +(#255), and several dashboard render branches (per-worker api/reject badges, XvB/Unknown pool +badges, the #278/#313 Tari-✔ invariant, `Gauge` done vs syncing). The gaps it surfaced that are +**not yet covered at an automatable tier** — all needing Docker or the real box, so they land at +tier 3/4: + +- **Firewall rollback, real kernel.** The #270 install-failure rollback is proven at tier 1 with a + stubbed `iptables` (control flow only). Whether a real kernel actually strips a *partially + inserted* `DOCKER-USER` rule and ends fail-closed is only ever exercised by the clean-apply + `no clearnet egress` assertion — never with an induced mid-insert failure. Needs a + `--fault-injection` case that fails one `iptables -I` and re-asserts no clearnet path survives. +- **`ensure_owner` real mixed-ownership tree.** Tier 1 forces the foreign-uid branch with a stub; + the actual #255 scenario (user-owned dir, root-owned *contents*, migrated in one `chown -R`) can + only be built as root. Needs a `--lifecycle` step that root-owns a file under a data dir and + asserts the next `apply` makes it container-owned. +- **Real-container monerod failover in PR CI.** The primary-node reject/readmit cycle only runs on + the manual tier-4 box (`--fault-injection`); the mini-stack (tier 3) breaks Tari, not monerod. +- **Non-blocking-Tari "ignore" path with real containers.** Unit-tested only; the mini-stack proves + Tari-down-while-required (reject) but never Tari-down-while-optional (keep mining). This is the + path that silently kills yield if it regresses to a reject. +- **monerod busy / mid-reorg failover.** The contract test proves the client reads a busy node as + unreachable; no mini-stack or fault-injection scenario asserts the dashboard actually rejects + workers on a busy-but-alive node (a real reorg state, distinct from a clean stop). +- **Double outage, both-must-recover.** Unit-tested (monerod ∧ Tari down → readmit only when both + healthy); never driven with real containers, so the recovery ordering is unproven end-to-end. +- **Partial-start / stop-failure idempotency.** The control loop's "container fails to start/stop → + retry next cycle" is unit-only; no tier-3/4 scenario injects a docker start/stop error. +- **`pithead doctor` on a real box.** Only its exit code is unit-tested; its NTP/clock-drift check + (mining is time-sensitive) is never fault-injected or asserted at tier 4. +- **Disk-full / ENOSPC verdict.** Only a disk-headroom *warning* is checked; a real + container-unhealthy-on-ENOSPC verdict is never forced, though the disk badge + db-write-error + paths are unit-tested. +- **Tor-container-down partial start.** No Caddy/Tor services exist in the mini-stack compose, so + "what happens when the Tor container is down" (SOCKS unreachable) is exercised at no tier below + the manual real box; every all-Tor egress assertion is read-path only. +- **Insecure + main matrix row.** `dashboard.secure=false` only ever pairs with `p2pool.pool=nano`, + so the Caddy-scheme / bind assertions for insecure mode are entangled with the nano path; an + insecure+main regression has no row. + ## Adding a scenario - Logic (a new decision/branch) → a unit test (tier 1). Cheapest, fastest. diff --git a/tests/stack/run.sh b/tests/stack/run.sh index ff555c5..2ba692a 100755 --- a/tests/stack/run.sh +++ b/tests/stack/run.sh @@ -346,6 +346,28 @@ printf 'NETWORK_SUBNET=172.28.0.0/24\nNETWORK_PREFIX=172.28.0\nTOR_EGRESS_FIREWA : >"$FW/ipt.log" PATH="$FW/bin:$PATH" run_sourced "$FW" apply_tor_egress_firewall >/dev/null 2>&1 assert_eq "opt-out (network.tor_egress_firewall=false) installs no DROP" "$(grep -c 'DROP' "$FW/ipt.log" 2>/dev/null)" "0" +# install-failure rollback (#270): if an `iptables -I` insert fails partway, apply must NOT leave a +# half-open firewall it believes is fail-closed — it warns and rolls back via remove_tor_egress_firewall. +# Stub: -N/-D succeed but every -I insert fails (rc 1). remove runs once up-front (idempotent clear) +# and again on rollback, so iptables-save fires TWICE — that second call is the proof the rollback ran. +FF="$SANDBOX/fwfail" +mkdir -p "$FF/bin" +printf '#!/usr/bin/env bash\nexec "$@"\n' >"$FF/bin/sudo" +cat >"$FF/bin/iptables" <<'IPT' +#!/usr/bin/env bash +printf '%s\n' "$*" >>"$IPT_LOG" +case "$1" in -I) exit 1 ;; esac # every insert fails midway +exit 0 +IPT +printf '#!/usr/bin/env bash\nprintf "save\\n" >>"$IPT_LOG"\nexit 0\n' >"$FF/bin/iptables-save" +chmod +x "$FF/bin/sudo" "$FF/bin/iptables" "$FF/bin/iptables-save" +printf 'NETWORK_SUBNET=172.28.0.0/24\nNETWORK_PREFIX=172.28.0\nTOR_EGRESS_FIREWALL=true\n' >"$FF/.env" +: >"$FF/ipt.log" +fwfail_out="$(PATH="$FF/bin:$PATH" IPT_LOG="$FF/ipt.log" run_sourced "$FF" apply_tor_egress_firewall 2>&1)" +fwfail_rc=$? +assert_rc "insert failure degrades gracefully (stack still runs, rc 0)" "$fwfail_rc" "0" +assert_contains "insert failure warns clearnet is NOT fail-closed" "$fwfail_out" "NOT fail-closed" +assert_eq "insert failure rolls back the partial firewall (remove reruns -> save x2)" "$(grep -c '^save$' "$FF/ipt.log")" "2" # remove: `down` (and every re-apply) strips ONLY our tagged rules — this removal is the precondition # for the #291 down->upgrade/apply window, so prove it deletes the tags and spares foreign DOCKER-USER # rules. iptables-save replays two tagged rules + one foreign rule; remove must -D the tagged pair only. @@ -1076,6 +1098,40 @@ run_grub append_grub_boot_params "$g" assert_rc "insert: no active line -> rc 1" "$?" "1" assert_eq "insert: leaves file unchanged when no active line" "$(cat "$g")" "$before" +echo "== unit: ensure_owner conditional recursive chown (#255) ==" +# ensure_owner migrates a data tree to the container's uid ONLY when something in it is foreign-owned, +# and scans the WHOLE tree (not just the top dir) — an install upgraded from the root-container era has +# a user-owned dir but root-owned *contents*, and those are what the non-root container can't overwrite. +# MEMORY flags "must scan contents not just dir" as a past bug, so we guard both the decision and that +# the find scan is recursive (no -maxdepth). sudo is stubbed to record what it would chown. +EO="$SANDBOX/eo" +mkdir -p "$EO/bin" "$EO/tree/sub" +: >"$EO/tree/sub/file" +printf '#!/usr/bin/env bash\nprintf "%%s\\n" "$*" >>"%s/sudo.log"\n' "$EO" >"$EO/bin/sudo" +chmod +x "$EO/bin/sudo" +myuid="$(id -u)" +: >"$EO/sudo.log" +PATH="$EO/bin:$PATH" run_sourced "$EO" ensure_owner "$EO/tree" "$myuid" "$myuid" >/dev/null 2>&1 +assert_rc "clean tree (already owned) stays sudo-free" "$?" "0" +assert_eq "clean tree triggers no chown" "$(grep -c chown "$EO/sudo.log")" "0" +: >"$EO/sudo.log" +PATH="$EO/bin:$PATH" run_sourced "$EO" ensure_owner "$EO/tree" 424242 424242 >/dev/null 2>&1 +assert_contains "foreign ownership triggers a recursive chown" "$(cat "$EO/sudo.log")" "chown -R 424242:424242 $EO/tree" +: >"$EO/sudo.log" +PATH="$EO/bin:$PATH" run_sourced "$EO" ensure_owner "$EO/nonexistent" "$myuid" "$myuid" >/dev/null 2>&1 +assert_rc "missing dir is a no-op" "$?" "0" +assert_eq "missing dir triggers no chown" "$(grep -c chown "$EO/sudo.log")" "0" +# Regression guard for #255: the ownership scan must be whole-tree. Stub `find` to capture its args and +# assert ensure_owner never passes -maxdepth (which would re-introduce the top-dir-only bug). +mkdir -p "$EO/findbin" +printf '#!/usr/bin/env bash\nprintf "%%s\\n" "$*" >>"%s/find.log"\n' "$EO" >"$EO/findbin/find" +printf '#!/usr/bin/env bash\nexit 0\n' >"$EO/findbin/sudo" +chmod +x "$EO/findbin/find" "$EO/findbin/sudo" +: >"$EO/find.log" +PATH="$EO/findbin:$PATH" run_sourced "$EO" ensure_owner "$EO/tree" "$myuid" "$myuid" >/dev/null 2>&1 +assert_not_contains "the ownership scan is recursive (no -maxdepth)" "$(cat "$EO/find.log")" "-maxdepth" +assert_contains "the ownership scan keys off foreign uid" "$(cat "$EO/find.log")" "! -uid $myuid" + echo "== unit: disk_component_gib ==" assert_eq "monero pruned -> 120" "$(run_sourced "$SANDBOX" disk_component_gib monero 1)" "120" assert_eq "monero full -> 320" "$(run_sourced "$SANDBOX" disk_component_gib monero 0)" "320" @@ -1287,6 +1343,43 @@ rc=$? assert_rc "too-short dashboard.auth.password rejected" "$rc" "1" assert_contains "dashboard.auth.password message" "$out" "dashboard.auth.password" +# Wallet-type hard-fail (#250): p2pool pays via coinbase, which CANNOT reach a subaddress or an +# integrated address — a wrong type MINES but is NEVER paid, silently. monero_address_type is +# unit-tested in isolation; these prove parse_and_validate_config actually ABORTS apply on each, +# so the guardrail against losing every reward is wired, not just present. +SUBADDR="8$(printf 'A%.0s' $(seq 94))" # 95-char, starts with 8 -> subaddress +INTADDR="4$(printf 'A%.0s' $(seq 105))" # 106-char, starts with 4 -> integrated +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"main"}, "dashboard":{"secure":true,"host":"box.lan"} }\n' "$SUBADDR" >"$V/config.json" +out="$(cd "$V" && PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +rc=$? +assert_rc "subaddress payout rejected (would never be paid)" "$rc" "1" +assert_contains "subaddress message names the type" "$out" "SUBADDRESS" +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"main"}, "dashboard":{"secure":true,"host":"box.lan"} }\n' "$INTADDR" >"$V/config.json" +out="$(cd "$V" && PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +rc=$? +assert_rc "integrated payout rejected (would never be paid)" "$rc" "1" +assert_contains "integrated message names the type" "$out" "INTEGRATED" + +# Remote mode with no host (#*): renders an empty MONERO_NODE_HOST -> p2pool/dashboard dial nothing, +# mining can't start. Must abort at validation, not silently proceed. +seed_env +printf '{ "monero": {"mode":"remote","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"main"}, "dashboard":{"secure":true,"host":"box.lan"} }\n' "$WALLET" >"$V/config.json" +out="$(cd "$V" && PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +rc=$? +assert_rc "remote mode without a host rejected" "$rc" "1" +assert_contains "remote-host message" "$out" "monero.remote.host" + +# A malformed network.subnet (#180): anything but an X.Y.Z.0/24 block renders a broken NETWORK_PREFIX +# into every service IP and the #270 firewall rules — reject before it can. +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"main"}, "network":{"subnet":"172.28.0.0/16"}, "dashboard":{"secure":true,"host":"box.lan"} }\n' "$WALLET" >"$V/config.json" +out="$(cd "$V" && PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +rc=$? +assert_rc "non-/24 network.subnet rejected" "$rc" "1" +assert_contains "network.subnet message" "$out" "network.subnet" + echo "== black-box: dashboard auth lifecycle (#8) ==" # The hashing reads the pinned Caddy image out of docker-compose.yml and shells out to the stubbed # `caddy hash-password`, so the whole enable → reuse → change → disable path runs offline. From eb3b336d5b1fd38d35973d8505de8037227ba267 Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Tue, 30 Jun 2026 20:43:32 -0500 Subject: [PATCH 2/3] test(integration): add tier-4 backstops for firewall rollback + ensure_owner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two divergence risks the audit flagged — paths PR #330 proves only with stubs — now have live counterparts, folded into the existing opt-in phases (no new flags), following the --auth-fail-closed "live counterpart to a tier-1 assertion" pattern. - --fault-injection: fault_firewall_rollback shadows iptables with a wrapper that fails every `-I`, re-runs apply_tor_egress_firewall, and asserts the box ends fail-closed (no pithead-tor-egress rule left half-installed), then reinstates the real firewall. Belt-and-braces reinstates unconditionally so a mid-phase abort can't leave clearnet egress open. Proves the real-kernel strip the tier-1 stub can't (#270). - --lifecycle: plants a root-owned file under the dashboard data dir and asserts the pool-flip apply (ensure_directories -> ensure_owner) chowns it to uid 1000 — the #255 "scan contents, not just the dir" regression, which needs root to set up so it can't live at tier 1. Both are DESTRUCTIVE-then-restored, local-box only, and run at the release gate (not CI). Help text, testing-strategy Known-gaps, and the test inventory updated. Verified: shellcheck + shfmt clean, bash -n parses, integration self-test 97/0, make lint green, inventory drift-check green. NOT executed against a real box — that happens at the release gate. Co-Authored-By: Claude Opus 4.8 --- docs/test-inventory.md | 4 +++ docs/testing-strategy.md | 18 +++++------ tests/integration/run.sh | 69 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 77 insertions(+), 14 deletions(-) diff --git a/docs/test-inventory.md b/docs/test-inventory.md index 2b91884..f250bf6 100644 --- a/docs/test-inventory.md +++ b/docs/test-inventory.md @@ -887,6 +887,7 @@ tests · 52 `pithead` shell sections · 17 harness self-test sections · - TARI_REQUIRED env matches config - XVB_ENABLED matches config - XvB stats + auto-register wired to the Tor SOCKS (#206/#163) +- apply migrates root-owned CONTENTS to the container uid (#255) - backup archive contains .env - backup archive contains config.json - backup/rollback prerequisites present (writable backups/, tar) @@ -899,6 +900,9 @@ tests · 52 `pithead` shell sections · 17 harness self-test sections · - default-off stratum: no --access-password live (#152) - disk headroom on the live chain FS (${avail} GiB free) - egress posture section present +- firewall apply degrades gracefully on an insert failure (rc 0) +- firewall reinstated after recovery +- insert failure leaves NO half-open firewall (rolled back) - memory ceiling live on $svc (#132) - monero auto-transitioned clearnet→Tor (#234) - monero display mode determinate ($dmode) diff --git a/docs/testing-strategy.md b/docs/testing-strategy.md index 53d5cab..c3111fb 100644 --- a/docs/testing-strategy.md +++ b/docs/testing-strategy.md @@ -214,15 +214,15 @@ badges, the #278/#313 Tari-✔ invariant, `Gauge` done vs syncing). The gaps it **not yet covered at an automatable tier** — all needing Docker or the real box, so they land at tier 3/4: -- **Firewall rollback, real kernel.** The #270 install-failure rollback is proven at tier 1 with a - stubbed `iptables` (control flow only). Whether a real kernel actually strips a *partially - inserted* `DOCKER-USER` rule and ends fail-closed is only ever exercised by the clean-apply - `no clearnet egress` assertion — never with an induced mid-insert failure. Needs a - `--fault-injection` case that fails one `iptables -I` and re-asserts no clearnet path survives. -- **`ensure_owner` real mixed-ownership tree.** Tier 1 forces the foreign-uid branch with a stub; - the actual #255 scenario (user-owned dir, root-owned *contents*, migrated in one `chown -R`) can - only be built as root. Needs a `--lifecycle` step that root-owns a file under a data dir and - asserts the next `apply` makes it container-owned. +- **Firewall rollback, real kernel.** ✅ Now a tier-4 `--fault-injection` case: it shadows `iptables` + with a wrapper that fails every `-I` insert, re-runs `apply_tor_egress_firewall`, and asserts the + box ends fail-closed (no `pithead-tor-egress` rule left half-installed), then reinstates the real + firewall. The tier-1 stubbed test proves the control flow; this proves the real-kernel strip. + Runs at the release gate only (destructive-then-restored, local box). +- **`ensure_owner` real mixed-ownership tree.** ✅ Now a tier-4 `--lifecycle` step: it plants a + root-owned file under the dashboard data dir and asserts the pool-flip `apply` (which runs + `ensure_directories` → `ensure_owner`) chowns it to uid 1000 — the #255 "scan contents, not just + the dir" regression. Runs at the release gate only (needs root to create a foreign-uid inode). - **Real-container monerod failover in PR CI.** The primary-node reject/readmit cycle only runs on the manual tier-4 box (`--fault-injection`); the mini-stack (tier 3) breaks Tari, not monerod. - **Non-blocking-Tari "ignore" path with real containers.** Unit-tested only; the mini-stack proves diff --git a/tests/integration/run.sh b/tests/integration/run.sh index e7981f2..a23dbb9 100755 --- a/tests/integration/run.sh +++ b/tests/integration/run.sh @@ -83,16 +83,19 @@ MATRIX: box's baseline is full) --full-data-dir synced FULL monero data dir (enables the full case when the box's baseline is pruned) - --lifecycle also run the lifecycle phase (restart, apply secret-preservation) + --lifecycle also run the lifecycle phase (restart, apply secret-preservation, + and the #255 ensure_owner migration: a root-owned file under a data + dir must be chowned to the container uid by apply) --safety-backup take a `pithead backup` BEFORE the destructive scenarios; if anything fails, automatically roll the box back to it (down → restore → up). The archive is removed on success. Recommended for the destructive matrix on a precious box. Also exercises backup/restore end-to-end. --fault-injection also run the fault-injection phase: deliberately break monerod (stop / SIGSTOP / remove) and assert pithead's status verdicts - (down / unhealthy / missing) and the failover→recovery cycle. - DESTRUCTIVE-then-restored; local mode only. Slow (healthcheck + - node-health debounce). + (down / unhealthy / missing) and the failover→recovery cycle. Also + forces a real `iptables -I` failure and asserts the #270 firewall + rolls back fail-closed (no half-open ruleset). DESTRUCTIVE-then- + restored; local mode only. Slow (healthcheck + node-health debounce). --auth-fail-closed also run the fail-closed auth phase (#153/#203): empty PROXY_AUTH_TOKEN in .env and assert `pithead up` REFUSES to start (the live counterpart to the tier-1 compose-config check), then restore the exact token and @@ -780,11 +783,31 @@ run_lifecycle() { local other [ "$cur_pool" = "mini" ] && other="main" || other="mini" fp_before="$(secret_fingerprint)" + # ensure_owner whole-tree migration (#255): plant a root-owned file UNDER a data dir (the + # root-container-era signature — user-owned dir, root-owned contents) and prove this apply chowns + # it to the container uid, the exact regression MEMORY flags ("scan contents, not just the dir"). + # Piggybacks the pool-flip apply below, which always runs ensure_directories -> ensure_owner. + # Local mode only (has data dirs); a stub can't create a foreign-uid inode, so this is tier-4. + local own_dir own_probe="" + if [ "$(env_on_box COMPOSE_PROFILES)" = "local_node" ]; then + own_dir="$(env_on_box DASHBOARD_DATA_DIR)" + if [ -n "$own_dir" ]; then + own_probe="$own_dir/.itest-owner-probe" + it_step "planting a root-owned file under $own_dir to exercise ensure_owner (#255)…" + rx "sudo touch $(quote_arg "$own_probe") && sudo chown 0:0 $(quote_arg "$own_probe")" >/dev/null 2>&1 + fi + fi push_config "$(render_scenario_config "$BASELINE_CONFIG" "p2pool.pool=$other")" it_step "apply pool $cur_pool -> $other…" pithead apply -y >/dev/null 2>&1 wait_status_ok 180 || true assert_eq "secrets preserved across pool change" "$(secret_fingerprint)" "$fp_before" + # APP_UID is 1000 in pithead; the migrated contents must now be owned by it, not root. + if [ -n "$own_probe" ]; then + assert_eq "apply migrates root-owned CONTENTS to the container uid (#255)" \ + "$(rx "stat -c %u $(quote_arg "$own_probe") 2>/dev/null")" "1000" + rx "sudo rm -f $(quote_arg "$own_probe")" >/dev/null 2>&1 || true + fi # .pool.type lags a sidechain switch until peers on the new chain connect — wait, don't assert cold (#54). wait_pool_ready 180 "$(pool_label "$other")" || true assert_eq "pool actually changed" "$(jq_get "$(api_state)" '.pool.type')" "$(pool_label "$other")" @@ -900,6 +923,38 @@ fault_missing() { wait_for 240 5 "monerod healthy" _pred_monerod_healthy || true } +# Live counterpart to the tier-1 stubbed rollback (tests/stack/run.sh #270): force a REAL +# `iptables -I` to fail mid-apply and prove the box ends fail-closed — the partial ruleset is +# rolled BACK, not left half-open (a stubbed iptables can't prove the real kernel strips a partial +# insert). DESTRUCTIVE-then-restored: apply_tor_egress_firewall clears the live rules before +# re-inserting, so on the sabotaged run the firewall is briefly down until the recover step (and +# run_fault_injection's belt-and-braces) reinstate it — hence opt-in, local-box only. +fault_firewall_rollback() { + if [ "$(env_on_box TOR_EGRESS_FIREWALL)" = "false" ]; then + it_warn "skipping firewall-rollback fault (network.tor_egress_firewall=false)" + return 0 + fi + it_step "fault: force an iptables -I failure during the firewall apply…" + # Shadow iptables with a wrapper that execs the real binary but exits 1 on any -I insert; on PATH + # only for the apply we invoke below, deleted on recover. $real is baked at write time; \$a/\$@ + # stay literal in the wrapper. + rx 'real=$(command -v iptables) && mkdir -p .itest-bin && printf "%s\n" "#!/usr/bin/env bash" "for a; do [ \"\$a\" = -I ] && exit 1; done" "exec $real \"\$@\"" > .itest-bin/iptables && chmod +x .itest-bin/iptables' >/dev/null 2>&1 + # apply_tor_egress_firewall is a pithead function (main is guarded when sourced), so sourcing + + # calling it with the sabotaged iptables hits the exact rollback branch. + local rc + rx 'PATH="$PWD/.itest-bin:$PATH" bash -c "source ./pithead && apply_tor_egress_firewall" >/dev/null 2>&1' + rc=$? + assert_rc "firewall apply degrades gracefully on an insert failure (rc 0)" "$rc" "0" + # No pithead-tagged rule may survive a failed insert — the rollback must strip the partial set. + assert_eq "insert failure leaves NO half-open firewall (rolled back)" \ + "$(rx 'sudo iptables-save 2>/dev/null | grep -c pithead-tor-egress')" "0" + it_step "recover: drop the sabotage and reinstall the real firewall…" + rx 'rm -rf .itest-bin' >/dev/null 2>&1 + rx 'bash -c "source ./pithead && apply_tor_egress_firewall" >/dev/null 2>&1' || true + assert_num_gt "firewall reinstated after recovery" \ + "$(rx 'sudo iptables-save 2>/dev/null | grep -c pithead-tor-egress')" 0 +} + run_fault_injection() { # shellcheck disable=SC2034 # read by lib.sh:it_fail to label captured failures IT_CURRENT_SCENARIO="fault-injection" @@ -914,10 +969,14 @@ run_fault_injection() { fault_node_down fault_unhealthy fault_missing + fault_firewall_rollback [ "$IT_FAIL" -gt "$fails_before" ] && capture_artifacts "fault-injection" "$OUT_DIR" - # Belt-and-braces: whatever happened above, leave monerod up and the stack healthy. + # Belt-and-braces: whatever happened above, leave monerod up, the firewall reinstated, and the + # stack healthy (the firewall reinstate is unconditional so a mid-phase abort can't leave the box + # with clearnet egress open). rx "docker compose up -d monerod" >/dev/null 2>&1 || true + rx 'bash -c "source ./pithead && apply_tor_egress_firewall" >/dev/null 2>&1' || true wait_for 240 5 "monerod healthy after fault phase" _pred_monerod_healthy || true wait_status_ok 240 || true } From 72fae383a04dad79db73e6e0fa68d4a4a6baca1a Mon Sep 17 00:00:00 2001 From: Vijit Singh Date: Tue, 30 Jun 2026 20:49:41 -0500 Subject: [PATCH 3/3] =?UTF-8?q?test(integration):=20fix=20firewall-rollbac?= =?UTF-8?q?k=20sabotage=20=E2=80=94=20shadow=20sudo,=20not=20iptables?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Validated the two tier-4 backstops live on a real box (gouda). The ensure_owner migration passed as written. The firewall-rollback check did NOT exercise the rollback: apply_tor_egress_firewall calls `sudo iptables -I`, and sudo's secure_path ignores a PATH-shadowed iptables, so the real insert succeeded and the rollback branch never ran (the rc-0 assertion passed for the wrong reason — a false green). Fix: shadow `sudo` instead of `iptables`. sudo is still resolved via PATH, so a wrapper that fails an `iptables … -I …` insert and execs real sudo for everything else (remove's -D, -N, iptables-save) makes the insert fail exactly as a real mid-insert error would. Re-validated live: with the real -I failing, the partial ruleset is rolled back (0 pithead-tor-egress rules), then reinstated (7). Box restored byte-identical to its pre-test iptables snapshot. shellcheck + shfmt clean, bash -n parses, self-test 97/0, inventory drift green. Co-Authored-By: Claude Opus 4.8 --- tests/integration/run.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/integration/run.sh b/tests/integration/run.sh index a23dbb9..63d8325 100755 --- a/tests/integration/run.sh +++ b/tests/integration/run.sh @@ -935,10 +935,13 @@ fault_firewall_rollback() { return 0 fi it_step "fault: force an iptables -I failure during the firewall apply…" - # Shadow iptables with a wrapper that execs the real binary but exits 1 on any -I insert; on PATH - # only for the apply we invoke below, deleted on recover. $real is baked at write time; \$a/\$@ - # stay literal in the wrapper. - rx 'real=$(command -v iptables) && mkdir -p .itest-bin && printf "%s\n" "#!/usr/bin/env bash" "for a; do [ \"\$a\" = -I ] && exit 1; done" "exec $real \"\$@\"" > .itest-bin/iptables && chmod +x .itest-bin/iptables' >/dev/null 2>&1 + # Shadow SUDO (not iptables): apply calls `sudo iptables -I`, and sudo's secure_path ignores a + # PATH-shadowed iptables, so the insert would really succeed. sudo itself is still found via PATH, + # so a wrapper that fails an `iptables … -I …` insert and execs real sudo for everything else + # (remove's -D, -N, iptables-save) makes the insert fail exactly as a real mid-insert error would. + # On PATH only for the apply below, deleted on recover. $realsudo baked at write time; \$1/\$a/\$@ + # stay literal. (Verified live on a real box — the iptables-shadow variant silently no-ops.) + rx 'realsudo=$(command -v sudo) && mkdir -p .itest-bin && printf "%s\n" "#!/usr/bin/env bash" "if [ \"\$1\" = iptables ]; then for a; do [ \"\$a\" = -I ] && exit 1; done; fi" "exec $realsudo \"\$@\"" > .itest-bin/sudo && chmod +x .itest-bin/sudo' >/dev/null 2>&1 # apply_tor_egress_firewall is a pithead function (main is guarded when sourced), so sourcing + # calling it with the sabotaged iptables hits the exact rollback branch. local rc