diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index abb6bc300..ac895510b 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -1,8 +1,16 @@
 {
   "permissions": {
     "allow": [
-      "Bash(date *)",
       "Bash(cp .claude/*)",
+      "Read(.claude/**)",
+      "Read(.claude/skills/run-tests/**)",
+      "Write(.claude/**/*commit_msg*)",
+      "Write(.claude/git_commit_msg_LATEST.md)",
+      "Skill(run-tests)",
+      "Skill(close-wkt)",
+      "Skill(open-wkt)",
+      "Skill(prompt-io)",
+      "Bash(date *)",
       "Bash(git diff *)",
       "Bash(git log *)",
       "Bash(git status)",
@@ -23,14 +31,12 @@
       "Bash(UV_PROJECT_ENVIRONMENT=py* uv sync:*)",
       "Bash(UV_PROJECT_ENVIRONMENT=py* uv run:*)",
       "Bash(echo EXIT:$?:*)",
-      "Write(.claude/*commit_msg*)",
-      "Write(.claude/git_commit_msg_LATEST.md)",
-      "Skill(run-tests)",
-      "Skill(close-wkt)",
-      "Skill(open-wkt)",
-      "Skill(prompt-io)"
+      "Bash(echo \"EXIT=$?\")",
+      "Read(//tmp/**)"
     ],
     "deny": [],
     "ask": []
-  }
+  },
+  "prefersReducedMotion": false,
+  "outputStyle": "default"
 }
diff --git a/.claude/skills/conc-anal/SKILL.md b/.claude/skills/conc-anal/SKILL.md
index 4f498b7c3..fa121bb25 100644
--- a/.claude/skills/conc-anal/SKILL.md
+++ b/.claude/skills/conc-anal/SKILL.md
@@ -229,3 +229,69 @@ Unlike asyncio, trio allows checkpoints in
 that does `await` can itself be cancelled (e.g.
 by nursery shutdown). Watch for cleanup code that
 assumes it will run to completion.
+
+### Unbounded waits in cleanup paths
+
+Any `await <event>.wait()` in a teardown path is
+a latent deadlock unless the event's setter is
+GUARANTEED to fire. If the setter depends on
+external state (peer disconnects, child process
+exit, subsequent task completion) that itself
+depends on the current task's progress, you have
+a mutual wait.
+
+Rule: **bound every `await X.wait()` in cleanup
+paths with `trio.move_on_after()`** unless you
+can prove the setter is unconditionally reachable
+from the state at the await site. Concrete recent
+example: `ipc_server.wait_for_no_more_peers()` in
+`async_main`'s finally (see
+`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
+"probe iteration 3") — it was unbounded, and when
+one peer-handler was stuck the wait-for-no-more-
+peers event never fired, deadlocking the whole
+actor-tree teardown cascade.
+
+### The capture-pipe-fill hang pattern (grep this first)
+
+When investigating any hang in the test suite
+**especially under fork-based backends**, first
+check whether the hang reproduces under `pytest
+-s` (`--capture=no`). If `-s` makes it go away
+you're not looking at a trio concurrency bug —
+you're looking at a Linux pipe-buffer fill.
+
+Mechanism: pytest replaces fds 1,2 with pipe
+write-ends. Fork-child subactors inherit those
+fds. High-volume error-log tracebacks (cancel
+cascade spew) fill the 64KB pipe buffer. Child
+`write()` blocks. Child can't exit. Parent's
+`waitpid`/pidfd wait blocks. Deadlock cascades up
+the tree.
+
+Pre-existing guards in `tests/conftest.py` encode
+this knowledge — grep these BEFORE blaming
+concurrency:
+
+```python
+# tests/conftest.py:258
+if loglevel in ('trace', 'debug'):
+    # XXX: too much logging will lock up the subproc (smh)
+    loglevel: str = 'info'
+
+# tests/conftest.py:316
+# can lock up on the `_io.BufferedReader` and hang..
+stderr: str = proc.stderr.read().decode()
+```
+
+Full post-mortem +
+`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
+for the canonical reproduction. Cost several
+investigation sessions before catching it —
+because the capture-pipe symptom was masked by
+deeper cascade-deadlocks. Once the cascades were
+fixed, the tree tore down enough to generate
+pipe-filling log volume → capture-pipe finally
+surfaced. Grep-note for future-self: **if a
+multi-subproc tractor test hangs, `pytest -s`
+first, conc-anal second.**
diff --git a/.claude/skills/run-tests/SKILL.md b/.claude/skills/run-tests/SKILL.md
index 946e871e0..b2014201c 100644
--- a/.claude/skills/run-tests/SKILL.md
+++ b/.claude/skills/run-tests/SKILL.md
@@ -205,6 +205,101 @@ python -m pytest tests/ -x -q --co 2>&1 | tail -5
 If either fails, fix the import error before running
 any actual tests.
 
+### Step 4: zombie-actor / stale-registry check (MANDATORY)
+
+The tractor runtime's default registry address is
+**`127.0.0.1:1616`** (TCP) / `/tmp/registry@1616.sock`
+(UDS). Whenever any prior test run — especially one
+using a fork-based backend like `subint_forkserver` —
+leaks a child actor process, that zombie keeps the
+registry port bound and **every subsequent test
+session fails to bind**, often presenting as 50+
+unrelated failures ("all tests broken"!) across
+backends.
+
+**This has to be checked before the first run AND
+after any cancelled/SIGINT'd run** — signal failures
+in the middle of a test can leave orphan children.
+
+```sh
+# 1. TCP registry — any listener on :1616? (primary signal)
+ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 free'
+
+# 2. leftover actor/forkserver procs — scoped to THIS
+#    repo's python path, so we don't false-flag legit
+#    long-running tractor-using apps (e.g. `piker`,
+#    downstream projects that embed tractor).
+pgrep -af "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" \
+  | grep -v 'grep\|pgrep' \
+  || echo 'no leaked actor procs from this repo'
+
+# 3. stale UDS registry sockets
+ls -la /tmp/registry@*.sock 2>/dev/null \
+  || echo 'no leaked UDS registry sockets'
+```
+
+**Interpretation:**
+
+- **TCP :1616 free AND no stale sockets** → clean,
+  proceed. The actor-procs probe is secondary — false
+  positives are common (piker, any other tractor-
+  embedding app); only cleanup if `:1616` is bound or
+  sockets linger.
+- **TCP :1616 bound OR stale sockets present** →
+  surface PIDs + cmdlines to the user, offer cleanup:
+
+  ```sh
+  # 1. GRACEFUL FIRST (tractor is structured concurrent — it
+  #    catches SIGINT as an OS-cancel in `_trio_main` and
+  #    cascades Portal.cancel_actor via IPC to every descendant.
+  #    So always try SIGINT first with a bounded timeout; only
+  #    escalate to SIGKILL if graceful cleanup doesn't complete).
+  pkill -INT -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
+
+  # 2. bounded wait for graceful teardown (usually sub-second).
+  #    Loop until the processes exit, or timeout. Keep the
+  #    bound tight — hung/abrupt-killed descendants usually
+  #    hang forever, so don't wait more than a few seconds.
+  for i in $(seq 1 10); do
+    pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null || break
+    sleep 0.3
+  done
+
+  # 3. ESCALATE TO SIGKILL only if graceful didn't finish.
+  if pgrep -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv" >/dev/null; then
+    echo 'graceful teardown timed out — escalating to SIGKILL'
+    pkill -9 -f "$(pwd)/py[0-9]*/bin/python.*_actor_child_main|subint-forkserv"
+  fi
+
+  # 4. if a test zombie holds :1616 specifically and doesn't
+  #    match the above pattern, find its PID the hard way:
+  ss -tlnp 2>/dev/null | grep ':1616'   # prints `users:(("<name>",pid=NNNN,...))`
+  # then (same SIGINT-first ladder):
+  # kill -INT <NNNN>; sleep 1; kill -9 <NNNN> 2>/dev/null
+
+  # 5. remove stale UDS sockets
+  rm -f /tmp/registry@*.sock
+
+  # 6. re-verify
+  ss -tlnp 2>/dev/null | grep ':1616' || echo 'TCP :1616 now free'
+  ```
+
+**Never ignore stale registry state.** If you see the
+"all tests failing" pattern — especially
+`trio.TooSlowError` / connection refused / address in
+use on many unrelated tests — check registry **before**
+spelunking into test code. The failure signature will
+be identical across backends because they're all
+fighting for the same port.
+
+**False-positive warning for step 2:** a plain
+`pgrep -af '_actor_child_main'` will also match
+legit long-running tractor-embedding apps (e.g.
+`piker` at `~/repos/piker/py*/bin/python3 -m
+tractor._child ...`). Always scope to the current
+repo's python path, or only use step 1 (`:1616`) as
+the authoritative signal.
+
 ## 4. Run and report
 
 - Run the constructed command.
@@ -356,3 +451,175 @@ by your changes — note them and move on.
 **Rule of thumb**: if a test fails with `TooSlowError`,
 `trio.TooSlowError`, or `pexpect.TIMEOUT` and you didn't
 touch the relevant code path, it's flaky — skip it.
+
+## 9. The pytest-capture hang pattern (CHECK THIS FIRST)
+
+**Symptom:** a tractor test hangs indefinitely under
+default `pytest` but passes instantly when you add
+`-s` (`--capture=no`).
+
+**Cause:** tractor subactors (especially under fork-
+based backends) inherit pytest's stdout/stderr
+capture pipes via fds 1,2. Under high-volume error
+logging (e.g. multi-level cancel cascade, nested
+`run_in_actor` failures, anything triggering
+`RemoteActorError` + `ExceptionGroup` traceback
+spew), the **64KB Linux pipe buffer fills** faster
+than pytest drains it. Subactor writes block → can't
+finish exit → parent's `waitpid`/pidfd wait blocks →
+deadlock cascades up the tree.
+
+**Pre-existing guards in the tractor harness** that
+encode this same knowledge — grep these FIRST
+before spelunking:
+
+- `tests/conftest.py:258-260` (in the `daemon`
+  fixture): `# XXX: too much logging will lock up
+  the subproc (smh)` — downgrades `trace`/`debug`
+  loglevel to `info` to prevent the hang.
+- `tests/conftest.py:316`: `# can lock up on the
+  _io.BufferedReader and hang..` — noted on the
+  `proc.stderr.read()` post-SIGINT.
+
+**Debug recipe (in priority order):**
+
+1. **Try `-s` first.** If the hang disappears with
+   `pytest -s`, you've confirmed it's capture-pipe
+   fill. Skip spelunking.
+2. **Lower the loglevel.** Default `--ll=error` on
+   this project; if you've bumped it to `debug` /
+   `info`, try dropping back. Each log level
+   multiplies pipe-pressure under fault cascades.
+3. **If you MUST use default capture + high log
+   volume**, redirect subactor stdout/stderr in the
+   child prelude (e.g.
+   `tractor.spawn._subint_forkserver._child_target`
+   post-`_close_inherited_fds`) to `/dev/null` or a
+   file.
+
+**Signature tells you it's THIS bug (vs. a real
+code hang):**
+
+- Multi-actor test under fork-based backend
+  (`subint_forkserver`, eventually `trio_proc` too
+  under enough log volume).
+- Multiple `RemoteActorError` / `ExceptionGroup`
+  tracebacks in the error path.
+- Test passes with `-s` in the 5-10s range, hangs
+  past pytest-timeout (usually 30+ s) without `-s`.
+- Subactor processes visible via `pgrep -af
+  subint-forkserv` or similar after the hang —
+  they're alive but blocked on `write()` to an
+  inherited stdout fd.
+
+**Historical reference:** this deadlock cost a
+multi-session investigation (4 genuine cascade
+fixes landed along the way) that only surfaced the
+capture-pipe issue AFTER the deeper fixes let the
+tree actually tear down enough to produce pipe-
+filling log volume. Full post-mortem in
+`ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`.
+Lesson codified here so future-me grep-finds the
+workaround before digging.
+
+## 10. Reaping zombie subactors (`tractor-reap`)
+
+**Symptom:** after a `pytest` run crashes, times out,
+or is `Ctrl+C`'d, subactor forks (esp. under
+`subint_forkserver`) can be reparented to `init`
+(PPid==1) and linger. They hold onto ports, inherit
+pytest's capture-pipe fds, and flakify later
+sessions.
+
+**Two layers of defense:**
+
+### a) Session-scoped auto-fixture (always on)
+
+`tractor/_testing/pytest.py::_reap_orphaned_subactors`
+runs at pytest session teardown. It walks `/proc` for
+direct descendants of the pytest pid, SIGINTs them,
+waits up to 3s, then SIGKILLs survivors. SC-polite:
+gives the subactor runtime a chance to run its trio
+cancel shield + IPC teardown before escalation.
+
+This is *autouse* and session-scoped — you don't need
+to do anything. It just runs.
+
+### b) `scripts/tractor-reap` CLI (manual reap)
+
+For the **pytest-died-mid-session** case (Ctrl+C, OOM
+kill, hung process you had to `kill -9`), the fixture
+never ran. Reach for the CLI:
+
+```sh
+# default: orphans (PPid==1, cwd==repo, cmd contains python)
+scripts/tractor-reap
+
+# descendant-mode: from a still-live supervisor
+scripts/tractor-reap --parent <pytest-pid>
+
+# see what would be reaped, don't signal
+scripts/tractor-reap -n
+
+# tune the SIGINT → SIGKILL grace window
+scripts/tractor-reap --grace 5
+```
+
+Exit code: `0` if everyone exited on SIGINT, `1` if
+SIGKILL had to escalate — so you can chain it in CI
+health-checks (`scripts/tractor-reap || <alert>`).
+
+**What it matches** (orphan-mode):
+- `PPid == 1` (reparented to init → definitely
+  orphaned, not just a currently-running child)
+- `cwd == <repo-root>` (keeps the sweep scoped; won't
+  touch unrelated init-children elsewhere)
+- `python` in cmdline
+
+**What it does not do:** kill anything whose PPid is
+still a live tractor parent. If the parent is alive
+it's not an orphan; use `--parent <pid>` if you need
+to force-reap under a still-live supervisor.
+
+**When NOT to run it:** while a pytest session is
+active in another terminal. It's safe (won't touch
+that session's live children in orphan-mode) but can
+race if the target session is mid-teardown.
+
+### c) `--shm` / `--shm-only`: orphan-segment sweep
+
+Because `tractor.ipc._mp_bs.disable_mantracker()`
+turns off `mp.resource_tracker` (see
+`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`),
+a hard-crashing actor can leave `/dev/shm/<key>`
+segments behind that nothing else GCs.
+
+```sh
+# process reap THEN shm sweep
+scripts/tractor-reap --shm
+
+# shm sweep only (skip process phase)
+scripts/tractor-reap --shm-only
+
+# dry-run: list candidates, don't unlink
+scripts/tractor-reap --shm -n
+```
+
+**Match criteria** (very conservative — this is a
+shared-system path, can't be wrong):
+- segment is a regular file under `/dev/shm`,
+- owned by the **current uid** (`stat.st_uid`),
+- AND **no live process holds it open** —
+  enumerated by walking every readable
+  `/proc/<pid>/maps` (post-mmap mappings) AND
+  `/proc/<pid>/fd/*` (pre-mmap shm-opened fds).
+
+The "nobody has it open" check is the
+kernel-canonical "is this leaked?" test — same
+answer `lsof /dev/shm/<key>` would give. No
+reliance on tractor-specific naming, so it works
+for any tractor app. Critically, it WILL NOT touch
+segments held by other apps you have running
+(e.g. `piker`, `lttng-ust-*`, `aja-shm-*` —
+verified locally with 81 in-use segments correctly
+preserved).
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ea5b98113..6eff3bcbe 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -148,9 +148,13 @@ jobs:
       - name: Run tests
         run: >
           uv run
-          pytest tests/ -rsx
+          pytest
+          tests/
+          -rsx
           --spawn-backend=${{ matrix.spawn_backend }}
           --tpt-proto=${{ matrix.tpt_proto }}
+          --capture=fd
+        # ^XXX^ can't work with --spawn-method=main_thread_forkserver
 
   # XXX legacy NOTE XXX
   #
diff --git a/ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md b/ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md
new file mode 100644
index 000000000..780cbb67c
--- /dev/null
+++ b/ai/conc-anal/cancel_cascade_too_slow_under_main_thread_forkserver_issue.md
@@ -0,0 +1,202 @@
+# Cancel-cascade `trio.TooSlowError` flakes under `main_thread_forkserver`
+
+## Symptom
+
+Running the full test suite under
+
+```bash
+./py313/bin/python -m pytest tests/ \
+  --tpt-proto=tcp \
+  --spawn-backend=main_thread_forkserver
+```
+
+surfaces a single, **rotating** `trio.TooSlowError`
+failure each run. The failure isn't deterministic on
+test identity — different test each run — but it
+ALWAYS looks like:
+
+```
+FAILED tests/<file>::test_<name> - trio.TooSlowError
+==== 1 failed, 373 passed, 17 skipped, 11–12 xfailed,
+       0–1 xpassed, ~550 warnings in ~6min ====
+```
+
+Pass rate: **~99.7%** (373 of 374 non-skip tests).
+Wall-clock per full run: 5–6 min.
+
+## Tests observed flaking so far
+
+Each row was the SOLE failure in a separate run:
+
+| run # | test |
+|---|---|
+| 1 | `tests/test_advanced_streaming.py::test_dynamic_pub_sub[KeyboardInterrupt]` |
+| 2 | `tests/test_infected_asyncio.py::test_context_spawns_aio_task_that_errors[parent_actor_cancels_child=False]` |
+
+Both share the same shape:
+
+- **Cancel cascade** of N subactors back to a parent root actor.
+- N ≥ `multiprocessing.cpu_count()` for `test_dynamic_pub_sub`
+  (it spawns `cpus - 1` consumers + publisher + dynamic-consumer).
+- N ≈ 2 for `test_context_spawns_aio_task_that_errors` —
+  but each subactor is `infect_asyncio=True`, so each
+  cancel involves the trio↔asyncio guest-run unwind
+  which is structurally heavier than pure-trio.
+- Test wraps the cascade in `trio.fail_after(N seconds)`
+  and the cap fires before the cascade completes.
+
+The exact failing test rotates because each test is
+independently close to the cap; whichever happens to
+be unlucky in scheduling/CPU-contention on a given run
+is the one that times out.
+
+## Root-cause family
+
+`hard_kill` (`tractor/spawn/_spawn.py:hard_kill`) runs
+the SC-graceful teardown ladder per subactor:
+
+1. `Portal.cancel_actor()` — graceful IPC cancel-req.
+2. Wait `terminate_after=1.6s` for sub to exit.
+3. If still alive: `proc.kill()` (SIGKILL).
+4. (NEW) `_unlink_uds_bind_addrs()` — post-mortem
+   sock-file cleanup for UDS leaks (issue #452 fix).
+
+For a cascade of N subactors, each pays steps 1–4. If
+graceful-cancel doesn't complete within 1.6s for ANY
+sub, that sub eats a full 1.6s of `move_on_after` plus
+the `proc.wait()` post-SIGKILL.
+
+Worst case under fork backend with N=cpus subs:
+- N × 1.6s = 16s+ on a 10-core box just for the
+  graceful timeout phase
+- Plus per-spawn fork-IPC handshake cost compounds
+  during teardown (each sub's IPC cleanup goes through
+  the same forkserver coordinator)
+- Plus the new autouse fixtures
+  (`_track_orphaned_uds_per_test`,
+  `_detect_runaway_subactors_per_test`,
+  `_reap_orphaned_subactors`) all run at test
+  teardown, adding small (10s of ms) but cumulative
+  overhead
+
+Current cap: 30s (`fail_after_s = 30 if
+is_forking_spawner else 12`). Empirically fits the
+median run but the tail breaks ~0.3% of the time.
+
+## NOT regressing
+
+To confirm this is a flake and not a regression:
+
+- Pre-`WakeupSocketpair`-patch baseline: tests
+  HUNG INDEFINITELY (busy-loop never released).
+- Post-patch: pass-or-fail-fast, ~99.7% pass, the
+  occasional cap-hit fails in bounded time (<60s for
+  the offending test).
+- Same test PASSES under `--spawn-backend=trio`
+  (no fork, no hard-kill compounding).
+
+So the suite is dramatically better than before; the
+remaining flake is a known-tolerable steady-state.
+
+## Possible mitigations (ranked)
+
+### A. Bump the cap further
+
+Cheapest. Change the per-test `fail_after_s` from 30
+to e.g. 60 for fork backends. Pros: trivial. Cons:
+masks any genuine slowness regression we'd want to
+catch.
+
+### B. CPU-count-aware cap
+
+For tests whose N scales with `cpu_count()`, scale
+the cap too:
+
+```python
+fail_after_s = (
+    max(30, cpu_count() * 3)  # 3s/actor floor
+    if is_forking_spawner
+    else 12
+)
+```
+
+Pros: scales with the actual cancel-cascade work.
+Cons: still arbitrary multiplier.
+
+### C. `pytest-rerunfailures` for these tests only
+
+Mark the known-flaky tests with
+`@pytest.mark.flaky(reruns=1)` (needs
+`pytest-rerunfailures` dep). Single retry hides
+genuine ~0.3% transient flakes.
+
+Pros: no cap change, surfaces persistent failures
+loudly. Cons: adds a dep, retries can mask real bugs
+if used widely.
+
+### D. Reduce `hard_kill`'s `terminate_after`
+
+Drop from 1.6s → 0.8s. Cuts the worst-case cascade
+time roughly in half. Risks: fewer subs get a chance
+to run their cleanup before SIGKILL → more orphaned
+state for the autouse reapers to handle (ironically,
+adds back overhead elsewhere).
+
+### E. Profile + targeted fix
+
+Add `log.devx()` markers in `hard_kill` to time each
+phase. Identify if any subactor is consistently
+hitting the 1.6s cap (vs. exiting in <0.1s). If so,
+that sub has a teardown bug worth fixing at source.
+Pros: actually fixes the underlying slowness. Cons:
+real investigation work, deferred from this round.
+
+## Recommendation
+
+Land this issue-doc as the tracker. Apply **(B)** as
+a small follow-up — cheap and proportional. If it
+still flakes, escalate to **(E)** with a `log.devx()`
+profile-pass.
+
+`(C)` is a backstop if `(B)` doesn't quite get there
+and we need green CI faster than (E) can deliver.
+
+## Verification protocol
+
+After applying any mitigation:
+
+```bash
+# Run the suite N times back-to-back, count failures.
+# A persistent failure on the SAME test == real bug.
+# Failures rotating across tests == still cap-related.
+
+for i in $(seq 1 5); do
+  ./py313/bin/python -m pytest tests/ \
+    --tpt-proto=tcp \
+    --spawn-backend=main_thread_forkserver \
+    -q 2>&1 | tail -2
+done
+```
+
+Target: 0 failures across 5 runs ⇒ ship. 1–2 failures
+still rotating ⇒ apply (C). Same test failing twice
+⇒ escalate to (E).
+
+## See also
+
+- [#452](https://github.com/goodboy/tractor/issues/452) —
+  UDS sock-file leak (related — `hard_kill`'s
+  cleanup phase contributes to cascade time)
+- `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
+  — the upstream-trio fix that turned this from a
+  100% hang into a 0.3% flake
+- `ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md`
+  — the asyncio variant which contributes to one of
+  the rotating failures
+- `tractor/spawn/_spawn.py::hard_kill` — the SIGKILL
+  cascade source
+- `tractor/_testing/_reap.py::_track_orphaned_uds_per_test`,
+  `_detect_runaway_subactors_per_test`,
+  `_reap_orphaned_subactors` — autouse cleanup
+  fixtures whose cumulative teardown overhead
+  contributes to the cascade time
diff --git a/ai/conc-anal/fork_thread_semantics_execution_vs_memory.md b/ai/conc-anal/fork_thread_semantics_execution_vs_memory.md
new file mode 100644
index 000000000..c07ad81d3
--- /dev/null
+++ b/ai/conc-anal/fork_thread_semantics_execution_vs_memory.md
@@ -0,0 +1,281 @@
+# `fork()` in a multi-threaded program — execution-side vs. memory-side of the same coin
+
+A reference doc for readers who've encountered one of two
+opposite-sounding framings of POSIX `fork()` semantics in a
+multi-threaded program and are confused by the other.
+
+This is a sibling to
+`subint_fork_blocked_by_cpython_post_fork_issue.md` — that
+doc covers a CPython-level refusal of fork-from-subint;
+this one covers the more general POSIX layer, since
+tractor's main-thread forkserver design rests on it.
+
+## TL;DR
+
+POSIX `fork()` only preserves the *calling* thread as a
+runnable thread in the child — every other thread in the
+parent simply never executes another instruction in the
+child. trio's docs call this "leaked"; tractor's
+`_main_thread_forkserver.py` docstring calls it "gone".
+Both are correct: "gone" is the *execution* side (no
+scheduler entry, no instructions retired), "leaked" is the
+*memory* side (the dead threads' stacks and per-thread
+heap structures still ride into the child's address space
+as orphaned COW pages with no owner and no cleanup hook).
+Same POSIX reality, two halves of the same coin.
+
+## The two framings
+
+[python-trio/trio#1614][trio-1614] (the canonical "trio +
+fork" hazards thread) puts it this way:
+
+> If you use `fork()` in a process with multiple threads,
+> all the other thread stacks are just leaked: there's
+> nothing else you can reasonably do with them.
+
+`tractor.spawn._main_thread_forkserver`'s module docstring
+(specifically the "What survives the fork? — POSIX
+semantics" section) puts it this way:
+
+> POSIX `fork()` only preserves the *calling* thread as a
+> runnable thread in the child. Every other thread in the
+> parent — trio's runner thread, any `to_thread` cache
+> threads, anything else — never executes another
+> instruction post-fork.
+
+A reader bouncing between the two can be forgiven for
+asking: well, *which* is it — leaked or gone?
+
+The answer is "yes". They're describing the same POSIX
+behavior from two different angles:
+
+- trio is talking about the **bytes** the dead threads
+  leave behind — stacks, TLS slots, per-thread arena
+  metadata — and the fact that nothing in the child can
+  drive them forward, free them, or even safely walk
+  them. That's a memory leak in the strict sense: held
+  but unreachable.
+- tractor is talking about the **execution** side
+  relevant to the forkserver design: which threads
+  retire instructions in the child? Exactly one — the
+  one that called `fork()`. Everything else, regardless
+  of the bytes left behind, is dead in a scheduler
+  sense.
+
+Neither framing is wrong; they're just answering
+different questions.
+
+## POSIX `fork()` in a multi-threaded program — what actually happens
+
+Per POSIX (and concretely on Linux glibc), the contract
+of `fork()` in a multi-threaded process is:
+
+1. The kernel creates a new process whose virtual
+   address space is a COW copy of the parent's. *All*
+   pages map across — code, heap, every thread's stack,
+   every malloc arena, every mmap region.
+2. Of the parent's N threads, exactly **one** is
+   reified in the child as a runnable kernel task: the
+   thread that called `fork()`. The other N-1 threads
+   have *no* corresponding task in the child kernel. They
+   were never scheduled, never `clone()`d for the child,
+   never exist as runnable entities.
+3. Their **memory artifacts** — pthread stacks, TLS,
+   `pthread_t` structures, glibc per-thread arena
+   bookkeeping — are still mapped in the child's address
+   space, because (1) duplicates *everything* page-wise.
+   They sit there as inert COW bytes.
+4. The kernel does not clean those bytes up. There is no
+   "phantom-thread cleanup" pass post-fork. The kernel
+   doesn't know which mapped pages "belonged to" which
+   thread — at the kernel level mappings are
+   process-scoped, not thread-scoped.
+5. The surviving thread (the caller of `fork()`) cannot
+   safely access those leaked bytes either. Any state
+   they encoded — held mutexes, in-flight syscalls,
+   half-updated invariants — is frozen at whatever
+   instant the parent's fork-syscall observed it. Some
+   of those mutexes may even still be locked from the
+   child's POV (the canonical "fork-in-multithreaded-
+   program-deadlocks" hazard; see `man pthread_atfork`).
+
+So: from the kernel's PoV, the child has one thread.
+From the address-space's PoV, the child has all the
+parent's bytes — including the corpses of the N-1 dead
+threads' stacks. Both true simultaneously.
+
+## Why trio says "leaked"
+
+trio's framing makes sense from the parent's
+PoV, looking at *what those threads were doing*. In a
+running `trio.run()` process you typically have:
+
+- The trio runner thread itself — owns the `selectors`
+  epoll fd, the signal-wakeup-fd, the run-queue.
+- Threadpool worker threads (`trio.to_thread`'s cache)
+  — blocked in `wait()` on the threadpool's work
+  condvar.
+- Whatever other ad-hoc threads the application
+  started.
+
+Each of those threads owns *real work-state*: epoll
+registrations, file descriptors held in
+soon-to-be-completed reads, half-released locks, posted
+but unconsumed wakeups. After fork, that state is still
+encoded in the child's memory. None of it is invalid in
+a well-formed-bytes sense. It's just that:
+
+- The thread that was driving it is gone.
+- Nothing else in the child knows the layout well
+  enough to take over.
+- Even if it did, the kernel objects backing the work
+  (epoll fd, signalfd) have separate post-fork
+  semantics that don't compose with userland trio
+  state.
+
+So the bytes are *held* (they're in the child's
+address space, they count against RSS, they survive
+until something clobbers them), and they're
+*unreachable* in any meaningful sense — no thread can
+safely drive them forward. That is the textbook
+definition of a leak.
+
+trio's quote is reminding the user that `fork()` from a
+multi-threaded process is a one-way memory hazard:
+whatever those threads were doing, that work-state is
+now garbage you happen to still be carrying.
+
+## Why tractor says "gone"
+
+tractor's `_main_thread_forkserver` framing is concerned
+with a different question: *which thread executes in the
+child, and is it safe?*
+
+The forkserver design rests on POSIX's "calling thread
+is the sole survivor" guarantee. We pick that calling
+thread very deliberately: a dedicated worker that has
+provably never entered trio. So the thread that *does*
+run in the child is one whose locals, TLS, and stack
+contain nothing trio-related. Trio's runner thread —
+the one that owned the epoll fd and the run-queue — is
+*gone* from the child in the execution sense. It will
+never run another instruction. The fact that its stack
+bytes still exist in the child's address space (the
+"leaked" view) is irrelevant to the forkserver, because
+nothing in the child reads or writes those pages.
+
+So when the docstring says "Every other thread … is
+gone the instant `fork()` returns in the child", it's
+being precise about the surface that matters for the
+backend: scheduler-level liveness. Nothing schedules
+those threads ever again. Whether their bytes are
+hanging around is a separate (and, for the design,
+non-load-bearing) fact.
+
+## Cross-table
+
+The same tabular layout the `_main_thread_forkserver`
+docstring uses, expanded with a fourth "what handles
+it" column:
+
+| thread              | parent    | child (executing) | child (memory)               | what handles it             |
+|---------------------|-----------|-------------------|------------------------------|-----------------------------|
+| forkserver worker   | continues | sole survivor     | live stack                   | runs the child's bootstrap  |
+| `trio.run()` thread | continues | not running       | leaked stack (zombie bytes)  | overwritten by child's fresh `trio.run()` |
+| any other thread    | continues | not running       | leaked stack (zombie bytes)  | overwritten / GC'd / clobbered by `exec()` if used |
+
+The "child (executing)" column is the *execution* side
+of the coin — what tractor cares about. The "child
+(memory)" column is the *memory* side — what trio
+cares about.
+
+The "what handles it" column is the deliberate punchline
+of the design: nothing has to handle the leaked bytes
+*explicitly*. They get clobbered by ordinary forward
+progress in the child:
+
+- The fresh `trio.run()` the child boots up allocates
+  its own stack, scheduler, and run-queue, which over
+  time overlaps and overwrites the inherited zombie
+  pages.
+- Python's GC walks live objects only; the dead-thread
+  Python frames aren't reachable from any
+  `PyThreadState`, so they get freed at the next
+  collection cycle.
+- If the child eventually `exec()`s, the entire address
+  space is replaced and the leak vanishes.
+
+## What this means for the forkserver design
+
+The crucial point is that **the design doesn't and
+*can't* prevent the leak**. There is no userland fix
+for COW thread stacks. The kernel hands the child a
+duplicated address space; that's what `fork()` *is*. No
+amount of pre-fork hookery, `pthread_atfork()`
+gymnastics, or post-fork cleanup can un-COW the dead
+threads' pages without unmapping them, and unmapping
+arbitrary regions of a duplicated address space is
+neither portable nor safe.
+
+What the design *does* ensure is the orthogonal
+property: the survivor thread is one that doesn't need
+any of that leaked state to function. Concretely:
+
+- Survivor is the forkserver worker thread.
+- That worker has provably never imported, called into,
+  or held any reference to `trio`. (Enforced by keeping
+  the worker's lifecycle entirely in
+  `_main_thread_forkserver.py` and never letting trio
+  task-state cross into it.)
+- So the leaked pages — trio runner stack, threadpool
+  caches, etc. — are inert relative to the survivor.
+  No code path in the child references them.
+- The child then boots its own fresh `trio.run()`,
+  which allocates new state in new pages. Over the
+  child's lifetime the COW'd zombie pages get
+  overwritten, GC'd, or (if the child eventually
+  `exec()`s) discarded wholesale.
+
+The "leak" is real but inert. It costs RSS until
+clobbered; it doesn't cost correctness. That's exactly
+the property the forkserver pattern is built on, and
+it's also why the design needs the "calling thread is
+trio-free" precondition to be airtight: if the survivor
+were a trio thread, it *would* try to drive the leaked
+trio state, and the leak would no longer be inert.
+
+## See also
+
+- `tractor/spawn/_main_thread_forkserver.py` — module
+  docstring's "What survives the fork? — POSIX
+  semantics" section is the in-tree, code-adjacent
+  prose this doc expands on. The cross-table here is a
+  fourth-column expansion of the table there.
+
+- [python-trio/trio#1614][trio-1614] — the trio issue
+  with the "leaked" framing, and the canonical thread
+  for trio + `fork()` hazards more broadly.
+
+- [`subint_fork_blocked_by_cpython_post_fork_issue.md`](./subint_fork_blocked_by_cpython_post_fork_issue.md)
+  — sibling analysis covering CPython's *post-fork*
+  hooks (`PyOS_AfterFork_Child`,
+  `_PyInterpreterState_DeleteExceptMain`) and why
+  fork-from-non-main-subint is a CPython-level hard
+  refusal. Complementary axis: this doc is about POSIX
+  semantics; that doc is about the CPython runtime
+  layer that runs *after* POSIX `fork()` returns in
+  the child.
+
+- `man pthread_atfork(3)` — canonical "fork in a
+  multithreaded process is dangerous" reference.
+  Especially the rationale section, which is the
+  closest thing to a normative statement of "the
+  surviving thread cannot safely use anything the dead
+  threads were touching."
+
+- `man fork(2)` (Linux) — "Other than [the calling
+  thread], … no other threads are replicated …"
+  paragraph is the kernel-side statement of the
+  execution-side framing this doc opens with.
+
+[trio-1614]: https://github.com/python-trio/trio/issues/1614
diff --git a/ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md b/ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md
new file mode 100644
index 000000000..0a04d253c
--- /dev/null
+++ b/ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md
@@ -0,0 +1,378 @@
+# `infect_asyncio` × `main_thread_forkserver` Mode-A deadlock
+
+## Reproducer
+
+```bash
+./py313/bin/python -m pytest \
+  tests/test_infected_asyncio.py::test_aio_simple_error \
+  --tpt-proto=tcp \
+  --spawn-backend=main_thread_forkserver \
+  -v --capture=sys
+```
+
+Hangs indefinitely. Mode-A signature — both processes
+parked in `epoll_wait`, **neither burning CPU**.
+
+## Empirical observations (caught alive)
+
+### Outer pytest (parent)
+
+`py-spy dump` on the test runner pid shows the trio
+event loop parked at the bottom of `trio.run()`:
+
+```
+Thread <pid> (idle): "MainThread"
+    get_events (trio/_core/_io_epoll.py:245)
+        self: <EpollIOManager at 0x...>
+        timeout: 86400
+    run (trio/_core/_run.py:2415)
+        next_send: []
+        timeout: 86400
+    test_aio_simple_error (tests/test_infected_asyncio.py:175)
+```
+
+`timeout: 86400` is trio's "no scheduled work, just wait
+for I/O forever" sentinel. `next_send: []` confirms
+nothing is queued. The parent is stuck inside
+`tractor.open_nursery(...).run_in_actor(...)` waiting
+for `ipc_server.wait_for_peer(uid)` to fire — i.e.
+waiting for the spawned subactor to connect back.
+
+### Subactor (forked child)
+
+`/proc/<pid>/stack`:
+
+```
+do_epoll_wait+0x4c0/0x500
+__x64_sys_epoll_wait+0x70/0x120
+do_syscall_64+0xef/0x1540
+entry_SYSCALL_64_after_hwframe+0x77/0x7f
+```
+
+`strace -p <pid> -f`:
+
+```
+[pid <child-A>] epoll_wait(6 <unfinished ...>
+[pid <child-B>] epoll_wait(3
+```
+
+**Two threads**, both parked in `epoll_wait` on
+distinct epoll fds. Both blocked, neither making
+progress.
+
+### Subactor file-descriptor table
+
+```
+fd=0,1,2     stdio
+fd=3         eventpoll [watches fd 4]
+fd=4 ↔ fd=5  unix STREAM (CONNECTED) — internal pair
+fd=6         eventpoll [watches fds 7, 9]
+fd=7 ↔ fd=8  unix STREAM (CONNECTED) — internal pair
+fd=9 ↔ fd=10 unix STREAM (CONNECTED) — internal pair
+```
+
+Confirmed via `ss -xp` peer-inode lookup: **all 6 unix
+sockets are internal socketpairs** (peer in same pid).
+
+**Critical**: zero TCP/IPv4/IPv6 sockets, despite
+`--tpt-proto=tcp`:
+
+```
+$ sudo lsof -p <subactor> | grep -iE 'TCP|IPv'
+(empty)
+$ sudo ss -tnp | grep <subactor>
+(empty)
+```
+
+**The subactor never opened a TCP connection back to
+the parent.**
+
+## Diagnosis
+
+The subactor reaches `_actor_child_main` →
+`_trio_main(actor)` →
+`run_as_asyncio_guest(trio_main)`. Code path
+(`tractor.spawn._entry`):
+
+```python
+if infect_asyncio:
+    actor._infected_aio = True
+    run_as_asyncio_guest(trio_main)   # ←  this branch
+else:
+    trio.run(trio_main)
+```
+
+`run_as_asyncio_guest` (`tractor.to_asyncio`):
+
+```python
+def run_as_asyncio_guest(trio_main, ...):
+    async def aio_main(trio_main):
+        loop = asyncio.get_running_loop()
+        trio_done_fute = asyncio.Future()
+        ...
+        trio.lowlevel.start_guest_run(
+            trio_main,
+            run_sync_soon_threadsafe=loop.call_soon_threadsafe,
+            done_callback=trio_done_callback,
+        )
+        out = await asyncio.shield(trio_done_fute)
+        return out.unwrap()
+    ...
+    return asyncio.run(aio_main(trio_main))
+```
+
+Expected flow:
+1. `asyncio.run(aio_main(...))` — boots fresh asyncio
+   loop in calling thread.
+2. `aio_main` calls `trio.lowlevel.start_guest_run(...)`
+   — initializes trio's I/O manager, schedules first
+   trio slice via `loop.call_soon_threadsafe`.
+3. asyncio loop dispatches the callback → trio runs a
+   slice → yields back via `call_soon_threadsafe`.
+4. Trio's `async_main` (the user function) runs →
+   `Channel.from_addr(parent_addr)` → TCP connect to
+   parent.
+
+What we observe instead:
+- 2 threads in `epoll_wait` (one trio epoll, one
+  asyncio epoll, both inactive)
+- 6 unix-socket fds (3 socketpairs: trio
+  wakeup-fd-pair, asyncio wakeup-fd-pair, trio kicker
+  socketpair)
+- ZERO TCP — `Channel.from_addr` never ran
+
+Most likely cause: **trio's guest-run scheduling
+callback didn't get dispatched by asyncio's loop in
+the forked child**, so trio's `async_main` never
+executes past trio bootstrap, and the
+parent-IPC-connect step is never reached.
+
+## Fork-survival risk surface (hypothesis)
+
+`trio.lowlevel.start_guest_run` builds Python-level
+closures + signal handlers + wakeup-fd registrations
+that depend on:
+
+- The asyncio event loop's `call_soon_threadsafe`
+  thread-id matching the loop owner thread.
+- Process-wide signal-wakeup-fd state
+  (`signal.set_wakeup_fd`).
+- Trio's `KIManager` SIGINT handler.
+
+Under `main_thread_forkserver`, the fork happens from
+a worker thread that has **never entered trio**
+(intentional — trio-free launchpad). But the FORKED
+child then tries to bring up BOTH asyncio AND
+trio-as-guest fresh from this trio-free thread. The
+asyncio loop boots fine; trio's `start_guest_run`
+initializes BUT the cross-loop dispatch (asyncio
+queue → trio slice) appears to silently fail to wire
+up.
+
+Two more hypotheses worth probing:
+
+1. **Wakeup-fd contention**: asyncio installs
+   `signal.set_wakeup_fd(<own_pair>)`. trio's
+   guest-run also wants a wakeup-fd. Whoever installs
+   second wins; the loser's `epoll_wait` no longer
+   wakes on signals. Combined with the `asyncio.shield(
+   trio_done_fute)` + `asyncio.CancelledError`
+   handling in `run_as_asyncio_guest`, a missed signal
+   delivery could explain the indefinite park.
+
+2. **Trio kicker socketpair race**: trio's I/O manager
+   uses an internal `socket.socketpair()` to "kick"
+   itself out of `epoll_wait` when a non-IO task needs
+   scheduling. In guest mode, the kicker is still
+   present but is supposed to be triggered via the
+   asyncio dispatch. If the kicker write never gets
+   issued by asyncio's callback, trio's epoll never
+   wakes.
+
+## Confirmed via py-spy (live capture)
+
+After detaching `strace` (ptrace is exclusive — that's
+why `py-spy` returns EPERM if strace is attached):
+
+```
+Thread <pid> (idle): "main-thread-forkserver[asyncio_actor]"
+    select (selectors.py:452)                          # asyncio epoll
+    _run_once (asyncio/base_events.py:2012)
+    run_forever (asyncio/base_events.py:683)
+    run_until_complete (asyncio/base_events.py:712)
+    run (asyncio/runners.py:118)
+    run (asyncio/runners.py:195)
+    run_as_asyncio_guest (tractor/to_asyncio.py:1770)
+    _trio_main (tractor/spawn/_entry.py:160)
+    _actor_child_main (tractor/_child.py:72)
+    _child_target (tractor/spawn/_main_thread_forkserver.py:910)
+    _worker (tractor/spawn/_main_thread_forkserver.py:605)
+    [thread bootstrap]
+
+Thread <pid+1> (idle): "Trio thread 14"
+    get_events (trio/_core/_io_epoll.py:245)           # trio epoll
+    get_events (trio/_core/_run.py:1678)
+    capture (outcome/_impl.py:67)
+    _handle_job (trio/_core/_thread_cache.py:173)
+    _work (trio/_core/_thread_cache.py:196)
+    [thread bootstrap]
+```
+
+This data **rewrites the diagnosis**: trio guest-run
+isn't broken across the fork — it's working as designed.
+The two threads ARE the canonical guest-run architecture:
+
+1. **Asyncio main loop** runs in the lead thread. Parked
+   in `selectors.EpollSelector.select(timeout=-1)` —
+   waiting indefinitely for ANY callback to be queued.
+2. **Trio's I/O manager** offloads `get_events`
+   (`epoll_wait`) onto a `trio._core._thread_cache`
+   worker thread. The worker calls
+   `outcome.capture(get_events)` and parks in
+   `epoll_wait(timeout=86400)`.
+3. When trio I/O fires (or its kicker socketpair gets a
+   write), the worker returns from `epoll_wait`,
+   delivers the result via `_handle_job`'s `deliver`
+   callback, which schedules the next trio slice on
+   asyncio via `loop.call_soon_threadsafe`.
+
+The fact that the trio thread is *already* in
+`_thread_cache._handle_job` doing `capture(get_events)`
+means **trio's scheduler HAS started** — the bridge
+asyncio↔trio is wired correctly post-fork.
+
+So `async_main` DID run far enough to register some
+trio task that's now awaiting I/O. The question
+becomes: **what is `async_main` waiting on?**
+
+Process state confirms it's NOT waiting on the TCP
+connect to parent:
+
+```
+$ sudo lsof -p <subactor> | grep -iE 'TCP|IPv'
+(empty)
+$ sudo ss -tnp | grep <subactor>
+(empty)
+```
+
+`Channel.from_addr(parent_addr)` — the very first
+thing `async_main` does — was never reached, OR was
+reached but errored before `socket()` was called. The
+parent (running `ipc_server.wait_for_peer`) waits
+forever for the connection; it never comes.
+
+## Refined hypothesis
+
+`async_main` is stalled in some PRE-`Channel.from_addr`
+checkpoint. Candidates:
+
+1. **`get_console_log` / logger init** — called early in
+   `_trio_main` if `actor.loglevel is not None`. Logging
+   setup involves file/handler init that could block on
+   something fork-inherited (e.g. a stale lock).
+2. **`debug.maybe_init_greenback`** — `start_guest_run`
+   includes a check (`if debug_mode(): assert 0` —
+   currently asserts unsupported). For non-debug mode
+   this is bypassed but related machinery may run.
+3. **Stackscope SIGUSR1 handler install** — gated on
+   `_debug_mode` OR `TRACTOR_ENABLE_STACKSCOPE` env-var.
+   The `enable_stack_on_sig()` path captures a trio
+   token via `trio.lowlevel.current_trio_token()` —
+   could block under guest mode.
+4. **Initial `await trio.sleep(0)` / first checkpoint**
+   in `async_main` before reaching the
+   `Channel.from_addr` line. Under guest mode, if the
+   FIRST `call_soon_threadsafe` callback never gets
+   processed by asyncio, trio's first slice never
+   completes — but the worker thread WOULD still be in
+   `epoll_wait` having been started by trio's I/O
+   manager init.
+
+## Confirming `async_main`'s parked location
+
+Add temporary logging at the top of `Actor.async_main`:
+
+```python
+# tractor/runtime/_runtime.py around line 855
+async def async_main(self, parent_addr=None):
+    log.devx('async_main: ENTERED')                # marker A
+    try:
+        log.devx('async_main: pre-Channel.from_addr')  # marker B
+        chan = await Channel.from_addr(
+            addr=wrap_address(parent_addr)
+        )
+        log.devx('async_main: post-Channel.from_addr')  # marker C
+        ...
+```
+
+Re-run the test with `--ll=devx`. The last marker logged
+tells us exactly where `async_main` parked. If only A
+fires, the issue is between A and B (logger init,
+stackscope, etc.). If A and B fire but not C, it's in
+`Channel.from_addr` (DNS, socket creation, connect).
+
+## Related sibling bug
+
+`tests/test_multi_program.py::test_register_duplicate_name`
+hangs under the same backend with a DIFFERENT
+fingerprint:
+
+- Subactor at 100% CPU (busy-loop), not parked
+- `recvfrom(6, "", 65536, 0, NULL, NULL) = 0` repeating
+  with no `epoll_wait` in between
+- fd=6 is one of trio's internal AF_UNIX
+  socketpair fds (the kicker mechanism)
+
+Distinct root cause — possibly trio's kicker socketpair
+inheriting a half-closed state across the fork — but
+shares the broader theme: **trio internal-state
+initialization isn't fully fork-safe under
+`main_thread_forkserver`** for the more exotic
+dispatch paths.
+
+## Workarounds (until fix lands)
+
+1. **Skip-mark on the fork backend** — temporarily mark
+   `tests/test_infected_asyncio.py` with
+   `pytest.mark.skipon_spawn_backend('main_thread_forkserver',
+   reason='infect_asyncio + fork interaction broken,
+   see ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md')`.
+   Lets the rest of the test suite run green while
+   this is being fixed properly.
+
+2. **Run infected-asyncio tests under the `trio`
+   backend only** — they don't exercise fork
+   semantics, so they won't hit this bug.
+
+## Investigation next steps
+
+In rough priority:
+
+1. Catch the hang alive again, **detach strace**,
+   `py-spy --locals` the subactor — confirm trio
+   thread is NOT yet at `async_main`.
+2. Diff `start_guest_run` setup pre-fork vs post-fork
+   by adding `log.devx()` markers in
+   `tractor.to_asyncio.run_as_asyncio_guest::aio_main`
+   at:
+   - asyncio loop bringup
+   - immediately before `start_guest_run`
+   - immediately after `start_guest_run`
+   - inside the `trio_done_callback` registration
+3. Check whether the asyncio loop dispatches ANY
+   callbacks in the forked child — instrument
+   `loop.call_soon_threadsafe` (e.g. monkey-patch
+   `loop._call_soon` to log).
+4. If steps 1–3 confirm that asyncio's queue is
+   stuck, look at whether the asyncio event-loop
+   policy or selector is being inherited from a
+   pre-fork (parent-process) state in a way that
+   breaks the new loop.
+
+## See also
+
+- [#379](https://github.com/goodboy/tractor/issues/379) — subint umbrella
+- [#451](https://github.com/goodboy/tractor/issues/451) — Mode-A cancel-cascade hang
+- `ai/conc-anal/fork_thread_semantics_execution_vs_memory.md`
+- `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
+- python-trio/trio#1614 — trio + fork hazards
diff --git a/ai/conc-anal/subint_fork_from_main_thread_smoketest.py b/ai/conc-anal/subint_fork_from_main_thread_smoketest.py
new file mode 100644
index 000000000..08166eac8
--- /dev/null
+++ b/ai/conc-anal/subint_fork_from_main_thread_smoketest.py
@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+'''
+Standalone CPython-level feasibility check for the "main-interp
+worker-thread forkserver + subint-hosted trio" architecture
+proposed as a workaround to the CPython-level refusal
+documented in
+`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`.
+
+Purpose
+-------
+Deliberately NOT a `tractor` test. Zero `tractor` imports.
+Uses `_interpreters` (private stdlib) + `os.fork()` directly so
+the signal is unambiguous — pass/fail here is a property of
+CPython alone, independent of our runtime.
+
+Run each scenario in isolation; the child's fate is observable
+only via `os.waitpid()` of the parent and the scenario's own
+status prints.
+
+Scenarios (pick one with `--scenario <name>`)
+---------------------------------------------
+
+- `control_subint_thread_fork` — the KNOWN-BROKEN case we
+  documented in `subint_fork_blocked_by_cpython_post_fork_issue.md`:
+  drive a subint from a thread, call `os.fork()` inside its
+  `_interpreters.exec()`, watch the child abort. **Included as
+  a control** — if this scenario DOESN'T abort the child, our
+  analysis is wrong and we should re-check everything.
+
+- `main_thread_fork` — baseline sanity. Call `os.fork()` from
+  the process's main thread. Must always succeed; if this
+  fails something much bigger is broken.
+
+- `worker_thread_fork` — the architectural assertion. Spawn a
+  regular `threading.Thread` (attached to main interp, NOT a
+  subint), have IT call `os.fork()`. Child should survive
+  post-fork cleanup.
+
+- `full_architecture` — end-to-end: main-interp worker thread
+  forks. In the child, fork-thread (still main-interp) creates
+  a subint, drives a second worker thread inside it that runs
+  a trivial `trio.run()`. Validates the "root runtime lives in
+  a subint in the child" piece of the proposed arch.
+
+All scenarios print a self-contained pass/fail banner. Exit
+code 0 on expected outcome (which for `control_*` means "child
+aborted", not "child succeeded"!).
+
+Requires Python 3.14+.
+
+Usage
+-----
+::
+
+    python subint_fork_from_main_thread_smoketest.py \\
+        --scenario main_thread_fork
+
+    python subint_fork_from_main_thread_smoketest.py \\
+        --scenario full_architecture
+
+'''
+from __future__ import annotations
+import argparse
+import os
+import sys
+import threading
+import time
+
+
+# Hard-require py3.14 for the public `concurrent.interpreters`
+# API (we still drop to `_interpreters` internally, same as
+# `tractor.spawn._subint`).
+try:
+    from concurrent import interpreters as _public_interpreters  # noqa: F401
+    import _interpreters  # type: ignore
+except ImportError:
+    print(
+        'FAIL (setup): requires Python 3.14+ '
+        '(missing `concurrent.interpreters`)',
+        file=sys.stderr,
+    )
+    sys.exit(2)
+
+
+# The actual primitives this script exercises live in
+# `tractor.spawn._subint_forkserver` — we re-import them here
+# rather than inlining so the module and the validation stay
+# in sync. (Early versions of this file had them inline for
+# the "zero tractor imports" isolation guarantee; now that
+# CPython-level feasibility is confirmed, the validated
+# primitives have moved into tractor proper.)
+from tractor.spawn._main_thread_forkserver import (
+    fork_from_worker_thread,
+    wait_child,
+)
+from tractor.spawn._subint_forkserver import (
+    run_subint_in_worker_thread,
+)
+
+
+# ----------------------------------------------------------------
+# small observability helpers (test-harness only)
+# ----------------------------------------------------------------
+
+
+def _banner(title: str) -> None:
+    line = '=' * 60
+    print(f'\n{line}\n{title}\n{line}', flush=True)
+
+
+def _report(
+    label: str,
+    *,
+    ok: bool,
+    status_str: str,
+    expect_exit_ok: bool,
+) -> None:
+    verdict: str = 'PASS' if ok else 'FAIL'
+    expected_str: str = (
+        'normal exit (rc=0)'
+        if expect_exit_ok
+        else 'abnormal death (signal or nonzero exit)'
+    )
+    print(
+        f'[{verdict}] {label}: '
+        f'expected {expected_str}; observed {status_str}',
+        flush=True,
+    )
+
+
+# ----------------------------------------------------------------
+# scenario: `control_subint_thread_fork` (known-broken)
+# ----------------------------------------------------------------
+
+
+def scenario_control_subint_thread_fork() -> int:
+    _banner(
+        '[control] fork from INSIDE a subint (expected: child aborts)'
+    )
+    interp_id = _interpreters.create('legacy')
+    print(f'  created subint {interp_id}', flush=True)
+
+    # Shared flag: child writes a sentinel file we can detect from
+    # the parent. If the child manages to write this, CPython's
+    # post-fork refusal is NOT happening → analysis is wrong.
+    sentinel = '/tmp/subint_fork_smoketest_control_child_ran'
+    try:
+        os.unlink(sentinel)
+    except FileNotFoundError:
+        pass
+
+    bootstrap = (
+        'import os\n'
+        'pid = os.fork()\n'
+        'if pid == 0:\n'
+        # child — if CPython's refusal fires this code never runs
+        f'    with open({sentinel!r}, "w") as f:\n'
+        '        f.write("ran")\n'
+        '    os._exit(0)\n'
+        'else:\n'
+        # parent side (inside the launchpad subint) — stash the
+        # forked PID on a shareable dict so we can waitpid()
+        # from the outer main interp. We can't just return it;
+        # _interpreters.exec() returns nothing useful.
+        '    import builtins\n'
+        '    builtins._forked_child_pid = pid\n'
+    )
+
+    # NOTE, we can't easily pull state back from the subint.
+    # For the CONTROL scenario we just time-bound the fork +
+    # check the sentinel. If sentinel exists → child ran →
+    # analysis wrong. If not → child aborted → analysis
+    # confirmed.
+    done = threading.Event()
+
+    def _drive() -> None:
+        try:
+            _interpreters.exec(interp_id, bootstrap)
+        except Exception as err:
+            print(
+                f'  subint bootstrap raised (expected on some '
+                f'CPython versions): {type(err).__name__}: {err}',
+                flush=True,
+            )
+        finally:
+            done.set()
+
+    t = threading.Thread(
+        target=_drive,
+        name='control-subint-fork-launchpad',
+        daemon=True,
+    )
+    t.start()
+    done.wait(timeout=5.0)
+    t.join(timeout=2.0)
+
+    # Give the (possibly-aborted) child a moment to die.
+    time.sleep(0.5)
+
+    sentinel_present = os.path.exists(sentinel)
+    verdict = (
+        # "PASS" for our analysis means sentinel NOT present.
+        'PASS' if not sentinel_present else 'FAIL (UNEXPECTED)'
+    )
+    print(
+        f'[{verdict}] control: sentinel present={sentinel_present} '
+        f'(analysis predicts False — child should abort before '
+        f'writing)',
+        flush=True,
+    )
+    if sentinel_present:
+        os.unlink(sentinel)
+
+    try:
+        _interpreters.destroy(interp_id)
+    except _interpreters.InterpreterError:
+        pass
+
+    return 0 if not sentinel_present else 1
+
+
+# ----------------------------------------------------------------
+# scenario: `main_thread_fork` (baseline sanity)
+# ----------------------------------------------------------------
+
+
+def scenario_main_thread_fork() -> int:
+    _banner(
+        '[baseline] fork from MAIN thread (expected: child exits normally)'
+    )
+
+    pid = os.fork()
+    if pid == 0:
+        os._exit(0)
+
+    return 0 if _wait_child(
+        pid,
+        label='main_thread_fork',
+        expect_exit_ok=True,
+    ) else 1
+
+
+# ----------------------------------------------------------------
+# scenario: `worker_thread_fork` (architectural assertion)
+# ----------------------------------------------------------------
+
+
+def _run_worker_thread_fork_scenario(
+    label: str,
+    *,
+    child_target=None,
+) -> int:
+    '''
+    Thin wrapper: delegate the actual fork to the
+    `tractor.spawn._subint_forkserver` primitive, then wait
+    on the child and render a pass/fail banner.
+
+    '''
+    try:
+        pid: int = fork_from_worker_thread(
+            child_target=child_target,
+            thread_name=f'worker-fork-thread[{label}]',
+        )
+    except RuntimeError as err:
+        print(f'[FAIL] {label}: {err}', flush=True)
+        return 1
+    print(f'  forked child pid={pid}', flush=True)
+    ok, status_str = wait_child(pid, expect_exit_ok=True)
+    _report(
+        label,
+        ok=ok,
+        status_str=status_str,
+        expect_exit_ok=True,
+    )
+    return 0 if ok else 1
+
+
+def scenario_worker_thread_fork() -> int:
+    _banner(
+        '[arch] fork from MAIN-INTERP WORKER thread '
+        '(expected: child exits normally — this is the one '
+        'that matters)'
+    )
+    return _run_worker_thread_fork_scenario(
+        'worker_thread_fork',
+    )
+
+
+# ----------------------------------------------------------------
+# scenario: `full_architecture`
+# ----------------------------------------------------------------
+
+
+_CHILD_TRIO_BOOTSTRAP: str = (
+    'import trio\n'
+    'async def _main():\n'
+    '    await trio.sleep(0.05)\n'
+    '    return 42\n'
+    'result = trio.run(_main)\n'
+    'assert result == 42, f"trio.run returned {result}"\n'
+    'print("  CHILD subint: trio.run OK, result=42", '
+    'flush=True)\n'
+)
+
+
+def _child_trio_in_subint() -> int:
+    '''
+    CHILD-side `child_target`: drive a trivial `trio.run()`
+    inside a fresh legacy-config subint on a worker thread,
+    using the `tractor.spawn._subint_forkserver.run_subint_in_worker_thread`
+    primitive. Returns 0 on success.
+
+    '''
+    try:
+        run_subint_in_worker_thread(
+            _CHILD_TRIO_BOOTSTRAP,
+            thread_name='child-subint-trio-thread',
+        )
+    except RuntimeError as err:
+        print(
+            f'  CHILD: run_subint_in_worker_thread timed out / thread '
+            f'never returned: {err}',
+            flush=True,
+        )
+        return 3
+    except BaseException as err:
+        print(
+            f'  CHILD: subint bootstrap raised: '
+            f'{type(err).__name__}: {err}',
+            flush=True,
+        )
+        return 4
+    return 0
+
+
+def scenario_full_architecture() -> int:
+    _banner(
+        '[arch-full] worker-thread fork + child runs trio in a '
+        'subint (end-to-end proposed arch)'
+    )
+    return _run_worker_thread_fork_scenario(
+        'full_architecture',
+        child_target=_child_trio_in_subint,
+    )
+
+
+# ----------------------------------------------------------------
+# main
+# ----------------------------------------------------------------
+
+
+SCENARIOS: dict[str, Callable[[], int]] = {
+    'control_subint_thread_fork': scenario_control_subint_thread_fork,
+    'main_thread_fork': scenario_main_thread_fork,
+    'worker_thread_fork': scenario_worker_thread_fork,
+    'full_architecture': scenario_full_architecture,
+}
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    ap.add_argument(
+        '--scenario',
+        choices=sorted(SCENARIOS.keys()),
+        required=True,
+    )
+    args = ap.parse_args()
+    return SCENARIOS[args.scenario]()
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md b/ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md
new file mode 100644
index 000000000..07214dad0
--- /dev/null
+++ b/ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md
@@ -0,0 +1,187 @@
+# `subint_forkserver` × `multiprocessing.SharedMemory`: fork-inherited `resource_tracker` fd
+
+Surfaced by `tests/test_shm.py` under
+`--spawn-backend=subint_forkserver`. Two distinct
+failure modes, one root cause:
+**`multiprocessing.resource_tracker` is fork-without-exec
+unsafe** (canonical CPython class — bpo-38119, bpo-45209).
+
+**Status: resolved by `tractor/ipc/_mp_bs.py` +
+`tractor/ipc/_shm.py` changes (see "Resolution" below).
+This doc kept as the
+post-mortem / decision record.**
+
+## TL;DR
+
+`mp.shared_memory.SharedMemory` registers each shm
+allocation with the per-process
+`multiprocessing.resource_tracker` singleton. The
+tracker is a daemon process started lazily; the
+parent owns a unix-pipe-fd to it. When the parent
+forks-without-execing into a `subint_forkserver`
+child, the child inherits that fd — but it refers to
+the *parent's* tracker, which the child has no
+business writing to.
+
+Two manifestations under the original (pre-fix) code:
+
+1. **`test_child_attaches_alot`** — child loops 1000×
+   `attach_shm_list()`. First `mp.SharedMemory` call
+   in the child triggers
+   `resource_tracker._ensure_running_and_write` →
+   `_teardown_dead_process` → `os.close(self._fd)` on
+   an fd the child should never have touched. Surfaces
+   as `OSError: [Errno 9] Bad file descriptor`
+   wrapped in `tractor.RemoteActorError`.
+
+2. **`test_parent_writer_child_reader[*]`** — first
+   parametrize variant "passes" (with
+   `resource_tracker: leaked shared_memory` warning)
+   because nobody ever cleans up `/shm_list`.
+   Subsequent variants then fail with
+   `FileExistsError: '/shm_list'` because the leak
+   persists across the parametrize loop and forkserver
+   children can't `shm_open(create=True)` an existing
+   key.
+
+Trio backend (`mp_spawn`-style) doesn't surface this:
+each subactor `exec`s a fresh interpreter →
+independent resource tracker per subactor → no
+inherited-fd issue, and the test's pre-existing leak
+gets masked by the per-process tracker reset.
+
+Under `subint_forkserver`, the child is `os.fork()`'d
+from a worker thread (no `exec`) → inherits parent's
+`mp.resource_tracker._resource_tracker._fd` → EBADF
+/ cross-talk on first `mp.SharedMemory` op.
+
+## Resolution
+
+We side-step the broken upstream machinery entirely
+rather than try to make it fork-safe. Two-part fix
+landed (commits to follow this doc):
+
+### 1. `tractor/ipc/_mp_bs.py::disable_mantracker()`
+   — unconditional disable
+
+The previous "3.13+ short-circuit" path used
+`partial(SharedMemory, track=False)` to opt-out of
+registration on 3.13+. The `track=False` switch is
+necessary but not sufficient under fork: the
+inherited tracker fd can still be touched indirectly
+(e.g. through `_ensure_running_and_write`'s
+self-check path).
+
+The fix takes both belts AND suspenders:
+
+- **Always** monkey-patch
+  `mp.resource_tracker._resource_tracker` to a
+  no-op `ManTracker` subclass whose
+  `register`/`unregister`/`ensure_running` are all
+  empty.
+- **Always** wrap `SharedMemory` with
+  `track=False`.
+
+Result: the inherited tracker fd in the fork child
+is still inherited (fd is a kernel object; we can't
+un-inherit it across fork) but **nothing in the
+shm code path will ever try to use it** — both the
+tracker singleton and the per-allocation registration
+are short-circuited.
+
+### 2. `tractor/ipc/_shm.py::open_shm_list()`
+   — own the cleanup
+
+Without `mp.resource_tracker`, nobody else will
+unlink leaked segments at process exit. tractor
+already controls actor lifecycle, so we register
+unlink on the actor's lifetime stack:
+
+```python
+def try_unlink():
+    try:
+        shml.shm.unlink()
+    except FileNotFoundError as fne:
+        log.exception(...)  # benign sibling-already-cleaned race
+
+actor.lifetime_stack.callback(try_unlink)
+```
+
+The `FileNotFoundError` swallow handles the case
+where a sibling actor already unlinked the same
+segment (legitimate race in shared-key setups).
+
+## Why this is the right call
+
+- **mp's tracker is widely criticized.** The
+  in-tree comment "non-SC madness" predates this
+  fix and matches CPython upstream's own discomfort
+  (e.g. the per-context tracker design rework
+  discussions in bpo-43475).
+- **tractor already owns process lifecycle.** We
+  have `actor.lifetime_stack`, `Portal.cancel_actor`,
+  and the IPC cancel cascade. Adding mp's tracker
+  on top buys nothing we can't do better ourselves.
+- **Backend-uniform.** No special-casing per spawn
+  backend. trio (`mp_spawn`-style), `subint_forkserver`,
+  and the future `subint` all behave identically
+  — register-time no-op, exit-time unlink-via-
+  lifetime-stack.
+
+## Trade-offs / known gaps
+
+- **Crash-leaked segments.** If an actor segfaults
+  or is `SIGKILL`'d before its lifetime stack runs,
+  `/dev/shm/<key>` will leak. Mitigation:
+  `scripts/tractor-reap --shm` walks `/dev/shm`,
+  filters to segments owned by the current uid that
+  no live process is mapping or holding open (via
+  `/proc/*/maps` + `/proc/*/fd/*`), and unlinks
+  them. The "nobody-has-it-open" filter is
+  kernel-canonical so it never touches in-flight
+  segments held by sibling apps (verified locally
+  against 81 piker/lttng/aja-held segments — all
+  preserved).
+  - Higher-level apps using shm should still pin a
+    UUID into the key (the `'shml_<uuid>'` pattern
+    in `test_child_attaches_alot`) so concurrent
+    sessions don't collide on the same key.
+- **Cross-actor unlink races.** Two actors holding
+  the same shm key racing on `unlink()` — handled
+  by the `FileNotFoundError` swallow.
+- **Crashes won't show up in mp's leak warning.**
+  We've turned off `resource_tracker`, so the usual
+  `resource_tracker: There appear to be N leaked
+  shared_memory objects to clean up at shutdown`
+  warning is gone too. If we ever want it back as
+  a crash-detection signal, we'd need our own
+  equivalent (walk the actor's `_shm_list_keys` set
+  at root teardown, log any unfreed).
+
+## Verification
+
+```sh
+# fixed under both backends:
+./py314/bin/python -m pytest tests/test_shm.py \
+    --spawn-backend=subint_forkserver
+# 7 passed
+
+./py314/bin/python -m pytest tests/test_shm.py \
+    --spawn-backend=trio
+# 7 passed (regression check)
+```
+
+## References
+
+- CPython upstream issues:
+  - https://bugs.python.org/issue38119 (fork
+    + resource_tracker fd inheritance)
+  - https://bugs.python.org/issue45209
+    (SharedMemory + resource_tracker)
+  - https://bugs.python.org/issue43475
+    (per-context tracker rework discussion)
+- Long-term alternative: migrate off
+  `multiprocessing.shared_memory` entirely to
+  `posix_ipc` (no tracker) or finish the
+  `hotbaud`-based ringbuf transport. Not blocked on
+  this fix — both are independently tracked.
diff --git a/ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md b/ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md
new file mode 100644
index 000000000..50c8a4c65
--- /dev/null
+++ b/ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md
@@ -0,0 +1,385 @@
+# `subint_forkserver` backend: orphaned-subactor SIGINT wedged in `epoll_wait`
+
+Follow-up to the Phase C `subint_forkserver` spawn-backend
+PR (see `tractor.spawn._subint_forkserver`, issue #379).
+Surfaced by the xfail'd
+`tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`.
+
+Related-but-distinct from
+`subint_cancel_delivery_hang_issue.md` (orphaned-channel
+park AFTER subint teardown) and
+`subint_sigint_starvation_issue.md` (GIL-starvation,
+SIGINT never delivered): here the SIGINT IS delivered,
+trio's handler IS installed, but trio's event loop never
+wakes — so the KBI-at-checkpoint → `_trio_main` catch path
+(which is the runtime's *intentional* OS-cancel design)
+never fires.
+
+## TL;DR
+
+When a `subint_forkserver`-spawned subactor is orphaned
+(parent `SIGKILL`'d, no IPC cancel path available) and then
+externally `SIGINT`'d, the subactor hangs in
+`trio/_core/_io_epoll.py::get_events` (epoll_wait)
+indefinitely — even though:
+
+1. `threading.current_thread() is threading.main_thread()`
+   post-fork (CPython 3.14 re-designates correctly).
+2. Trio's SIGINT handler IS installed in the subactor
+   (`signal.getsignal(SIGINT)` returns
+   `<function KIManager.install.<locals>.handler at 0x...>`).
+3. The kernel does deliver SIGINT — the signal arrives at
+   the only thread in the process (the fork-inherited
+   worker which IS now "main" per Python).
+
+Yet `epoll_wait` does not return. Trio's wakeup-fd mechanism
+— the machinery that turns SIGINT into an epoll-wake — is
+somehow not firing the wakeup. Until that's fixed, the
+intentional "KBI-as-OS-cancel" path in
+`tractor/spawn/_entry.py::_trio_main:164` is unreachable
+for forkserver-spawned subactors whose parent dies.
+
+## Symptom
+
+Test: `tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`
+(currently marked `@pytest.mark.xfail(strict=True)`).
+
+1. Harness subprocess brings up a tractor root actor +
+   one `run_in_actor(_sleep_forever)` subactor via
+   `try_set_start_method('subint_forkserver')`.
+2. Harness prints `CHILD_PID` (subactor) and
+   `PARENT_READY` (root actor) markers to stdout.
+3. Test `os.kill(parent_pid, SIGKILL)` + `proc.wait()`
+   to fully reap the root-actor harness.
+4. Child (now reparented to pid 1) is still alive.
+5. Test `os.kill(child_pid, SIGINT)` and polls
+   `os.kill(child_pid, 0)` for up to 10s.
+6. **Observed**: the child is still alive at deadline —
+   SIGINT did not unwedge the trio loop.
+
+## What the "intentional" cancel path IS
+
+`tractor/spawn/_entry.py::_trio_main:157-186` —
+
+```python
+try:
+    if infect_asyncio:
+        actor._infected_aio = True
+        run_as_asyncio_guest(trio_main)
+    else:
+        trio.run(trio_main)
+
+except KeyboardInterrupt:
+    logmeth = log.cancel
+    exit_status: str = (
+        'Actor received KBI (aka an OS-cancel)\n'
+        ...
+    )
+```
+
+The "KBI == OS-cancel" mapping IS the runtime's
+deliberate, documented design. An OS-level SIGINT should
+flow as: kernel → trio handler → KBI at trio checkpoint
+→ unwinds `async_main` → surfaces at `_trio_main`'s
+`except KeyboardInterrupt:` → `log.cancel` + clean `rc=0`.
+
+**So fixing this hang is not "add a new SIGINT behavior" —
+it's "make the existing designed behavior actually fire in
+this backend config".** That's why option (B) ("fix root
+cause") is aligned with existing design intent, not a
+scope expansion.
+
+## Evidence
+
+### Positive control: standalone fork-from-worker + `trio.run(sleep_forever)` + SIGINT WORKS
+
+```python
+import os, signal, time, trio
+from tractor.spawn._subint_forkserver import (
+    fork_from_worker_thread, wait_child,
+)
+
+def child_target() -> int:
+    async def _main():
+        try:
+            await trio.sleep_forever()
+        except KeyboardInterrupt:
+            print('CHILD: caught KBI — trio SIGINT works!')
+            return
+    trio.run(_main)
+    return 0
+
+pid = fork_from_worker_thread(child_target, thread_name='trio-sigint-test')
+time.sleep(1.0)
+os.kill(pid, signal.SIGINT)
+wait_child(pid)
+```
+
+Result: `CHILD: caught KBI — trio SIGINT works!` + clean
+exit. So the fork-child + trio signal plumbing IS healthy
+in isolation. The hang appears only with the full tractor
+subactor runtime on top.
+
+### Negative test: full tractor subactor + orphan-SIGINT
+
+Equivalent to the xfail test. Traceback dump via
+`faulthandler.register(SIGUSR1, all_threads=True)` at the
+stuck moment:
+
+```
+Current thread 0x00007... [subint-forkserv] (most recent call first):
+  File ".../trio/_core/_io_epoll.py", line 245 in get_events
+  File ".../trio/_core/_run.py", line 2415 in run
+  File "tractor/spawn/_entry.py", line 162 in _trio_main
+  File "tractor/_child.py", line 72 in _actor_child_main
+  File "tractor/spawn/_subint_forkserver.py", line 650 in _child_target
+  File "tractor/spawn/_subint_forkserver.py", line 308 in _worker
+  File ".../threading.py", line 1024 in run
+```
+
+### Thread + signal-mask inventory of the stuck subactor
+
+Single thread (`tid == pid`, comm `'subint-forkserv'`,
+which IS `threading.main_thread()` post-fork):
+
+```
+SigBlk:  0000000000000000  # nothing blocked
+SigIgn:  0000000001001000  # SIGPIPE etc (Python defaults)
+SigCgt:  0000000108000202  # bit 1 = SIGINT caught
+```
+
+Bit 1 set in `SigCgt` → SIGINT handler IS installed. So
+trio's handler IS in place at the kernel level — not a
+"handler missing" situation.
+
+### Handler identity
+
+Inside the subactor's RPC body, `signal.getsignal(SIGINT)`
+returns `<function KIManager.install.<locals>.handler at
+0x...>` — trio's own `KIManager` handler. tractor's only
+SIGINT touches are `signal.getsignal()` *reads* (to stash
+into `debug.DebugStatus._trio_handler`); nothing writes
+over trio's handler outside the debug-REPL shielding path
+(`devx/debug/_tty_lock.py::shield_sigint`) which isn't
+engaged here (no debug_mode).
+
+## Ruled out
+
+- **GIL starvation / signal-pipe-full** (class A,
+  `subint_sigint_starvation_issue.md`): subactor runs on
+  its own GIL (separate OS process), not sharing with the
+  parent → no cross-process GIL contention. And `strace`-
+  equivalent in the signal mask shows SIGINT IS caught,
+  not queued.
+- **Orphaned channel park** (`subint_cancel_delivery_hang_issue.md`):
+  different failure mode — that one has trio iterating
+  normally and getting wedged on an orphaned
+  `chan.recv()` AFTER teardown. Here trio's event loop
+  itself never wakes.
+- **Tractor explicitly catching + swallowing KBI**:
+  greppable — the one `except KeyboardInterrupt:` in the
+  runtime is the INTENTIONAL cancel-path catch at
+  `_trio_main:164`. `async_main` uses `except Exception`
+  (not BaseException), so KBI should propagate through
+  cleanly if it ever fires.
+- **Missing `signal.set_wakeup_fd` (main-thread
+  restriction)**: post-fork, the fork-worker thread IS
+  `threading.main_thread()`, so trio's main-thread check
+  passes and its wakeup-fd install should succeed.
+
+## Root cause hypothesis (unverified)
+
+The SIGINT handler fires but trio's wakeup-fd write does
+not wake `epoll_wait`. Candidate causes, ranked by
+plausibility:
+
+1. **Wakeup-fd lifecycle race around tractor IPC setup.**
+   `async_main` spins up an IPC server + `process_messages`
+   loops early. Somewhere in that path the wakeup-fd that
+   trio registered with its epoll instance may be
+   closed/replaced/clobbered, so subsequent SIGINT writes
+   land on an fd that's no longer in the epoll set.
+   Evidence needed: compare
+   `signal.set_wakeup_fd(-1)` return value inside a
+   post-tractor-bringup RPC body vs. a pre-bringup
+   equivalent. If they differ, that's it.
+2. **Shielded cancel scope around `process_messages`.**
+   The RPC message loop is likely wrapped in a trio cancel
+   scope; if that scope is `shield=True` at any outer
+   layer, KBI scheduled at a checkpoint could be absorbed
+   by the shield and never bubble out to `_trio_main`.
+3. **Pre-fork wakeup-fd inheritance.** trio in the PARENT
+   process registered a wakeup-fd with its own epoll. The
+   child inherits the fd number but not the parent's
+   epoll instance — if tractor/trio re-uses the parent's
+   stale fd number anywhere, writes would go to a no-op
+   fd. (This is the least likely — `trio.run()` on the
+   child calls `KIManager.install` which should install a
+   fresh wakeup-fd from scratch.)
+
+## Cross-backend scope question
+
+**Untested**: does the same orphan-SIGINT hang reproduce
+against the `trio_proc` backend (stock subprocess + exec)?
+If yes → pre-existing tractor bug, independent of
+`subint_forkserver`. If no → something specific to the
+fork-from-worker path (e.g. inherited fds, mid-epoll-setup
+interference).
+
+**Quick repro for trio_proc**:
+
+```python
+# save as /tmp/trio_proc_orphan_sigint_repro.py
+import os, sys, signal, time, glob
+import subprocess as sp
+
+SCRIPT = '''
+import os, sys, trio, tractor
+async def _sleep_forever():
+    print(f"CHILD_PID={os.getpid()}", flush=True)
+    await trio.sleep_forever()
+
+async def _main():
+    async with (
+        tractor.open_root_actor(registry_addrs=[("127.0.0.1", 12350)]),
+        tractor.open_nursery() as an,
+    ):
+        await an.run_in_actor(_sleep_forever, name="sf-child")
+        print(f"PARENT_READY={os.getpid()}", flush=True)
+        await trio.sleep_forever()
+
+trio.run(_main)
+'''
+
+proc = sp.Popen(
+    [sys.executable, '-c', SCRIPT],
+    stdout=sp.PIPE, stderr=sp.STDOUT,
+)
+# parse CHILD_PID + PARENT_READY off proc.stdout ...
+# SIGKILL parent, SIGINT child, poll.
+```
+
+If that hangs too, open a broader issue; if not, this is
+`subint_forkserver`-specific (likely fd-inheritance-related).
+
+## Why this is ours to fix (not CPython's)
+
+- Signal IS delivered (`SigCgt` bitmask confirms).
+- Handler IS installed (trio's `KIManager`).
+- Thread identity is correct post-fork.
+- `_trio_main` already has the intentional KBI→clean-exit
+  path waiting to fire.
+
+Every CPython-level precondition is met. Something in
+tractor's runtime or trio's integration with it is
+breaking the SIGINT→wakeup→event-loop-wake pipeline.
+
+## Possible fix directions
+
+1. **Audit the wakeup-fd across tractor's IPC bringup.**
+   Add a trio startup hook that captures
+   `signal.set_wakeup_fd(-1)` at `_trio_main` entry,
+   after `async_main` enters, and periodically — assert
+   it's unchanged. If it moves, track down the writer.
+2. **Explicit `signal.set_wakeup_fd` reset after IPC
+   setup.** Brute force: re-install a fresh wakeup-fd
+   mid-bringup. Band-aid, but fast to try.
+3. **Ensure no `shield=True` cancel scope envelopes the
+   RPC-message-loop / IPC-server task.** If one does,
+   KBI-at-checkpoint never escapes.
+4. **Once fixed, the `child_sigint='trio'` mode on
+   `subint_forkserver_proc`** becomes effectively a no-op
+   or a doc-only mode — trio's natural handler already
+   does the right thing. Might end up removing the flag
+   entirely if there's no behavioral difference between
+   modes.
+
+## Current workaround
+
+None; `child_sigint` defaults to `'ipc'` (IPC cancel is
+the only reliable cancel path today), and the xfail test
+documents the gap. Operators hitting orphan-SIGINT get a
+hung process that needs `SIGKILL`.
+
+## Reproducer
+
+Inline, standalone (no pytest):
+
+```python
+# save as /tmp/orphan_sigint_repro.py  (py3.14+)
+import os, sys, signal, time, glob, trio
+import tractor
+from tractor.spawn._subint_forkserver import (
+    fork_from_worker_thread,
+)
+
+async def _sleep_forever():
+    print(f'SUBACTOR[{os.getpid()}]', flush=True)
+    await trio.sleep_forever()
+
+async def _main():
+    async with (
+        tractor.open_root_actor(
+            registry_addrs=[('127.0.0.1', 12349)],
+        ),
+        tractor.open_nursery() as an,
+    ):
+        await an.run_in_actor(_sleep_forever, name='sf-child')
+        await trio.sleep_forever()
+
+def child_target() -> int:
+    from tractor.spawn._spawn import try_set_start_method
+    try_set_start_method('subint_forkserver')
+    trio.run(_main)
+    return 0
+
+pid = fork_from_worker_thread(child_target, thread_name='repro')
+time.sleep(3.0)
+
+# find the subactor pid via /proc
+children = []
+for path in glob.glob(f'/proc/{pid}/task/*/children'):
+    with open(path) as f:
+        children.extend(int(x) for x in f.read().split() if x)
+subactor_pid = children[0]
+
+# SIGKILL root → orphan the subactor
+os.kill(pid, signal.SIGKILL)
+os.waitpid(pid, 0)
+time.sleep(0.3)
+
+# SIGINT the orphan — should cause clean trio exit
+os.kill(subactor_pid, signal.SIGINT)
+
+# poll for exit
+for _ in range(100):
+    try:
+        os.kill(subactor_pid, 0)
+        time.sleep(0.1)
+    except ProcessLookupError:
+        print('HARNESS: subactor exited cleanly ✔')
+        sys.exit(0)
+os.kill(subactor_pid, signal.SIGKILL)
+print('HARNESS: subactor hung — reproduced')
+sys.exit(1)
+```
+
+Expected (current): `HARNESS: subactor hung — reproduced`.
+
+After fix: `HARNESS: subactor exited cleanly ✔`.
+
+## References
+
+- `tractor/spawn/_entry.py::_trio_main:157-186` — the
+  intentional KBI→clean-exit path this bug makes
+  unreachable.
+- `tractor/spawn/_subint_forkserver` — the backend whose
+  orphan cancel-robustness this blocks.
+- `tests/spawn/test_subint_forkserver.py::test_orphaned_subactor_sigint_cleanup_DRAFT`
+  — the xfail'd reproducer in the test suite.
+- `ai/conc-anal/subint_cancel_delivery_hang_issue.md` —
+  sibling "orphaned channel park" hang (different class).
+- `ai/conc-anal/subint_sigint_starvation_issue.md` —
+  sibling "GIL starvation SIGINT drop" hang (different
+  class).
+- tractor issue #379 — subint backend tracking.
diff --git a/ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md b/ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md
new file mode 100644
index 000000000..a685f14ff
--- /dev/null
+++ b/ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md
@@ -0,0 +1,851 @@
+# `subint_forkserver` backend: `test_cancellation.py` multi-level cancel cascade hang
+
+> **Tracked at:** [#449](https://github.com/goodboy/tractor/issues/449)
+
+Follow-up tracker: surfaced while wiring the new
+`subint_forkserver` spawn backend into the full tractor
+test matrix (step 2 of the post-backend-lands plan).
+See also
+`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
+— sibling tracker for a different forkserver-teardown
+class which probably shares the same fundamental root
+cause (fork-FD-inheritance across nested spawns).
+
+## TL;DR
+
+`tests/test_cancellation.py::test_nested_multierrors[subint_forkserver]`
+hangs indefinitely under our new backend. The hang is
+**inside the graceful IPC cancel cascade** — every actor
+in the multi-level tree parks in `epoll_wait` waiting
+for IPC messages that never arrive. Not a hard-kill /
+tree-reap issue (we don't reach the hard-kill fallback
+path at all).
+
+Working hypothesis (unverified): **`os.fork()` from a
+subactor inherits the root parent's IPC listener socket
+FDs**. When a first-level subactor forkserver-spawns a
+grandchild, that grandchild inherits both its direct
+spawner's FDs AND the root's FDs — IPC message routing
+becomes ambiguous (or silently sends to the wrong
+channel), so the cancel cascade can't reach its target.
+
+## Corrected diagnosis vs. earlier draft
+
+An earlier version of this doc claimed the root cause
+was **"forkserver teardown doesn't tree-kill
+descendants"** (SIGKILL only reaches the direct child,
+grandchildren survive and hold TCP `:1616`). That
+diagnosis was **wrong**, caused by conflating two
+observations:
+
+1. *5-zombie leak holding :1616* — happened in my own
+   workflow when I aborted a bg pytest task with
+   `pkill` (SIGTERM/SIGKILL, not SIGINT). The abrupt
+   kill skipped the graceful `ActorNursery.__aexit__`
+   cancel cascade entirely, orphaning descendants to
+   init. **This was my cleanup bug, not a forkserver
+   teardown bug.** Codified the fix (SIGINT-first +
+   bounded wait before SIGKILL) in
+   `feedback_sc_graceful_cancel_first.md` +
+   `.claude/skills/run-tests/SKILL.md`.
+2. *`test_nested_multierrors` hangs indefinitely* —
+   the real, separate, forkserver-specific bug
+   captured by this doc.
+
+The two symptoms are unrelated. The tree-kill / setpgrp
+fix direction proposed earlier would not help (1) (SC-
+graceful-cleanup is the right answer there) and would
+not help (2) (the hang is in the cancel cascade, not
+in the hard-kill fallback).
+
+## Symptom
+
+Reproducer (py3.14, clean env):
+
+```sh
+# preflight: ensure clean env
+ss -tlnp 2>/dev/null | grep ':1616' && echo 'FOUL — cleanup first!' || echo 'clean'
+
+./py314/bin/python -m pytest --spawn-backend=subint_forkserver \
+  'tests/test_cancellation.py::test_nested_multierrors[subint_forkserver]' \
+  --timeout=30 --timeout-method=thread --tb=short -v
+```
+
+Expected: `pytest-timeout` fires at 30s with a thread-
+dump banner, but the process itself **remains alive
+after timeout** and doesn't unwedge on subsequent
+SIGINT. Requires SIGKILL to reap.
+
+## Evidence (tree structure at hang point)
+
+All 5 processes are kernel-level `S` (sleeping) in
+`do_epoll_wait` (trio's event loop waiting on I/O):
+
+```
+PID     PPID    THREADS  NAME             ROLE
+333986  1       2        subint-forkserv  pytest main (the test body)
+333993  333986  3        subint-forkserv  "child 1" spawner subactor
+  334003 333993 1        subint-forkserv  grandchild errorer under child-1
+  334014 333993 1        subint-forkserv  grandchild errorer under child-1
+333999  333986  1        subint-forkserv  "child 2" spawner subactor (NO grandchildren!)
+```
+
+### Asymmetric tree depth
+
+The test's `spawn_and_error(breadth=2, depth=3)` should
+have BOTH direct children spawning 2 grandchildren
+each, going 3 levels deep. Reality:
+
+- Child 1 (333993, 3 threads) DID spawn its two
+  grandchildren as expected — fully booted trio
+  runtime.
+- Child 2 (333999, 1 thread) did NOT spawn any
+  grandchildren — clearly never completed its
+  nursery's first `run_in_actor`. Its 1-thread state
+  suggests the runtime never fully booted (no trio
+  worker threads for `waitpid`/IPC).
+
+This asymmetry is the key clue: the two direct
+children started identically but diverged. Probably a
+race around fork-inherited state (listener FDs,
+subactor-nursery channel state) that happens to land
+differently depending on spawn ordering.
+
+### Parent-side state
+
+Thread-dump of pytest main (333986) at the hang:
+
+- Main trio thread — parked in
+  `trio._core._io_epoll.get_events` (epoll_wait on
+  its event loop). Waiting for IPC from children.
+- Two trio-cache worker threads — each parked in
+  `outcome.capture(sync_fn)` calling
+  `os.waitpid(child_pid, 0)`. These are our
+  `_ForkedProc.wait()` off-loads. They're waiting for
+  the direct children to exit — but children are
+  stuck in their own epoll_wait waiting for IPC from
+  the parent.
+
+**It's a deadlock, not a leak:** the parent is
+correctly running `soft_kill(proc, _ForkedProc.wait,
+portal)` (graceful IPC cancel via
+`Portal.cancel_actor()`), but the children never
+acknowledge the cancel message (or the message never
+reaches them through the tangled post-fork IPC).
+
+## What's NOT the cause (ruled out)
+
+- **`_ForkedProc.kill()` only SIGKILLs direct pid /
+  missing tree-kill**: doesn't apply — we never reach
+  the hard-kill path. The deadlock is in the graceful
+  cancel cascade.
+- **Port `:1616` contention**: ruled out after the
+  `reg_addr` fixture-wiring fix; each test session
+  gets a unique port now.
+- **GIL starvation / SIGINT pipe filling** (class-A,
+  `subint_sigint_starvation_issue.md`): doesn't apply
+  — each subactor is its own OS process with its own
+  GIL (not legacy-config subint).
+- **Child-side `_trio_main` absorbing KBI**: grep
+  confirmed; `_trio_main` only catches KBI at the
+  `trio.run()` callsite, which is reached only if the
+  trio loop exits normally. The children here never
+  exit trio.run() — they're wedged inside.
+
+## Hypothesis: FD inheritance across nested forks
+
+`subint_forkserver_proc` calls
+`fork_from_worker_thread()` which ultimately does
+`os.fork()` from a dedicated worker thread. Standard
+Linux/POSIX fork semantics: **the child inherits ALL
+open FDs from the parent**, including listener
+sockets, epoll fds, trio wakeup pipes, and the
+parent's IPC channel sockets.
+
+At root-actor fork-spawn time, the root's IPC server
+listener FDs are open in the parent. Those get
+inherited by child 1. Child 1 then forkserver-spawns
+its OWN subactor (grandchild). The grandchild
+inherits FDs from child 1 — but child 1's address
+space still contains **the root's IPC listener FDs
+too** (inherited at first fork). So the grandchild
+has THREE sets of FDs:
+
+1. Its own (created after becoming a subactor).
+2. Its direct parent child-1's.
+3. The ROOT's (grandparent's) — inherited transitively.
+
+IPC message routing may be ambiguous in this tangled
+state. Or a listener socket that the root thinks it
+owns is actually open in multiple processes, and
+messages sent to it go to an arbitrary one. That
+would exactly match the observed "graceful cancel
+never propagates".
+
+This hypothesis predicts the bug **scales with fork
+depth**: single-level forkserver spawn
+(`test_subint_forkserver_spawn_basic`) works
+perfectly, but any test that spawns a second level
+deadlocks. Matches observations so far.
+
+## Fix directions (to validate)
+
+### 1. `close_fds=True` equivalent in `fork_from_worker_thread()`
+
+`subprocess.Popen` / `trio.lowlevel.open_process` have
+`close_fds=True` by default on POSIX — they
+enumerate open FDs in the child post-fork and close
+everything except stdio + any explicitly-passed FDs.
+Our raw `os.fork()` doesn't. Adding the equivalent to
+our `_worker` prelude would isolate each fork
+generation's FD set.
+
+Implementation sketch in
+`tractor.spawn._subint_forkserver.fork_from_worker_thread._worker`:
+
+```python
+def _worker() -> None:
+    pid: int = os.fork()
+    if pid == 0:
+        # CHILD: close inherited FDs except stdio + the
+        # pid-pipe we just opened.
+        keep: set[int] = {0, 1, 2, rfd, wfd}
+        import resource
+        soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
+        os.closerange(3, soft)  # blunt; or enumerate /proc/self/fd
+        # ... then child_target() as before
+```
+
+Problem: overly aggressive — closes FDs the
+grandchild might legitimately need (e.g. its parent's
+IPC channel for the spawn-spec handshake, if we rely
+on that). Needs thought about which FDs are
+"inheritable and safe" vs. "inherited by accident".
+
+### 2. Cloexec on tractor's own FDs
+
+Set `FD_CLOEXEC` on tractor-created sockets (listener
+sockets, IPC channel sockets, pipes). This flag
+causes automatic close on `execve`, but since we
+`fork()` without `exec()`, this alone doesn't help.
+BUT — combined with a child-side explicit close-
+non-cloexec loop, it gives us a way to mark "my
+private FDs" vs. "safe to inherit". Most robust, but
+requires tractor-wide audit.
+
+### 3. Explicit FD cleanup in `_ForkedProc`/`_child_target`
+
+Have `subint_forkserver_proc`'s `_child_target`
+closure explicitly close the parent-side IPC listener
+FDs before calling `_actor_child_main`. Requires
+being able to enumerate "the parent's listener FDs
+that the child shouldn't keep" — plausible via
+`Actor.ipc_server`'s socket objects.
+
+### 4. Use `os.posix_spawn` with explicit `file_actions`
+
+Instead of raw `os.fork()`, use `os.posix_spawn()`
+which supports explicit file-action specifications
+(close this FD, dup2 that FD). Cleaner semantics, but
+probably incompatible with our "no exec" requirement
+(subint_forkserver is a fork-without-exec design).
+
+**Likely correct answer: (3) — targeted FD cleanup
+via `actor.ipc_server` handle.** (1) is too blunt,
+(2) is too wide-ranging, (4) changes the spawn
+mechanism.
+
+## Reproducer (standalone, no pytest)
+
+```python
+# save as /tmp/forkserver_nested_hang_repro.py  (py3.14+)
+import trio, tractor
+
+async def assert_err():
+    assert 0
+
+async def spawn_and_error(breadth: int = 2, depth: int = 1):
+    async with tractor.open_nursery() as n:
+        for i in range(breadth):
+            if depth > 0:
+                await n.run_in_actor(
+                    spawn_and_error,
+                    breadth=breadth,
+                    depth=depth - 1,
+                    name=f'spawner_{i}_{depth}',
+                )
+            else:
+                await n.run_in_actor(
+                    assert_err,
+                    name=f'errorer_{i}',
+                )
+
+async def _main():
+    async with tractor.open_nursery() as n:
+        for i in range(2):
+            await n.run_in_actor(
+                spawn_and_error,
+                name=f'top_{i}',
+                breadth=2,
+                depth=1,
+            )
+
+if __name__ == '__main__':
+    from tractor.spawn._spawn import try_set_start_method
+    try_set_start_method('subint_forkserver')
+    with trio.fail_after(20):
+        trio.run(_main)
+```
+
+Expected (current): hangs on `trio.fail_after(20)`
+— children never ack the error-propagation cancel
+cascade. Pattern: top 2 direct children, 4
+grandchildren, 1 errorer deadlocks while trying to
+unwind through its parent chain.
+
+After fix: `trio.TooSlowError`-free completion; the
+root's `open_nursery` receives the
+`BaseExceptionGroup` containing the `AssertionError`
+from the errorer and unwinds cleanly.
+
+## Update — 2026-04-23: partial fix landed, deeper layer surfaced
+
+Three improvements landed as separate commits in the
+`subint_forkserver_backend` branch (see `git log`):
+
+1. **`_close_inherited_fds()` in fork-child prelude**
+   (`tractor/spawn/_subint_forkserver.py`). POSIX
+   close-fds-equivalent enumeration via
+   `/proc/self/fd` (or `RLIMIT_NOFILE` fallback), keep
+   only stdio. This is fix-direction (1) from the list
+   above — went with the blunt form rather than the
+   targeted enum-via-`actor.ipc_server` form, turns
+   out the aggressive close is safe because every
+   inheritable resource the fresh child needs
+   (IPC-channel socket, etc.) is opened AFTER the
+   fork anyway.
+2. **`_ForkedProc.wait()` via `os.pidfd_open()` +
+   `trio.lowlevel.wait_readable()`** — matches the
+   `trio.Process.wait` / `mp.Process.sentinel` pattern
+   used by `trio_proc` and `proc_waiter`. Gives us
+   fully trio-cancellable child-wait (prior impl
+   blocked a cache thread on a sync `os.waitpid` that
+   was NOT trio-cancellable due to
+   `abandon_on_cancel=False`).
+3. **`_parent_chan_cs` wiring** in
+   `tractor/runtime/_runtime.py`: capture the shielded
+   `loop_cs` for the parent-channel `process_messages`
+   task in `async_main`; explicitly cancel it in
+   `Actor.cancel()` teardown. This breaks the shield
+   during teardown so the parent-chan loop exits when
+   cancel is issued, instead of parking on a parent-
+   socket EOF that might never arrive under fork
+   semantics.
+
+**Concrete wins from (1):** the sibling
+`subint_forkserver_orphan_sigint_hang_issue.md` class
+is **now fixed** — `test_orphaned_subactor_sigint_cleanup_DRAFT`
+went from strict-xfail to pass. The xfail mark was
+removed; the test remains as a regression guard.
+
+**test_nested_multierrors STILL hangs** though.
+
+### Updated diagnosis (narrowed)
+
+DIAGDEBUG instrumentation of `process_messages` ENTER/
+EXIT pairs + `_parent_chan_cs.cancel()` call sites
+showed (captured during a 20s-timeout repro):
+
+- 80 `process_messages` ENTERs, 75 EXITs → 5 stuck.
+- **All 40 `shield=True` ENTERs matched EXIT** — every
+  shielded parent-chan loop exits cleanly. The
+  `_parent_chan_cs` wiring works as intended.
+- **The 5 stuck loops are all `shield=False`** — peer-
+  channel handlers (inbound connections handled by
+  `handle_stream_from_peer` in stream_handler_tn).
+- After our `_parent_chan_cs.cancel()` fires, NEW
+  shielded process_messages loops start (on the
+  session reg_addr port — probably discovery-layer
+  reconnection attempts). These don't block teardown
+  (they all exit) but indicate the cancel cascade has
+  more moving parts than expected.
+
+### Remaining unknown
+
+Why don't the 5 peer-channel loops exit when
+`service_tn.cancel_scope.cancel()` fires? They're in
+`stream_handler_tn` which IS `service_tn` in the
+current configuration (`open_ipc_server(parent_tn=
+service_tn, stream_handler_tn=service_tn)`). A
+standard nursery-scope-cancel should propagate through
+them — no shield, no special handler. Something
+specific to the fork-spawned configuration keeps them
+alive.
+
+Candidate follow-up experiments:
+
+- Dump the trio task tree at the hang point (via
+  `stackscope` or direct trio introspection) to see
+  what each stuck loop is awaiting. `chan.__anext__`
+  on a socket recv? An inner lock? A shielded sub-task?
+- Compare peer-channel handler lifecycle under
+  `trio_proc` vs `subint_forkserver` with equivalent
+  logging to spot the divergence.
+- Investigate whether the peer handler is caught in
+  the `except trio.Cancelled:` path at
+  `tractor/ipc/_server.py:448` that re-raises — but
+  re-raise means it should still exit. Unless
+  something higher up swallows it.
+
+### Attempted fix (DID NOT work) — hypothesis (3)
+
+Tried: in `_serve_ipc_eps` finally, after closing
+listeners, also iterate `server._peers` and
+sync-close each peer channel's underlying stream
+socket fd:
+
+```python
+for _uid, _chans in list(server._peers.items()):
+    for _chan in _chans:
+        try:
+            _stream = _chan._transport.stream if _chan._transport else None
+            if _stream is not None:
+                _stream.socket.close()  # sync fd close
+        except (AttributeError, OSError):
+            pass
+```
+
+Theory: closing the socket fd from outside the stuck
+recv task would make the recv see EBADF /
+ClosedResourceError and unblock.
+
+Result: `test_nested_multierrors[subint_forkserver]`
+still hangs identically. Either:
+- The sync `socket.close()` doesn't propagate into
+  trio's in-flight `recv_some()` the way I expected
+  (trio may hold an internal reference that keeps the
+  fd open even after an external close), or
+- The stuck recv isn't even the root blocker and the
+  peer handlers never reach the finally for some
+  reason I haven't understood yet.
+
+Either way, the sync-close hypothesis is **ruled
+out**. Reverted the experiment, restored the skip-
+mark on the test.
+
+### Aside: `-s` flag does NOT change `test_nested_multierrors` behavior
+
+Tested explicitly: both with and without `-s`, the
+test hangs identically. So the capture-pipe-fill
+hypothesis is **ruled out** for this test.
+
+The earlier `test_context_stream_semantics.py` `-s`
+observation was most likely caused by a competing
+pytest run in my session (confirmed via process list
+— my leftover pytest was alive at that time and
+could have been holding state on the default
+registry port).
+
+## Update — 2026-04-23 (late): cancel delivery ruled in, nursery-wait ruled BLOCKER
+
+**New diagnostic run** instrumented
+`handle_stream_from_peer` at ENTER / `except
+trio.Cancelled:` / finally, plus `Actor.cancel()`
+just before `self._parent_chan_cs.cancel()`. Result:
+
+- **40 `handle_stream_from_peer` ENTERs**.
+- **0 `except trio.Cancelled:` hits** — cancel
+  never fires on any peer-handler.
+- **35 finally hits** — those handlers exit via
+  peer-initiated EOF (normal return), NOT cancel.
+- **5 handlers never reach finally** — stuck forever.
+- **`Actor.cancel()` fired in 12 PIDs** — but the
+  PIDs with peer handlers that DIDN'T fire
+  Actor.cancel are exactly **root + 2 direct
+  spawners**. These 3 actors have peer handlers
+  (for their own subactors) that stay stuck because
+  **`Actor.cancel()` at these levels never runs**.
+
+### The actual deadlock shape
+
+`Actor.cancel()` lives in
+`open_root_actor.__aexit__` / `async_main` teardown.
+That only runs when the enclosing `async with
+tractor.open_nursery()` exits. The nursery's
+`__aexit__` calls the backend `*_proc` spawn target's
+teardown, which does `soft_kill() →
+_ForkedProc.wait()` on its child PID. That wait is
+trio-cancellable via pidfd now (good) — but nothing
+CANCELS it because the outer scope only cancels when
+`Actor.cancel()` runs, which only runs when the
+nursery completes, which waits on the child.
+
+It's a **multi-level mutual wait**:
+
+```
+root              blocks on spawner.wait()
+  spawner         blocks on grandchild.wait()
+    grandchild    blocks on errorer.wait()
+      errorer     Actor.cancel() ran, but process
+                  may not have fully exited yet
+                  (something in root_tn holding on?)
+```
+
+Each level waits for the level below. The bottom
+level (errorer) reaches Actor.cancel(), but its
+process may not fully exit — meaning its pidfd
+doesn't go readable, meaning the grandchild's
+waitpid doesn't return, meaning the grandchild's
+nursery doesn't unwind, etc. all the way up.
+
+### Refined question
+
+**Why does an errorer process not exit after its
+`Actor.cancel()` completes?**
+
+Possibilities:
+1. `_parent_chan_cs.cancel()` fires (shielded
+   parent-chan loop unshielded), but the task is
+   stuck INSIDE the shielded loop's recv in a way
+   that cancel still can't break.
+2. After `Actor.cancel()` returns, `async_main`
+   still has other tasks in `root_tn` waiting for
+   something that never arrives (e.g. outbound
+   IPC reply delivery).
+3. The `os._exit(rc)` in `_worker` (at
+   `_subint_forkserver.py`) doesn't run because
+   `_child_target` never returns.
+
+Next-session candidate probes (in priority order):
+
+1. **Instrument `_worker`'s fork-child branch** to
+   confirm whether `child_target()` returns (and
+   thus `os._exit(rc)` is reached) for errorer
+   PIDs. If yes → process should die; if no →
+   trace back into `_actor_child_main` /
+   `_trio_main` / `async_main` to find the stuck
+   spot.
+2. **Instrument `async_main`'s final unwind** to
+   see which await in the teardown doesn't
+   complete.
+3. **Compare under `trio_proc` backend** at the
+   same `_worker`-equivalent level to see where
+   the flows diverge.
+
+### Rule-out: NOT a stuck peer-chan recv
+
+Earlier hypothesis was that the 5 stuck peer-chan
+loops were blocked on a socket recv that cancel
+couldn't interrupt. This pass revealed the real
+cause: cancel **never reaches those tasks** because
+their owning actor's `Actor.cancel()` never runs.
+The recvs are fine — they're just parked because
+nothing is telling them to stop.
+
+## Update — 2026-04-23 (very late): leaves exit, middle actors stuck in `trio.run`
+
+Yet another instrumentation pass — this time
+printing at:
+
+- `_worker` child branch: `pre child_target()` /
+  `child_target RETURNED rc=N` / `about to
+  os._exit(rc)` 
+- `_trio_main`: `about to trio.run` /
+  `trio.run RETURNED NORMALLY` / `FINALLY`
+
+**Fresh-run results** (`test_nested_multierrors[
+subint_forkserver]`, depth=1/breadth=2, 1 root + 14
+forked = 15 actors total):
+
+- **9 processes completed the full flow** —
+  `trio.run RETURNED NORMALLY` → `child_target
+  RETURNED rc=0` → `about to os._exit(0)`. These
+  are the LEAVES of the tree (errorer actors) plus
+  their direct parents (depth-0 spawners). They
+  actually exit their processes.
+- **5 processes are stuck INSIDE `trio.run(trio_main)`**
+  — they hit "about to trio.run" but NEVER see
+  "trio.run RETURNED NORMALLY". These are root +
+  top-level spawners + one intermediate.
+
+**What this means:** `async_main` itself is the
+deadlock holder, not the peer-channel loops.
+Specifically, the outer `async with root_tn:` in
+`async_main` never exits for the 5 stuck actors.
+Their `trio.run` never returns → `_trio_main`
+catch/finally never runs → `_worker` never reaches
+`os._exit(rc)` → the PROCESS never dies → its
+parent's `_ForkedProc.wait()` blocks → parent's
+nursery hangs → parent's `async_main` hangs → ...
+
+### The new precise question
+
+**What task in the 5 stuck actors' `async_main`
+never completes?** Candidates:
+
+1. The shielded parent-chan `process_messages`
+   task in `root_tn` — but we explicitly cancel it
+   via `_parent_chan_cs.cancel()` in `Actor.cancel()`.
+   However, `Actor.cancel()` only runs during
+   `open_root_actor.__aexit__`, which itself runs
+   only after `async_main`'s outer unwind — which
+   doesn't happen. So the shield isn't broken.
+
+2. `await actor_nursery._join_procs.wait()` or
+   similar in the inline backend `*_proc` flow.
+
+3. `_ForkedProc.wait()` on a grandchild that
+   actually DID exit — but the pidfd_open watch
+   didn't fire for some reason (race between
+   pidfd_open and the child exiting?).
+
+The most specific next probe: **add DIAG around
+`_ForkedProc.wait()` enter/exit** to see whether
+the pidfd-based wait returns for every grandchild
+exit. If a stuck parent's `_ForkedProc.wait()`
+NEVER returns despite its child exiting, the
+pidfd mechanism has a race bug under nested
+forkserver.
+
+Alternative probe: instrument `async_main`'s outer
+nursery exits to find which nursery's `__aexit__`
+is stuck, drilling down from `trio.run` to the
+specific `async with` that never completes.
+
+### Cascade summary (updated tree view)
+
+```
+ROOT (pytest)                       STUCK in trio.run
+├── top_0 (spawner, d=1)            STUCK in trio.run
+│   ├── spawner_0_d1_0 (d=0)        exited (os._exit 0)
+│   │   ├── errorer_0_0             exited (os._exit 0)
+│   │   └── errorer_0_1             exited (os._exit 0)
+│   └── spawner_0_d1_1 (d=0)        exited (os._exit 0)
+│       ├── errorer_0_2             exited (os._exit 0)
+│       └── errorer_0_3             exited (os._exit 0)
+└── top_1 (spawner, d=1)            STUCK in trio.run
+    ├── spawner_1_d1_0 (d=0)        STUCK in trio.run (sibling race?)
+    │   ├── errorer_1_0             exited
+    │   └── errorer_1_1             exited
+    └── spawner_1_d1_1 (d=0)        STUCK in trio.run
+        ├── errorer_1_2             exited
+        └── errorer_1_3             exited
+```
+
+Grandchildren (d=0 spawners) exit OR stick —
+asymmetric. Not purely depth-determined. Some race
+condition in nursery teardown when multiple
+siblings error simultaneously.
+
+## Update — 2026-04-23 (late, probe iteration 3): hang pinpointed to `wait_for_no_more_peers()`
+
+Further DIAGDEBUG at every milestone in `async_main`
+(runtime UP / EXITED service_tn / EXITED root_tn /
+FINALLY ENTER / RETURNING) plus `_ForkedProc.wait`
+ENTER/RETURNED per-pidfd. Result:
+
+**Every stuck actor reaches `async_main: FINALLY
+ENTER` but NOT `async_main: RETURNING`.**
+
+That isolates the hang to a specific await in
+`async_main`'s finally block at
+`tractor/runtime/_runtime.py:1837+`. The suspect:
+
+```python
+# Ensure all peers (actors connected to us as clients) are finished
+if ipc_server := actor.ipc_server and ipc_server.has_peers(check_chans=True):
+    ...
+    await ipc_server.wait_for_no_more_peers()  # ← UNBOUNDED, blocks forever
+```
+
+`_no_more_peers` is an `Event` set only when
+`server._peers` empties (see
+`ipc/_server.py:526-530`). If ANY peer-handler is
+stuck (the 5 unclosed loops from the earlier pass),
+it keeps its channel in `server._peers`, so the
+event never fires, so the wait hangs.
+
+### Applied fix (partial, landed as defensive-in-depth)
+
+`tractor/runtime/_runtime.py:1981` —
+`wait_for_no_more_peers()` call now wrapped in
+`trio.move_on_after(3.0)` + a warning log when the
+timeout fires. Commented with the full rationale.
+
+**Verified:** with this fix, ALL 15 actors reach
+`async_main: RETURNING` cleanly (up from 10/15
+reaching end before).
+
+**Unfortunately:** the test still hangs past 45s
+total — meaning there's YET ANOTHER unbounded wait
+downstream of `async_main`. The bounded
+`wait_for_no_more_peers` unblocks one level, but
+the cascade has another level above it.
+
+### Candidates for the remaining hang
+
+1. `open_root_actor`'s own finally / post-
+   `async_main` flow in `_root.py` — specifically
+   `await actor.cancel(None)` which has its own
+   internal waits.
+2. The `trio.run()` itself doesn't return even
+   after the root task completes because trio's
+   nursery still has background tasks running.
+3. Maybe `_serve_ipc_eps`'s finally has an await
+   that blocks when peers aren't clearing.
+
+### Current stance
+
+- Defensive `wait_for_no_more_peers` bound landed
+  (good hygiene regardless). Revealing a real
+  deadlock-avoidance gap in tractor's cleanup.
+- Test still hangs → skip-mark restored on
+  `test_nested_multierrors[subint_forkserver]`.
+- The full chain of unbounded waits needs another
+  session of drilling, probably at
+  `open_root_actor` / `actor.cancel` level.
+
+### Summary of this investigation's wins
+
+1. **FD hygiene fix** (`_close_inherited_fds`) —
+   correct, closed orphan-SIGINT sibling issue.
+2. **pidfd-based `_ForkedProc.wait`** — cancellable,
+   matches trio_proc pattern.
+3. **`_parent_chan_cs` wiring** —
+   `Actor.cancel()` now breaks the shielded parent-
+   chan `process_messages` loop.
+4. **`wait_for_no_more_peers` bounded** —
+   prevents the actor-level finally hang.
+5. **Ruled-out hypotheses:** tree-kill missing
+   (wrong), stuck socket recv (wrong).
+6. **Pinpointed remaining unknown:** at least one
+   more unbounded wait in the teardown cascade
+   above `async_main`. Concrete candidates
+   enumerated above.
+
+## Update — 2026-04-23 (VERY late): pytest capture pipe IS the final gate
+
+After landing fixes 1-4 and instrumenting every
+layer down to `tractor_test`'s `trio.run(_main)`:
+
+**Empirical result: with `pytest -s` the test PASSES
+in 6.20s.** Without `-s` (default `--capture=fd`) it
+hangs forever.
+
+DIAG timeline for the root pytest PID (with `-s`
+implied from later verification):
+
+```
+tractor_test: about to trio.run(_main)
+open_root_actor: async_main task started, yielding to test body
+_main: about to await wrapped test fn
+_main: wrapped RETURNED cleanly        ← test body completed!
+open_root_actor: about to actor.cancel(None)
+Actor.cancel ENTER req_chan=False
+Actor.cancel RETURN
+open_root_actor: actor.cancel RETURNED
+open_root_actor: outer FINALLY
+open_root_actor: finally END (returning from ctxmgr)
+tractor_test: trio.run FINALLY (returned or raised)  ← trio.run fully returned!
+```
+
+`trio.run()` fully returns. The test body itself
+completes successfully (pytest.raises absorbed the
+expected `BaseExceptionGroup`). What blocks is
+**pytest's own stdout/stderr capture** — under
+`--capture=fd` default, pytest replaces the parent
+process's fd 1,2 with pipe write-ends it's reading
+from. Fork children inherit those pipe fds
+(because `_close_inherited_fds` correctly preserves
+stdio). High-volume subactor error-log tracebacks
+(7+ actors each logging multiple
+`RemoteActorError`/`ExceptionGroup` tracebacks on
+the error-propagation cascade) fill the 64KB Linux
+pipe buffer. Subactor writes block. Subactor can't
+progress. Process doesn't exit. Parent's
+`_ForkedProc.wait` (now pidfd-based and
+cancellable, but nothing's cancelling here since
+the test body already completed) keeps the pipe
+reader alive... but pytest isn't draining its end
+fast enough because test-teardown/fixture-cleanup
+is in progress.
+
+**Actually** the exact mechanism is slightly
+different: pytest's capture fixture MIGHT be
+actively reading, but faster-than-writer subactors
+overflow its internal buffer. Or pytest might be
+blocked itself on the finalization step.
+
+Either way, `-s` conclusively fixes it.
+
+### Why I ruled this out earlier (and shouldn't have)
+
+Earlier in this investigation I tested
+`test_nested_multierrors` with/without `-s` and
+both hung. That's because AT THAT TIME, fixes 1-4
+weren't all in place yet. The test was hanging at
+multiple deeper levels long before reaching the
+"generate lots of error-log output" phase. Once
+the cascade actually tore down cleanly, enough
+output was produced to hit the capture-pipe limit.
+
+**Classic order-of-operations mistake in
+debugging:** ruling something out too early based
+on a test that was actually failing for a
+different reason.
+
+### Fix direction (next session)
+
+Redirect subactor stdout/stderr to `/dev/null` (or
+a session-scoped log file) in the fork-child
+prelude, right after `_close_inherited_fds()`. This
+severs the inherited pytest-capture pipes and lets
+subactor output flow elsewhere. Under normal
+production use (non-pytest), stdout/stderr would
+be the TTY — we'd want to keep that. So the
+redirect should be conditional or opt-in via the
+`child_sigint`/proc_kwargs flag family.
+
+Alternative: document as a gotcha and recommend
+`pytest -s` for any tests using the
+`subint_forkserver` backend with multi-level actor
+trees. Simpler, user-visible, no code change.
+
+### Current state
+
+- Skip-mark on `test_nested_multierrors[subint_forkserver]`
+  restored with reason pointing here.
+- Test confirmed passing with `-s` after all 4
+  cascade fixes applied.
+- The 4 cascade fixes are NOT wasted — they're
+  correct hardening regardless of the capture-pipe
+  issue, AND without them we'd never reach the
+  "actually produces enough output to fill the
+  pipe" state.
+
+## Stopgap (landed)
+
+`test_nested_multierrors` skip-marked under
+`subint_forkserver` via
+`@pytest.mark.skipon_spawn_backend('subint_forkserver',
+reason='...')`, cross-referenced to this doc. Mark
+should be dropped once the peer-channel-loop exit
+issue is fixed.
+
+## References
+
+- `tractor/spawn/_subint_forkserver.py::fork_from_worker_thread`
+  — the primitive whose post-fork FD hygiene is
+  probably the culprit.
+- `tractor/spawn/_subint_forkserver.py::subint_forkserver_proc`
+  — the backend function that orchestrates the
+  graceful cancel path hitting this bug.
+- `tractor/spawn/_subint_forkserver.py::_ForkedProc`
+  — the `trio.Process`-compatible shim; NOT the
+  failing component (confirmed via thread-dump).
+- `tests/test_cancellation.py::test_nested_multierrors`
+  — the test that surfaced the hang.
+- `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
+  — sibling hang class; probably same underlying
+  fork-FD-inheritance root cause.
+- tractor issue #379 — subint backend tracking.
diff --git a/ai/conc-anal/subint_forkserver_thread_constraints_on_pep684_issue.md b/ai/conc-anal/subint_forkserver_thread_constraints_on_pep684_issue.md
new file mode 100644
index 000000000..b3c4563d3
--- /dev/null
+++ b/ai/conc-anal/subint_forkserver_thread_constraints_on_pep684_issue.md
@@ -0,0 +1,186 @@
+# Revisit `subint_forkserver` thread-cache constraints once msgspec PEP 684 support lands
+
+> **Tracked at:** [#450](https://github.com/goodboy/tractor/issues/450)
+
+Follow-up tracker for cleanup work gated on the msgspec
+PEP 684 adoption upstream ([jcrist/msgspec#563](https://github.com/jcrist/msgspec/issues/563)).
+
+Context — why this exists
+-------------------------
+
+The `tractor.spawn._subint_forkserver` submodule currently
+carries two "non-trio" thread-hygiene constraints whose
+necessity is tangled with issues that *should* dissolve
+under PEP 684 isolated-mode subinterpreters:
+
+1. `fork_from_worker_thread()` / `run_subint_in_worker_thread()`
+   internally allocate a **dedicated `threading.Thread`**
+   rather than using `trio.to_thread.run_sync()`.
+2. The test helper is named
+   `run_fork_in_non_trio_thread()` — the
+   `non_trio` qualifier is load-bearing today.
+
+This doc catalogs *why* those constraints exist, which of
+them isolated-mode would fix, and what the
+audit-and-cleanup path looks like once msgspec #563 is
+resolved.
+
+The three reasons the constraints exist
+---------------------------------------
+
+### 1. GIL-starvation class → fixed by PEP 684 isolated mode
+
+The class-A hang documented in
+`subint_sigint_starvation_issue.md` is entirely about
+legacy-config subints **sharing the main GIL**. Once
+msgspec #563 lands and tractor flips
+`tractor.spawn._subint` to
+`concurrent.interpreters.create()` (isolated config), each
+subint gets its own GIL. Abandoned subint threads can't
+contend for main's GIL → can't starve the main trio loop
+→ signal-wakeup-pipe drains normally → no SIGINT-drop.
+
+This class of hazard **dissolves entirely**. The
+non-trio-thread requirement for *this reason* disappears.
+
+### 2. Destroy race / tstate-recycling → orthogonal; unclear
+
+The `subint_proc` dedicated-thread fix (commit `26fb8206`)
+addressed a different issue: `_interpreters.destroy(interp_id)`
+was blocking on a trio-cache worker that had run an
+earlier `interp.exec()` for that subint. Working
+hypothesis at the time was "the cached thread retains the
+subint's tstate".
+
+But tstate-handling is **not specific to GIL mode** —
+`_PyXI_Enter` / `_PyXI_Exit` (the C-level machinery both
+configs use to enter/leave a subint from a thread) should
+restore the caller's tstate regardless of GIL config. So
+isolated mode **doesn't obviously fix this**. It might be:
+
+- A py3.13 bug fixed in later versions — we saw the race
+  first on 3.13 and never re-tested on 3.14 after moving
+  to dedicated threads.
+- A genuine CPython quirk around cached threads that
+  exec'd into a subint, persisting across GIL modes.
+- Something else we misdiagnosed — the empirical fix
+  (dedicated thread) worked but the analysis may have
+  been incomplete.
+
+Only way to know: once we're on isolated mode, empirically
+retry `trio.to_thread.run_sync(interp.exec, ...)` and see
+if `destroy()` still blocks. If it does, keep the
+dedicated thread; if not, one constraint relaxed.
+
+### 3. Fork-from-main-interp-tstate (the constraint in this module's helper names)
+
+The fork-from-main-interp-tstate invariant — CPython's
+`PyOS_AfterFork_Child` →
+`_PyInterpreterState_DeleteExceptMain` gate documented in
+`subint_fork_blocked_by_cpython_post_fork_issue.md` — is
+about the calling thread's **current** tstate at the
+moment `os.fork()` runs. If trio's cache threads never
+enter subints at all, their tstate is plain main-interp,
+and fork from them would be fine.
+
+The reason the smoke test +
+`run_fork_in_non_trio_thread` test helper
+currently use a dedicated `threading.Thread` is narrow:
+**we don't want to risk a trio cache thread that has
+previously been used as a subint driver being the one that
+picks up the fork job**. If cached tstate doesn't get
+cleared (back to reason #2), the fork's child-side
+post-init would see the wrong interp and abort.
+
+In an isolated-mode world where msgspec works:
+
+- `subint_proc` would use the public
+  `concurrent.interpreters.create()` + `Interpreter.exec()`
+  / `Interpreter.close()` — which *should* handle tstate
+  cleanly (they're the "blessed" API).
+- If so, trio's cache threads are safe to fork from
+  regardless of whether they've previously driven subints.
+- → the `non_trio` qualifier in
+  `run_fork_in_non_trio_thread` becomes
+  *overcautious* rather than load-bearing, and the
+  dedicated-thread primitives in `_subint_forkserver.py`
+  can likely be replaced with straight
+  `trio.to_thread.run_sync()` wrappers.
+
+TL;DR
+-----
+
+| constraint | fixed by isolated mode? |
+|---|---|
+| GIL-starvation (class A) | **yes** |
+| destroy race on cached worker | unclear — empirical test on py3.14 + isolated API required |
+| fork-from-main-tstate requirement on worker | **probably yes, conditional on the destroy-race question above** |
+
+If #2 also resolves on py3.14+ with isolated mode,
+tractor could drop the `non_trio` qualifier from the fork
+helper's name and just use `trio.to_thread.run_sync(...)`
+for everything. But **we shouldn't do that preemptively**
+— the current cautious design is cheap (one dedicated
+thread per fork / per subint-exec) and correct.
+
+Audit plan when msgspec #563 lands
+----------------------------------
+
+Assuming msgspec grows `Py_mod_multiple_interpreters`
+support:
+
+1. **Flip `tractor.spawn._subint` to isolated mode.** Drop
+   the `_interpreters.create('legacy')` call in favor of
+   the public API (`concurrent.interpreters.create()` +
+   `Interpreter.exec()` / `Interpreter.close()`). Run the
+   three `ai/conc-anal/subint_*_issue.md` reproducers —
+   class-A (`test_stale_entry_is_deleted` etc.) should
+   pass without the `skipon_spawn_backend('subint')` marks
+   (revisit the marker inventory).
+
+2. **Empirical destroy-race retest.** In `subint_proc`,
+   swap the dedicated `threading.Thread` back to
+   `trio.to_thread.run_sync(Interpreter.exec, ...,
+   abandon_on_cancel=False)` and run the full subint test
+   suite. If `Interpreter.close()` (or the backing
+   destroy) blocks the same way as the legacy version
+   did, revert and keep the dedicated thread.
+
+3. **If #2 clean**, audit `_subint_forkserver.py`:
+   - Rename `run_fork_in_non_trio_thread` → drop the
+     `_non_trio_` qualifier (e.g. `run_fork_in_thread`) or
+     inline the two-line `trio.to_thread.run_sync` call at
+     the call sites and drop the helper entirely.
+   - Consider whether `fork_from_worker_thread` +
+     `run_subint_in_worker_thread` still warrant being
+     separate module-level primitives or whether they
+     collapse into a compound
+     `trio.to_thread.run_sync`-driven pattern inside the
+     (future) `subint_forkserver_proc` backend.
+
+4. **Doc fallout.** `subint_sigint_starvation_issue.md`
+   and `subint_cancel_delivery_hang_issue.md` both cite
+   the legacy-GIL-sharing architecture as the root cause.
+   Close them with commit-refs to the isolated-mode
+   migration. This doc itself should get a closing
+   post-mortem section noting which of #1/#2/#3 actually
+   resolved vs persisted.
+
+References
+----------
+
+- `tractor.spawn._subint_forkserver` — the in-tree module
+  whose constraints this doc catalogs.
+- `ai/conc-anal/subint_sigint_starvation_issue.md` — the
+  GIL-starvation class.
+- `ai/conc-anal/subint_cancel_delivery_hang_issue.md` —
+  sibling Ctrl-C-able hang class.
+- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
+  — why fork-from-subint is blocked (this drives the
+  forkserver-via-non-subint-thread workaround).
+- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
+  — empirical validation for the workaround.
+- [PEP 684 — per-interpreter GIL](https://peps.python.org/pep-0684/)
+- [PEP 734 — `concurrent.interpreters` public API](https://peps.python.org/pep-0734/)
+- [jcrist/msgspec#563 — PEP 684 support tracker](https://github.com/jcrist/msgspec/issues/563)
+- tractor issue #379 — subint backend tracking.
diff --git a/ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md b/ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md
new file mode 100644
index 000000000..67d754710
--- /dev/null
+++ b/ai/conc-anal/test_register_duplicate_name_daemon_connect_race_issue.md
@@ -0,0 +1,273 @@
+# `test_register_duplicate_name` racy connect-failure on `daemon` fixture readiness
+
+## Symptom
+
+`tests/test_multi_program.py::test_register_duplicate_name`
+fails intermittently under BOTH transports + ALL spawn
+backends with connect-refused errors:
+
+```
+# under --tpt-proto=uds
+FAILED tests/test_multi_program.py::test_register_duplicate_name
+- ConnectionRefusedError: [Errno 111] Connection refused
+( ^^^ this exc was collapsed from a group ^^^ )
+
+# under --tpt-proto=tcp
+FAILED tests/test_multi_program.py::test_register_duplicate_name
+- OSError: all attempts to connect to 127.0.0.1:36003 failed
+( ^^^ this exc was collapsed from a group ^^^ )
+```
+
+Distinct from the cancel-cascade `TooSlowError` flake
+class — see
+`cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`.
+This is a **connect-time race** before the daemon is
+fully ready to `accept()`, not a teardown-cascade
+slowness.
+
+## Root cause: blind `time.sleep()` in `daemon` fixture
+
+`tests/conftest.py::daemon` boots a sub-py-process via
+`subprocess.Popen([python, '-c', 'tractor.run_daemon(...)'])`,
+then **blindly sleeps** a fixed delay before yielding
+`proc` to the test:
+
+```python
+# excerpt from tests/conftest.py::daemon
+proc = subprocess.Popen([
+    sys.executable, '-c', code,
+])
+
+bg_daemon_spawn_delay: float = _PROC_SPAWN_WAIT  # 0.6
+if tpt_proto == 'uds':
+    bg_daemon_spawn_delay += 1.6
+if _non_linux and ci_env:
+    bg_daemon_spawn_delay += 1
+
+# XXX, allow time for the sub-py-proc to boot up.
+# !TODO, see ping-polling ideas above!
+time.sleep(bg_daemon_spawn_delay)
+
+assert not proc.returncode
+yield proc
+```
+
+Inherent fragility: the delay is "long enough on dev
+boxes most of the time" but has no actual
+synchronization with the daemon's `bind()` + `listen()`
+completion. Under any of:
+
+- Loaded box (CI parallelism, big rebuild in
+  background, low-cpu-freq)
+- Cold first-run (`importlib` cache miss, JIT warmup)
+- Higher-than-expected `tractor` import cost
+- Filesystem latency (UDS sockfile create, slow
+  tmpfs)
+
+...the sleep finishes BEFORE the daemon has bound its
+listen socket → first test client call to
+`tractor.find_actor()` / `wait_for_actor()` /
+`open_nursery(registry_addrs=[reg_addr])`'s implicit
+connect → `ConnectionRefusedError` (TCP) or
+`FileNotFoundError`/`ConnectionRefusedError` (UDS).
+
+## Reproducer
+
+Easiest: run the suite under load.
+
+```bash
+# create CPU pressure on another core in parallel
+stress-ng --cpu 2 --timeout 600s &
+
+./py313/bin/python -m pytest \
+  tests/test_multi_program.py::test_register_duplicate_name \
+  --spawn-backend=main_thread_forkserver \
+  --tpt-proto=tcp -v
+```
+
+Reproduces ~30-50% of the time on a dev laptop. On a
+quiet idle box, may need 5-10 runs to hit.
+
+## Why the existing `_PROC_SPAWN_WAIT` tuning is
+inadequate
+
+Recent `bg_daemon_spawn_delay` rename
+(de-monotonic-grow fix) just-shipped removed the
+*accumulation* bug where each invocation made the
+NEXT test's wait longer too. Net effect: every
+invocation now uses the SAME `0.6 + 1.6` (UDS) or
+`0.6` (TCP) sleep, no growth. Good — but does
+NOTHING for the underlying race. Each individual
+test still relies on a blind sleep that may or may
+not be sufficient.
+
+Bumping the constant higher pushes flake rate down
+but never to zero AND adds dead time to every
+non-flaking run. Not a fix, just a knob.
+
+## Side effects
+
+- **Inter-test cascade**: a single failure can cascade
+  via leaked subprocesses (the `daemon` fixture's
+  cleanup may not fully tear down a daemon that never
+  reached "ready"). The `_reap_orphaned_subactors`
+  session-end + `_track_orphaned_uds_per_test`
+  per-test fixtures handle most of this now, but the
+  affected test itself still fails.
+- **Worsens under fork-spawn backends**: the daemon
+  has more init work
+  (`_main_thread_forkserver`-coordinator-thread
+  startup, etc.) so the sleep has to cover MORE.
+
+## Fix design — replace blind sleep with active poll
+
+The right primitive is **poll the daemon's bind
+address until it accepts a connection or we time
+out**, with the timeout being a hard ceiling rather
+than a baseline. Two implementation paths:
+
+### Path A — TCP/UDS connect-poll loop
+
+Try `socket.connect(reg_addr)` in a tight loop with
+short backoff (~50ms), succeed on the first non-error
+return, fail-loud on a hard cap (e.g. 10s). Same
+primitive works for both transports because both use
+`socket.connect()` semantics.
+
+Rough shape:
+
+```python
+def _wait_for_daemon_ready(
+    reg_addr,
+    tpt_proto: str,
+    timeout: float = 10.0,
+    poll_interval: float = 0.05,
+) -> None:
+    deadline = time.monotonic() + timeout
+    while True:
+        if tpt_proto == 'tcp':
+            sock = socket.socket(socket.AF_INET)
+            target = reg_addr  # (host, port)
+        else:  # uds
+            sock = socket.socket(socket.AF_UNIX)
+            target = os.path.join(*reg_addr)
+        try:
+            sock.settimeout(poll_interval)
+            sock.connect(target)
+        except (
+            ConnectionRefusedError,
+            FileNotFoundError,
+            socket.timeout,
+        ) as exc:
+            if time.monotonic() >= deadline:
+                raise TimeoutError(
+                    f'Daemon never accepted on {target!r} '
+                    f'within {timeout}s'
+                ) from exc
+            time.sleep(poll_interval)
+        else:
+            sock.close()
+            return
+```
+
+Pros: trivial primitive, no tractor-runtime
+dependency, works pre-yield in the fixture body,
+fail-fast on truly-broken daemon.
+Cons: doesn't actually do an IPC handshake, just
+proves listen-side is up. A daemon that bound but
+hasn't initialized its registrar table yet would
+still race.
+
+### Path B — `tractor.find_actor()` poll
+
+Use the actual discovery API the test would call:
+
+```python
+async def _wait_for_daemon_ready_via_discovery(
+    reg_addr,
+    timeout: float = 10.0,
+    poll_interval: float = 0.05,
+):
+    deadline = trio.current_time() + timeout
+    async with tractor.open_root_actor(
+        registry_addrs=[reg_addr],
+        # ephemeral root just for the probe
+    ):
+        while True:
+            try:
+                async with tractor.find_actor(
+                    'registrar',  # daemon's own name
+                    registry_addrs=[reg_addr],
+                ) as portal:
+                    if portal is not None:
+                        return
+            except Exception:
+                pass
+            if trio.current_time() >= deadline:
+                raise TimeoutError(...)
+            await trio.sleep(poll_interval)
+```
+
+Pros: actually proves the discovery path works,
+handles the "bound but not ready" case naturally.
+Cons: requires booting an ephemeral root actor JUST
+for the probe (overhead), more code, and runs in trio
+which complicates the sync-fixture context. Need a
+`trio.run()` wrapper.
+
+### Recommended: Path A with optional handshake check
+
+Path A is much simpler + handles 95% of the bug
+class. If "bound-but-not-ready" turns out to still
+race (it shouldn't — `tractor.run_daemon` doesn't
+return from `bind()` until the registrar is
+fully populated), escalate to Path B as a focused
+follow-up.
+
+## Workarounds (until fix lands)
+
+1. **Bump `_PROC_SPAWN_WAIT`** higher (current: 0.6).
+   2.0–3.0 hides most flakes at the cost of adding
+   dead time to every test. Not a fix but reduces
+   blast radius while the proper poll lands.
+2. **`pytest-rerunfailures`** with `reruns=1` on the
+   `daemon` fixture's tests specifically. Hides the
+   flake but doesn't address it.
+3. **Mark known-affected tests as `xfail(strict=False)`**
+   under `--ci`. Lets CI go green at the cost of
+   silently hiding regressions.
+
+(Recommend skipping all three — implement the active
+poll instead.)
+
+## Investigation next steps
+
+1. Implement Path A as a `_wait_for_daemon_ready()`
+   helper in `tests/conftest.py`. Replace the
+   `time.sleep(bg_daemon_spawn_delay)` call with it.
+2. Drop the `_PROC_SPAWN_WAIT` constant entirely
+   (active poll obsoletes blind sleep).
+3. Run the suite 5-10 times to validate flake rate
+   drops to 0.
+4. If flakes persist, profile whether the daemon
+   process exits with non-zero before the poll's
+   deadline hits — that'd be a different bug
+   (daemon startup crash) that the blind sleep was
+   masking.
+5. Cross-check `tests/test_multi_program.py::test_*`
+   — multiple tests use the `daemon` fixture; all
+   should benefit from the same poll primitive.
+
+## Related
+
+- `tests/conftest.py::daemon` — the fixture under
+  fix
+- `tests/conftest.py::_PROC_SPAWN_WAIT` — the
+  constant to drop
+- `cancel_cascade_too_slow_under_main_thread_forkserver_issue.md`
+  — distinct flake class (cancel-cascade
+  `TooSlowError` at teardown, not connect-time race)
+- `trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
+  — different bug entirely; this race was masked
+  pre-WakeupSocketpair-patch by the busy-loop
+  hangs.
diff --git a/ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md b/ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md
new file mode 100644
index 000000000..213841e99
--- /dev/null
+++ b/ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md
@@ -0,0 +1,221 @@
+# trio `WakeupSocketpair.drain()` busy-loop in forked child (peer-closed missed-EOF)
+
+## Reproducer
+
+```bash
+./py313/bin/python -m pytest \
+  tests/test_multi_program.py::test_register_duplicate_name \
+  --tpt-proto=tcp \
+  --spawn-backend=main_thread_forkserver \
+  -v --capture=sys
+```
+
+Subactor pegs a CPU core indefinitely; parent test
+hangs waiting for the subactor.
+
+## Empirical evidence (caught alive)
+
+```
+$ sudo strace -p <subactor-pid>
+recvfrom(6, "", 65536, 0, NULL, NULL)   = 0
+recvfrom(6, "", 65536, 0, NULL, NULL)   = 0
+recvfrom(6, "", 65536, 0, NULL, NULL)   = 0
+... (no `epoll_wait`, no other syscalls, just this back-to-back)
+```
+
+Pattern: tight C-level `recvfrom` loop returning 0
+each call. No `epoll_wait` between iterations →
+**not trio's task scheduler**. Pure synchronous C
+loop.
+
+```
+$ sudo readlink /proc/<subactor-pid>/fd/6
+socket:[<inode>]
+
+$ sudo lsof -p <subactor-pid> | grep ' 6u'
+<cmd> <pid> goodboy 6u unix 0xffff... 0t0 <inode> type=STREAM (CONNECTED)
+```
+
+fd=6 is an **AF_UNIX socket** in CONNECTED state.
+Even though the test uses `--tpt-proto=tcp`, this fd
+is NOT a tractor IPC channel — it's an internal
+trio socketpair.
+
+## Root-cause: `WakeupSocketpair.drain()`
+
+`/site-packages/trio/_core/_wakeup_socketpair.py`:
+
+```python
+class WakeupSocketpair:
+    def __init__(self) -> None:
+        self.wakeup_sock, self.write_sock = socket.socketpair()
+        self.wakeup_sock.setblocking(False)
+        self.write_sock.setblocking(False)
+        ...
+
+    def drain(self) -> None:
+        try:
+            while True:
+                self.wakeup_sock.recv(2**16)
+        except BlockingIOError:
+            pass
+```
+
+`socket.socketpair()` on Linux defaults to AF_UNIX
+SOCK_STREAM. Both ends non-blocking. Normal flow:
+
+1. Signal/wake event → `write_sock.send(b'\x00')`
+   queues a byte.
+2. `wakeup_sock` becomes readable → trio's epoll
+   triggers.
+3. Trio calls `drain()` to flush the buffer.
+4. drain loops on `wakeup_sock.recv(64KB)`.
+5. Eventually buffer empty → non-blocking socket
+   raises `BlockingIOError` → except → break.
+
+**Bug surface — peer-closed missed-EOF**:
+
+Non-blocking socket semantics:
+- buffer has data → `recv` returns N>0 bytes (loop continues)
+- buffer empty → `recv` raises `BlockingIOError`
+- **peer FIN'd → `recv` returns 0 bytes (NEITHER exception NOR
+  break — infinite tight loop)**
+
+`drain()` does not handle the `b''` return-value
+(EOF) case. If `write_sock` has been closed (or the
+process holding it is gone), every iteration returns
+0 → infinite loop → 100% CPU on a single core.
+
+## Why this triggers under `main_thread_forkserver`
+
+Under `os.fork()` from the forkserver-worker thread:
+
+1. Parent has a `WakeupSocketpair` instance with
+   `wakeup_sock=fdN`, `write_sock=fdM`. Both fds
+   open in parent.
+2. Fork → child inherits BOTH fds (kernel-level fd
+   table dup).
+3. `_close_inherited_fds()` runs in child →
+   closes everything except stdio. `wakeup_sock` and
+   `write_sock` of the parent's `WakeupSocketpair`
+   ARE closed in child.
+4. Child's trio (running fresh) creates its OWN
+   `WakeupSocketpair` → NEW fd numbers (e.g. fd 6, 7).
+5. **In `infect_asyncio` mode** the asyncio loop is
+   the host; trio runs as guest via
+   `start_guest_run`. trio still creates its
+   `WakeupSocketpair` in the I/O manager but its
+   role is different.
+
+The race window: somewhere between (3) and (5), if a
+`WakeupSocketpair` Python object reference inherited
+via COW (from parent's pre-fork heap) survives long
+enough that `drain()` is called on it AFTER its fds
+were closed but BEFORE the child's NEW socketpair
+takes over the recycled fd numbers — the recycled fd
+will be one of the child's NEW socketpair ends, whose
+peer might be FIN-flagged (e.g. parent-process
+peer-end is closed).
+
+Or simpler: the `wait_for_actor`/`find_actor` discovery
+flow in `test_register_duplicate_name` triggers an
+unusual code path where a stale `WakeupSocketpair`
+gets `drain()`-called on a fd whose peer has already
+closed.
+
+## Why `drain()` shouldn't loop indefinitely on EOF
+(upstream trio bug)
+
+Even WITHOUT fork, `drain()` should treat `b''` as
+EOF and break. The current code is correct for the
+"buffer drained on a healthy socketpair" scenario but
+incorrect for the "peer is gone" scenario. It's a
+defensive-programming gap in trio.
+
+A one-line patch upstream:
+
+```python
+def drain(self) -> None:
+    try:
+        while True:
+            data = self.wakeup_sock.recv(2**16)
+            if not data:
+                break  # peer-closed; nothing more to drain
+    except BlockingIOError:
+        pass
+```
+
+## Workarounds (until the underlying issue lands)
+
+1. **Skip-mark on the fork backend**:
+   `tests/test_multi_program.py` →
+   `pytest.mark.skipon_spawn_backend('main_thread_forkserver',
+   reason='trio WakeupSocketpair.drain busy-loop, see ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md')`.
+
+2. **Defensive monkey-patch in tractor's
+   forkserver-child prelude** — wrap
+   `WakeupSocketpair.drain` to handle `b''`:
+
+   ```python
+   # in `_actor_child_main` or `_close_inherited_fds`'s
+   # post-fork prelude:
+   from trio._core._wakeup_socketpair import WakeupSocketpair
+   _orig_drain = WakeupSocketpair.drain
+   def _safe_drain(self):
+       try:
+           while True:
+               data = self.wakeup_sock.recv(2**16)
+               if not data:
+                   return  # peer closed
+       except BlockingIOError:
+           pass
+   WakeupSocketpair.drain = _safe_drain
+   ```
+
+   Tracks upstream — remove once trio fixes.
+
+3. **Upstream the fix**: 1-line PR to `python-trio/trio`
+   adding `if not data: break` to `drain()`.
+
+## Investigation next steps
+
+1. **Confirm via py-spy**: when caught alive, detach
+   strace first then
+   `sudo py-spy dump --pid <subactor> --locals`. The
+   busy thread should show `drain` from `WakeupSocketpair`
+   in the call chain.
+2. **Identify which write-end peer is closed**: from
+   the inode of fd 6, look up the matching peer
+   inode via `ss -xp` and see whose process it
+   was/is.
+3. **Verify the missed-EOF hypothesis**: hand-craft a
+   minimal `WakeupSocketpair` repro:
+
+   ```python
+   from trio._core._wakeup_socketpair import WakeupSocketpair
+   ws = WakeupSocketpair()
+   ws.write_sock.close()  # simulate peer-gone
+   ws.drain()             # should hang forever
+   ```
+
+## Sibling bug
+
+`tests/test_infected_asyncio.py::test_aio_simple_error`
+hangs under the same backend with a DIFFERENT
+fingerprint (Mode-A deadlock, both parties in
+`epoll_wait`, no busy-loop). Distinct root cause —
+see `infected_asyncio_under_main_thread_forkserver_hang_issue.md`.
+
+Both share the broader theme: **trio internal-state
+initialization isn't fully fork-safe under
+`main_thread_forkserver`** for the more exotic
+dispatch paths.
+
+## See also
+
+- [#379](https://github.com/goodboy/tractor/issues/379) — subint umbrella
+- python-trio/trio#1614 — trio + fork hazards
+- `trio._core._wakeup_socketpair.WakeupSocketpair`
+  source (the smoking gun)
+- `ai/conc-anal/fork_thread_semantics_execution_vs_memory.md`
+- `ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md`
diff --git a/ai/prompt-io/claude/20260422T200723Z_797f57c_prompt_io.md b/ai/prompt-io/claude/20260422T200723Z_797f57c_prompt_io.md
new file mode 100644
index 000000000..e606db8ff
--- /dev/null
+++ b/ai/prompt-io/claude/20260422T200723Z_797f57c_prompt_io.md
@@ -0,0 +1,155 @@
+---
+model: claude-opus-4-7[1m]
+service: claude
+session: subints-phase-b-hardening-and-fork-block
+timestamp: 2026-04-22T20:07:23Z
+git_ref: 797f57c
+scope: code
+substantive: true
+raw_file: 20260422T200723Z_797f57c_prompt_io.raw.md
+---
+
+## Prompt
+
+Session-spanning work on the Phase B `subint` spawn-backend.
+Three distinct sub-phases in one log:
+
+1. **Py3.13 gate tightening** — diagnose a reproducible hang
+   of subint spawn flow under py3.13 (works on py3.14), trace
+   to a private `_interpreters` module vintage issue, tighten
+   our feature gate from "`_interpreters` present" to "public
+   `concurrent.interpreters` present" (i.e. py3.14+).
+2. **Test-harness hardening** — add `pytest-timeout` dep, put
+   `@pytest.mark.timeout(30, method='thread')` on the
+   three known-hanging subint tests cataloged in
+   `ai/conc-anal/subint_sigint_starvation_issue.md`. Separately,
+   code-review the user's in-flight `skipon_spawn_backend`
+   marker implementation; find four bugs; refactor to use
+   `item.iter_markers()`.
+3. **`subint_fork` prototype → CPython-block finding** — draft
+   a WIP `subint_fork_proc` backend using a sub-interpreter as
+   a launchpad for `os.fork()` (to sidestep trio#1614). User
+   tests on py3.14, hits
+   `Fatal Python error: _PyInterpreterState_DeleteExceptMain:
+   not main interpreter`. Walk CPython sources (local clone at
+   `~/repos/cpython/`) to pinpoint the refusal
+   (`Modules/posixmodule.c:728` → `Python/pystate.c:1040`).
+   Revert implementation to a `NotImplementedError` stub in a
+   new `_subint_fork.py` submodule, document the finding in a
+   third `conc-anal/` doc with an upstream-report draft for
+   the CPython issue tracker. Finally, discuss user's proposed
+   workaround architecture (main-interp worker-thread
+   forkserver) and draft a standalone smoke-test script for
+   feasibility validation.
+
+## Response summary
+
+All three sub-phases landed concrete artifacts:
+
+**Sub-phase 1** — `_subint.py` + `_spawn.py` gates + error
+messages updated to require py3.14+ via the public
+`concurrent.interpreters` module presence check. Module
+docstring revised to explain the empirical reason
+(py3.13's private `_interpreters` vintage wedges under
+multi-trio-task usage even though minimal standalone
+reproducers work fine there). Test-module
+`pytest.importorskip` likewise switched.
+
+**Sub-phase 2** — `pytest-timeout>=2.3` added to `testing`
+dep group. `@pytest.mark.timeout(30, method='thread')`
+applied on:
+- `tests/discovery/test_registrar.py::test_stale_entry_is_deleted`
+- `tests/test_cancellation.py::test_cancel_while_childs_child_in_sync_sleep`
+- `tests/test_cancellation.py::test_multierror_fast_nursery`
+- `tests/test_subint_cancellation.py::test_subint_non_checkpointing_child`
+
+`method='thread'` documented inline as load-bearing — the
+GIL-starvation path that drops `SIGINT` would equally drop
+`SIGALRM`, so only a watchdog-thread timeout can reliably
+escape.
+
+`skipon_spawn_backend` plugin refactored into a single
+`iter_markers`-driven loop in `pytest_collection_modifyitems`
+(~30 LOC replacing ~30 LOC of nested conditionals). Four
+bugs dissolved: wrong `.get()` key, module-level `pytestmark`
+suppressing per-test marks, unhandled `pytestmark = [list]`
+form, `pytest.Makr` typo. Marker help text updated to
+document the variadic backend-list + `reason=` kwarg
+surface.
+
+**Sub-phase 3** — Prototype drafted (then reverted):
+
+- `tractor/spawn/_subint_fork.py` — new dedicated submodule
+  housing the `subint_fork_proc` stub. Module docstring +
+  fn docstring explain the attempt, the CPython-level
+  block, and the reason for keeping the stub in-tree
+  (documentation of the attempt + starting point if CPython
+  ever lifts the restriction).
+- `tractor/spawn/_spawn.py` — `'subint_fork'` registered as a
+  `SpawnMethodKey` literal + in `_methods`, so
+  `--spawn-backend=subint_fork` routes to a clean
+  `NotImplementedError` pointing at the analysis doc rather
+  than an "invalid backend" error.
+- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md` —
+  third sibling conc-anal doc. Full annotated CPython
+  source walkthrough from user-visible
+  `Fatal Python error` → `Modules/posixmodule.c:728
+  PyOS_AfterFork_Child()` → `Python/pystate.c:1040
+  _PyInterpreterState_DeleteExceptMain()` gate. Includes a
+  copy-paste-ready upstream-report draft for the CPython
+  issue tracker with a two-tier ask (ideally "make it work",
+  minimally "cleaner error than `Fatal Python error`
+  aborting the child").
+- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` —
+  standalone zero-tractor-import CPython-level smoke test
+  for the user's proposed workaround architecture
+  (forkserver on a main-interp worker thread). Four
+  argparse-driven scenarios: `control_subint_thread_fork`
+  (reproduces the known-broken case as a test-harness
+  sanity),  `main_thread_fork` (baseline), `worker_thread_fork`
+  (architectural assertion), `full_architecture`
+  (end-to-end trio-in-subint in forked child). User will
+  run on py3.14 next.
+
+## Files changed
+
+See `git log 26fb820..HEAD --stat` for the canonical list.
+New files this session:
+- `tractor/spawn/_subint_fork.py`
+- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
+- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
+
+Modified (diff pointers in raw log):
+- `tractor/spawn/_subint.py` (py3.14 gate)
+- `tractor/spawn/_spawn.py` (`subint_fork` registration)
+- `tractor/_testing/pytest.py` (`skipon_spawn_backend` refactor)
+- `pyproject.toml` (`pytest-timeout` dep)
+- `tests/discovery/test_registrar.py`,
+  `tests/test_cancellation.py`,
+  `tests/test_subint_cancellation.py` (timeout marks,
+  cross-refs to conc-anal docs)
+
+## Human edits
+
+Several back-and-forth iterations with user-driven
+adjustments during the session:
+
+- User corrected my initial mis-classification of
+  `test_cancel_while_childs_child_in_sync_sleep[subint-False]`
+  as Ctrl-C-able — second strace showed `EAGAIN`, putting
+  it squarely in class A (GIL-starvation). Re-analysis
+  preserved in the raw log.
+- User independently fixed the `.get(reason)` → `.get('reason', reason)`
+  bug in the marker plugin before my review; preserved their
+  fix.
+- User suggested moving the `subint_fork_proc` stub from
+  the bottom of `_subint.py` into its own
+  `_subint_fork.py` submodule — applied.
+- User asked to keep the forkserver-architecture
+  discussion as background for the smoke-test rather than
+  committing to a tractor-side refactor until the smoke
+  test validates the CPython-level assumptions.
+
+Commit messages in this range (b025c982 … 797f57c) were
+drafted via `/commit-msg` + `rewrap.py --width 67`; user
+landed them with the usual review.
diff --git a/ai/prompt-io/claude/20260422T200723Z_797f57c_prompt_io.raw.md b/ai/prompt-io/claude/20260422T200723Z_797f57c_prompt_io.raw.md
new file mode 100644
index 000000000..395523fef
--- /dev/null
+++ b/ai/prompt-io/claude/20260422T200723Z_797f57c_prompt_io.raw.md
@@ -0,0 +1,343 @@
+---
+model: claude-opus-4-7[1m]
+service: claude
+timestamp: 2026-04-22T20:07:23Z
+git_ref: 797f57c
+diff_cmd: git log 26fb820..HEAD  # all session commits since the destroy-race fix log
+---
+
+Session-spanning conversation covering the Phase B hardening
+of the `subint` spawn-backend and an investigation into a
+proposed `subint_fork` follow-up which turned out to be
+blocked at the CPython level. This log is a narrative capture
+of the substantive turns (not every message) and references
+the concrete code + docs the session produced. Per diff-ref
+mode the actual code diffs are pointed at via `git log` on
+each ref rather than duplicated inline.
+
+## Narrative of the substantive turns
+
+### Py3.13 hang / gate tightening
+
+Diagnosed a reproducible hang of the `subint` backend under
+py3.13 (test_spawning tests wedge after root-actor bringup).
+Root cause: py3.13's vintage of the private `_interpreters` C
+module has a latent thread/subint-interaction issue that
+`_interpreters.exec()` silently fails to progress under
+tractor's multi-trio usage pattern — even though a minimal
+standalone `threading.Thread` + `_interpreters.exec()`
+reproducer works fine on the same Python. Empirically
+py3.14 fixes it.
+
+Fix (from this session): tighten the `_has_subints` gate in
+`tractor.spawn._subint` from "private module importable" to
+"public `concurrent.interpreters` present" — which is 3.14+
+only. This leaves `subint_proc()` unchanged in behavior (we
+still call the *private* `_interpreters.create('legacy')`
+etc. under the hood) but refuses to engage on 3.13.
+
+Also tightened the matching gate in
+`tractor.spawn._spawn.try_set_start_method('subint')` and
+rev'd the corresponding error messages from "3.13+" to
+"3.14+" with a sentence explaining why. Test-module
+`pytest.importorskip` switched from `_interpreters` →
+`concurrent.interpreters` to match.
+
+### `pytest-timeout` dep + `skipon_spawn_backend` marker plumbing
+
+Added `pytest-timeout>=2.3` to the `testing` dep group with
+an inline comment pointing at the `ai/conc-anal/*.md` docs.
+Applied `@pytest.mark.timeout(30, method='thread')` (the
+`method='thread'` is load-bearing — `signal`-method
+`SIGALRM` suffers the same GIL-starvation path that drops
+`SIGINT` in the class-A hang pattern) to the three known-
+hanging subint tests cataloged in
+`subint_sigint_starvation_issue.md`.
+
+Separately code-reviewed the user's newly-staged
+`skipon_spawn_backend` pytest marker implementation in
+`tractor/_testing/pytest.py`. Found four bugs:
+
+1. `modmark.kwargs.get(reason)` called `.get()` with the
+   *variable* `reason` as the dict key instead of the string
+   `'reason'` — user-supplied `reason=` was never picked up.
+   (User had already fixed this locally via `.get('reason',
+   reason)` by the time my review happened — preserved that
+   fix.)
+2. The module-level `pytestmark` branch suppressed per-test
+   marker handling (the `else:` was an `else:` rather than
+   independent iteration).
+3. `mod_pytestmark.mark` assumed a single
+   `MarkDecorator` — broke on the valid-pytest `pytestmark =
+   [mark, mark]` list form.
+4. Typo: `pytest.Makr` → `pytest.Mark`.
+
+Refactored the hook to use `item.iter_markers(name=...)`
+which walks function + class + module scopes uniformly and
+handles both `pytestmark` forms natively. ~30 LOC replaced
+the original ~30 LOC of nested conditionals, all four bugs
+dissolved. Also updated the marker help string to reflect
+the variadic `*start_methods` + `reason=` surface.
+
+### `subint_fork_proc` prototype attempt
+
+User's hypothesis: the known trio+`fork()` issues
+(python-trio/trio#1614) could be sidestepped by using a
+sub-interpreter purely as a launchpad — `os.fork()` from a
+subint that has never imported trio → child is in a
+trio-free context. In the child `execv()` back into
+`python -m tractor._child` and the downstream handshake
+matches `trio_proc()` identically.
+
+Drafted the prototype at `tractor/spawn/_subint.py`'s bottom
+(originally — later moved to its own submod, see below):
+launchpad-subint creation, bootstrap code-string with
+`os.fork()` + `execv()`, driver-thread orchestration,
+parent-side `ipc_server.wait_for_peer()` dance. Registered
+`'subint_fork'` as a new `SpawnMethodKey` literal, added
+`case 'subint' | 'subint_fork':` feature-gate arm in
+`try_set_start_method()`, added entry in `_methods` dict.
+
+### CPython-level block discovered
+
+User tested on py3.14 and saw:
+
+```
+Fatal Python error: _PyInterpreterState_DeleteExceptMain: not main interpreter
+Python runtime state: initialized
+
+Current thread 0x00007f6b71a456c0 [subint-fork-lau] (most recent call first):
+  File "<script>", line 2 in <module>
+<script>:2: DeprecationWarning: This process (pid=802985) is multi-threaded, use of fork() may lead to deadlocks in the child.
+```
+
+Walked CPython sources (local clone at `~/repos/cpython/`):
+
+- **`Modules/posixmodule.c:728` `PyOS_AfterFork_Child()`** —
+  post-fork child-side cleanup. Calls
+  `_PyInterpreterState_DeleteExceptMain(runtime)` with
+  `goto fatal_error` on non-zero status. Has the
+  `// Ideally we could guarantee tstate is running main.`
+  self-acknowledging-fragile comment directly above.
+
+- **`Python/pystate.c:1040`
+  `_PyInterpreterState_DeleteExceptMain()`** — the
+  refusal. Hard `PyStatus_ERR("not main interpreter")` gate
+  when `tstate->interp != interpreters->main`. Docstring
+  formally declares the precondition ("If there is a
+  current interpreter state, it *must* be the main
+  interpreter"). `XXX` comments acknowledge further latent
+  issues within.
+
+Definitive answer to "Open Question 1" of the prototype
+docstring: **no, CPython does not support `os.fork()` from
+a non-main sub-interpreter**. Not because the fork syscall
+is blocked (it isn't — the parent returns a valid pid),
+but because the child cannot survive CPython's post-fork
+initialization. This is an enforced invariant, not an
+incidental limitation.
+
+### Revert: move to stub submod + doc the finding
+
+Per user request:
+
+1. Reverted the working `subint_fork_proc` body to a
+   `NotImplementedError` stub, MOVED to its own submod
+   `tractor/spawn/_subint_fork.py` (keeps `_subint.py`
+   focused on the working `subint_proc` backend).
+2. Updated `_spawn.py` to import the stub from the new
+   submod path; kept `'subint_fork'` in `SpawnMethodKey` +
+   `_methods` so `--spawn-backend=subint_fork` routes to a
+   clean `NotImplementedError` with pointer to the analysis
+   doc rather than an "invalid backend" error.
+3. Wrote
+   `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
+   with the full annotated CPython walkthrough + an
+   upstream-report draft for the CPython issue tracker.
+   Draft has a two-tier ask: ideally "make it work"
+   (pre-fork tstate-swap hook or `DeleteExceptFor(interp)`
+   variant), minimally "give us a clean `RuntimeError` in
+   the parent instead of a `Fatal Python error` aborting
+   the child silently".
+
+### Design discussion — main-interp-thread forkserver workaround
+
+User proposed: set up a "subint forking server" that fork()s
+on behalf of subint callers. Core insight: the CPython gate
+is on `tstate->interp`, not thread identity, so **any thread
+whose tstate is main-interp** can fork cleanly. A worker
+thread attached to main-interp (never entering a subint)
+satisfies the precondition.
+
+Structurally this is `mp.forkserver` (which tractor already
+has as `mp_forkserver`) but **in-process**: instead of a
+separate Python subproc as the fork server, we'd put the
+forkserver on a thread in the tractor parent process. Pros:
+faster spawn (no IPC marshalling to external server + no
+separate Python startup), inherits already-imported modules
+for free. Cons: less crash isolation (forkserver failure
+takes the whole process).
+
+Required tractor-side refactor: move the root actor's
+`trio.run()` off main-interp-main-thread (so main-thread can
+run the forkserver loop). Nontrivial; approximately the same
+magnitude as "Phase C".
+
+The design would also not fully resolve the class-A
+GIL-starvation issue because child actors' trio still runs
+inside subints (legacy config, msgspec PEP 684 pending).
+Would mitigate SIGINT-starvation specifically if signal
+handling moves to the forkserver thread.
+
+Recommended pre-commitment: a standalone CPython-only smoke
+test validating the four assumptions the arch rests on,
+before any tractor-side work.
+
+### Smoke-test script drafted
+
+Wrote `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`:
+argparse-driven, four scenarios (`control_subint_thread_fork`
+reproducing the known-broken case, `main_thread_fork`
+baseline, `worker_thread_fork` the architectural assertion,
+`full_architecture` end-to-end with trio in a subint in the
+forked child). No `tractor` imports; pure CPython + `_interpreters`
++ `trio`. Bails cleanly on py<3.14. Pass/fail banners per
+scenario.
+
+User will validate on their py3.14 env next.
+
+## Per-code-artifact provenance
+
+### `tractor/spawn/_subint_fork.py` (new submod)
+
+> `git show 797f57c -- tractor/spawn/_subint_fork.py`
+
+NotImplementedError stub for the subint-fork backend. Module
+docstring + fn docstring explain the attempt, the CPython
+block, and why the stub is kept in-tree. No runtime behavior
+beyond raising with a pointer at the conc-anal doc.
+
+### `tractor/spawn/_spawn.py` (modified)
+
+> `git log 26fb820..HEAD -- tractor/spawn/_spawn.py`
+
+- Added `'subint_fork'` to `SpawnMethodKey` literal with a
+  block comment explaining the CPython-level block.
+- Generalized the `case 'subint':` arm to `case 'subint' |
+  'subint_fork':` since both use the same py3.14+ gate.
+- Registered `subint_fork_proc` in `_methods` with a
+  pointer-comment at the analysis doc.
+
+### `tractor/spawn/_subint.py` (modified across session)
+
+> `git log 26fb820..HEAD -- tractor/spawn/_subint.py`
+
+- Tightened `_has_subints` gate: dual-requires public
+  `concurrent.interpreters` + private `_interpreters`
+  (tests for py3.14-or-newer on the public-API presence,
+  then uses the private one for legacy-config subints
+  because `msgspec` still blocks the public isolated mode
+  per jcrist/msgspec#563).
+- Updated module docstring, `subint_proc()` docstring, and
+  gate-error messages to reflect the 3.14+ requirement and
+  the reason (py3.13 wedges under multi-trio usage even
+  though the private module exists there).
+
+### `tractor/_testing/pytest.py` (modified)
+
+> `git log 26fb820..HEAD -- tractor/_testing/pytest.py`
+
+- New `skipon_spawn_backend(*start_methods, reason=...)`
+  pytest marker expanded into `pytest.mark.skip(reason=...)`
+  at collection time via
+  `pytest_collection_modifyitems()`.
+- Implementation uses `item.iter_markers(name=...)` which
+  walks function + class + module scopes uniformly and
+  handles both `pytestmark = <single Mark>` and
+  `pytestmark = [mark, ...]` forms natively. ~30-LOC
+  single-loop refactor replacing a prior nested
+  conditional that had four bugs (see "Review" narrative
+  above).
+- Added `pytest.Config` / `pytest.Function` /
+  `pytest.FixtureRequest` type annotations on fixture
+  signatures while touching the file.
+
+### `pyproject.toml` (modified)
+
+> `git log 26fb820..HEAD -- pyproject.toml`
+
+Added `pytest-timeout>=2.3` to `testing` dep group with
+comment pointing at the `ai/conc-anal/` docs.
+
+### `tests/discovery/test_registrar.py`,
+`tests/test_subint_cancellation.py`,
+`tests/test_cancellation.py` (modified)
+
+> `git log 26fb820..HEAD -- tests/`
+
+Applied `@pytest.mark.timeout(30, method='thread')` on
+known-hanging subint tests. Extended comments to cross-
+reference the `ai/conc-anal/*.md` docs. `method='thread'`
+is documented inline as load-bearing (`signal`-method
+SIGALRM suffers the same GIL-starvation path that drops
+SIGINT).
+
+### `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md` (new)
+
+> `git show 797f57c -- ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
+
+Third sibling doc under `conc-anal/`. Structure: TL;DR,
+context ("what we tried"), symptom (the user's exact
+`Fatal Python error` output), CPython source walkthrough
+with excerpted snippets from `posixmodule.c` +
+`pystate.c`, chain summary, definitive answer to Open
+Question 1, `## Upstream-report draft (for CPython issue
+tracker)` section with a two-tier ask, references.
+
+### `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` (new, THIS turn)
+
+Zero-tractor-import smoke test for the proposed workaround
+architecture. Four argparse-driven scenarios covering the
+control case + baseline + arch-critical case + end-to-end.
+Pass/fail banners per scenario; clean `--help` output;
+py3.13 early-exit.
+
+## Non-code output (verbatim)
+
+### The `strace` signature that kicked off the CPython
+walkthrough
+
+```
+--- SIGINT {si_signo=SIGINT, si_code=SI_KERNEL} ---
+write(16, "\2", 1)                      = -1 EAGAIN (Resource temporarily unavailable)
+rt_sigreturn({mask=[WINCH]})            = 139801964688928
+```
+
+### Key user quotes framing the direction
+
+> ok actually we get this [fatal error] ... see if you can
+> take a look at what's going on, in particular wrt to
+> cpython's sources. pretty sure there's a local copy at
+> ~/repos/cpython/
+
+(Drove the CPython walkthrough that produced the
+definitive refusal chain.)
+
+> is there any reason we can't just sidestep this "must fork
+> from main thread in main subint" issue by simply ensuring
+> a "subint forking server" is always setup prior to
+> invoking trio in a non-main-thread subint ...
+
+(Drove the main-interp-thread-forkserver architectural
+discussion + smoke-test script design.)
+
+### CPython source tags for quick jump-back
+
+```
+Modules/posixmodule.c:728   PyOS_AfterFork_Child()
+Modules/posixmodule.c:753   // Ideally we could guarantee tstate is running main.
+Modules/posixmodule.c:778   status = _PyInterpreterState_DeleteExceptMain(runtime);
+
+Python/pystate.c:1040       _PyInterpreterState_DeleteExceptMain()
+Python/pystate.c:1044-1047  tstate->interp != main → PyStatus_ERR("not main interpreter")
+```
diff --git a/examples/debugging/multi_daemon_subactors.py b/examples/debugging/multi_daemon_subactors.py
index 844a228a5..e313803ab 100644
--- a/examples/debugging/multi_daemon_subactors.py
+++ b/examples/debugging/multi_daemon_subactors.py
@@ -27,12 +27,9 @@ async def main():
     '''
     async with tractor.open_nursery(
         debug_mode=True,
-        loglevel='cancel',
-        # loglevel='devx',
-    ) as n:
-
-        p0 = await n.start_actor('bp_forever', enable_modules=[__name__])
-        p1 = await n.start_actor('name_error', enable_modules=[__name__])
+    ) as an:
+        p0 = await an.start_actor('bp_forever', enable_modules=[__name__])
+        p1 = await an.start_actor('name_error', enable_modules=[__name__])
 
         # retreive results
         async with p0.open_stream_from(breakpoint_forever) as stream:
diff --git a/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
index b63f1945c..6cfce50f0 100644
--- a/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
+++ b/examples/debugging/multi_nested_subactors_error_up_through_nurseries.py
@@ -67,7 +67,7 @@ async def main():
     """
     async with tractor.open_nursery(
         debug_mode=True,
-        # loglevel='cancel',
+        loglevel='pdb',
     ) as n:
 
         # spawn both actors
diff --git a/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py b/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py
index 72c6de4ca..93daa33b8 100644
--- a/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py
+++ b/examples/debugging/root_cancelled_but_child_is_in_tty_lock.py
@@ -39,8 +39,8 @@ async def main():
     '''
     async with tractor.open_nursery(
         debug_mode=True,
-        loglevel='devx',
-        enable_transports=['uds'],
+        enable_transports=['uds'],  # TODO, apss this via osenv?
+        loglevel='devx',  # XXX, required for test!
     ) as n:
 
         # spawn both actors
diff --git a/examples/debugging/root_timeout_while_child_crashed.py b/examples/debugging/root_timeout_while_child_crashed.py
index e313672f6..4dfc699da 100644
--- a/examples/debugging/root_timeout_while_child_crashed.py
+++ b/examples/debugging/root_timeout_while_child_crashed.py
@@ -1,4 +1,3 @@
-
 import trio
 import tractor
 
@@ -9,16 +8,22 @@ async def key_error():
 
 
 async def main():
-    """Root dies 
+    '''
+    Root is fail-after-cancelled while blocking and child RPC fails
+    simultaneously.
 
-    """
+    '''
     async with tractor.open_nursery(
         debug_mode=True,
-        loglevel='debug'
+        # loglevel='debug'  # ?XXX required?
     ) as n:
 
         # spawn both actors
         portal = await n.run_in_actor(key_error)
+        print(
+            f'Child is up @ {portal.chan.aid.reprol()}'
+        )
+
 
         # XXX: originally a bug caused by this is where root would enter
         # the debugger and clobber the tty used by the repl even though
diff --git a/examples/debugging/shield_hang_in_sub.py b/examples/debugging/shield_hang_in_sub.py
index 280757ea7..530f26db9 100644
--- a/examples/debugging/shield_hang_in_sub.py
+++ b/examples/debugging/shield_hang_in_sub.py
@@ -49,9 +49,11 @@ async def main(
         tractor.open_nursery(
             debug_mode=True,
             enable_stack_on_sig=True,
-            # maybe_enable_greenback=False,
-            loglevel='devx',
+            loglevel='devx',  # XXX REQUIRED log level!
             enable_transports=[tpt],
+            # maybe_enable_greenback=True,
+            # ^TODO? maybe a "smarter" way todo all this is how
+            # `modden` does with a rtv serialized through the osenv?
         ) as an,
     ):
         ptl: tractor.Portal  = await an.start_actor(
@@ -63,7 +65,9 @@ async def main(
             start_n_shield_hang,
         ) as (ctx, cpid):
 
-            _, proc, _ = an._children[ptl.chan.uid]
+            _, proc, _ = an._children[
+                ptl.chan.aid.uid
+            ]
             assert cpid == proc.pid
 
             print(
diff --git a/examples/debugging/subactor_bp_in_ctx.py b/examples/debugging/subactor_bp_in_ctx.py
index 0ca7097fa..5bfff3311 100644
--- a/examples/debugging/subactor_bp_in_ctx.py
+++ b/examples/debugging/subactor_bp_in_ctx.py
@@ -36,6 +36,11 @@ async def just_bp(
 
 async def main():
 
+    # !TODO, parametrize the --tpt-proto={key} with osenv vars just
+    # like we do for loglevel/spawn-backend!
+    # - [ ] run on both tpts for all such debugger tests?
+    # - [ ] special skip for macos!
+    #
     if platform.system() != 'Darwin':
         tpt = 'uds'
     else:
diff --git a/examples/debugging/subactor_error.py b/examples/debugging/subactor_error.py
index d7aee447f..4bd809f9a 100644
--- a/examples/debugging/subactor_error.py
+++ b/examples/debugging/subactor_error.py
@@ -9,7 +9,6 @@ async def name_error():
 async def main():
     async with tractor.open_nursery(
         debug_mode=True,
-        # loglevel='transport',
     ) as an:
 
         # TODO: ideally the REPL arrives at this frame in the parent,
diff --git a/examples/debugging/sync_bp.py b/examples/debugging/sync_bp.py
index a26a9c54e..64a6e14bf 100644
--- a/examples/debugging/sync_bp.py
+++ b/examples/debugging/sync_bp.py
@@ -1,9 +1,22 @@
 from functools import partial
+import os
 import time
 
+# ?TODO? how to make `pdbp` enforce this?
+# os.environ['PYTHON_COLORS'] = '0'
+# os.environ['NO_COLOR'] = '1'
+
 import trio
 import tractor
 
+# disable `pbdp` prompt colors
+# for prompt matching in test.
+def disable_pdbp_color():
+    if os.environ['PYTHON_COLORS'] == '0':
+        from tractor.devx.debug import _repl
+        _repl.TractorConfig.use_pygments = False
+
+
 # TODO: only import these when not running from test harness?
 # can we detect `pexpect` usage maybe?
 # from tractor.devx.debug import (
@@ -42,6 +55,7 @@ async def start_n_sync_pause(
     ctx: tractor.Context,
 ):
     actor: tractor.Actor = tractor.current_actor()
+    disable_pdbp_color()
 
     # sync to parent-side task
     await ctx.started()
@@ -52,13 +66,15 @@ async def start_n_sync_pause(
 
 
 async def main() -> None:
+    disable_pdbp_color()
     async with (
         tractor.open_nursery(
             debug_mode=True,
             maybe_enable_greenback=True,
-            enable_stack_on_sig=True,
-            # loglevel='warning',
-            # loglevel='devx',
+
+            # XXX flags required for test pattern matching.
+            loglevel='pdb',
+            # enable_stack_on_sig=True,
         ) as an,
         trio.open_nursery() as tn,
     ):
@@ -68,8 +84,8 @@ async def main() -> None:
         p: tractor.Portal  = await an.start_actor(
             'subactor',
             enable_modules=[__name__],
-            # infect_asyncio=True,
             debug_mode=True,
+            # infect_asyncio=True,
         )
 
         # TODO: 3 sub-actor usage cases:
diff --git a/pyproject.toml b/pyproject.toml
index 3d62c8b77..0898c2013 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,6 +62,7 @@ dev = [
   {include-group = 'devx'},
   {include-group = 'testing'},
   {include-group = 'repl'},
+  {include-group = 'sync_pause'},
 ]
 devx = [
   # `tractor.devx` tooling
@@ -84,11 +85,16 @@ testing = [
   # known-hanging `subint`-backend audit tests; see
   # `ai/conc-anal/subint_*_issue.md`).
   "pytest-timeout>=2.3",
+  # used by `tractor._testing._reap` for the
+  # `tractor-reap` zombie-subactor + leaked-shm
+  # cleanup utility (xplatform `Process.memory_maps`,
+  # `Process.open_files`).
+  "psutil>=7.0.0",
 ]
 repl = [
   "pyperclip>=1.9.0",
   "prompt-toolkit>=3.0.50",
-  "xonsh>=0.22.8",
+  "xonsh>=0.23.0",
   "psutil>=7.0.0",
 ]
 lint = [
@@ -129,7 +135,7 @@ sync_pause = {requires-python = ">=3.13, <3.14"}
 # xonsh = { git = 'https://github.com/anki-code/xonsh.git', branch = 'prompt_next_suggestion' }
 # ^ https://github.com/xonsh/xonsh/pull/6048
 # xonsh = { git = 'https://github.com/xonsh/xonsh.git', branch = 'main' }
-xonsh = { path = "../xonsh", editable = true }
+# xonsh = { path = "../xonsh", editable = true }
 
 # [tool.uv.sources.pdbp]
 # XXX, in case we need to tmp patch again.
@@ -198,7 +204,35 @@ all_bullets = true
 
 [tool.pytest.ini_options]
 minversion = '6.0'
-timeout = 200  # per-test hard limit
+# NOTE: `pytest-timeout`'s global per-test cap is intentionally
+# NOT set — both of its enforcement methods break trio's
+# runtime under our fork-based spawn backends:
+#
+# - `method='signal'` (the default; SIGALRM) raises `Failed`
+#   synchronously from the signal handler in trio's main
+#   thread, which leaves `GLOBAL_RUN_CONTEXT` half-installed
+#   ("Trio guest run got abandoned"). EVERY subsequent
+#   `trio.run()` in the same pytest session then bails with
+#   `RuntimeError: Attempted to call run() from inside a
+#   run()` — full-session poison: a single 200s hang
+#   cascades into 30+ false-positive failures across
+#   downstream test files.
+#
+# - `method='thread'` calls `_thread.interrupt_main()` which
+#   can let the resulting `KeyboardInterrupt` escape trio's
+#   `KIManager` under fork-cascade teardown races, killing
+#   the whole pytest session.
+#
+# For tests that legitimately need a wall-clock cap, use
+# `with trio.fail_after(N):` INSIDE the test — trio's own
+# Cancelled machinery handles the timeout cleanly through
+# the actor nursery without disturbing global state. See
+# `tests/test_advanced_streaming.py::test_dynamic_pub_sub`'s
+# module-level NOTE for the canonical pattern.
+#
+# CI environments should rely on job-level wall-clock
+# timeouts (e.g. GitHub Actions `timeout-minutes`) for an
+# escape hatch on genuinely-stuck suites.
 # https://docs.pytest.org/en/stable/reference/reference.html#configuration-options
 testpaths = [
   'tests'
@@ -206,15 +240,27 @@ testpaths = [
 addopts = [
   # TODO: figure out why this isn't working..
   '--rootdir=./tests',
-
   '--import-mode=importlib',
   # don't show frickin captured logs AGAIN in the report..
   '--show-capture=no',
 
+  # load builtin plugin since we need a boostrapping hook,
+  # `pytest_load_initial_conftests()` for `--capture=` per:
+  # https://docs.pytest.org/en/stable/reference/reference.html#bootstrapping-hooks
+  '-p tractor._testing.pytest',
+
   # disable `xonsh` plugin
   # https://docs.pytest.org/en/stable/how-to/plugins.html#disabling-plugins-from-autoloading
   # https://docs.pytest.org/en/stable/how-to/plugins.html#deactivating-unregistering-a-plugin-by-name
-  '-p no:xonsh'
+  '-p no:xonsh',
+
+  # XXX default on non-forking spawners
+  '--capture=fd',
+  # '--capture=sys',
+  # ^XXX NOTE^ ALWAYS SET THIS for `*_forkserver` spawner
+  # backends! see details @
+  # `tractor._testing.pytest.pytest_load_initial_conftests()`
+
 ]
 log_cli = false
 # TODO: maybe some of these layout choices?
diff --git a/scripts/tractor-reap b/scripts/tractor-reap
new file mode 100755
index 000000000..11ad8e09d
--- /dev/null
+++ b/scripts/tractor-reap
@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+'''
+`tractor-reap` — SC-polite zombie-subactor reaper +
+optional `/dev/shm/` orphan-segment sweep.
+
+Two cleanup phases (run in order when both are enabled):
+
+1. **process reap** — finds `tractor` subactor processes
+   left alive after a `pytest` (or any tractor-app) run
+   that failed to fully cancel its actor tree, then sends
+   SIGINT with a bounded grace window before escalating
+   to SIGKILL.
+
+2. **shm sweep** (`--shm` / `--shm-only`) — unlinks
+   `/dev/shm/<file>` entries owned by the current uid
+   that no live process has open (mmap'd or fd-held).
+   Needed because `tractor` disables
+   `mp.resource_tracker` (see `tractor.ipc._mp_bs`), so a
+   hard-crashing actor leaves leaked segments that
+   nothing else GCs.
+
+3. **UDS sweep** (`--uds` / `--uds-only`) — unlinks
+   `${XDG_RUNTIME_DIR}/tractor/<name>@<pid>.sock` files
+   whose binder pid is dead (or the `1616` registry
+   sentinel). Needed because the IPC server's
+   `os.unlink()` cleanup lives in a `finally:` block
+   that doesn't always run on hard exits (SIGKILL,
+   escaped `KeyboardInterrupt`, etc.) — see issue #452.
+
+Process-reap detection modes (auto-selected):
+
+    --parent <pid>  : descendant-mode — kill procs whose
+                      PPid == <pid>. Use when a parent
+                      is still alive and you want to
+                      scope the sweep precisely (e.g.
+                      CI wrapper calling in from outside
+                      pytest).
+
+    (default)       : orphan-mode — kill procs with
+                      PPid==1 (init-reparented) whose
+                      cwd matches the repo root AND
+                      whose cmdline contains `python`.
+                      The cwd filter is what prevents
+                      sweeping unrelated init-children.
+
+Usage:
+
+    # process reap only (default)
+    scripts/tractor-reap
+
+    # process reap + shm sweep
+    scripts/tractor-reap --shm
+
+    # only the shm sweep, skip process reap
+    scripts/tractor-reap --shm-only
+
+    # process reap + shm + UDS sweep (the works)
+    scripts/tractor-reap --shm --uds
+
+    # only UDS sweep
+    scripts/tractor-reap --uds-only
+
+    # from inside a still-live supervisor
+    scripts/tractor-reap --parent 12345
+
+    # dry-run: list what would be reaped, don't act
+    scripts/tractor-reap -n
+    scripts/tractor-reap --shm --uds -n
+
+'''
+import argparse
+import pathlib
+import subprocess
+import sys
+
+
+def _repo_root() -> pathlib.Path:
+    '''
+    Use `git rev-parse --show-toplevel` when available;
+    fall back to the repo this script lives in.
+
+    '''
+    try:
+        out: str = subprocess.check_output(
+            ['git', 'rev-parse', '--show-toplevel'],
+            stderr=subprocess.DEVNULL,
+            text=True,
+        ).strip()
+        return pathlib.Path(out)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return pathlib.Path(__file__).resolve().parent.parent
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        prog='tractor-reap',
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        '--parent', '-p',
+        type=int,
+        default=None,
+        help='descendant-mode: reap procs with PPid==<pid>',
+    )
+    parser.add_argument(
+        '--grace', '-g',
+        type=float,
+        default=3.0,
+        help='SIGINT grace window in seconds (default 3.0)',
+    )
+    parser.add_argument(
+        '--dry-run', '-n',
+        action='store_true',
+        help='list matched pids/paths but do not signal/unlink',
+    )
+    parser.add_argument(
+        '--shm',
+        action='store_true',
+        help=(
+            'after process reap, also unlink orphaned '
+            '/dev/shm segments owned by the current user '
+            'that no live process is mapping or holding open'
+        ),
+    )
+    parser.add_argument(
+        '--shm-only',
+        action='store_true',
+        help='skip process reap; only do the shm sweep',
+    )
+    parser.add_argument(
+        '--uds',
+        action='store_true',
+        help=(
+            'after process reap, also unlink orphaned '
+            '${XDG_RUNTIME_DIR}/tractor/*.sock files '
+            'whose binder pid is dead (or the 1616 '
+            'registry sentinel). See issue #452.'
+        ),
+    )
+    parser.add_argument(
+        '--uds-only',
+        action='store_true',
+        help='skip process reap + shm; only do the UDS sweep',
+    )
+    args = parser.parse_args()
+    # any *-only flag also skips the process reap phase
+    skip_proc_reap: bool = (
+        args.shm_only
+        or
+        args.uds_only
+    )
+
+    # import lazily so `--help` doesn't require the tractor
+    # package to be importable (e.g. when running from a
+    # shell not inside a venv).
+    repo = _repo_root()
+    sys.path.insert(0, str(repo))
+    from tractor._testing._reap import (
+        find_descendants,
+        find_orphans,
+        find_orphaned_shm,
+        find_orphaned_uds,
+        reap,
+        reap_shm,
+        reap_uds,
+    )
+
+    rc: int = 0
+
+    # --- phase 1: process reap (skipped under --*-only) ---
+    if not skip_proc_reap:
+        if args.parent is not None:
+            pids: list[int] = find_descendants(args.parent)
+            mode: str = f'descendants of PPid={args.parent}'
+        else:
+            pids = find_orphans(repo)
+            mode = f'orphans (PPid=1, cwd={repo})'
+
+        if not pids:
+            print(f'[tractor-reap] no {mode} to reap')
+        elif args.dry_run:
+            print(
+                f'[tractor-reap] dry-run — {mode}:\n  {pids}'
+            )
+        else:
+            _, survivors = reap(pids, grace=args.grace)
+            if survivors:
+                rc = 1
+
+    # --- phase 2: shm sweep (opt-in) ---
+    if args.shm or args.shm_only:
+        leaked: list[str] = find_orphaned_shm()
+        if not leaked:
+            print(
+                '[tractor-reap] no orphaned /dev/shm '
+                'segments to sweep'
+            )
+        elif args.dry_run:
+            print(
+                f'[tractor-reap] dry-run — {len(leaked)} '
+                f'orphaned shm segment(s):\n  {leaked}'
+            )
+        else:
+            _, errors = reap_shm(leaked)
+            if errors:
+                rc = 1
+
+    # --- phase 3: UDS sweep (opt-in) ---
+    if args.uds or args.uds_only:
+        leaked_uds: list[str] = find_orphaned_uds()
+        if not leaked_uds:
+            print(
+                '[tractor-reap] no orphaned UDS sock-files '
+                'to sweep'
+            )
+        elif args.dry_run:
+            print(
+                f'[tractor-reap] dry-run — {len(leaked_uds)} '
+                f'orphaned UDS sock-file(s):\n  {leaked_uds}'
+            )
+        else:
+            _, errors = reap_uds(leaked_uds)
+            if errors:
+                rc = 1
+
+    # exit 0 if everything cleaned cleanly, else 1 — useful
+    # for CI health-check chaining.
+    return rc
+
+
+if __name__ == '__main__':
+    raise SystemExit(main())
diff --git a/tests/conftest.py b/tests/conftest.py
index 90498ba05..7e246db27 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -22,7 +22,8 @@
 
 pytest_plugins: list[str] = [
     'pytester',
-    'tractor._testing.pytest',
+    # NOTE, now loaded in `pytest-ini` section of `pyproject.toml`
+    # 'tractor._testing.pytest',
 ]
 
 _ci_env: bool = os.environ.get('CI', False)
@@ -134,25 +135,30 @@ def pytest_addoption(
         "--ll",
         action="store",
         dest='loglevel',
-        default='ERROR', help="logging level to set when testing"
+        default=None,
+        help="logging level to set when testing",
     )
 
 
 @pytest.fixture(scope='session', autouse=True)
 def loglevel(
     request: pytest.FixtureRequest,
-) -> str:
+) -> str|None:
     import tractor
     orig = tractor.log._default_loglevel
-    level = tractor.log._default_loglevel = request.config.option.loglevel
+    flag_level: str|None = request.config.option.loglevel
+
+    if flag_level is not None:
+        tractor.log._default_loglevel = flag_level
+
     log = tractor.log.get_console_log(
-        level=level,
+        level=flag_level,
         name='tractor',  # <- enable root logger
     )
     log.info(
-        f'Test-harness set runtime loglevel: {level!r}\n'
+        f'Test-harness set runtime loglevel: {flag_level!r}\n'
     )
-    yield level
+    yield flag_level
     tractor.log._default_loglevel = orig
 
 
@@ -248,6 +254,7 @@ def daemon(
     tpt_proto: str,
     ci_env: bool,
     test_log: tractor.log.StackLevelAdapter,
+    # set_fork_aware_capture,
 
 ) -> subprocess.Popen:
     '''
@@ -255,8 +262,12 @@ def daemon(
     "remote registrar" for discovery-protocol related tests.
 
     '''
+    # XXX: too much logging will lock up the subproc (smh)
     if loglevel in ('trace', 'debug'):
-        # XXX: too much logging will lock up the subproc (smh)
+        test_log.warning(
+            f'Test harness log level is too verbose: {loglevel!r}\n'
+            f'Reducing to INFO level..'
+        )
         loglevel: str = 'info'
 
     code: str = (
@@ -294,19 +305,19 @@ def daemon(
     # -[ ] UDS: can we do something similar for 'pinging" the
     #     file-socket?
     #
-    global _PROC_SPAWN_WAIT
+    bg_daemon_spawn_delay: float = _PROC_SPAWN_WAIT
     # UDS sockets are **really** fast to bind()/listen()/connect()
     # so it's often required that we delay a bit more starting
     # the first actor-tree..
     if tpt_proto == 'uds':
-        _PROC_SPAWN_WAIT += 1.6
+        bg_daemon_spawn_delay += 1.6
 
     if _non_linux and ci_env:
-        _PROC_SPAWN_WAIT += 1
+        bg_daemon_spawn_delay += 1
 
     # XXX, allow time for the sub-py-proc to boot up.
     # !TODO, see ping-polling ideas above!
-    time.sleep(_PROC_SPAWN_WAIT)
+    time.sleep(bg_daemon_spawn_delay)
 
     assert not proc.returncode
     yield proc
diff --git a/tests/devx/conftest.py b/tests/devx/conftest.py
index 747c859d7..7b0d96bbd 100644
--- a/tests/devx/conftest.py
+++ b/tests/devx/conftest.py
@@ -4,6 +4,7 @@
 '''
 from __future__ import annotations
 import platform
+import os
 import signal
 import time
 from typing import (
@@ -56,6 +57,7 @@ def pytest_configure(config):
 @pytest.fixture
 def spawn(
     start_method: str,
+    loglevel: str,
     testdir: pytest.Pytester,
     reg_addr: tuple[str, int],
 
@@ -65,9 +67,19 @@ def spawn(
     run an `./examples/..` script by name.
 
     '''
-    if start_method != 'trio':
+    supported_spawners: set[str] = {
+        'trio',
+        # `examples/debugging/<script>.py` picks up the spawn
+        # backend via the `TRACTOR_SPAWN_METHOD` env-var which
+        # is honored inside `tractor._root.open_root_actor()`,
+        # so no per-script edits are required.
+        'main_thread_forkserver',
+        'subint_forkserver',
+    }
+    if start_method not in supported_spawners:
         pytest.skip(
-            '`pexpect` based tests only supported on `trio` backend'
+            f'`pexpect` based tests NOT supported on spawning-backend: {start_method!r}\n'
+            f'supported-spawners: {supported_spawners!r}'
         )
 
     def unset_colors():
@@ -79,21 +91,64 @@ def unset_colors():
         https://docs.python.org/3/using/cmdline.html#using-on-controlling-color
 
         '''
-        import os
         # disable colored tbs
         os.environ['PYTHON_COLORS'] = '0'
         # disable all ANSI color output
         # os.environ['NO_COLOR'] = '1'
+        # ?TODO, doesn't seem to disable prompt color
+        # for `pdbp`?
+
+    def set_spawn_method(
+        start_method: str,
+    ):
+        '''
+        Drive the actor-spawn backend inside the spawned
+        `examples/debugging/<script>.py` subproc via env-var
+        (consumed by `tractor._root.open_root_actor()`),
+        without requiring per-script CLI plumbing.
+
+        '''
+        os.environ['TRACTOR_SPAWN_METHOD'] = start_method
+
+    def set_loglevel(
+        loglevel: str|None,
+    ):
+        '''
+        Forward the test-suite parametrized `loglevel` into the
+        spawned `examples/debugging/<script>.py` subproc via
+        env-var (consumed by `tractor._root.open_root_actor()`),
+        so console verbosity can be cranked or silenced from
+        the test harness without per-script edits.
+
+        '''
+        if loglevel:
+            os.environ['TRACTOR_LOGLEVEL'] = loglevel
+        else:
+            os.environ.pop('TRACTOR_LOGLEVEL', None)
 
     spawned: PexpectSpawner|None = None
 
     def _spawn(
         cmd: str,
         expect_timeout: float = 4,
+        start_method: str = start_method,
+        loglevel: str|None = None,
         **mkcmd_kwargs,
     ) -> pty_spawn.spawn:
+        '''
+        Inner closure handed to consumer tests to invoke
+        `pytest.Pytester.spawn`
+
+        '''
         nonlocal spawned
         unset_colors()
+        set_spawn_method(start_method=start_method)
+        set_loglevel(
+            loglevel=loglevel,
+            # ?TODO^ when should this be set by `--ll <level>` ?
+            # by default we apply 'error' but there should be a diff
+            # vs. when the flag IS NOT passed?
+        )
         spawned = testdir.spawn(
             cmd=mk_cmd(
                 cmd,
@@ -137,6 +192,14 @@ def _spawn(
         if ptyproc.isalive():
             ptyproc.kill(signal.SIGKILL)
 
+    # Scope our env-var mutations to this single fixture invocation
+    # — both `TRACTOR_SPAWN_METHOD` and `TRACTOR_LOGLEVEL` are
+    # honored by `tractor._root.open_root_actor()` so leaking them
+    # past this test could inadvertently re-route a later in-process
+    # tractor test's spawn-backend / loglevel.
+    os.environ.pop('TRACTOR_SPAWN_METHOD', None)
+    os.environ.pop('TRACTOR_LOGLEVEL', None)
+
     # TODO? ensure we've cleaned up any UDS-paths?
     # breakpoint()
 
@@ -148,21 +211,38 @@ def _spawn(
 def ctlc(
     request: pytest.FixtureRequest,
     ci_env: bool,
-
+    start_method: str,
 ) -> bool:
+    '''
+    Parametrize and optionally skip tests which handle
+    ctlc-in-`pdbp`-REPL testing scenarios; certain spawners and actor-tree depths
+    cope very poorly with this..
 
+    In particular the spawning backends from `multiprocessing` are
+    fragile, as can be the default `trio` spawner under certain
+    conditions where SIGINT is relayed down the entire subproc tree.
+
+    '''
     use_ctlc: bool = request.param
     node = request.node
     markers = node.own_markers
     for mark in markers:
-        if mark.name == 'has_nested_actors':
+        if (
+            mark.name == 'has_nested_actors'
+            and
+            start_method not in {
+                # TODO, any spawners we should try again?
+                # - [ ] 'trio' but WITHOUT the SIGINT handler setup
+                #      per subproc?
+                # 'main_thread_forkserver',
+            }
+        ):
             pytest.skip(
                 f'Test {node} has nested actors and fails with Ctrl-C.\n'
                 f'The test can sometimes run fine locally but until'
                 ' we solve' 'this issue this CI test will be xfail:\n'
                 'https://github.com/goodboy/tractor/issues/320'
             )
-
         if (
             mark.name == 'ctlcs_bish'
             and
@@ -189,13 +269,10 @@ def ctlc(
 
 def expect(
     child,
-
-    # normally a `pdb` prompt by default
-    patt: str,
-
+    patt: str,  # often a `pdbp`-prompt
     **kwargs,
 
-) -> None:
+) -> str:
     '''
     Expect wrapper that prints last seen console
     data before failing.
@@ -206,6 +283,8 @@ def expect(
             patt,
             **kwargs,
         )
+        before = str(child.before.decode())
+        return before
     except TIMEOUT:
         before = str(child.before.decode())
         print(before)
@@ -260,10 +339,13 @@ def in_prompt_msg(
 def assert_before(
     child: SpawnBase,
     patts: list[str],
-
     **kwargs,
+) -> str:
+    '''
+    Assert a patter is in `child.before.decode() -> str`,
+    return the full `.before` output on success.
 
-) -> None:
+    '''
     __tracebackhide__: bool = False
 
     assert in_prompt_msg(
@@ -274,7 +356,8 @@ def assert_before(
         err_on_false=True,
         **kwargs
     )
-    return str(child.before.decode())
+    before: str = str(child.before.decode())
+    return before
 
 
 def do_ctlc(
diff --git a/tests/devx/test_debugger.py b/tests/devx/test_debugger.py
index d5fd759bf..94515aa43 100644
--- a/tests/devx/test_debugger.py
+++ b/tests/devx/test_debugger.py
@@ -24,6 +24,7 @@
     TIMEOUT,
     EOF,
 )
+import tractor
 
 from .conftest import (
     do_ctlc,
@@ -343,6 +344,7 @@ def test_subactor_breakpoint(
 def test_multi_subactors(
     spawn,
     ctlc: bool,
+    set_fork_aware_capture,
 ):
     '''
     Multiple subactors, both erroring and
@@ -487,11 +489,12 @@ def test_multi_subactors(
 def test_multi_daemon_subactors(
     spawn,
     loglevel: str,
-    ctlc: bool
+    ctlc: bool,
+    set_fork_aware_capture,
 ):
     '''
-    Multiple daemon subactors, both erroring and breakpointing within a
-    stream.
+    Multiple daemon subactors, both erroring and breakpointing within
+    a stream.
 
     '''
     non_linux = _non_linux
@@ -604,7 +607,10 @@ def test_multi_daemon_subactors(
             child,
             bp_forev_parts,
         )
-    except AssertionError:
+    except (
+        # AssertionError,  # TODO? rm since never raised?
+        ValueError,
+    ):
         before: str = assert_before(
             child,
             name_error_parts,
@@ -765,6 +771,8 @@ def test_multi_subactors_root_errors(
 def test_multi_nested_subactors_error_through_nurseries(
     ci_env: bool,
     spawn: PexpectSpawner,
+    is_forking_spawner: bool,
+    test_log: tractor.log.StackLevelAdapter,
 
     # TODO: address debugger issue for nested tree:
     # https://github.com/goodboy/tractor/issues/320
@@ -781,16 +789,17 @@ def test_multi_nested_subactors_error_through_nurseries(
     # A test (below) has now been added to explicitly verify this is
     # fixed.
 
-    child = spawn('multi_nested_subactors_error_up_through_nurseries')
-
-    # timed_out_early: bool = False
-
+    child = spawn(
+        'multi_nested_subactors_error_up_through_nurseries',
+        loglevel='pdb',
+    )
+    last_send_char: str|None = None
     for (
         i,
         send_char,
     ) in enumerate(itertools.cycle(['c', 'q'])):
 
-        timeout: float = -1
+        timeout: float = child.timeout
         if (
             _non_linux
             and
@@ -803,49 +812,82 @@ def test_multi_nested_subactors_error_through_nurseries(
         elif i == 0:
             timeout = 5
 
+        # XXX forking backends may take longer due to
+        # determinstic IPC cancellation.
+        if is_forking_spawner:
+            timeout += 4
+
         try:
             child.expect(
                 PROMPT,
                 timeout=timeout,
             )
+            delay: float = 0.1
+            test_log.info('Sleeping {delay!r} before next send-chart..')
+            time.sleep(delay)
+            last_send_char: str = send_char
             child.sendline(send_char)
-            time.sleep(0.01)
+            time.sleep(delay)
 
+        # script finally exited with tb on console.
         except EOF:
+            test_log.info(
+                f'Breaking from send-char loop'
+                f'last_send_char: {last_send_char!r}\n'
+            )
             break
 
-    assert_before(
-        child,
-        [ # boxed source errors
-            "NameError: name 'doggypants' is not defined",
-            "tractor._exceptions.RemoteActorError:",
-            "('name_error'",
+    # boxed source errors
+    expect_patts: list[str] = [
+        "NameError: name 'doggypants' is not defined",
+        "tractor._exceptions.RemoteActorError:",
+        "('name_error'",
+
+        # first level subtrees
+        # "tractor._exceptions.RemoteActorError: ('spawner0'",
+        "src_uid=('spawner0'",
+
+        # "tractor._exceptions.RemoteActorError: ('spawner1'",
+
+        # propagation of errors up through nested subtrees
+        # "tractor._exceptions.RemoteActorError: ('spawn_until_0'",
+        # "tractor._exceptions.RemoteActorError: ('spawn_until_1'",
+        # "tractor._exceptions.RemoteActorError: ('spawn_until_2'",
+        # ^-NOTE-^ old RAE repr, new one is below with a field
+        # showing the src actor's uid.
+        "src_uid=('spawn_until_2'",
+    ]
+    # XXX, I HAVE NO IDEA why these patts only show on the
+    # `trio`-spawner but it seems to have something to do with
+    # what gets dumped in prior-prompt latches somehow??
+    # TODO for claude, explain and or work through how this is
+    # happening but ONLY WHEN RUN FROM THE TEST, bc when i try to
+    # run the test script manually the correct output ALWAYS seems
+    # to be in the last `str(child.before.decode())` output !?!?
+    if (
+        not is_forking_spawner
+        and
+        last_send_char == 'q'
+    ):
+        expect_patts += [
+            # expect the pdb-quit exc.
             "bdb.BdbQuit",
-
-            # first level subtrees
-            # "tractor._exceptions.RemoteActorError: ('spawner0'",
-            "src_uid=('spawner0'",
-
-            # "tractor._exceptions.RemoteActorError: ('spawner1'",
-
-            # propagation of errors up through nested subtrees
-            # "tractor._exceptions.RemoteActorError: ('spawn_until_0'",
-            # "tractor._exceptions.RemoteActorError: ('spawn_until_1'",
-            # "tractor._exceptions.RemoteActorError: ('spawn_until_2'",
-            # ^-NOTE-^ old RAE repr, new one is below with a field
-            # showing the src actor's uid.
+            # BUT WHY these dude!?
             "src_uid=('spawn_until_0'",
             "relay_uid=('spawn_until_1'",
-            "src_uid=('spawn_until_2'",
         ]
+
+    assert_before(
+        child,
+        expect_patts,
     )
+    expect(child, EOF)
 
 
-@pytest.mark.timeout(15)
+# @pytest.mark.timeout(15)
 @has_nested_actors
 def test_root_nursery_cancels_before_child_releases_tty_lock(
     spawn,
-    start_method,
     ctlc: bool,
 ):
     '''
@@ -1187,7 +1229,11 @@ def test_ctxep_pauses_n_maybe_ipc_breaks(
     mashed and zombie reaper kills sub with no hangs.
 
     '''
-    child = spawn('subactor_bp_in_ctx')
+    child = spawn(
+        'subactor_bp_in_ctx',
+        loglevel='devx'
+        # ^XXX REQUIRED for below patt matching!
+    )
     child.expect(PROMPT)
 
     # 3 iters for the `gen()` pause-points
@@ -1277,7 +1323,11 @@ def test_crash_handling_within_cancelled_root_actor(
     call.
 
     '''
-    child = spawn('root_self_cancelled_w_error')
+    child = spawn(
+        'root_self_cancelled_w_error',
+        loglevel='cancel',
+        # ^XXX REQUIRED for below patt matching!
+    )
     child.expect(PROMPT)
 
     assert_before(
diff --git a/tests/devx/test_pause_from_non_trio.py b/tests/devx/test_pause_from_non_trio.py
index 4a03a1237..0710ba80b 100644
--- a/tests/devx/test_pause_from_non_trio.py
+++ b/tests/devx/test_pause_from_non_trio.py
@@ -63,19 +63,31 @@ def test_pause_from_sync(
     `examples/debugging/sync_bp.py`
 
     '''
-    child = spawn('sync_bp')
+    # XXX required for `breakpoint()` overload and
+    # thus`tractor.devx.pause_from_sync()`.
+    pytest.importorskip('greenback')
+    child = spawn(
+        'sync_bp',
+        loglevel='pdb',  # XXX pattern matching
+    )
 
     # first `sync_pause()` after nurseries open
     child.expect(PROMPT)
-    assert_before(
+    _before: str = assert_before(
         child,
         [
-            # pre-prompt line
-            _pause_msg,
-            "<Task '__main__.main'",
+            # devx-loglevel
+            # "imported <module 'greenback' from",
+            # "successfully scheduled `._pause()` in `trio` thread on behalf of <Task",
+
+            _pause_msg,  # pre-prompt line
             "('root'",
+            "<Task '__main__.main'",
+            "tractor.pause_from_sync()",
         ]
     )
+    # XXX `enable_stack_on_sig=False` in script
+    assert 'stackscope' not in _before
     if ctlc:
         do_ctlc(child)
         # ^NOTE^ subactor not spawned yet; don't need extra delay.
@@ -85,18 +97,18 @@ def test_pause_from_sync(
     # first `await tractor.pause()` inside `p.open_context()` body
     child.expect(PROMPT)
 
-    # XXX shouldn't see gb loaded message with PDB loglevel!
-    # assert not in_prompt_msg(
-    #     child,
-    #     ['`greenback` portal opened!'],
-    # )
     # should be same root task
     assert_before(
         child,
         [
+            # XXX should see gb loaded with devx-loglevel.
+            # "`greenback` portal opened!",
+            # "Activated `greenback` for `tractor.pause_from_sync()` support!",
+
             _pause_msg,
-            "<Task '__main__.main'",
             "('root'",
+            "<Task '__main__.main'",
+            "tractor.pause()",
         ]
     )
 
@@ -127,17 +139,17 @@ def test_pause_from_sync(
     # `Lock.acquire()`-ed
     # (NOT both, which will result in REPL clobbering!)
     attach_patts: dict[str, list[str]] = {
-        'subactor': [
-            "'start_n_sync_pause'",
-            "('subactor'",
+        "|_<Task 'start_n_sync_pause'": [
+            "|_('subactor'",
+            "tractor.pause_from_sync()",
         ],
-        'inline_root_bg_thread': [
-            "<Thread(inline_root_bg_thread",
+        "|_<Thread(inline_root_bg_thread": [
             "('root'",
+            "breakpoint(hide_tb=hide_tb)",
         ],
-        'start_soon_root_bg_thread': [
-            "<Thread(start_soon_root_bg_thread",
-            "('root'",
+        "|_<Thread(start_soon_root_bg_thread": [
+            "|_('root'",
+            "tractor.pause_from_sync()",
         ],
     }
     conts: int = 0  # for debugging below matching logic on failure
@@ -260,6 +272,9 @@ def test_sync_pause_from_aio_task(
     `examples/debugging/asycio_bp.py`
 
     '''
+    # XXX required for `breakpoint()` overload and
+    # thus`tractor.devx.pause_from_sync()`.
+    pytest.importorskip('greenback')
     child = spawn('asyncio_bp')
 
     # RACE on whether trio/asyncio task bps first
diff --git a/tests/devx/test_tooling.py b/tests/devx/test_tooling.py
index c529bed2e..4a0e1d5af 100644
--- a/tests/devx/test_tooling.py
+++ b/tests/devx/test_tooling.py
@@ -21,6 +21,7 @@
 import signal
 import time
 from typing import (
+    Callable,
     TYPE_CHECKING,
 )
 
@@ -47,7 +48,12 @@
 
 @no_macos
 def test_shield_pause(
-    spawn: PexpectSpawner,
+    spawn: Callable[
+        ...,
+        PexpectSpawner,
+    ],
+    start_method: str,
+    request: pytest.FixtureRequest,
 ):
     '''
     Verify the `tractor.pause()/.post_mortem()` API works inside an
@@ -55,8 +61,10 @@ def test_shield_pause(
     next checkpoint wherein the cancelled will get raised.
 
     '''
-    child = spawn(
-        'shield_hang_in_sub'
+    child: PexpectSpawner = spawn(
+        'shield_hang_in_sub',
+        loglevel='devx',
+        # ^XXX REQUIRED for below patt matching!
     )
     expect(
         child,
@@ -86,38 +94,82 @@ def test_shield_pause(
         # end-of-tree delimiter
         "end-of-\('root'",
     )
-    assert_before(
+    _before: str = assert_before(
         child,
         [
             # 'Srying to dump `stackscope` tree..',
             # 'Dumping `stackscope` tree for actor',
             "('root'",  # uid line
 
-            # TODO!? this used to show?
+            # TODO!? this in-task-code used to show??
             # -[ ] mk reproducable for @oremanj?
+            # => SOLVED? by our `trio_token.run_sync_soon()`
+            #    approach?
             #
             # parent block point (non-shielded)
             # 'await trio.sleep_forever()  # in root',
         ]
     )
-    expect(
-        child,
-        # end-of-tree delimiter
-        "end-of-\('hanger'",
-    )
-    assert_before(
-        child,
-        [
-            # relay to the sub should be reported
-            'Relaying `SIGUSR1`[10] to sub-actor',
 
-            "('hanger'",  # uid line
+    # NOTE, hierarchical-ordering invariant restored by
+    # `_dump_then_relay` (co-scheduled dump+relay on the
+    # trio loop, see `tractor.devx._stackscope`): the
+    # parent's full task-tree prints BEFORE the 'Relaying
+    # `SIGUSR1`' log msg, which prints BEFORE any sub-
+    # actor receives the signal and dumps its own tree.
+    # So the relay log appears BETWEEN `end-of-('root'`
+    # (above) and `end-of-('hanger'` (below).
+    handle_out_of_order: bool = False
+
+    # XXX, when capfd is NOT used we don't expect to
+    # see the logging output from the subactor.
+    if (no_capfd := (start_method in [
+            'main_thread_forkserver',
+        ])
+    ):
+        opts = request.config.option
+        assert opts.spawn_backend == start_method
+        # ?XXX? i guess the `testdir` fixture "pretends to" reset
+        # this to the default 'fd'??
+        # assert opts.capture in [
+        #     'sys',
+        #     'no',
+        # ]
+
+    if (
+        handle_out_of_order
+        and
+        "end-of-('hanger'" in _before
+    ):
+         assert "('hanger'" in _before
+         assert 'Relaying `SIGUSR1`[10] to sub-actor' in _before
+
+    else:
+        _before = expect(
+            child,
+            'Relaying `SIGUSR1`\\[10\\] to sub-actor',
+        )
+        # _before: str = assert_before(
+        #     child,
+        #     ["('hanger'",]  # uid line
+        # )
+        if not no_capfd:
+            expect(
+                child,
+                # end-of-subactor's-tree delimiter
+                "end-of-\('hanger'",
+            )
+            _before: str = assert_before(
+                child,
+                [
+                    "('hanger'",  # uid line
+
+                    # TODO!? SEE ABOVE
+                    # hanger LOC where it's shield-halted
+                    # 'await trio.sleep_forever()  # in subactor',
+                ]
+            )
 
-            # TODO!? SEE ABOVE
-            # hanger LOC where it's shield-halted
-            # 'await trio.sleep_forever()  # in subactor',
-        ]
-    )
 
     # simulate the user sending a ctl-c to the hanging program.
     # this should result in the terminator kicking in since
@@ -133,14 +185,19 @@ def test_shield_pause(
         _shutdown_msg,
         timeout=6,
     )
-    assert_before(
-        child,
-        [
-            'raise KeyboardInterrupt',
+    expect_on_teardown: list[str] = [
+        'raise KeyboardInterrupt',
+        'Root actor terminated',
+    ]
+    if not no_capfd:
+        expect_on_teardown += [
             # 'Shutting down actor runtime',
             '#T-800 deployed to collect zombie B0',
             "'--uid', \"('hanger',",
         ]
+    assert_before(
+        child,
+        expect_on_teardown,
     )
 
 
@@ -156,8 +213,10 @@ def test_breakpoint_hook_restored(
     calls used.
 
     '''
+    # XXX required for `breakpoint()` overload and
+    # thus`tractor.devx.pause_from_sync()`.
+    pytest.importorskip('greenback')
     child = spawn('restore_builtin_breakpoint')
-
     child.expect(PROMPT)
     try:
         assert_before(
diff --git a/tests/discovery/test_registrar.py b/tests/discovery/test_registrar.py
index a004ddac7..ef061f3c1 100644
--- a/tests/discovery/test_registrar.py
+++ b/tests/discovery/test_registrar.py
@@ -133,7 +133,7 @@ async def say_hello_use_wait(
 
 
 @pytest.mark.timeout(
-    3,
+    7,
     method='thread',
 )
 @tractor_test
@@ -520,6 +520,10 @@ async def kill_transport(
 
 
 
+# ?TODO, do a OSc style signalling test on this?
+# -[ ] doesn't work for fork backends
+# @pytest.mark.parametrize('use_signal', [False, True])
+#
 # Wall-clock bound via `pytest-timeout` (`method='thread'`).
 # Under `--spawn-backend=subint` this test can wedge in an
 # un-Ctrl-C-able state (abandoned-subint + shared-GIL
@@ -531,20 +535,22 @@ async def kill_transport(
 # At timeout the plugin hard-kills the pytest process — that's
 # the intended behavior here; the alternative is an unattended
 # suite run that never returns.
-@pytest.mark.timeout(
-    3,  # NOTE should be a 2.1s happy path.
-    method='thread',
-)
+# @pytest.mark.timeout(
+#     30,
+#     # NOTE should be a 2.1s happy path.
+#     # XXX for `main_thread_forkserver` this is SUPER SENSITIVE
+#     # so keep it higher to avoid flaky runs..
+#     method='thread',
+# )
 @pytest.mark.skipon_spawn_backend(
     'subint',
+    # 'main_thread_forkserver',
     reason=(
         'XXX SUBINT HANGING TEST XXX\n'
         'See oustanding issue(s)\n'
         # TODO, put issue link!
     )
 )
-# @pytest.mark.parametrize('use_signal', [False, True])
-#
 def test_stale_entry_is_deleted(
     debug_mode: bool,
     daemon: subprocess.Popen,
@@ -558,7 +564,6 @@ def test_stale_entry_is_deleted(
 
     '''
     async def main():
-
         name: str = 'transport_fails_actor'
         _reg_ptl: tractor.Portal
         an: tractor.ActorNursery
@@ -591,6 +596,14 @@ async def main():
                 await ptl.cancel_actor()
                 await an.cancel()
 
+    # XXX, for tracing if this starts being flaky again..
+    #
+    # async def _timeout_main():
+    #     with trio.move_on_after(4) as cs:
+    #         await main()
+    #     if cs.cancel_called:
+    #         await tractor.pause()
+
     # TODO, remove once the `[subint]` variant no longer hangs.
     #
     # Status (as of Phase B hard-kill landing):
@@ -641,3 +654,4 @@ async def main():
         path=f'/tmp/test_stale_entry_is_deleted_{start_method}.dump',
     ):
         trio.run(main)
+        # trio.run(_timeout_main)
diff --git a/tests/msg/test_pldrx_limiting.py b/tests/msg/test_pldrx_limiting.py
index b180dc035..1a8b61176 100644
--- a/tests/msg/test_pldrx_limiting.py
+++ b/tests/msg/test_pldrx_limiting.py
@@ -55,12 +55,37 @@ async def maybe_expect_raises(
     raises: BaseException|None = None,
     ensure_in_message: list[str]|None = None,
     post_mortem: bool = False,
-    timeout: int = 3,
+    # NOTE, `None` selects a backend-aware default below —
+    # see `_BACKEND_TIMEOUT_DEFAULTS` for rationale. Caller
+    # can override with an explicit value to opt out.
+    timeout: int|None = None,
 ) -> None:
     '''
     Async wrapper for ensuring errors propagate from the inner scope.
 
     '''
+    if timeout is None:
+        # Pick a backend-aware default. Fork-based backends
+        # (`main_thread_forkserver`) need much more headroom
+        # because actor spawn + IPC ctx-exit + msg-validation
+        # error path takes longer than under `trio` backend
+        # — especially under cross-pytest-stream contention
+        # (#451). `test_basic_payload_spec` empirically:
+        #   - 3s flaked all-valid variant (`TooSlowError`)
+        #   - 8s flaked `invalid-return` variant
+        #     (`Cancelled` surfaced instead of `MsgTypeError`
+        #     because `fail_after` fired mid-error-path)
+        #   - 15s flaked under cross-stream contention
+        # 30s for fork-based gives plenty of headroom while
+        # still failing-loud on a genuine hang. Other
+        # backends keep the original 3s.
+        from tractor.spawn import _spawn as _spawn_mod
+        timeout = (
+            30
+            if _spawn_mod._spawn_method == 'main_thread_forkserver'
+            else 3
+        )
+
     if tractor.debug_mode():
         timeout += 999
 
diff --git a/tests/spawn/__init__.py b/tests/spawn/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/spawn/test_main_thread_forkserver.py b/tests/spawn/test_main_thread_forkserver.py
new file mode 100644
index 000000000..8354d75b4
--- /dev/null
+++ b/tests/spawn/test_main_thread_forkserver.py
@@ -0,0 +1,652 @@
+'''
+Integration exercises for the `tractor.spawn._main_thread_forkserver`
+submodule at three tiers:
+
+1. the low-level primitives
+   (`fork_from_worker_thread()` from `_main_thread_forkserver`
+   + `run_subint_in_worker_thread()` from
+   `_subint_forkserver`) driven from inside a real
+   `trio.run()` in the parent process,
+
+2. the full `main_thread_forkserver_proc` spawn backend wired
+   through tractor's normal actor-nursery + portal-RPC
+   machinery — i.e. `open_root_actor` + `open_nursery` +
+   `run_in_actor` against a subactor spawned via fork from a
+   main-interp worker thread.
+
+Background
+----------
+`ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
+establishes that `os.fork()` from a non-main sub-interpreter
+aborts the child at the CPython level. The sibling
+`subint_fork_from_main_thread_smoketest.py` proves the escape
+hatch: fork from a main-interp *worker thread* (one that has
+never entered a subint) works, and the forked child can then
+host its own `trio.run()` inside a fresh subint.
+
+Those smoke-test scenarios are standalone — no trio runtime
+in the *parent*. Tiers (1)+(2) here cover the primitives
+driven from inside `trio.run()` in the parent, and tier (3)
+(the `*_spawn_basic` test) drives the registered
+`main_thread_forkserver` spawn backend end-to-end against
+the tractor runtime.
+
+Gating
+------
+- py3.14+ (via `concurrent.interpreters` presence)
+- no `--spawn-backend` restriction — the backend-level test
+  flips `tractor.spawn._spawn._spawn_method` programmatically
+  (via `try_set_start_method('main_thread_forkserver')`) and
+  restores it on teardown, so these tests are independent of
+  the session-level CLI backend choice.
+
+'''
+from __future__ import annotations
+from functools import partial
+import os
+from pathlib import Path
+import platform
+import select
+import signal
+import subprocess
+import sys
+import time
+
+import pytest
+import trio
+
+import tractor
+from tractor.devx import dump_on_hang
+
+
+# Gate: subint forkserver primitives require py3.14+. Check
+# the public stdlib wrapper's presence (added in 3.14) rather
+# than `_interpreters` directly — see
+# `tractor.spawn._subint` for why.
+pytest.importorskip('concurrent.interpreters')
+
+from tractor.spawn._main_thread_forkserver import (  # noqa: E402
+    fork_from_worker_thread,
+    wait_child,
+)
+from tractor.spawn._subint_forkserver import (  # noqa: E402
+    run_subint_in_worker_thread,
+)
+from tractor.spawn import _spawn as _spawn_mod  # noqa: E402
+from tractor.spawn._spawn import try_set_start_method  # noqa: E402
+
+
+# ----------------------------------------------------------------
+# child-side callables (passed via `child_target=` across fork)
+# ----------------------------------------------------------------
+
+
+_CHILD_TRIO_BOOTSTRAP: str = (
+    'import trio\n'
+    'async def _main():\n'
+    '    await trio.sleep(0.05)\n'
+    '    return 42\n'
+    'result = trio.run(_main)\n'
+    'assert result == 42, f"trio.run returned {result}"\n'
+)
+
+
+def _child_trio_in_subint() -> int:
+    '''
+    `child_target` for the trio-in-child scenario: drive a
+    trivial `trio.run()` inside a fresh legacy-config subint
+    on a worker thread.
+
+    Returns an exit code suitable for `os._exit()`:
+    - 0: subint-hosted `trio.run()` succeeded
+    - 3: driver thread hang (timeout inside `run_subint_in_worker_thread`)
+    - 4: subint bootstrap raised some other exception
+
+    '''
+    try:
+        run_subint_in_worker_thread(
+            _CHILD_TRIO_BOOTSTRAP,
+            thread_name='child-subint-trio-thread',
+        )
+    except RuntimeError:
+        # timeout / thread-never-returned
+        return 3
+    except BaseException:
+        return 4
+    return 0
+
+
+# ----------------------------------------------------------------
+# parent-side harnesses (run inside `trio.run()`)
+# ----------------------------------------------------------------
+
+
+async def run_fork_in_non_trio_thread(
+    deadline: float,
+    *,
+    child_target=None,
+) -> int:
+    '''
+    From inside a parent `trio.run()`, off-load the
+    forkserver primitive to a main-interp worker thread via
+    `trio.to_thread.run_sync()` and return the forked child's
+    pid.
+
+    Then `wait_child()` on that pid (also off-loaded so we
+    don't block trio's event loop on `waitpid()`) and assert
+    the child exited cleanly.
+
+    '''
+    with trio.fail_after(deadline):
+        # NOTE: `fork_from_worker_thread` internally spawns its
+        # own dedicated `threading.Thread` (not from trio's
+        # cache) and joins it before returning — so we can
+        # safely off-load via `to_thread.run_sync` without
+        # worrying about the trio-thread-cache recycling the
+        # runner. Pass `abandon_on_cancel=False` for the
+        # same "bounded + clean" rationale we use in
+        # `_subint.subint_proc`.
+        pid: int = await trio.to_thread.run_sync(
+            partial(
+                fork_from_worker_thread,
+                child_target,
+                thread_name='test-subint-forkserver',
+            ),
+            abandon_on_cancel=False,
+        )
+        assert pid > 0
+
+        ok, status_str = await trio.to_thread.run_sync(
+            partial(
+                wait_child,
+                pid,
+                expect_exit_ok=True,
+            ),
+            abandon_on_cancel=False,
+        )
+        assert ok, (
+            f'forked child did not exit cleanly: '
+            f'{status_str}'
+        )
+        return pid
+
+
+# ----------------------------------------------------------------
+# tests
+# ----------------------------------------------------------------
+
+
+# Bounded wall-clock via `pytest-timeout` (`method='thread'`)
+# for the usual GIL-hostage safety reason documented in the
+# sibling `test_subint_cancellation.py` / the class-A
+# `subint_sigint_starvation_issue.md`. Each test also has an
+# inner `trio.fail_after()` so assertion failures fire fast
+# under normal conditions.
+# @pytest.mark.timeout(30, method='thread')
+def test_fork_from_worker_thread_via_trio(
+) -> None:
+    '''
+    Baseline: inside `trio.run()`, call
+    `fork_from_worker_thread()` via `trio.to_thread.run_sync()`,
+    get a child pid back, reap the child cleanly.
+
+    No trio-in-child. If this regresses we know the parent-
+    side trio↔worker-thread plumbing is broken independent
+    of any child-side subint machinery.
+
+    '''
+    deadline: float = 10.0
+    with dump_on_hang(
+        seconds=deadline,
+        path='/tmp/main_thread_forkserver_baseline.dump',
+    ):
+        pid: int = trio.run(
+            partial(run_fork_in_non_trio_thread, deadline),
+        )
+    # parent-side sanity — we got a real pid back.
+    assert isinstance(pid, int) and pid > 0
+    # by now the child has been waited on; it shouldn't be
+    # reap-able again.
+    with pytest.raises((ChildProcessError, OSError)):
+        os.waitpid(pid, os.WNOHANG)
+
+
+@pytest.mark.timeout(30, method='thread')
+def test_fork_and_run_trio_in_child() -> None:
+    '''
+    End-to-end: inside the parent's `trio.run()`, off-load
+    `fork_from_worker_thread()` to a worker thread, have the
+    forked child then create a fresh subint and run
+    `trio.run()` inside it on yet another worker thread.
+
+    This is the full "forkserver + trio-in-subint-in-child"
+    pattern the proposed `main_thread_forkserver` spawn backend
+    would rest on.
+
+    '''
+    deadline: float = 15.0
+    with dump_on_hang(
+        seconds=deadline,
+        path='/tmp/main_thread_forkserver_trio_in_child.dump',
+    ):
+        pid: int = trio.run(
+            partial(
+                run_fork_in_non_trio_thread,
+                deadline,
+                child_target=_child_trio_in_subint,
+            ),
+        )
+    assert isinstance(pid, int) and pid > 0
+
+
+# ----------------------------------------------------------------
+# tier-3 backend test: drive the registered `main_thread_forkserver`
+# spawn backend end-to-end through tractor's actor-nursery +
+# portal-RPC machinery.
+# ----------------------------------------------------------------
+
+
+async def _trivial_rpc() -> str:
+    '''
+    Minimal subactor-side RPC body: just return a sentinel
+    string the parent can assert on.
+
+    '''
+    return 'hello from subint-forkserver child'
+
+
+async def _happy_path_forkserver(
+    reg_addr: tuple[str, int | str],
+    deadline: float,
+) -> None:
+    '''
+    Parent-side harness: stand up a root actor, open an actor
+    nursery, spawn one subactor via the currently-selected
+    spawn backend (which this test will have flipped to
+    `main_thread_forkserver`), run a trivial RPC through its
+    portal, assert the round-trip result.
+
+    '''
+    with trio.fail_after(deadline):
+        async with (
+            tractor.open_root_actor(
+                registry_addrs=[reg_addr],
+            ),
+            tractor.open_nursery() as an,
+        ):
+            portal: tractor.Portal = await an.run_in_actor(
+                _trivial_rpc,
+                name='subint-forkserver-child',
+            )
+            result: str = await portal.wait_for_result()
+            assert result == 'hello from subint-forkserver child'
+
+
+@pytest.fixture
+def forkserver_spawn_method():
+    '''
+    Flip `tractor.spawn._spawn._spawn_method` to
+    `'main_thread_forkserver'` for the duration of a test,
+    then restore whatever was in place before (usually the
+    session-level CLI choice, typically `'trio'`).
+
+    Without this, other tests in the same session would
+    observe the global flip and start spawning via fork —
+    which is almost certainly NOT what their assertions were
+    written against.
+
+    '''
+    prev_method: str = _spawn_mod._spawn_method
+    prev_ctx = _spawn_mod._ctx
+    try_set_start_method('main_thread_forkserver')
+    try:
+        yield
+    finally:
+        _spawn_mod._spawn_method = prev_method
+        _spawn_mod._ctx = prev_ctx
+
+
+@pytest.mark.timeout(60, method='thread')
+def test_main_thread_forkserver_spawn_basic(
+    reg_addr: tuple[str, int | str],
+    forkserver_spawn_method,
+) -> None:
+    '''
+    Happy-path: spawn ONE subactor via the
+    `main_thread_forkserver` backend (parent-side fork from a
+    main-interp worker thread), do a trivial portal-RPC
+    round-trip, tear the nursery down cleanly.
+
+    If this passes, the "forkserver + tractor runtime" arch
+    is proven end-to-end: the registered
+    `main_thread_forkserver_proc` spawn target successfully
+    forks a child, the child runs `_actor_child_main()` +
+    completes IPC handshake + serves an RPC, and the parent
+    reaps via `_ForkedProc.wait()` without regressing any of
+    the normal nursery teardown invariants.
+
+    '''
+    deadline: float = 20.0
+    with dump_on_hang(
+        seconds=deadline,
+        path='/tmp/main_thread_forkserver_spawn_basic.dump',
+    ):
+        trio.run(
+            partial(
+                _happy_path_forkserver,
+                reg_addr,
+                deadline,
+            ),
+        )
+
+
+# ----------------------------------------------------------------
+# tier-4 DRAFT: orphaned-subactor SIGINT survivability
+#
+# Motivating question: with `main_thread_forkserver`, the child's
+# `trio.run()` lives on the fork-inherited worker thread which
+# is NOT `threading.main_thread()` — so trio cannot install its
+# `signal.set_wakeup_fd`-based SIGINT handler. If the parent
+# goes away via `SIGKILL` (no IPC `Portal.cancel_actor()`
+# possible), does SIGINT on the orphan child cleanly tear it
+# down via CPython's default `KeyboardInterrupt` delivery, or
+# does it hang?
+#
+# Working hypothesis (unverified pre-this-test): post-fork the
+# child is effectively single-threaded (only the fork-worker
+# tstate survived), so SIGINT → default handler → raises
+# `KeyboardInterrupt` on the only thread — which happens to be
+# the one driving trio's event loop — so trio observes it at
+# the next checkpoint. If so, we're "fine" on this backend
+# despite the missing trio SIGINT handler.
+#
+# Cross-backend generalization (decide after this passes):
+# - applicable to any backend whose subactors are separate OS
+#   processes: `trio`, `mp_spawn`, `mp_forkserver`,
+#   `main_thread_forkserver`.
+# - NOT applicable to plain `subint` (subactors are in-process
+#   subinterpreters, no orphan child process to SIGINT).
+# - move path: lift the harness script into
+#   `tests/_orphan_harness.py`, parametrize on the session's
+#   `_spawn_method`, add `skipif _spawn_method == 'subint'`.
+# ----------------------------------------------------------------
+
+
+_ORPHAN_HARNESS_SCRIPT: str = '''
+import os
+import sys
+import trio
+import tractor
+from tractor.spawn._spawn import try_set_start_method
+
+async def _sleep_forever() -> None:
+    print(f"CHILD_PID={os.getpid()}", flush=True)
+    await trio.sleep_forever()
+
+async def _main(reg_addr):
+    async with (
+        tractor.open_root_actor(registry_addrs=[reg_addr]),
+        tractor.open_nursery() as an,
+    ):
+        portal = await an.run_in_actor(
+            _sleep_forever,
+            name="orphan-test-child",
+        )
+        print(f"PARENT_READY={os.getpid()}", flush=True)
+        await trio.sleep_forever()
+
+if __name__ == "__main__":
+    backend = sys.argv[1]
+    host = sys.argv[2]
+    port = int(sys.argv[3])
+    try_set_start_method(backend)
+    trio.run(_main, (host, port))
+'''
+
+
+def _read_marker(
+    proc: subprocess.Popen,
+    marker: str,
+    timeout: float,
+    _buf: dict,
+) -> str:
+    '''
+    Block until `<marker>=<value>\\n` appears on `proc.stdout`
+    and return `<value>`. Uses a per-proc byte buffer (`_buf`)
+    to carry partial lines across calls.
+
+    '''
+    deadline: float = time.monotonic() + timeout
+    remainder: bytes = _buf.get('remainder', b'')
+    prefix: bytes = f'{marker}='.encode()
+    while time.monotonic() < deadline:
+        # drain any complete lines already buffered
+        while b'\n' in remainder:
+            line, remainder = remainder.split(b'\n', 1)
+            if line.startswith(prefix):
+                _buf['remainder'] = remainder
+                return line[len(prefix):].decode().strip()
+        ready, _, _ = select.select([proc.stdout], [], [], 0.2)
+        if not ready:
+            continue
+        chunk: bytes = os.read(proc.stdout.fileno(), 4096)
+        if not chunk:
+            break
+        remainder += chunk
+    _buf['remainder'] = remainder
+    raise TimeoutError(
+        f'Never observed marker {marker!r} on harness stdout '
+        f'within {timeout}s'
+    )
+
+
+def _process_alive(pid: int) -> bool:
+    '''Liveness probe for a pid we do NOT parent (post-orphan).'''
+    try:
+        os.kill(pid, 0)
+        return True
+    except ProcessLookupError:
+        return False
+
+
+# Known-gap test — `main_thread_forkserver` orphan-SIGINT
+# handling. See
+# `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.
+# `strict=True` so if a future fix closes the gap the
+# XPASS surfaces as a FAIL and forces us to drop the
+# mark intentionally.
+@pytest.mark.xfail(
+    strict=True,
+    reason=(
+        'Orphan subactor SIGINT delivery: trio event loop '
+        'on non-main thread post-fork doesn\'t see the '
+        'external SIGINT → KBI path. See tracker doc.\n'
+        'ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md'
+    ),
+)
+@pytest.mark.timeout(
+    30,
+    method='thread',
+)
+def test_orphaned_subactor_sigint_cleanup_DRAFT(
+    reg_addr: tuple[str, int | str],
+    tmp_path: Path,
+) -> None:
+    '''
+    DRAFT — orphaned-subactor SIGINT survivability under the
+    `main_thread_forkserver` backend.
+
+    Sequence:
+      1. Spawn a harness subprocess that brings up a root
+         actor + one `sleep_forever` subactor via
+         `main_thread_forkserver`.
+      2. Read the harness's stdout for `PARENT_READY=<pid>`
+         and `CHILD_PID=<pid>` markers (confirms the
+         parent→child IPC handshake completed).
+      3. `SIGKILL` the parent (no IPC cancel possible — the
+         whole point of this test).
+      4. `SIGINT` the orphan child.
+      5. Poll `os.kill(child_pid, 0)` for up to 10s — assert
+         the child exits.
+
+    Empirical result (2026-04, py3.14): currently **FAILS** —
+    SIGINT on the orphan child doesn't unwind the trio loop,
+    despite trio's `KIManager` handler being correctly
+    installed in the subactor (the post-fork thread IS
+    `threading.main_thread()` on py3.14). `faulthandler` dump
+    shows the subactor wedged in `trio/_core/_io_epoll.py::
+    get_events` — the signal's supposed wakeup of the event
+    loop isn't firing. Full analysis + diagnostic evidence
+    in `ai/conc-anal/
+    subint_forkserver_orphan_sigint_hang_issue.md`.
+
+    The runtime's *intentional* "KBI-as-OS-cancel" path at
+    `tractor/spawn/_entry.py::_trio_main:164` is therefore
+    unreachable under this backend+config. Closing the gap is
+    aligned with existing design intent (make the already-
+    designed behavior actually fire), not a new feature.
+    Marked `xfail(strict=True)` so the
+    mark flips to XPASS→fail once the gap is closed and we'll
+    know to drop the mark.
+
+    '''
+    if platform.system() != 'Linux':
+        pytest.skip(
+            'orphan-reparenting semantics only exercised on Linux'
+        )
+
+    script_path = tmp_path / '_orphan_harness.py'
+    script_path.write_text(_ORPHAN_HARNESS_SCRIPT)
+
+    # Offset the port so we don't race the session reg_addr with
+    # any concurrently-running backend test's listener.
+    host: str = reg_addr[0]
+    port: int = int(reg_addr[1]) + 17
+
+    proc: subprocess.Popen = subprocess.Popen(
+        [
+            sys.executable,
+            str(script_path),
+            'main_thread_forkserver',
+            host,
+            str(port),
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+    )
+    parent_pid: int | None = None
+    child_pid: int | None = None
+    buf: dict = {}
+    try:
+        child_pid = int(_read_marker(proc, 'CHILD_PID', 15.0, buf))
+        parent_pid = int(_read_marker(proc, 'PARENT_READY', 15.0, buf))
+
+        # sanity: both alive before we start killing stuff
+        assert _process_alive(parent_pid), (
+            f'harness parent pid={parent_pid} gone before '
+            f'SIGKILL — test premise broken'
+        )
+        assert _process_alive(child_pid), (
+            f'orphan-candidate child pid={child_pid} gone '
+            f'before test started'
+        )
+
+        # step 3: kill parent — no IPC cancel arrives at child.
+        # `proc.wait()` reaps the zombie so it truly disappears
+        # from the process table (otherwise `os.kill(pid, 0)`
+        # keeps reporting it as alive).
+        os.kill(parent_pid, signal.SIGKILL)
+        try:
+            proc.wait(timeout=3.0)
+        except subprocess.TimeoutExpired:
+            pytest.fail(
+                f'harness parent pid={parent_pid} did not die '
+                f'after SIGKILL — test premise broken'
+            )
+        assert _process_alive(child_pid), (
+            f'child pid={child_pid} died along with parent — '
+            f'did the parent reap it before SIGKILL took? '
+            f'test premise requires an orphan.'
+        )
+
+        # step 4+5: SIGINT the orphan, poll for exit.
+        os.kill(child_pid, signal.SIGINT)
+        timeout: float = 6.0
+        cleanup_deadline: float = time.monotonic() + timeout
+        while time.monotonic() < cleanup_deadline:
+            if not _process_alive(child_pid):
+                return  # <- success path
+            time.sleep(0.1)
+
+        pytest.fail(
+            f'Orphan subactor (pid={child_pid}) did NOT exit '
+            f'within 10s of SIGINT under `main_thread_forkserver` '
+            f'→ trio on non-main thread did not observe the '
+            f'default CPython KeyboardInterrupt; backend needs '
+            f'explicit SIGINT plumbing.'
+        )
+    finally:
+        # best-effort cleanup to avoid leaking orphans across
+        # the test session regardless of outcome.
+        for pid in (parent_pid, child_pid):
+            if pid is None:
+                continue
+            try:
+                os.kill(pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+        try:
+            proc.kill()
+        except OSError:
+            pass
+        try:
+            proc.wait(timeout=2.0)
+        except subprocess.TimeoutExpired:
+            pass
+
+
+# ----------------------------------------------------------------
+# regression guard: variant-2 (`subint_forkserver`) placeholder
+# MUST raise `NotImplementedError` today — guards against future
+# commits accidentally re-aliasing the key to the variant-1
+# coroutine (which was a transient state during the rename).
+# ----------------------------------------------------------------
+def test_subint_forkserver_key_errors_cleanly() -> None:
+    '''
+    `--spawn-backend=subint_forkserver` is reserved for the
+    eventual variant-2 (subint-isolated child runtime)
+    backend, gated on jcrist/msgspec#1026 unblocking PEP 684
+    isolated-mode subints upstream.
+
+    Until that lands, the dispatch entry MUST raise
+    `NotImplementedError` immediately rather than silently
+    aliasing to `main_thread_forkserver_proc`. Verify the
+    error message also surfaces both the working-backend
+    pointer and the upstream-blocker ref so an operator
+    arriving at the error has somewhere to go.
+
+    '''
+    import asyncio
+    from tractor.spawn._spawn import _methods
+
+    proc = _methods['subint_forkserver']
+    with pytest.raises(NotImplementedError) as ei:
+        # signature args match `main_thread_forkserver_proc`'s
+        # — the stub raises before touching them so dummy
+        # values are fine.
+        asyncio.run(
+            proc(
+                'x', None, None, {}, [],
+                ('127.0.0.1', 0), {},
+            )
+        )
+
+    msg: str = str(ei.value)
+    assert 'main_thread_forkserver' in msg, (
+        f'stub error msg should redirect to the working '
+        f'variant-1 backend; got: {msg!r}'
+    )
+    assert 'msgspec#1026' in msg or '1026' in msg, (
+        f'stub error msg should reference the upstream '
+        f'blocker (jcrist/msgspec#1026); got: {msg!r}'
+    )
diff --git a/tests/test_subint_cancellation.py b/tests/spawn/test_subint_cancellation.py
similarity index 98%
rename from tests/test_subint_cancellation.py
rename to tests/spawn/test_subint_cancellation.py
index 0f0757384..26ce0fdd8 100644
--- a/tests/test_subint_cancellation.py
+++ b/tests/spawn/test_subint_cancellation.py
@@ -179,10 +179,10 @@ def test_subint_happy_teardown(
 # `subint_sigint_starvation_issue.md` GIL-starvation flavor,
 # so `method='thread'` keeps us safe in case ordering or
 # load shifts the failure mode.
-@pytest.mark.timeout(
-    3,  # NOTE never passes pre-3.14+ subints support.
-    method='thread',
-)
+# @pytest.mark.timeout(
+#     3,  # NOTE never passes pre-3.14+ subints support.
+#     method='thread',
+# )
 def test_subint_non_checkpointing_child(
     reg_addr: tuple[str, int|str],
 ) -> None:
diff --git a/tests/test_advanced_streaming.py b/tests/test_advanced_streaming.py
index 907a21964..645d2759f 100644
--- a/tests/test_advanced_streaming.py
+++ b/tests/test_advanced_streaming.py
@@ -5,6 +5,7 @@
 from collections import Counter
 import itertools
 import platform
+from typing import Type
 
 import pytest
 import trio
@@ -76,9 +77,7 @@ async def subscribe(
 
 
 async def consumer(
-
     subs: list[str],
-
 ) -> None:
 
     uid = tractor.current_actor().uid
@@ -108,59 +107,134 @@ async def consumer(
                         print(f'{uid} got: {value}')
 
 
-def test_dynamic_pub_sub():
+# NOTE: deliberately NOT using `@pytest.mark.timeout(...)` —
+# both pytest-timeout enforcement modes break trio under
+# fork-based backends:
+#
+# - `method='signal'` (SIGALRM): the handler synchronously
+#   raises `Failed` in trio's main thread mid-`epoll.poll()`,
+#   leaves `GLOBAL_RUN_CONTEXT` half-installed ("Trio guest
+#   run got abandoned"), and EVERY subsequent `trio.run()`
+#   in the same pytest process bails with
+#   `RuntimeError: Attempted to call run() from inside a
+#   run()` — session-wide poison.
+#
+# - `method='thread'`: calls `_thread.interrupt_main()`
+#   raising `KeyboardInterrupt` into the main thread. Under
+#   fork-based backends with mid-cascade fd-juggling the KBI
+#   can escape trio's `KIManager` and bubble out of pytest
+#   itself — kills the WHOLE session.
+#
+# Instead we use `trio.fail_after()` INSIDE `main()` below:
+# trio's own `Cancelled`/`TooSlowError` machinery handles the
+# timeout, cleanly unwinds the actor nursery's cancel
+# cascade, and only fails the single test (no cross-test
+# state corruption either way).
+#
+# `pyproject.toml`'s default `timeout = 200` is still a
+# last-resort safety net.
+@pytest.mark.parametrize(
+    'expect_cancel_exc', [
+        KeyboardInterrupt,
+        trio.TooSlowError,
+    ],
+    ids=lambda item:
+        f'expect_user_exc_raised={item.__name__}'
+)
+def test_dynamic_pub_sub(
+    reg_addr: tuple,
+    debug_mode: bool,
+    test_log: tractor.log.StackLevelAdapter,
+    reap_subactors_per_test: int,
+    expect_cancel_exc: Type[BaseException],
+):
+    failed_to_raise_report: str = (
+        f'Never got a {expect_cancel_exc!r} ??'
+    )
 
     global _registry
 
     from multiprocessing import cpu_count
     cpus = cpu_count()
 
+    # Hard safety cap via trio's own cancellation — see the
+    # module-level NOTE on why we avoid `pytest-timeout` for
+    # this test. Picked backend-aware: under `trio` backend
+    # spawn is cheap (~1s for `cpus` actors) but fork-based
+    # backends pay a per-spawn cost (forkserver round-trip +
+    # IPC peer-handshake) that can stack up over `cpus - 1`
+    # sequential `n.run_in_actor()` calls — especially on UDS
+    # under cross-pytest contention (#451 / #452). Empirically
+    # 12s flakes on `main_thread_forkserver`; 30s gives
+    # plenty of headroom while still failing-loud on a real
+    # hang.
+    from tractor.spawn import _spawn as _spawn_mod
+    fail_after_s: int = (
+        30
+        if _spawn_mod._spawn_method == 'main_thread_forkserver'
+        else 12
+    )
+
     async def main():
-        async with tractor.open_nursery() as n:
+        with trio.fail_after(fail_after_s):
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+                debug_mode=debug_mode,
+            ) as n:
 
-            # name of this actor will be same as target func
-            await n.run_in_actor(publisher)
+                # name of this actor will be same as target func
+                await n.run_in_actor(publisher)
+
+                for i, sub in zip(
+                    range(cpus - 2),
+                    itertools.cycle(_registry.keys())
+                ):
+                    await n.run_in_actor(
+                        consumer,
+                        name=f'consumer_{sub}',
+                        subs=[sub],
+                    )
 
-            for i, sub in zip(
-                range(cpus - 2),
-                itertools.cycle(_registry.keys())
-            ):
+                # make one dynamic subscriber
                 await n.run_in_actor(
                     consumer,
-                    name=f'consumer_{sub}',
-                    subs=[sub],
+                    name='consumer_dynamic',
+                    subs=list(_registry.keys()),
                 )
 
-            # make one dynamic subscriber
-            await n.run_in_actor(
-                consumer,
-                name='consumer_dynamic',
-                subs=list(_registry.keys()),
-            )
-
-            # block until cancelled by user
-            with trio.fail_after(3):
-                await trio.sleep_forever()
+                # block until "cancelled by user"
+                await trio.sleep(3)
+                test_log.warning(
+                    f'Raising user cancel exc: '
+                    f'{expect_cancel_exc!r}'
+                )
+                raise expect_cancel_exc('simulate user cancel!')
 
     try:
         trio.run(main)
-    except (
-        trio.TooSlowError,
-        ExceptionGroup,
-    ) as err:
-        if isinstance(err, ExceptionGroup):
-            for suberr in err.exceptions:
-                if isinstance(suberr, trio.TooSlowError):
-                    break
-            else:
-                pytest.fail('Never got a `TooSlowError` ?')
+        pytest.fail(failed_to_raise_report)
+    except expect_cancel_exc:
+        # parent-side raised the user-cancel exc directly and
+        # it propagated unwrapped; clean path.
+        test_log.exception('Got user-cancel exc AS EXPECTED')
+    except BaseExceptionGroup as err:
+        # under fork-based backends the user-raised cancel
+        # can race with subactor-side stream teardown
+        # (`trio.EndOfChannel` from a publisher's `send()`
+        # whose remote half got cut). The expected exc may
+        # then be nested deeper in the group rather than at
+        # the top level. `BaseExceptionGroup.split()` walks
+        # the exc tree recursively (Python 3.11+).
+        matched, _ = err.split(expect_cancel_exc)
+        if matched is None:
+            pytest.fail(failed_to_raise_report)
+
+        test_log.exception('Got user-cancel exc AS EXPECTED')
 
 
 @tractor.context
 async def one_task_streams_and_one_handles_reqresp(
-
     ctx: tractor.Context,
-
 ) -> None:
 
     await ctx.started()
@@ -257,7 +331,8 @@ async def echo_ctx_stream(
 
 
 def test_sigint_both_stream_types():
-    '''Verify that running a bi-directional and recv only stream
+    '''
+    Verify that running a bi-directional and recv only stream
     side-by-side will cancel correctly from SIGINT.
 
     '''
@@ -286,7 +361,6 @@ async def main():
                             resp = await stream.receive()
                             assert resp == msg
                             raise KeyboardInterrupt
-
     try:
         trio.run(main)
         assert 0, "Didn't receive KBI!?"
@@ -356,7 +430,12 @@ async def close_stream_on_sentinel():
     print('streamer exited .open_streamer() block')
 
 
+# @pytest.mark.timeout(
+#     6,
+#     method='signal',
+# )
 def test_local_task_fanout_from_stream(
+    reg_addr: tuple,
     debug_mode: bool,
 ):
     '''
@@ -421,4 +500,9 @@ async def pull_and_count(name: str):
 
             await p.cancel_actor()
 
-    trio.run(main)
+    async def w_timeout():
+        with trio.fail_after(6):
+            await main()
+
+    # trio.run(main)
+    trio.run(w_timeout)
diff --git a/tests/test_cancellation.py b/tests/test_cancellation.py
index 645ee068e..af1ae51df 100644
--- a/tests/test_cancellation.py
+++ b/tests/test_cancellation.py
@@ -275,7 +275,8 @@ async def stream_forever():
     timeout=6,
 )
 async def test_cancel_infinite_streamer(
-    start_method: str
+    reg_addr: tuple,
+    start_method: str,
 ):
     # stream for at most 1 seconds
     with (
@@ -341,6 +342,7 @@ async def test_cancel_infinite_streamer(
 )
 async def test_some_cancels_all(
     num_actors_and_errs: tuple,
+    reg_addr: tuple,
     start_method: str,
     loglevel: str,
 ):
@@ -450,8 +452,19 @@ async def spawn_and_error(
             await nursery.run_in_actor(*args, **kwargs)
 
 
+# NOTE: `main_thread_forkserver` capture-fd hang class is no
+# longer skipped here — `--capture=sys` (the new `pyproject.toml`
+# default) sidesteps the pipe-buffer-fill deadlock for
+# `test_nested_multierrors`. See
+# `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
+# / #449 for the post-mortem.
+# @pytest.mark.timeout(
+#     10,
+#     method='thread',
+# )
 @tractor_test
 async def test_nested_multierrors(
+    reg_addr: tuple,
     loglevel: str,
     start_method: str,
 ):
@@ -541,6 +554,7 @@ async def test_nested_multierrors(
 
 @no_windows
 def test_cancel_via_SIGINT(
+    reg_addr: tuple,
     loglevel: str,
     start_method: str,
 ):
@@ -553,7 +567,9 @@ def test_cancel_via_SIGINT(
 
     async def main():
         with trio.fail_after(2):
-            async with tractor.open_nursery() as tn:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as tn:
                 await tn.start_actor('sucka')
                 if 'mp' in start_method:
                     time.sleep(0.1)
@@ -566,6 +582,7 @@ async def main():
 
 @no_windows
 def test_cancel_via_SIGINT_other_task(
+    reg_addr: tuple,
     loglevel: str,
     start_method: str,
     spawn_backend: str,
@@ -594,7 +611,9 @@ def test_cancel_via_SIGINT_other_task(
     async def spawn_and_sleep_forever(
         task_status=trio.TASK_STATUS_IGNORED
     ):
-        async with tractor.open_nursery() as tn:
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+        ) as tn:
             for i in range(3):
                 await tn.run_in_actor(
                     sleep_forever,
diff --git a/tests/test_context_stream_semantics.py b/tests/test_context_stream_semantics.py
index 6d7de4d60..5548ed17b 100644
--- a/tests/test_context_stream_semantics.py
+++ b/tests/test_context_stream_semantics.py
@@ -115,10 +115,12 @@ async def not_started_but_stream_opened(
 )
 def test_started_misuse(
     target: Callable,
+    reg_addr: tuple,
     debug_mode: bool,
 ):
     async def main():
         async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
             debug_mode=debug_mode,
         ) as an:
             portal = await an.start_actor(
@@ -184,6 +186,7 @@ def test_simple_context(
     error_parent,
     child_blocks_forever,
     pointlessly_open_stream,
+    reg_addr: tuple,
     debug_mode: bool,
 ):
 
@@ -193,6 +196,7 @@ async def main():
 
         with trio.fail_after(timeout):
             async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
                 debug_mode=debug_mode,
             ) as an:
                 portal = await an.start_actor(
@@ -278,6 +282,7 @@ def test_parent_cancels(
     cancel_method: str,
     chk_ctx_result_before_exit: bool,
     child_returns_early: bool,
+    reg_addr: tuple,
     debug_mode: bool,
 ):
     '''
@@ -355,6 +360,7 @@ async def check_canceller(
     async def main():
 
         async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
             debug_mode=debug_mode,
         ) as an:
             portal = await an.start_actor(
@@ -931,6 +937,7 @@ async def keep_sending_from_child(
 )
 def test_one_end_stream_not_opened(
     overrun_by: tuple[str, int, Callable],
+    reg_addr: tuple,
     debug_mode: bool,
 ):
     '''
@@ -949,6 +956,7 @@ def test_one_end_stream_not_opened(
 
     async def main():
         async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
             debug_mode=debug_mode,
         ) as an:
             portal = await an.start_actor(
@@ -1113,6 +1121,7 @@ def test_maybe_allow_overruns_stream(
 
     # conftest wide
     loglevel: str,
+    reg_addr: tuple,
     debug_mode: bool,
 ):
     '''
@@ -1133,6 +1142,7 @@ def test_maybe_allow_overruns_stream(
     '''
     async def main():
         async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
             debug_mode=debug_mode,
         ) as an:
             portal = await an.start_actor(
@@ -1249,6 +1259,7 @@ async def main():
 
 def test_ctx_with_self_actor(
     loglevel: str,
+    reg_addr: tuple,
     debug_mode: bool,
 ):
     '''
@@ -1263,6 +1274,7 @@ def test_ctx_with_self_actor(
     '''
     async def main():
         async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
             debug_mode=debug_mode,
             enable_modules=[__name__],
         ) as an:
diff --git a/tests/test_infected_asyncio.py b/tests/test_infected_asyncio.py
index 9f6b43e5f..8157e6d48 100644
--- a/tests/test_infected_asyncio.py
+++ b/tests/test_infected_asyncio.py
@@ -32,6 +32,22 @@
 from tractor._testing import expect_ctxc
 
 
+# Per-test zombie-subactor reaper. Opt-in (NOT autouse) —
+# see `tractor._testing.pytest.reap_subactors_per_test`'s
+# docstring for the full rationale. This module specifically
+# needs it because tests like
+# `test_echoserver_detailed_mechanics[KeyboardInterrupt]`
+# and the `test_sigint_closes_lifetime_stack[*]` matrix have
+# been observed to hang past pytest's wall-clock under
+# `main_thread_forkserver`, leaving subactor forks that
+# squat on registrar resources and cascade-fail every
+# subsequent test (`test_inter_peer_cancellation`,
+# `test_legacy_one_way_streaming`, etc.).
+pytestmark = pytest.mark.usefixtures(
+    'reap_subactors_per_test',
+)
+
+
 @pytest.fixture(
     scope='module',
 )
@@ -183,6 +199,7 @@ def test_tractor_cancels_aio(
     async def main():
         async with tractor.open_nursery(
             debug_mode=debug_mode,
+            registry_addrs=[reg_addr],
         ) as an:
             portal = await an.run_in_actor(
                 asyncio_actor,
@@ -205,11 +222,11 @@ def test_trio_cancels_aio(
 
     '''
     async def main():
-
+        # cancel the nursery shortly after boot
         with trio.move_on_after(1):
-            # cancel the nursery shortly after boot
-
-            async with tractor.open_nursery() as tn:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as tn:
                 await tn.run_in_actor(
                     asyncio_actor,
                     target='aio_sleep_forever',
@@ -277,7 +294,9 @@ def test_context_spawns_aio_task_that_errors(
     '''
     async def main():
         with trio.fail_after(1 + delay):
-            async with tractor.open_nursery() as an:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as an:
                 p = await an.start_actor(
                     'aio_daemon',
                     enable_modules=[__name__],
@@ -360,7 +379,9 @@ def test_aio_cancelled_from_aio_causes_trio_cancelled(
     async def main():
 
         an: tractor.ActorNursery
-        async with tractor.open_nursery() as an:
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+        ) as an:
             p: tractor.Portal = await an.run_in_actor(
                 asyncio_actor,
                 target='aio_cancel',
@@ -569,7 +590,9 @@ def test_basic_interloop_channel_stream(
     async def main():
         # TODO, figure out min timeout here!
         with trio.fail_after(6):
-            async with tractor.open_nursery() as an:
+            async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
+            ) as an:
                 portal = await an.run_in_actor(
                     stream_from_aio,
                     infect_asyncio=True,
@@ -582,9 +605,13 @@ async def main():
 
 
 # TODO: parametrize the above test and avoid the duplication here?
-def test_trio_error_cancels_intertask_chan(reg_addr):
+def test_trio_error_cancels_intertask_chan(
+    reg_addr: tuple[str, int],
+):
     async def main():
-        async with tractor.open_nursery() as an:
+        async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
+        ) as an:
             portal = await an.run_in_actor(
                 stream_from_aio,
                 trio_raise_err=True,
@@ -619,6 +646,7 @@ async def main():
             async with tractor.open_nursery(
                 debug_mode=debug_mode,
                 # enable_stack_on_sig=True,
+                registry_addrs=[reg_addr],
             ) as an:
                 portal = await an.run_in_actor(
                     stream_from_aio,
@@ -667,6 +695,7 @@ def test_aio_exits_early_relays_AsyncioTaskExited(
     async def main():
         with trio.fail_after(1 + delay):
             async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
                 debug_mode=debug_mode,
                 # enable_stack_on_sig=True,
             ) as an:
@@ -707,6 +736,7 @@ def test_aio_errors_and_channel_propagates_and_closes(
 ):
     async def main():
         async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
             debug_mode=debug_mode,
         ) as an:
             portal = await an.run_in_actor(
@@ -806,6 +836,7 @@ def test_echoserver_detailed_mechanics(
 ):
     async def main():
         async with tractor.open_nursery(
+            registry_addrs=[reg_addr],
             debug_mode=debug_mode,
         ) as an:
             p = await an.start_actor(
@@ -984,7 +1015,7 @@ async def manage_file(
     ],
     ids=[
         'bg_aio_task',
-        'just_trio_slee',
+        'just_trio_sleep',
     ],
 )
 @pytest.mark.parametrize(
@@ -1000,11 +1031,14 @@ async def manage_file(
 )
 def test_sigint_closes_lifetime_stack(
     tmp_path: Path,
+    reg_addr: tuple,
+    debug_mode: bool,
+
     wait_for_ctx: bool,
     bg_aio_task: bool,
     trio_side_is_shielded: bool,
-    debug_mode: bool,
     send_sigint_to: str,
+    start_method: str,
 ):
     '''
     Ensure that an infected child can use the `Actor.lifetime_stack`
@@ -1014,12 +1048,22 @@ def test_sigint_closes_lifetime_stack(
     '''
     async def main():
 
-        delay = 999 if tractor.debug_mode() else 1
+        delay: float = (
+            999
+            if debug_mode
+            else 1
+        )
         try:
             an: tractor.ActorNursery
             async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
                 debug_mode=debug_mode,
             ) as an:
+
+                # sanity
+                if debug_mode:
+                    assert tractor.debug_mode()
+
                 p: tractor.Portal = await an.start_actor(
                     'file_mngr',
                     enable_modules=[__name__],
@@ -1054,6 +1098,10 @@ async def main():
                         cpid if send_sigint_to == 'child'
                         else os.getpid()
                     )
+                    print(
+                        f'Sending SIGINT to {send_sigint_to!r}\n'
+                        f'pid: {pid!r}\n'
+                    )
                     os.kill(
                         pid,
                         signal.SIGINT,
@@ -1064,13 +1112,37 @@ async def main():
                     # timeout should trigger!
                     if wait_for_ctx:
                         print('waiting for ctx outcome in parent..')
+
+                        if debug_mode:
+                            assert delay == 999
+
                         try:
-                            with trio.fail_after(1 + delay):
+                            with trio.fail_after(
+                                1 + delay
+                            ):
                                 await ctx.wait_for_result()
                         except tractor.ContextCancelled as ctxc:
                             assert ctxc.canceller == ctx.chan.uid
                             raise
 
+                        except trio.TooSlowError:
+                            if (
+                                send_sigint_to == 'child'
+                                and
+                                start_method == 'main_thread_forkserver'
+                            ):
+                                pytest.xfail(
+                                    reason=(
+                                        'SIGINT delivery to fork-child subactor is known '
+                                        'to NOT SUCCEED, precisely bc we have not wired up a'
+                                        '"trio SIGINT mode" in the child pre-fork.\n'
+                                        'Also see `test_orphaned_subactor_sigint_cleanup_DRAFT` for'
+                                        'a dedicated suite demonstrating this expected limitation as '
+                                        'well as the detailed doc:\n'
+                                        '`ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`.\n'
+                                    ),
+                                )
+
                     # XXX CASE 2: this seems to be the source of the
                     # original issue which exhibited BEFORE we put
                     # a `Actor.cancel_soon()` inside
@@ -1170,6 +1242,7 @@ async def main():
         with trio.fail_after(3):
             an: tractor.ActorNursery
             async with tractor.open_nursery(
+                registry_addrs=[reg_addr],
                 debug_mode=debug_mode,
                 loglevel=loglevel,
             ) as an:
diff --git a/tests/test_shm.py b/tests/test_shm.py
index 3409f3384..84d0988ec 100644
--- a/tests/test_shm.py
+++ b/tests/test_shm.py
@@ -16,10 +16,16 @@
 
 pytestmark = pytest.mark.skipon_spawn_backend(
     'subint',
+    # NOTE, `main_thread_forkserver` works for these tests
+    # via the `mp.SharedMemory(track=False)` +
+    # `mp.resource_tracker` monkey-patch in `.ipc._mp_bs`.
+    # Without that workaround the fork-inherited
+    # `resource_tracker` fd would EBADF on first shm op +
+    # cascade into `FileExistsError` across parametrize
+    # variants. Tracker doc:
+    # `ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`.
     reason=(
-        'XXX SUBINT GIL-CONTENTION HANGING TEST XXX\n'
-        'See oustanding issue(s)\n'
-        # TODO, put issue link!
+        'subint: GIL-contention hanging class.\n'
     )
 )
 
diff --git a/tests/test_spawning.py b/tests/test_spawning.py
index 7f3421fe5..63a2fb8e1 100644
--- a/tests/test_spawning.py
+++ b/tests/test_spawning.py
@@ -194,9 +194,14 @@ def test_loglevel_propagated_to_subactor(
     reg_addr: tuple,
     level: str,
 ):
-    if start_method == 'mp_forkserver':
+    if start_method in ('mp_forkserver', 'main_thread_forkserver'):
         pytest.skip(
-            "a bug with `capfd` seems to make forkserver capture not work?"
+            "a bug with `capfd` seems to make forkserver capture not work? "
+            "(same class as the `mp_forkserver` pre-existing skip — fork-"
+            "based backends inherit pytest's capfd temp-file fds into the "
+            "subactor and the IPC handshake reads garbage (`unclean EOF "
+            "read only X/HUGE_NUMBER bytes`). Work around by using "
+            "`capsys` instead or skip entirely."
         )
 
     async def main():
diff --git a/tests/trionics/__init__.py b/tests/trionics/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/trionics/test_patches.py b/tests/trionics/test_patches.py
new file mode 100644
index 000000000..9f2b942f1
--- /dev/null
+++ b/tests/trionics/test_patches.py
@@ -0,0 +1,99 @@
+'''
+Regression tests for `tractor.trionics.patches` —
+defensive monkey-patches on upstream `trio` bugs.
+
+Each test asserts:
+
+1. The bug exists (or is gone — skip cleanly if
+   upstream shipped the fix and our `is_needed()` now
+   returns `False`).
+2. Our patch fixes it (post-`apply()` the `repro()`
+   returns cleanly within a tight wall-clock cap).
+
+Wall-clock caps are critical here — the bugs we patch
+are tight-loops or deadlocks, so a regression would
+HANG the test runner unless we hard-cap each
+`repro()` call.
+
+'''
+import signal
+
+import pytest
+
+from tractor.trionics import patches
+from tractor.trionics.patches import _wakeup_socketpair as wsp
+
+
+@pytest.fixture(autouse=True)
+def _alarm_cleanup():
+    '''
+    Ensure no leftover SIGALRM survives a test failure
+    or unexpected return.
+
+    '''
+    yield
+    signal.alarm(0)
+
+
+def test_wakeup_socketpair_drain_eof_patch_works():
+    '''
+    Without the patch, `WakeupSocketpair.drain()` on a
+    socketpair whose write-end has been closed spins
+    forever. With the patch applied, it returns
+    cleanly within milliseconds.
+
+    Wall-clock cap: 2s. If the patch regresses, SIGALRM
+    fires and the test hard-fails with a clear signal
+    instead of hanging CI indefinitely.
+
+    '''
+    if not wsp.is_needed():
+        pytest.skip(
+            'upstream trio shipped the fix — '
+            'patch no longer needed for trio '
+            '(see `is_needed()` for version gate)'
+        )
+
+    # Apply the patch.
+    applied: bool = wsp.apply()
+    # First call MUST return True; idempotent guard
+    # prevents False on subsequent calls within the
+    # same process.
+    assert applied is True or applied is False  # idempotent
+
+    # Cap wall-clock at 2s; SIGALRM raises in main
+    # thread which interrupts the C-level recv loop
+    # IF the patch regresses (since `signal.alarm`
+    # uses Python's signal-wakeup-fd which the patch
+    # itself relies on... but `repro()` runs OUTSIDE
+    # a trio.run, so it's plain stdlib semantics here
+    # — alarm WILL fire during `recv` syscall).
+    signal.alarm(2)
+    wsp.repro()
+    signal.alarm(0)
+
+
+def test_apply_all_idempotent():
+    '''
+    Calling `apply_all()` twice should not double-
+    apply: second call's dict has all-False values
+    (every patch reports "already applied").
+
+    '''
+    first: dict[str, bool] = patches.apply_all()
+    second: dict[str, bool] = patches.apply_all()
+
+    # Second call: every patch reports skipped.
+    assert all(v is False for v in second.values()), (
+        f'apply_all() not idempotent: {second}'
+    )
+
+    # First call: at least one patch was applied
+    # (or all are no-ops because `is_needed()` is
+    # False everywhere — the all-fixed-upstream future
+    # state which is also valid).
+    assert isinstance(first, dict)
+    for name, applied in first.items():
+        assert isinstance(applied, bool), (
+            f'patch {name!r} returned non-bool: {applied!r}'
+        )
diff --git a/tractor/_child.py b/tractor/_child.py
index 727a5054a..a79ea0050 100644
--- a/tractor/_child.py
+++ b/tractor/_child.py
@@ -63,6 +63,14 @@ def _actor_child_main(
     sub-interpreter via `Interpreter.call()`.
 
     '''
+    # Apply defensive monkey-patches for upstream `trio`
+    # bugs we've encountered while running tractor — see
+    # `tractor.trionics.patches` for the catalog +
+    # per-patch upstream-fix tracking. Must run BEFORE
+    # any trio runtime init.
+    from .trionics.patches import apply_all
+    apply_all()
+
     subactor = Actor(
         name=uid[0],
         uuid=uid[1],
diff --git a/tractor/_root.py b/tractor/_root.py
index 9b58523da..958ed503a 100644
--- a/tractor/_root.py
+++ b/tractor/_root.py
@@ -69,6 +69,20 @@
 logger = log.get_logger('tractor')
 
 
+# Spawn backends under which `debug_mode=True` is supported.
+# Requirement: the spawned subactor's root runtime must be
+# trio-native so `tractor.devx.debug._tty_lock` works. Matches
+# both the enable-site in `open_root_actor` and the cleanup-
+# site reset of `_runtime_vars['_debug_mode']` — keep them in
+# lockstep when adding backends.
+_DEBUG_COMPATIBLE_BACKENDS: tuple[str, ...] = (
+    'trio',
+    # forkserver children run `_trio_main` in their own OS
+    # process — same child-side runtime shape as `trio_proc`.
+    'main_thread_forkserver',
+)
+
+
 # TODO: stick this in a `@acm` defined in `devx.debug`?
 # -[ ] also maybe consider making this a `wrapt`-deco to
 #     save an indent level?
@@ -227,6 +241,7 @@ async def open_root_actor(
             f'_registry_addrs: {registry_addrs!r}\n'
         )
 
+    # debug.mk_pdb().set_trace()
     async with maybe_block_bp(
         debug_mode=debug_mode,
         maybe_enable_greenback=maybe_enable_greenback,
@@ -270,6 +285,75 @@ async def open_root_actor(
             )
             enable_modules.extend(rpc_module_paths)
 
+        # `TRACTOR_LOGLEVEL` env-var wins over any caller-passed
+        # `loglevel` so devs/test-runs can crank (or silence)
+        # console verbosity without touching application code.
+        env_ll_report: str = ''
+        if env_ll := os.environ.get('TRACTOR_LOGLEVEL'):
+            loglevel = env_ll
+            env_ll_report: str = (
+                f'Detected env-var setting,\n'
+                f'TRACTOR_LOGLEVEL={env_ll!r}\n'
+                f'\n'
+                f'Setting console loglevel per,\n'
+                f'loglevel={loglevel!r}\n'
+            )
+            if (
+                loglevel
+                and
+                loglevel.upper() != env_ll.upper()
+            ):
+                env_ll_report += (
+                    f'\n'
+                    f'NOTE env-var OVERRIDES caller-passed,\n'
+                    f'loglevel={loglevel!r}\n'
+                )
+
+        loglevel: str = (
+            loglevel
+            or
+            log._default_loglevel
+        )
+        loglevel: str = loglevel.upper()
+
+        assert loglevel
+        _log = log.get_console_log(
+            level=loglevel,
+            name='tractor',
+            logger=logger,
+        )
+        assert _log
+        if env_ll_report:
+            _log.info(env_ll_report)
+
+        # `TRACTOR_SPAWN_METHOD` env-var wins over any caller-passed
+        # `start_method` so devs/test-runs can swap the actor spawn
+        # backend without touching application code (e.g. driving
+        # the `examples/debugging/<script>.py` suite under each
+        # backend from `tests/devx/conftest.py::spawn`).
+        if env_sm := os.environ.get('TRACTOR_SPAWN_METHOD'):
+            start_method: str = env_sm
+            env_sm_report: str = (
+                f'Detected env-var setting,\n'
+                f'TRACTOR_SPAWN_METHOD={env_sm!r}\n'
+                f'\n'
+                f'Setting spawn backend as,\n'
+                f'start_method={env_sm!r}\n'
+            )
+            if (
+                start_method
+                and
+                start_method != env_sm
+            ):
+                _log.warning(
+                    env_sm_report
+                    +
+                    f'NOTE env-var OVERRIDES caller-passed,\n'
+                    f'`start_method={start_method!r}`\n'
+                )
+            else:
+                _log.info(env_sm_report)
+
         if start_method is not None:
             _spawn.try_set_start_method(start_method)
 
@@ -286,17 +370,15 @@ async def open_root_actor(
             wrap_address(uw_addr)
             for uw_addr in uw_reg_addrs
         ]
-        loglevel: str = (
-            loglevel
-            or
-            log._default_loglevel
-        )
-        loglevel: str = loglevel.upper()
 
+        # Debug-mode is currently only supported for backends whose
+        # subactor root runtime is trio-native (so `tractor.devx.
+        # debug._tty_lock` works). See `_DEBUG_COMPATIBLE_BACKENDS`
+        # module-const for the list.
         if (
             debug_mode
             and
-            _spawn._spawn_method == 'trio'
+            _spawn._spawn_method in _DEBUG_COMPATIBLE_BACKENDS
         ):
             _state._runtime_vars['_debug_mode'] = True
 
@@ -318,16 +400,11 @@ async def open_root_actor(
 
         elif debug_mode:
             raise RuntimeError(
-                "Debug mode is only supported for the `trio` backend!"
+                f'Debug mode currently supported only for '
+                f'{_DEBUG_COMPATIBLE_BACKENDS!r} spawn backends, not '
+                f'{_spawn._spawn_method!r}.'
             )
 
-        assert loglevel
-        _log = log.get_console_log(
-            level=loglevel,
-            name='tractor',
-        )
-        assert _log
-
         # TODO: factor this into `.devx._stackscope`!!
         if (
             debug_mode
@@ -619,7 +696,7 @@ async def ping_tpt_socket(
             if (
                 debug_mode
                 and
-                _spawn._spawn_method == 'trio'
+                _spawn._spawn_method in _DEBUG_COMPATIBLE_BACKENDS
             ):
                 _state._runtime_vars['_debug_mode'] = False
 
diff --git a/tractor/_testing/_reap.py b/tractor/_testing/_reap.py
new file mode 100644
index 000000000..ce823a097
--- /dev/null
+++ b/tractor/_testing/_reap.py
@@ -0,0 +1,958 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Zombie-subactor reaper — SC-polite (SIGINT first, SIGKILL
+as last resort with a bounded grace window) plus optional
+`/dev/shm/` orphan-segment sweep.
+
+Shared implementation between the `tractor-reap` CLI
+(`scripts/tractor-reap`) and the pytest session-scoped
+auto-fixture that guards the test suite against leftover
+subactor processes.
+
+Design notes — process reap
+---------------------------
+
+- Linux-only today: reads `/proc/<pid>/{status,cwd,cmdline}`.
+  Module imports cleanly elsewhere; calling `find_*` on a
+  non-Linux box returns an empty list (no `/proc`
+  enumeration). A future xplatform pass could swap this
+  for `psutil.Process.children()` /
+  `psutil.process_iter()` since `psutil` is already a
+  test-time dependency.
+
+- Two detection modes:
+
+  1. **descendant-mode** — when invoked from a still-live
+     parent (e.g. a pytest session-end fixture), match by
+     `PPid == parent_pid`. Direct + precise; the target
+     PIDs are still reparented to the live pytest process
+     at teardown time, before pytest exits.
+
+  2. **orphan-mode** — when invoked after the parent died
+     (e.g. the `tractor-reap` CLI run post-Ctrl+C), match
+     by `PPid == 1` (reparented to init) AND `cwd ==
+     <repo-root>` AND cmdline contains `python`. The cwd
+     filter is what keeps the heuristic from sweeping up
+     unrelated init-children on the box.
+
+- Escalation: for every matched PID, SIGINT, poll for up
+  to `grace` seconds, then SIGKILL any survivors. The
+  two-phase pattern is the SC-graceful-cancel discipline
+  documented in `feedback_sc_graceful_cancel_first.md` —
+  we want the subactor runtime to run its trio cancel
+  shield + IPC teardown paths where it can.
+
+Design notes — shm sweep
+------------------------
+
+Since `tractor/ipc/_mp_bs.disable_mantracker()` turns off
+`mp.resource_tracker` entirely, a hard-crashing actor can
+leave `/dev/shm/<key>` segments behind that nothing else
+GCs (see
+`ai/conc-anal/subint_forkserver_mp_shared_memory_issue.md`,
+"Trade-offs / known gaps").
+
+The shm sweep is **Linux-/FreeBSD-only**: both expose
+POSIX shared-memory segments as regular files under
+`/dev/shm`, so `os.stat()` + `os.unlink()` are the
+correct primitives. macOS POSIX shm has no fs-visible
+path (segments live behind `shm_open`/`shm_unlink`
+syscalls only), and Windows is a different story
+entirely. Calling the shm helpers on an unsupported
+platform raises `NotImplementedError`.
+
+In-use enumeration delegates to `psutil` —
+`Process.memory_maps()` (post-mmap) +
+`Process.open_files()` (pre-mmap shm-opened fds) —
+xplatform, mature, and handles the per-process
+permission/race edge cases correctly. Segments matching
+neither are genuinely leaked → safe to unlink.
+
+The "nobody has it open" check is the kernel-canonical
+test — same answer `lsof /dev/shm/<key>` would give. No
+reliance on tractor-specific naming conventions (shm
+keys are caller-defined).
+
+'''
+from __future__ import annotations
+
+import os
+import pathlib
+import re
+import signal
+import stat
+import sys
+import time
+
+# `/dev/shm` is the POSIX-shm filesystem on Linux + FreeBSD.
+# macOS uses `shm_open` syscalls without a fs-visible path,
+# so the shm helpers refuse to run there.
+_SHM_PLATFORM_OK: bool = sys.platform.startswith(
+    ('linux', 'freebsd')
+)
+SHM_DIR: str = '/dev/shm'
+
+# UDS-socket leak sweep — see `find_orphaned_uds()` /
+# `reap_uds()` below. Tractor's UDS transport
+# (`tractor.ipc._uds`) creates sock files under
+# `${XDG_RUNTIME_DIR}/tractor/<name>@<pid>.sock`; a
+# crash / SIGKILL / mid-cancel teardown can leave the
+# file behind because `os.unlink()` lives in the
+# `_serve_ipc_eps` `finally:` block which doesn't always
+# get to run on hard exits. The reaper here is best-effort
+# cleanup for the test harness + the `tractor-reap` CLI.
+_UDS_SUBDIR: str = 'tractor'
+# `<actor-name>@<pid>.sock` — pid is the binder's pid at
+# creation time. Special sentinel: `registry@1616.sock`
+# uses the magic `1616` not a real pid (the root
+# registrar's known address; see `UDSAddress.get_root`).
+_UDS_NAME_RE: re.Pattern = re.compile(
+    r'^(?P<name>.+)@(?P<pid>\d+)\.sock$'
+)
+_UDS_REGISTRY_SENTINEL_PID: int = 1616
+
+
+def _ensure_shm_supported() -> None:
+    '''
+    Guard for shm helpers — they assume `/dev/shm` exists
+    as a tmpfs and `os.unlink()` is the right primitive.
+    Both true on Linux + FreeBSD; not true elsewhere.
+
+    '''
+    if not _SHM_PLATFORM_OK:
+        raise NotImplementedError(
+            f'shm reap is only supported on Linux/FreeBSD; '
+            f'got sys.platform={sys.platform!r}. macOS '
+            f'POSIX shm has no fs-visible path; Windows '
+            f'has no /dev/shm equivalent.'
+        )
+
+
+def _read_status_ppid(pid: int) -> int | None:
+    '''
+    Return the parent-pid from `/proc/<pid>/status` or
+    `None` if the proc went away / is unreadable.
+
+    '''
+    try:
+        with open(f'/proc/{pid}/status') as f:
+            for line in f:
+                if line.startswith('PPid:'):
+                    return int(line.split()[1])
+    except (
+        FileNotFoundError,
+        PermissionError,
+        ProcessLookupError,
+    ):
+        return None
+    return None
+
+
+def _read_cwd(pid: int) -> str | None:
+    try:
+        return os.readlink(f'/proc/{pid}/cwd')
+    except (
+        FileNotFoundError,
+        PermissionError,
+        ProcessLookupError,
+    ):
+        return None
+
+
+def _read_cmdline(pid: int) -> str:
+    try:
+        with open(f'/proc/{pid}/cmdline', 'rb') as f:
+            return f.read().replace(b'\0', b' ').decode(
+                errors='replace',
+            )
+    except (
+        FileNotFoundError,
+        PermissionError,
+        ProcessLookupError,
+    ):
+        return ''
+
+
+def _iter_live_pids() -> list[int]:
+    '''
+    Enumerate currently-alive pids from `/proc`. Returns
+    `[]` on systems without `/proc` (e.g. macOS).
+
+    '''
+    try:
+        entries: list[str] = os.listdir('/proc')
+    except OSError:
+        return []
+    return [int(e) for e in entries if e.isdigit()]
+
+
+def find_descendants(
+    parent_pid: int,
+) -> list[int]:
+    '''
+    PIDs whose `PPid == parent_pid` — i.e. direct
+    children of the given pid. Used by the pytest
+    session-end fixture where `parent_pid` is still
+    alive as the pytest-python process.
+
+    '''
+    return [
+        pid
+        for pid in _iter_live_pids()
+        if _read_status_ppid(pid) == parent_pid
+    ]
+
+
+def find_runaway_subactors(
+    parent_pid: int,
+    *,
+    cpu_threshold: float = 95.0,
+    sample_interval: float = 0.5,
+    only_pids: set[int]|None = None,
+) -> list[tuple[int, float, str]]:
+    '''
+    Return `(pid, cpu_pct, cmdline)` for any descendant
+    of `parent_pid` currently burning CPU above
+    `cpu_threshold` (default 95%) — the smoking-gun
+    signature of a runaway tight-loop bug (e.g. a C-level
+    `recvfrom` loop on a closed socket that missed EOF
+    detection — see
+    `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`).
+
+    `cpu_percent(interval=sample_interval)` is the
+    canonical psutil API for a "what %CPU is this proc
+    using NOW" answer — it samples twice with a delta to
+    compute true utilization.
+
+    `only_pids` filters to a specific pre-snapshotted set
+    (e.g. "pids spawned during this test only"); when
+    `None`, all live descendants are checked.
+
+    Returns `[]` when `psutil` isn't installed or no
+    descendants match.
+
+    '''
+    try:
+        import psutil
+    except ImportError:
+        return []
+
+    candidates: list[int] = find_descendants(parent_pid)
+    if only_pids is not None:
+        candidates = [p for p in candidates if p in only_pids]
+    if not candidates:
+        return []
+
+    runaways: list[tuple[int, float, str]] = []
+    for pid in candidates:
+        try:
+            proc = psutil.Process(pid)
+            cpu: float = proc.cpu_percent(
+                interval=sample_interval,
+            )
+            if cpu < cpu_threshold:
+                continue
+            cmdline: str = ' '.join(proc.cmdline())
+            runaways.append((pid, cpu, cmdline))
+        except (
+            psutil.NoSuchProcess,
+            psutil.AccessDenied,
+        ):
+            continue
+    return runaways
+
+
+def find_orphans(
+    repo_root: pathlib.Path,
+) -> list[int]:
+    '''
+    PIDs that are:
+
+    - reparented to init (`PPid == 1`),
+    - have `cwd == <repo_root>`,
+    - and have a `python` in their cmdline.
+
+    This is the "pytest-died-mid-session" case where the
+    subactor forks got reparented. The cwd filter is the
+    critical bit that keeps us from sweeping up unrelated
+    init-children on the box.
+
+    '''
+    repo: str = str(repo_root)
+    hits: list[int] = []
+    for pid in _iter_live_pids():
+        if _read_status_ppid(pid) != 1:
+            continue
+        cwd: str | None = _read_cwd(pid)
+        if cwd != repo:
+            continue
+        cmd: str = _read_cmdline(pid)
+        if 'python' not in cmd:
+            continue
+        hits.append(pid)
+    return hits
+
+
+def reap(
+    pids: list[int],
+    *,
+    grace: float = 3.0,
+    poll: float = 0.25,
+    log=print,
+) -> tuple[list[int], list[int]]:
+    '''
+    Deliver SIGINT to each pid, wait up to `grace`
+    seconds for them to exit, then SIGKILL any that
+    survive.
+
+    Returns `(signalled, survivors_killed)` so callers
+    can report / assert.
+
+    `log` is the logger function for user-visible
+    progress lines — default `print`; pytest fixture
+    swaps it for a `pytest`-friendly writer.
+
+    '''
+    if not pids:
+        return ([], [])
+
+    signalled: list[int] = []
+    for pid in pids:
+        try:
+            os.kill(pid, signal.SIGINT)
+            signalled.append(pid)
+        except ProcessLookupError:
+            # raced — already gone
+            pass
+
+    if signalled:
+        log(
+            f'[tractor-reap] SIGINT → {len(signalled)} '
+            f'proc(s): {signalled}'
+        )
+
+    deadline: float = time.monotonic() + grace
+    while time.monotonic() < deadline:
+        time.sleep(poll)
+        alive: list[int] = [
+            pid for pid in signalled if _is_alive(pid)
+        ]
+        if not alive:
+            return (signalled, [])
+
+    survivors: list[int] = [
+        pid for pid in signalled if _is_alive(pid)
+    ]
+    if survivors:
+        log(
+            f'[tractor-reap] SIGKILL (after {grace}s '
+            f'grace) → {survivors}'
+        )
+        for pid in survivors:
+            try:
+                os.kill(pid, signal.SIGKILL)
+            except ProcessLookupError:
+                pass
+
+    return (signalled, survivors)
+
+
+def _is_alive(pid: int) -> bool:
+    '''
+    True iff `/proc/<pid>` still exists AND the proc
+    isn't already a zombie (Z state).
+
+    '''
+    try:
+        with open(f'/proc/{pid}/status') as f:
+            for line in f:
+                if line.startswith('State:'):
+                    # e.g. 'State:\tZ (zombie)'
+                    return 'Z' not in line.split()[1]
+    except (
+        FileNotFoundError,
+        ProcessLookupError,
+    ):
+        return False
+    return True
+
+
+def _enumerate_in_use_shm(
+    shm_dir: str = SHM_DIR,
+) -> set[str]:
+    '''
+    Return the set of `<shm_dir>/<file>` paths currently
+    held open by any live process — via `psutil`'s
+    xplatform `Process.memory_maps()` (post-mmap
+    segments) and `Process.open_files()` (pre-mmap
+    shm-opened fds).
+
+    Lazy-imports `psutil` so the module stays importable
+    on installs without it (it's a `testing` group dep).
+
+    '''
+    _ensure_shm_supported()
+
+    # lazy + actionable failure: leaked shm sweep is the
+    # only thing in this module that needs psutil; we
+    # don't want a top-level ImportError breaking the
+    # process-reap path.
+    try:
+        import psutil
+    except ImportError as exc:
+        raise RuntimeError(
+            'shm reap requires `psutil` — install the '
+            '`testing` dep group, e.g. '
+            '`uv sync --group testing`.'
+        ) from exc
+
+    in_use: set[str] = set()
+    prefix: str = shm_dir.rstrip('/') + '/'
+    for proc in psutil.process_iter(['pid']):
+        try:
+            for m in proc.memory_maps(grouped=False):
+                if m.path.startswith(prefix):
+                    in_use.add(m.path)
+            for f in proc.open_files():
+                if f.path.startswith(prefix):
+                    in_use.add(f.path)
+        except (
+            psutil.NoSuchProcess,
+            psutil.AccessDenied,
+            psutil.ZombieProcess,
+            FileNotFoundError,
+            PermissionError,
+        ):
+            # raced — proc died or we can't see its
+            # mappings (e.g. root-owned). Skip; missing
+            # an in-use entry only means we'd preserve
+            # something we could reap, never the
+            # reverse — safe-by-default.
+            continue
+    return in_use
+
+
+def find_orphaned_shm(
+    *,
+    uid: int | None = None,
+    shm_dir: str = SHM_DIR,
+) -> list[str]:
+    '''
+    `<shm_dir>/<file>` paths that are:
+
+    - owned by `uid` (default: the current effective uid),
+    - and currently held by NO live process — i.e.
+      genuinely leaked.
+
+    Linux/FreeBSD only — see module docstring. No reliance
+    on caller-defined shm-key naming, so this works for
+    any tractor app (not just the test suite).
+
+    '''
+    _ensure_shm_supported()
+
+    if uid is None:
+        uid = os.geteuid()
+
+    try:
+        entries: list[str] = os.listdir(shm_dir)
+    except OSError:
+        return []
+
+    in_use: set[str] = _enumerate_in_use_shm(shm_dir=shm_dir)
+    leaked: list[str] = []
+    prefix: str = shm_dir.rstrip('/') + '/'
+    for entry in entries:
+        path: str = prefix + entry
+        try:
+            st: os.stat_result = os.stat(path)
+        except OSError:
+            continue
+        # only regular files — skip subdirs / sockets etc.
+        if not stat.S_ISREG(st.st_mode):
+            continue
+        if st.st_uid != uid:
+            continue
+        if path in in_use:
+            continue
+        leaked.append(path)
+    return leaked
+
+
+def reap_shm(
+    paths: list[str],
+    *,
+    log=print,
+) -> tuple[list[str], list[tuple[str, OSError]]]:
+    '''
+    Unlink the given `/dev/shm/...` paths.
+
+    Linux/FreeBSD only — `os.unlink()` is the correct
+    primitive on the POSIX-shm tmpfs there. macOS POSIX
+    shm has no fs-visible path; the equivalent there is
+    `posix_ipc.unlink_shared_memory(name)` (not
+    implemented here — see module docstring).
+
+    Returns `(unlinked, errors)` where `errors` is a list
+    of `(path, exc)` for paths that could not be removed
+    (e.g. permissions). Paths that raced to being already-
+    gone are counted as successfully unlinked.
+
+    '''
+    _ensure_shm_supported()
+
+    unlinked: list[str] = []
+    errors: list[tuple[str, OSError]] = []
+    for path in paths:
+        try:
+            os.unlink(path)
+            unlinked.append(path)
+        except FileNotFoundError:
+            # raced — already gone, treat as success
+            unlinked.append(path)
+        except OSError as exc:
+            errors.append((path, exc))
+
+    if unlinked:
+        log(
+            f'[tractor-reap] unlinked {len(unlinked)} '
+            f'orphaned shm segment(s): {unlinked}'
+        )
+    for path, exc in errors:
+        log(
+            f'[tractor-reap] could not unlink {path}: '
+            f'{exc!r}'
+        )
+    return (unlinked, errors)
+
+
+def get_uds_dir() -> str|None:
+    '''
+    Path of tractor's per-user UDS sock-file dir
+    (`${XDG_RUNTIME_DIR}/tractor/`).
+
+    Returns `None` when `XDG_RUNTIME_DIR` is unset (e.g.
+    non-systemd hosts, or inside a container without the
+    var plumbed through). Caller should treat that as
+    "no UDS leaks possible to detect — skip".
+
+    '''
+    xdg: str|None = os.environ.get('XDG_RUNTIME_DIR')
+    if not xdg:
+        return None
+    return os.path.join(xdg, _UDS_SUBDIR)
+
+
+def _parse_uds_name(filename: str) -> tuple[str, int]|None:
+    '''
+    Extract `(actor_name, pid)` from a tractor UDS sock
+    filename. Returns `None` for unrecognized names.
+
+    '''
+    m = _UDS_NAME_RE.match(filename)
+    if not m:
+        return None
+    return (m['name'], int(m['pid']))
+
+
+def find_orphaned_uds(
+    *,
+    uds_dir: str|None = None,
+) -> list[str]:
+    '''
+    `<uds_dir>/*.sock` paths whose binder pid is no
+    longer alive (orphaned). Includes the
+    `registry@1616.sock` sentinel — `1616` is a magic
+    sentinel pid (not a real one) so the file's
+    presence alone signals a leak from a dead session.
+
+    Returns `[]` on platforms without `XDG_RUNTIME_DIR`
+    or when the dir doesn't exist. Files whose name
+    doesn't match the `<name>@<pid>.sock` pattern are
+    skipped (we don't unlink things we don't recognize).
+
+    '''
+    dir_path: str = uds_dir or get_uds_dir()
+    if not dir_path:
+        return []
+
+    try:
+        entries: list[str] = os.listdir(dir_path)
+    except OSError:
+        return []
+
+    leaked: list[str] = []
+    prefix: str = dir_path.rstrip('/') + '/'
+    for entry in entries:
+        path: str = prefix + entry
+        if not entry.endswith('.sock'):
+            continue
+        try:
+            st: os.stat_result = os.stat(path)
+        except OSError:
+            continue
+        # only sockets; skip stray regular files / subdirs
+        if not stat.S_ISSOCK(st.st_mode):
+            continue
+        parsed = _parse_uds_name(entry)
+        if parsed is None:
+            # unknown naming — skip rather than risk
+            # unlinking something we don't own
+            continue
+        _name, pid = parsed
+        if pid == _UDS_REGISTRY_SENTINEL_PID:
+            # sentinel — never a real pid; if the file
+            # exists nobody live is "owning" it via
+            # /proc lookup, so always orphaned
+            leaked.append(path)
+            continue
+        if not _is_alive(pid):
+            leaked.append(path)
+    return leaked
+
+
+def reap_uds(
+    paths: list[str],
+    *,
+    log=print,
+) -> tuple[list[str], list[tuple[str, OSError]]]:
+    '''
+    Unlink the given UDS sock-file paths.
+
+    Returns `(unlinked, errors)`; race-already-gone
+    `FileNotFoundError`s count as success. Same shape
+    as `reap_shm` so callers can pipeline both.
+
+    '''
+    unlinked: list[str] = []
+    errors: list[tuple[str, OSError]] = []
+    for path in paths:
+        try:
+            os.unlink(path)
+            unlinked.append(path)
+        except FileNotFoundError:
+            unlinked.append(path)
+        except OSError as exc:
+            errors.append((path, exc))
+
+    if unlinked:
+        log(
+            f'[tractor-reap] unlinked {len(unlinked)} '
+            f'orphaned UDS sock-file(s): {unlinked}'
+        )
+    for path, exc in errors:
+        log(
+            f'[tractor-reap] could not unlink {path}: '
+            f'{exc!r}'
+        )
+    return (unlinked, errors)
+
+
+# ----------------------------------------------------------
+# Pytest fixtures — sub-plugin surface
+# ----------------------------------------------------------
+# Loaded as a pytest plugin via the `pytest_plugins` line in
+# `tractor._testing.pytest`. Keeps the reaping infra (helpers
+# above + fixtures below) co-located so adding a new reap
+# target is a single-file change. Sibling-module
+# (`tractor._testing.pytest`) keeps its core
+# tractor-tooling surface (option/marker/parametrize hooks,
+# `tractor_test` deco, transport / spawn-method fixtures)
+# uncluttered.
+import pytest
+
+
+@pytest.fixture(
+    scope='session',
+    autouse=True,
+)
+def _reap_orphaned_subactors():
+    '''
+    Session-scoped autouse fixture: after the whole test
+    session finishes, SIGINT any subactor processes still
+    parented to this `pytest` process, wait a bounded
+    grace window, then SIGKILL survivors.
+
+    Rationale: under fork-based spawn backends (notably
+    `main_thread_forkserver`), a test that times out or bails
+    mid-teardown can leave subactor forks alive. Without
+    this reap, they linger across sessions and compete
+    for ports / inherit pytest's capture-pipe fds — which
+    flakifies later tests. SC-polite discipline: SIGINT
+    first to let the subactor's trio cancel shield + IPC
+    teardown paths run before we escalate.
+
+    Matching companion CLI: `scripts/tractor-reap` for
+    the pytest-died-mid-session case.
+
+    '''
+    parent_pid: int = os.getpid()
+    yield
+    pids: list[int] = find_descendants(parent_pid)
+    if pids:
+        reap(pids, grace=3.0)
+    # NOTE, sweep UDS sock-files AFTER reaping subactors —
+    # killed actors' bind paths only become "orphaned" once
+    # their owning pid is gone. See `find_orphaned_uds()`
+    # for the leak-detection algorithm + the `1616`
+    # registry-sentinel special case.
+    leaked_uds: list[str] = find_orphaned_uds()
+    if leaked_uds:
+        reap_uds(leaked_uds)
+
+
+@pytest.fixture(
+    scope='function',
+    autouse=True,
+)
+def _track_orphaned_uds_per_test():
+    '''
+    Per-test (function-scoped) autouse UDS sock-file leak
+    detector + reaper.
+
+    Snapshots `${XDG_RUNTIME_DIR}/tractor/` before and
+    after each test; any `<name>@<pid>.sock` files
+    created during the test that survive teardown AND
+    whose creator pid is dead are surfaced as a loud
+    warning AND reaped, so the next test starts with a
+    clean dir.
+
+    Why per-test (not just session-scoped): under
+    `--tpt-proto=uds`, a single hard-killed subactor
+    leaves a sock file that a sibling test's
+    `wait_for_actor`/`find_actor` discovery probes can
+    accidentally hit (FileExistsError on rebind, or
+    epoll register on a half-closed peer-FIN'd fd → see
+    issue #454). Catching the leak the test that caused
+    it (vs. blanket session-end sweep) makes blame
+    obvious + prevents cascade flakiness.
+
+    Cheap: 2x `os.listdir` + a few `os.stat`s per test.
+    Skips silently when `XDG_RUNTIME_DIR` isn't set.
+
+    '''
+    uds_dir: str|None = get_uds_dir()
+    # snapshot pre-test sock-file population so we only
+    # blame this test for files it added (others may have
+    # been left around by session-scoped fixtures /
+    # cross-session leaks pending reaper).
+    before: set[str] = set()
+    if uds_dir:
+        try:
+            before = {
+                e for e in os.listdir(uds_dir)
+                if e.endswith('.sock')
+            }
+        except OSError:
+            pass
+
+    yield
+
+    if not uds_dir:
+        return
+    try:
+        after: set[str] = {
+            e for e in os.listdir(uds_dir)
+            if e.endswith('.sock')
+        }
+    except OSError:
+        return
+    new_files: set[str] = after - before
+    if not new_files:
+        return
+    # only consider files whose binder pid is dead (or the
+    # 1616 sentinel) — a still-running test that legit
+    # holds a sock open will be ignored here and caught at
+    # session-end if it really is leaked.
+    orphans: list[str] = find_orphaned_uds(uds_dir=uds_dir)
+    new_orphans: list[str] = [
+        os.path.join(uds_dir, n) for n in new_files
+        if os.path.join(uds_dir, n) in orphans
+    ]
+    if new_orphans:
+        import warnings
+        warnings.warn(
+            'UDS sock-file LEAK detected from test '
+            '(reaping):\n  '
+            + '\n  '.join(new_orphans),
+            stacklevel=1,
+        )
+        reap_uds(new_orphans)
+
+
+@pytest.fixture(
+    scope='function',
+    autouse=True,
+)
+def _detect_runaway_subactors_per_test():
+    '''
+    Per-test (function-scoped) autouse runaway-subactor
+    detector.
+
+    Snapshots descendant pids before+after each test;
+    for any pid spawned during the test that's still
+    ALIVE at teardown AND burning >95% CPU, emits a loud
+    warning with `pid`, sampled `cpu%`, full `cmdline`,
+    AND copy-pastable diag commands (`strace`, `lsof`,
+    `ss`, `kill`).
+
+    **Does NOT kill the runaway** — by design.
+    The point of this fixture is to make tight-loop bugs
+    (e.g. C-level `recvfrom` loop on a closed socket
+    that missed EOF detection — see
+    `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`)
+    loudly visible AT the test that triggers, while
+    keeping the live pid available for hands-on
+    diagnosis. The session-end
+    `_reap_orphaned_subactors` fixture will
+    SIGINT-then-SIGKILL any survivors when the test
+    session completes normally; if the user Ctrl-C's
+    pytest mid-warning, the pid stays alive for as long
+    as needed.
+
+    Cost: one extra `os.listdir('/proc')` snapshot
+    pre-test, one snapshot + N×`psutil.cpu_percent(0.5)`
+    post-test (only when there ARE new descendants —
+    most tests don't trigger any sampling). Skips
+    silently when `psutil` isn't installed.
+
+    '''
+    parent_pid: int = os.getpid()
+
+    def _emit_runaway_warning(
+        runaways: list[tuple[int, float, str]],
+        when: str,
+    ) -> None:
+        '''
+        Format + emit the runaway warning. Shared between
+        the SETUP-side (pre-yield, catches survivors of a
+        prior hung test) and TEARDOWN-side (post-yield,
+        catches normally-completing tests that left a
+        runaway behind) detection passes.
+
+        '''
+        msg_lines: list[str] = [
+            f'RUNAWAY subactor(s) detected at {when} — '
+            f'burning CPU (>95%):',
+        ]
+        for pid, cpu, cmdline in runaways:
+            msg_lines.extend([(
+                f'  pid={pid} cpu={cpu:.1f}% cmdline={cmdline!r}\n'
+                f'  diagnose live (pid stays alive — NOT killed):\n'
+                f'    sudo strace -p {pid} -f -tt -e trace=recvfrom,epoll_wait,read,write\n'
+                f'    sudo readlink /proc/{pid}/fd/* 2>/dev/null | head -20\n'
+                f'    sudo ss -tnp | grep {pid}\n'
+                f'    sudo lsof -p {pid}\n'
+                f'  manual kill when done:\n'
+                f'    kill -SIGINT {pid}    # graceful first\n'
+                f'    kill -SIGKILL {pid}   # if SIGINT ignored (busy in C)\n'
+                f'\n'
+            )])
+        import warnings
+        warnings.warn(
+            '\n'.join(msg_lines),
+            stacklevel=1,
+        )
+
+    # SETUP-side detection: catches runaways inherited
+    # from a PRIOR test that hung (and the user
+    # Ctrl-C'd or pytest-timeout fired) — those tests'
+    # teardown-side detector never ran, but the
+    # subactor is still burning CPU when the next test
+    # starts. The warning comes ONE TEST LATE which is
+    # imperfect but better than silence.
+    pre_existing: set[int] = set(find_descendants(parent_pid))
+    pre_runaways: list[tuple[int, float, str]] = (
+        find_runaway_subactors(
+            parent_pid,
+            only_pids=pre_existing,
+        )
+    )
+    if pre_runaways:
+        _emit_runaway_warning(
+            pre_runaways,
+            when='test SETUP (leftover from prior hung test)',
+        )
+
+    yield
+
+    # TEARDOWN-side detection: catches runaways spawned
+    # by THIS test that survived a normal teardown
+    # (i.e. parent's `hard_kill` SIGKILL didn't actually
+    # stop the runaway because it was in C tight-loop
+    # somewhere unreachable to signals — see
+    # `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
+    # for the canonical fork-spawn forkserver-worker
+    # post-fork-close gap).
+    post_runaways: list[tuple[int, float, str]] = (
+        find_runaway_subactors(
+            parent_pid,
+            only_pids=set(
+                find_descendants(parent_pid)
+            ) - pre_existing,
+        )
+    )
+    if post_runaways:
+        _emit_runaway_warning(
+            post_runaways,
+            when='test teardown',
+        )
+
+
+@pytest.fixture
+def reap_subactors_per_test() -> int:
+    '''
+    Per-test (function-scoped) zombie-subactor reaper —
+    **opt-in**, NOT autouse.
+
+    When a test's teardown fails to fully cancel its actor
+    tree (e.g. an asyncio cancel-cascade times out under
+    `main_thread_forkserver`, pytest hits its 200s wall-
+    clock and abandons), the leftover subactor lingers as a
+    direct child of `pytest` and squats on whatever
+    registrar port / UDS path / shm segment it had bound.
+    Subsequent tests trying to allocate the same resource
+    fail — and with backends that bind a session-shared
+    `reg_addr`, that means EVERY following test in the
+    suite cascades. The session-scoped sibling
+    (`_reap_orphaned_subactors`) only kicks in at session
+    end which is too late to save the cascade.
+
+    Apply at module-level on the topically-problematic
+    test files via:
+
+    ```python
+    pytestmark = pytest.mark.usefixtures(
+        'reap_subactors_per_test',
+    )
+    ```
+
+    Or per-test via the same `usefixtures` mark on a
+    specific function. Intentionally NOT autouse so the
+    fixture's presence on a module signals "this module's
+    teardown is known-leaky enough to contaminate
+    siblings"; the visibility helps future-us track down
+    root causes rather than burying them under blanket
+    cleanup.
+
+    '''
+    parent_pid: int = os.getpid()
+    yield parent_pid
+    pids: list[int] = find_descendants(parent_pid)
+    if pids:
+        reap(pids, grace=3.0)
diff --git a/tractor/_testing/addr.py b/tractor/_testing/addr.py
index 1cff80db6..6927db770 100644
--- a/tractor/_testing/addr.py
+++ b/tractor/_testing/addr.py
@@ -22,6 +22,7 @@
 our `tractor.discovery` subsys?
 
 '''
+import os
 import random
 from typing import (
     Type,
@@ -31,17 +32,28 @@
 
 def get_rando_addr(
     tpt_proto: str,
-    *,
-
-    # choose random port at import time
-    _rando_port: str = random.randint(1000, 9999)
-
 ) -> tuple[str, str|int]:
     '''
     Used to globally override the runtime to the
     per-test-session-dynamic addr so that all tests never conflict
     with any other actor tree using the default.
 
+    Cross-process isolation: TCP-port picks salt
+    `random.randint()` with `os.getpid()` so two parallel
+    pytest sessions (e.g. one running `--tpt-proto=tcp` and
+    another `--tpt-proto=uds` concurrently) almost-never
+    collide on the same port. Without the salt, the prior
+    impl's import-time `random.randint(1000, 9999)` default
+    arg was effectively a process-singleton with a 1/9000
+    chance of cross-run collision per pair — and when it
+    happened EVERY `reg_addr`-using test in BOTH runs would
+    fight over the bind, cascading into a chain of
+    "Address already in use" failures.
+
+    For UDS this concern doesn't apply: `UDSAddress.get_random()`
+    already builds socket paths from `os.getpid()` so each
+    pytest process gets its own socket-path namespace.
+
     '''
     addr_type: Type[_addr.Addres] = _addr._address_types[tpt_proto]
     def_reg_addr: tuple[str, int] = _addr._default_lo_addrs[tpt_proto]
@@ -51,9 +63,21 @@ def get_rando_addr(
     testrun_reg_addr: tuple[str, int|str]
     match tpt_proto:
         case 'tcp':
+            # Per-call randomness mixed with `os.getpid()` —
+            # see the docstring above for the cross-process
+            # isolation rationale. The mix means:
+            # - within one pytest session, two calls return
+            #   distinct ports (good for tests that need a
+            #   second-different-reg-addr in one fn body, e.g.
+            #   `test_tpt_bind_addrs::bind-subset-reg`),
+            # - across parallel pytest sessions, the pid bias
+            #   makes coincident port choices unlikely.
+            port: int = 1000 + (
+                random.randint(0, 8999) + os.getpid()
+            ) % 9000
             testrun_reg_addr = (
                 addr_type.def_bindspace,
-                _rando_port,
+                port,
             )
 
         # NOTE, file-name uniqueness (no-collisions) will be based on
diff --git a/tractor/_testing/pytest.py b/tractor/_testing/pytest.py
index ef3cc9a73..a579d546c 100644
--- a/tractor/_testing/pytest.py
+++ b/tractor/_testing/pytest.py
@@ -24,16 +24,114 @@
     wraps,
 )
 import inspect
+import os
 import platform
 from typing import (
     Callable,
     get_args,
+    TYPE_CHECKING,
 )
 
 import pytest
 import tractor
+from tractor.spawn._spawn import SpawnMethodKey
 import trio
 
+# Sub-plugin: zombie-subactor + UDS sock-file + shm
+# reaping fixtures live in `tractor._testing._reap`
+# alongside the underlying detection/cleanup helpers.
+# Loading `_reap` as a sub-plugin here keeps reaping
+# concerns co-located + this module focused on tractor-
+# tooling-specific hooks (option/marker/parametrize,
+# `tractor_test` deco, transport / spawn-method
+# fixtures).
+pytest_plugins: tuple[str, ...] = (
+    'tractor._testing._reap',
+)
+
+if TYPE_CHECKING:
+    from argparse import Namespace
+
+_cap_sys_passed_as_flag: bool = False
+_cap_fd_set: bool = False
+
+# XXX REQUIRED in order to enforce `--capture=` flag
+# pre test session.
+# https://docs.pytest.org/en/stable/reference/reference.html#bootstrapping-hooks
+@pytest.hookimpl(tryfirst=True)
+def pytest_load_initial_conftests(
+    early_config: pytest.Config,
+    parser: pytest.Parser,
+    args: list[str],
+):
+    global _cap_sys_passed_as_flag, _cap_fd_set
+
+    opts: Namespace = early_config.option
+    if opts.capture == 'fd':
+        _cap_fd_set = True
+
+    opts_w_args: Namespace = parser.parse_known_args(args)
+    if opts_w_args.capture == 'fd':
+        _cap_fd_set = True
+
+    if '--capture=sys' in args:
+        _cap_sys_passed_as_flag = True
+
+    # XXX, ALWAYS apply capsys for fork based spawners:
+    # * main_thread_forkserver
+    # * (TODO) subint_forkserver
+    # '--capture=sys',
+    # ^XXX NOTE^ for `main_thread_forkserver` spawner
+    #
+    # => sys-level capture is REQUIRED for fork-based spawn
+    # backends (e.g. `main_thread_forkserver`): default
+    # `--capture=fd` redirects fd 1,2 to temp files, and fork
+    # children inherit those fds — opaque deadlocks happen in
+    # the pytest-capture-machinery ↔ fork-child stdio
+    # interaction. `--capture=sys` only redirects Python-level
+    # `sys.stdout`/`sys.stderr`, leaving fd 1,2 alone.
+    #
+    # Trade-off (vs. `--capture=fd`):
+    # - LOST: per-test attribution of subactor *raw-fd* output
+    #   (C-ext writes, `os.write(2, ...)`, subproc stdout). Not
+    #   zero — those go to the terminal, captured by CI's
+    #   terminal-level capture, just not per-test-scoped in the
+    #   pytest failure report.
+    # - KEPT: Python-level `print()` + `logging` capture per-
+    #   test (tractor's logger uses `sys.stderr`, so tractor
+    #   log output IS still attributed per-test).
+    # - KEPT: user `pytest -s` for debugging (unaffected).
+    #
+    # Full post-mortem in
+    # `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`.
+    if (
+        (spawner := opts_w_args.spawn_backend) in [
+            'main_thread_forkserver',
+        ]
+    ):
+        print(
+            f'XXX SETTING CAPSYS due to spawning backend XXX\n'
+            f'--spawn-backend={spawner!r}\n'
+        )
+        opts.capture = 'sys'
+        # ^TODO XXX?/
+        # seems like this doesn't get set by the above!?
+        args.append(
+            '--capture=sys',
+        )
+        out = parser.parse_known_and_unknown_args(
+            args,
+            early_config.option,
+        )
+        assert out[0].capture == 'sys'
+        # breakpoint()
+
+    # TODO, set various `$TRACTOR_X*` osenv vars here!
+    print(
+        f'Applying `tractor`-specific `pytest` config,\n'
+        f'{opts_w_args!r}\n'
+    )
+
 
 def tractor_test(
     wrapped: Callable|None = None,
@@ -112,11 +210,17 @@ async def test_whatever(
     # injection (via `__wrapped__`) without leaking the async
     # nature.
     @wraps(wrapped)
-    def wrapper(**kwargs):
+    def wrapper(
+        set_fork_aware_capture: pytest.CaptureFixture|None = None,
+        # ^NOTE when set, the decorated fn declared as fixture-param.
+
+        **kwargs,
+    ):
         __tracebackhide__: bool = hide_tb
 
         # NOTE, ensure we inject any test-fn declared fixture
         # names.
+        sig = inspect.signature(wrapped)
         for kw in [
             'reg_addr',
             'loglevel',
@@ -125,9 +229,13 @@ def wrapper(**kwargs):
             'tpt_proto',
             'timeout',
         ]:
-            if kw in inspect.signature(wrapped).parameters:
+            if kw in sig.parameters:
                 assert kw in kwargs
 
+        if 'set_fork_aware_capture' in sig.parameters:
+            assert set_fork_aware_capture
+            kwargs['set_fork_aware_capture'] = set_fork_aware_capture
+
         # Extract runtime settings as locals for
         # `open_root_actor()`; these must NOT leak into
         # `kwargs` when the test fn doesn't declare them
@@ -170,7 +278,6 @@ async def _main(**kwargs):
                     # invoke test-fn body IN THIS task
                     await wrapped(**kwargs)
 
-        # invoke runtime via a root task.
         return trio.run(
             partial(
                 _main,
@@ -184,13 +291,6 @@ async def _main(**kwargs):
 def pytest_addoption(
     parser: pytest.Parser,
 ):
-    # parser.addoption(
-    #     "--ll",
-    #     action="store",
-    #     dest='loglevel',
-    #     default='ERROR', help="logging level to set when testing"
-    # )
-
     parser.addoption(
         "--spawn-backend",
         action="store",
@@ -212,6 +312,21 @@ def pytest_addoption(
         ),
     )
 
+    parser.addoption(
+        "--enable-stackscope",
+        action="store_true",
+        dest='enable_stackscope',
+        default=False,
+        help=(
+            'Install `stackscope` SIGUSR1 handler in pytest + '
+            'every spawned subactor for live trio task-tree '
+            'dumps during hang investigations. Lighter than '
+            '`--tpdb` (no pdb machinery / tty-lock contention) '
+            '— use when you only need stack visibility. To '
+            'capture: `kill -USR1 <pytest-or-subactor-pid>`.'
+        ),
+    )
+
     # provide which IPC transport protocols opting-in test suites
     # should accumulatively run against.
     parser.addoption(
@@ -227,6 +342,13 @@ def pytest_addoption(
 def pytest_configure(
     config: pytest.Config,
 ):
+    # opts: Namespace = config.option
+    # print(
+    #     f'PYTEST_CONFIGURE\n'
+    #     f'capture={opts.capture!r}\n'
+    # )
+    # breakpoint()
+
     backend: str = config.option.spawn_backend
     from tractor.spawn._spawn import try_set_start_method
     try:
@@ -252,6 +374,40 @@ def pytest_configure(
         'in `ai/conc-anal/subint_sigint_starvation_issue.md`).'
     )
 
+    # `--enable-stackscope`: install SIGUSR1 → trio task-tree
+    # dump in pytest itself + propagate to every subactor via
+    # an env var that fork-children inherit and the runtime
+    # gate honors. Lighter than `--tpdb` (no pdb machinery) —
+    # purely for hang-investigation stack visibility.
+    if getattr(
+        config.option,
+        'enable_stackscope',
+        False
+    ):
+        # Env var inherited via fork → subactor's runtime
+        # picks it up at `Actor.async_main` startup. See the
+        # gate in `tractor.runtime._runtime` matching this
+        # var name.
+        os.environ['TRACTOR_ENABLE_STACKSCOPE'] = '1'
+
+        # Install in pytest itself so `kill -USR1 <pytest>`
+        # dumps the parent trio task-tree (which is where
+        # most Mode-A-class hangs park).
+        try:
+            from tractor.devx._stackscope import (
+                enable_stack_on_sig,
+            )
+            enable_stack_on_sig()
+        except ImportError:
+            import warnings
+            warnings.warn(
+                '`stackscope` not installed — '
+                '--enable-stackscope is a no-op. '
+                'Install via the `devx` dep group.'
+            )
+    else:
+        os.environ.pop('TRACTOR_ENABLE_STACKSCOPE', None)
+
 
 def pytest_collection_modifyitems(
     config: pytest.Config,
@@ -274,7 +430,12 @@ class + module-level marks in the correct scope order (and
     default_reason: str = f'Borked on --spawn-backend={backend!r}'
     for item in items:
         for mark in item.iter_markers(name='skipon_spawn_backend'):
-            if backend in mark.args:
+            skip_backends: tuple[str] = mark.args
+            for skip_backend in skip_backends:
+                assert skip_backend in get_args(SpawnMethodKey)
+            # ?TODO, run these through the try-set-backend checker to
+            # avoid typos?
+            if backend in skip_backends:
                 reason: str = mark.kwargs.get(
                     'reason',
                     default_reason,
@@ -285,6 +446,25 @@ class + module-level marks in the correct scope order (and
                 break
 
 
+@pytest.fixture(
+    scope="session",
+    autouse=True,
+)
+def alert_on_finish():
+    '''
+    Ring a terminal notification on full test session
+    completion to alert any would be human.
+
+    '''
+    # TODO, check attached to tty or skip!
+    yield  # run all tests
+    print("\a")  # trigger terminal bell
+    # ?TODO, any other nice-tricks/specific tuis we could try?
+    # - supposedly works in many terminals:
+    #   >> print("\033]5;Alert: Tests Finished\a")
+    # - sway/i3-nag?
+
+
 @pytest.fixture(scope='session')
 def debug_mode(
     request: pytest.FixtureRequest,
@@ -398,7 +578,6 @@ def pytest_generate_tests(
     # drive the valid-backend set from the canonical `Literal` so
     # adding a new spawn backend (e.g. `'subint'`) doesn't require
     # touching the harness.
-    from tractor.spawn._spawn import SpawnMethodKey
     assert spawn_backend in get_args(SpawnMethodKey)
 
     # NOTE: used-to-be-used-to dyanmically parametrize tests for when
@@ -410,6 +589,7 @@ def pytest_generate_tests(
             "start_method",
             [spawn_backend],
             scope='module',
+            ids=lambda item: f'start_method={spawn_backend}',
         )
 
     # TODO, parametrize any `tpt_proto: str` declaring tests!
@@ -420,3 +600,85 @@ def pytest_generate_tests(
     #         proto_tpts,  # TODO, double check this list usage!
     #         scope='module',
     #     )
+
+def _is_forking_spawner(
+    start_method: str,
+) -> bool:
+    return start_method in [
+        'main_thread_forkserver',
+        'mp_forkserver',
+    ]
+
+
+@pytest.fixture
+def is_forking_spawner(
+    start_method: str,
+) -> bool:
+    '''
+    Is the `pytest` run using a `fork()`ing process spawning-backend?
+
+    '''
+    return _is_forking_spawner
+
+
+def maybe_xfail_for_spawner(
+    start_method: str,
+    is_forking_spawner: bool,
+) -> None:
+    '''
+    Fork based spawning backends caude issues with `pytest`'s
+    fd-capture mechanism and can cause various suites to hang.
+
+    Instead this helper allows skipping/xfailing from a test
+    when a certain spawner + CLI-flag input is detected.
+
+    '''
+    if (
+        not _cap_sys_passed_as_flag
+        and
+        is_forking_spawner
+    ):
+        pytest.skip(
+            f'Spawner {start_method!r} requires the flag,\n'
+            f'--capture=sys or similar..\n'
+        )
+
+
+def maybe_override_capture(
+    request: pytest.FixtureRequest,
+    start_method: bool,
+) -> str:
+    if _is_forking_spawner(start_method):
+        request.getfixturevalue('capsys')
+        return 'sys'
+
+    return request.config.option.capture
+
+
+@pytest.fixture
+def set_fork_aware_capture(
+    request: pytest.FixtureRequest,
+    start_method: str,
+) -> pytest.CaptureFixture|str:
+    '''
+    Force `--capture=sys` method for tests using
+    a forking-spawner backend due to fd-copying issues
+    which can oddly make certain tests hang/fail.
+
+    '''
+    if _cap_sys_passed_as_flag:
+        return 'sys'
+
+    capsys: pytest.CaptureFixture = maybe_override_capture(
+        request=request,
+        start_method=start_method,
+    )
+    return capsys
+    # XXX reset?
+    # with capsys.disabled():
+    #     pass
+    # return partial(
+    #     maybe_override_capture,
+    #     request=request,
+    #     start_method=start_method,
+    # )
diff --git a/tractor/devx/_stackscope.py b/tractor/devx/_stackscope.py
index 6a9ecd48c..8e1605fd8 100644
--- a/tractor/devx/_stackscope.py
+++ b/tractor/devx/_stackscope.py
@@ -24,7 +24,7 @@
 
 '''
 from __future__ import annotations
-# from functools import partial
+from functools import partial
 from threading import (
     current_thread,
     Thread,
@@ -47,7 +47,9 @@
 import trio
 from tractor.runtime import _state
 from tractor import log as logmod
-from tractor.devx import debug
+from tractor.devx import (
+    debug,
+)
 
 log = logmod.get_logger()
 
@@ -61,12 +63,28 @@
 
 
 @trio.lowlevel.disable_ki_protection
-def dump_task_tree() -> None:
+def dump_task_tree(
+    write_file: bool = False,
+    write_tty: bool = False,
+) -> None:
     '''
     Do a classic `stackscope.extract()` task-tree dump to console at
     `.devx()` level.
 
+    Also unconditionally tee the rendered tree to two
+    capture-bypassing sinks so SIGUSR1 dumps remain visible
+    when the parent process has captured stdio (e.g. pytest's
+    default `--capture=fd`):
+
+    - `/tmp/tractor-stackscope-<pid>.log` (append-mode, always
+      written) — guaranteed-readable artifact even under CI
+      / `nohup` / no-tty conditions. `tail -f` to follow.
+    - `/dev/tty` if a controlling terminal is attached —
+      best-effort, ignored if the device is missing or write
+      fails. pytest never captures the tty.
+
     '''
+    import os
     import stackscope
     tree_str: str = str(
         stackscope.extract(
@@ -96,46 +114,158 @@ def dump_task_tree() -> None:
     # |_{Supervisor/Scope
     # |_[Storage/Memory/IPC-Stream/Data-Struct
 
-    log.devx(
+    fpath: str = f'/tmp/tractor-stackscope-{os.getpid()}.log'
+    from . import pformat
+    actor_repr: str = pformat.nest_from_op(
+        input_op='|_',
+        text=f'{actor}',
+        nest_prefix='|_',
+        nest_indent=3,
+    )
+    full_dump: str = (
         f'Dumping `stackscope` tree for actor\n'
-        f'(>: {actor.uid!r}\n'
+        f'(>: {actor.aid.uid!r}\n'
         f' |_{mp.current_process()}\n'
         f'   |_{thr}\n'
-        f'     |_{actor}\n'
+        # TODO, use the nest_from_op
+        f'{actor_repr}'
+        # f'     |_{actor}'
         f'\n'
         f'{sigint_handler_report}\n'
         f'signal.getsignal(SIGINT) -> {current_sigint_handler!r}\n'
-        # f'\n'
-        # start-of-trace-tree delimiter (mostly for testing)
-        # f'------ {actor.uid!r} ------\n'
         f'\n'
-        f'------ start-of-{actor.uid!r} ------\n'
+        f'capture-bypass tee: {fpath}\n'
+        f'(`tail -f {fpath}` to follow across signals)\n'
+        f'\n'
+        f'------ start-of-{actor.aid.uid!r} ------\n'
         f'|\n'
         f'{tree_str}'
-        # end-of-trace-tree delimiter (mostly for testing)
         f'|\n'
-        f'|_____ end-of-{actor.uid!r} ______\n'
+        f'|_____ end-of-{actor.aid.uid!r} ______\n'
     )
-    # TODO: can remove this right?
-    # -[ ] was original code from author
-    #
-    # print(
-    #     'DUMPING FROM PRINT\n'
-    #     +
-    #     content
-    # )
-    # import logging
-    # try:
-    #     with open("/dev/tty", "w") as tty:
-    #         tty.write(tree_str)
-    # except BaseException:
-    #     logging.getLogger(
-    #         "task_tree"
-    #     ).exception("Error printing task tree")
+    log.devx(full_dump)
+
+    # NOTE, capture-bypass sinks. Pytest's default
+    # `--capture=fd` swallows `log.devx()` above; the
+    # following two writes guarantee the dump reaches the
+    # human even when stdio is captured.
+    if write_file:
+        try:
+            with open(fpath, 'a') as f:
+                f.write(full_dump + '\n')
+        except OSError:
+            log.exception(
+                f'Failed to tee stackscope dump to {fpath!r}'
+            )
+
+    if write_tty:
+        try:
+            with open('/dev/tty', 'w') as tty:
+                tty.write(full_dump + '\n')
+        except OSError:
+            # no controlling tty (CI / nohup / detached) —
+            # silently fall through; the file sink covers it.
+            pass
 
 _handler_lock = RLock()
 _tree_dumped: bool = False
 
+# Captured at `enable_stack_on_sig()` time when running
+# inside a trio task. `dump_tree_on_sig` uses this to
+# schedule `dump_task_tree()` ON the trio loop via
+# `token.run_sync_soon` so stackscope sees a real current
+# task and can recurse into nursery children. Without
+# it (signal handler running in a non-trio stack frame),
+# `stackscope.extract` only walks the `<init>` task and
+# misses everything inside `async_main`'s nurseries.
+_trio_token: trio.lowlevel.TrioToken|None = None
+
+
+def _relay_sig_to_subactors(sig: int) -> None:
+    '''
+    Forward `sig` to every live sub-actor's underlying
+    process so each runs its own `dump_tree_on_sig`
+    handler.
+
+    Factored out of `dump_tree_on_sig` so the
+    `run_sync_soon`-deferred path can call it AFTER
+    the parent's `dump_task_tree()` completes — see
+    `_dump_then_relay` below for why ordering matters.
+
+    '''
+    an: ActorNursery
+    for an in _state.current_actor()._actoruid2nursery.values():
+        subproc: ProcessType
+        subactor: Actor
+        for (
+            subactor,
+            subproc,
+            _,
+        ) in an._children.values():
+            log.warning(
+                f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
+                f'{subactor}\n'
+                f' |_{subproc}\n'
+            )
+            # bc of course stdlib can't have a std API.. XD
+            match subproc:
+                case trio.Process():
+                    subproc.send_signal(sig)
+
+                case mp.Process():
+                    subproc._send_signal(sig)
+
+
+def _dump_then_relay(
+    sig: int|None,
+) -> None:
+    '''
+    `run_sync_soon`-friendly callback: dump THIS actor's
+    task tree first, THEN relay `sig` to subactors so
+    their dumps can't race ahead of ours.
+
+    Hierarchical-ordering preservation: the legacy
+    direct-call path (pre-`run_sync_soon`) ran the dump
+    synchronously inside the signal handler, then
+    relayed — guaranteeing parent-output-before-child
+    in the multiplexed pty stream. The pure-deferred
+    path (schedule dump only, relay sync from handler)
+    inverts that: relay fires while the parent's
+    dump is still queued, subs receive SIGUSR1 and
+    schedule their own dumps, all dumps then race in
+    arbitrary order through stdio.
+
+    Co-scheduling fixes that: by chaining relay AFTER
+    `dump_task_tree()` inside the same trio-loop
+    callback, parent output flushes before any sub
+    receives the signal, restoring the
+    parent → relay-log → sub-dump ordering humans
+    expect when reading hang-investigation traces.
+
+    Trio prints + crashes on uncaught exceptions in
+    scheduled callbacks; we swallow + log so the test
+    keeps running and the user can re-trigger.
+
+    '''
+    try:
+        dump_task_tree()
+    except BaseException:
+        log.exception(
+            '`dump_task_tree()` raised (scheduled via '
+            '`run_sync_soon`); continuing.\n'
+        )
+
+    if sig is None:
+        return
+
+    try:
+        _relay_sig_to_subactors(sig)
+    except BaseException:
+        log.exception(
+            f'`_relay_sig_to_subactors({sig})` raised '
+            f'(scheduled via `run_sync_soon`); continuing.\n'
+        )
+
 
 def dump_tree_on_sig(
     sig: int,
@@ -159,16 +289,32 @@ def dump_tree_on_sig(
             'Trying to dump `stackscope` tree..\n'
         )
         try:
-            dump_task_tree()
-            # await actor._service_n.start_soon(
-            #     partial(
-            #         trio.to_thread.run_sync,
-            #         dump_task_tree,
-            #     )
-            # )
-            # trio.lowlevel.current_trio_token().run_sync_soon(
-            #     dump_task_tree
-            # )
+            # Prefer scheduling on the trio loop — runs the
+            # dump from a real trio-task context so
+            # `stackscope.extract(recurse_child_tasks=True)`
+            # walks every nursery child instead of seeing
+            # only the `<init>` task. Falls back to a direct
+            # call when no token was captured (e.g. signal
+            # delivered outside a trio.run).
+            #
+            # Co-schedule the relay-to-subs in the SAME
+            # callback so parent's dump prints BEFORE any
+            # sub receives SIGUSR1 — see `_dump_then_relay`
+            # for the full hierarchical-ordering rationale.
+            if _trio_token is not None:
+                _trio_token.run_sync_soon(
+                    partial(
+                        _dump_then_relay,
+                        sig=sig if relay_to_subs else None,
+                    )
+                )
+                # NOTE, `_dump_then_relay` handles the relay
+                # internally; bail out before the
+                # direct-path relay below.
+                return
+
+            else:
+                dump_task_tree()
 
         except RuntimeError:
             log.exception(
@@ -188,27 +334,15 @@ def dump_tree_on_sig(
         #     'Supposedly we dumped just fine..?'
         # )
 
+    # Direct-path relay (only reached when `_trio_token`
+    # was None — the run_sync_soon path returned above
+    # to let `_dump_then_relay` handle the relay
+    # in-callback).
     if not relay_to_subs:
+        log.devx(f'Skipping {sig!r} relay to subactors..')
         return
 
-    an: ActorNursery
-    for an in _state.current_actor()._actoruid2nursery.values():
-        subproc: ProcessType
-        subactor: Actor
-        for subactor, subproc, _ in an._children.values():
-            log.warning(
-                f'Relaying `SIGUSR1`[{sig}] to sub-actor\n'
-                f'{subactor}\n'
-                f' |_{subproc}\n'
-            )
-
-            # bc of course stdlib can't have a std API.. XD
-            match subproc:
-                case trio.Process():
-                    subproc.send_signal(sig)
-
-                case mp.Process():
-                    subproc._send_signal(sig)
+    _relay_sig_to_subactors(sig)
 
 
 def enable_stack_on_sig(
@@ -233,19 +367,50 @@ def enable_stack_on_sig(
 
     '''
     try:
-        import stackscope
+        # NOTE, `stackscope._glue` does intentional async-gen type
+        # introspection at import-time which trips
+        # `RuntimeWarning: coroutine method 'asend'/'athrow' was
+        # never awaited`. Benign — they only want the wrapper
+        # type — but visible to users. Squelch the import-only
+        # warning so SIGUSR1 setup stays quiet.
+        import warnings
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                'ignore',
+                category=RuntimeWarning,
+                message=r"coroutine method '(asend|athrow)' .* was never awaited",
+            )
+            import stackscope
+            _state._runtime_vars['use_stackscope'] = True
     except ImportError:
         log.warning(
             'The `stackscope` lib is not installed!\n'
             '`Ignoring enable_stack_on_sig() call!\n'
         )
+        assert not _state._runtime_vars['use_stackscope']
         return None
 
+    # Capture the trio token if we're inside `trio.run`
+    # so SIGUSR1 dispatches the dump *onto* the trio loop
+    # (full task-tree visibility). When called outside trio
+    # (e.g. from `pytest_configure`), token capture fails
+    # silently and `dump_tree_on_sig` falls back to the
+    # direct-call path.
+    global _trio_token
+    try:
+        _trio_token = trio.lowlevel.current_trio_token()
+    except RuntimeError:
+        # not in a `trio.run` — leave None; runtime can
+        # re-call `enable_stack_on_sig()` later from
+        # inside `async_main` to capture it.
+        _trio_token = None
+
     handler: Callable|int = getsignal(sig)
     if handler is dump_tree_on_sig:
         log.devx(
             'A `SIGUSR1` handler already exists?\n'
             f'|_ {handler!r}\n'
+            f'(trio_token captured: {_trio_token is not None})\n'
         )
         return
 
@@ -259,5 +424,6 @@ def enable_stack_on_sig(
         f'{stackscope!r}\n\n'
         f'With `SIGUSR1` handler\n'
         f'|_{dump_tree_on_sig}\n'
+        f'(trio_token captured: {_trio_token is not None})\n'
     )
     return stackscope
diff --git a/tractor/devx/debug/_tty_lock.py b/tractor/devx/debug/_tty_lock.py
index 3d2be681a..0016b61e2 100644
--- a/tractor/devx/debug/_tty_lock.py
+++ b/tractor/devx/debug/_tty_lock.py
@@ -181,7 +181,7 @@ def repr(cls) -> str:
         return (
             f'<{cls.__name__}(\n'
             f'{body}'
-            ')>\n\n'
+            ')>\n'
         )
 
     @classmethod
@@ -282,7 +282,7 @@ def release(
             ):
                 message += (
                     '-> No new task holds the TTY lock!\n\n'
-                    f'{Lock.repr()}\n'
+                    f'{Lock.repr()}'
                 )
 
             elif (
diff --git a/tractor/ipc/_mp_bs.py b/tractor/ipc/_mp_bs.py
index 462291c6b..7f2092d24 100644
--- a/tractor/ipc/_mp_bs.py
+++ b/tractor/ipc/_mp_bs.py
@@ -17,7 +17,7 @@
 Utils to tame mp non-SC madeness
 
 '''
-import platform
+from functools import partial
 
 
 def disable_mantracker():
@@ -27,49 +27,37 @@ def disable_mantracker():
 
     '''
     from multiprocessing.shared_memory import SharedMemory
-
+    from multiprocessing import (
+        resource_tracker as mantracker,
+    )
+
+    # XXX ALWAYS disable the stdlib's "resource tracker"; it prevents
+    # fork backends and never was useful to us since we're SC
+    # lifetime managing all allocations.
+    class ManTracker(mantracker.ResourceTracker):
+        def register(self, name, rtype):
+            pass
+
+        def unregister(self, name, rtype):
+            pass
+
+        def ensure_running(self):
+            pass
+
+    # "know your land and know your prey"
+    # https://www.dailymotion.com/video/x6ozzco
+    mantracker._resource_tracker = ManTracker()
+    mantracker.register = mantracker._resource_tracker.register
+    mantracker.ensure_running = mantracker._resource_tracker.ensure_running
+    mantracker.unregister = mantracker._resource_tracker.unregister
+    mantracker.getfd = mantracker._resource_tracker.getfd
 
     # 3.13+ only.. can pass `track=False` to disable
     # all the resource tracker bs.
     # https://docs.python.org/3/library/multiprocessing.shared_memory.html
-    if (_py_313 := (
-            platform.python_version_tuple()[:-1]
-            >=
-            ('3', '13')
-        )
-    ):
-        from functools import partial
-        return partial(
-            SharedMemory,
-            track=False,
-        )
-
-    # !TODO, once we drop 3.12- we can obvi remove all this!
-    else:
-        from multiprocessing import (
-            resource_tracker as mantracker,
-        )
-
-        # Tell the "resource tracker" thing to fuck off.
-        class ManTracker(mantracker.ResourceTracker):
-            def register(self, name, rtype):
-                pass
-
-            def unregister(self, name, rtype):
-                pass
-
-            def ensure_running(self):
-                pass
-
-        # "know your land and know your prey"
-        # https://www.dailymotion.com/video/x6ozzco
-        mantracker._resource_tracker = ManTracker()
-        mantracker.register = mantracker._resource_tracker.register
-        mantracker.ensure_running = mantracker._resource_tracker.ensure_running
-        mantracker.unregister = mantracker._resource_tracker.unregister
-        mantracker.getfd = mantracker._resource_tracker.getfd
-
-        # use std type verbatim
-        shmT = SharedMemory
+    shmT = partial(
+        SharedMemory,
+        track=False,
+    )
 
     return shmT
diff --git a/tractor/ipc/_shm.py b/tractor/ipc/_shm.py
index b60fafcce..f0225d707 100644
--- a/tractor/ipc/_shm.py
+++ b/tractor/ipc/_shm.py
@@ -929,15 +929,26 @@ def open_shm_list(
     # "close" attached shm on actor teardown
     try:
         actor = tractor.current_actor()
-
         actor.lifetime_stack.callback(shml.shm.close)
 
-        # XXX on 3.13+ we don't need to call this?
-        # -> bc we pass `track=False` for `SharedMemeory` orr?
-        if (
-            platform.python_version_tuple()[:-1] < ('3', '13')
-        ):
-            actor.lifetime_stack.callback(shml.shm.unlink)
+        # >XXX NOTE< on 3.13+ we need to call this AS WELL AS pass
+        # `track=False` for `mp.SharedMemeory` otherwise fork based
+        # backends will error out due to long lived stdlib
+        # limitations,
+        # - https://bugs.python.org/issue38119
+        # - https://bugs.python.org/issue45209
+        #
+        def try_unlink():
+            try:
+                shml.shm.unlink()
+            except FileNotFoundError as fne:
+                log.debug(
+                    f'ShmList already deallocated pre-actor-shutdown.\n'
+                    f'{fne!r}\n'
+                )
+
+        actor.lifetime_stack.callback(try_unlink)
+
     except RuntimeError:
         log.warning('tractor runtime not active, skipping teardown steps')
 
diff --git a/tractor/runtime/_runtime.py b/tractor/runtime/_runtime.py
index 0c25b9262..1111d30fd 100644
--- a/tractor/runtime/_runtime.py
+++ b/tractor/runtime/_runtime.py
@@ -870,7 +870,17 @@ async def _from_parent(
 
             accept_addrs: list[UnwrappedAddress]|None = None
 
-            if self._spawn_method in ("trio", "subint"):
+            if self._spawn_method in (
+                'trio',
+                'subint',
+                # `main_thread_forkserver` (and the future
+                # variant-2 `subint_forkserver`) parent-side
+                # sends a `SpawnSpec` over IPC just like the
+                # other two — fork child-side runtime is
+                # trio-native.
+                'main_thread_forkserver',
+                'subint_forkserver',
+            ):
 
                 # Receive post-spawn runtime state from our parent.
                 spawnspec: msgtypes.SpawnSpec = await chan.recv()
@@ -922,26 +932,38 @@ async def _from_parent(
                 # => update process-wide globals
                 # TODO! -[ ] another `Struct` for rtvs..
                 rvs: dict[str, Any] = spawnspec._runtime_vars
+
+                # `stackscope` SIGUSR1 handler: install when EITHER
+                # `_debug_mode=True` (full multi-actor pdb support
+                # path) OR the `TRACTOR_ENABLE_STACKSCOPE` env var
+                # is set (lighter test-time hang-debug path; see
+                # `tractor._testing.pytest`'s `--enable-stackscope`
+                # CLI flag — env var propagates via fork-inherited
+                # environ).
                 if rvs['_debug_mode']:
-                    from ..devx import (
-                        enable_stack_on_sig,
-                        maybe_init_greenback,
-                    )
-                    try:
-                        # TODO: maybe return some status msgs upward
-                        # to that we can emit them in `con_status`
-                        # instead?
-                        log.devx(
-                            'Enabling `stackscope` traces on SIGUSR1'
-                        )
-                        enable_stack_on_sig()
+                    if (
+                        rvs.get('use_stackscope')
+                        or
+                        os.environ.get('TRACTOR_ENABLE_STACKSCOPE')
+                    ):
+                        from ..devx import enable_stack_on_sig
+                        try:
+                            # TODO: maybe return some status msgs upward
+                            # to that we can emit them in `con_status`
+                            # instead?
+                            log.devx(
+                                'Enabling `stackscope` traces on SIGUSR1'
+                            )
+                            enable_stack_on_sig()
 
-                    except ImportError:
-                        log.warning(
-                            '`stackscope` not installed for use in debug mode!'
-                        )
+                        except ImportError:
+                            log.warning(
+                                '`stackscope` not installed for use in '
+                                'debug mode / `--enable-stackscope`!'
+                            )
 
                     if rvs.get('use_greenback', False):
+                        from ..devx import maybe_init_greenback
                         maybe_mod: ModuleType|None = await maybe_init_greenback()
                         if maybe_mod:
                             log.devx(
@@ -1209,6 +1231,23 @@ async def cancel(
                 ipc_server.cancel()
                 await ipc_server.wait_for_shutdown()
 
+            # Break the shield on the parent-channel
+            # `process_messages` loop (started with `shield=True`
+            # in `async_main` above). Required to avoid a
+            # deadlock during teardown of fork-spawned subactors:
+            # without this cancel, the loop parks waiting for
+            # EOF on the parent channel, but the parent is
+            # blocked on `os.waitpid()` for THIS actor's exit
+            # — mutual wait. For exec-spawn backends the EOF
+            # arrives naturally when the parent closes its
+            # handler-task socket during its own teardown, but
+            # in fork backends the shared-process-image makes
+            # that delivery racy / not guaranteed. Explicit
+            # cancel here gives us deterministic unwinding
+            # regardless of backend.
+            if self._parent_chan_cs is not None:
+                self._parent_chan_cs.cancel()
+
             # cancel all rpc tasks permanently
             if self._service_tn:
                 self._service_tn.cancel_scope.cancel()
@@ -1729,7 +1768,16 @@ async def async_main(
                 # start processing parent requests until our channel
                 # server is 100% up and running.
                 if actor._parent_chan:
-                    await root_tn.start(
+                    # Capture the shielded `loop_cs` for the
+                    # parent-channel `process_messages` task so
+                    # `Actor.cancel()` has a handle to break the
+                    # shield during teardown — without this, the
+                    # shielded loop would park on the parent chan
+                    # indefinitely waiting for EOF that only arrives
+                    # after the PARENT tears down, which under
+                    # fork-based backends (e.g. `main_thread_forkserver`)
+                    # it waits on THIS actor's exit — deadlock.
+                    actor._parent_chan_cs = await root_tn.start(
                         partial(
                             _rpc.process_messages,
                             chan=actor._parent_chan,
@@ -1940,7 +1988,25 @@ async def async_main(
                 f'   {pformat(ipc_server._peers)}'
             )
             log.runtime(teardown_report)
-            await ipc_server.wait_for_no_more_peers()
+            # NOTE: bound the peer-clear wait — otherwise if any
+            # peer-channel handler is stuck (e.g. never got its
+            # cancel propagated due to a runtime bug), this wait
+            # blocks forever and deadlocks the whole actor-tree
+            # teardown cascade. 3s is enough for any graceful
+            # cancel-ack round-trip; beyond that we're in bug
+            # territory and need to proceed with local teardown
+            # so the parent's `_ForkedProc.wait()` can unblock.
+            # See `ai/conc-anal/
+            # subint_forkserver_test_cancellation_leak_issue.md`
+            # for the full diagnosis.
+            with trio.move_on_after(3.0) as _peers_cs:
+                await ipc_server.wait_for_no_more_peers()
+            if _peers_cs.cancelled_caught:
+                teardown_report += (
+                    f'-> TIMED OUT waiting for peers to clear '
+                    f'({len(ipc_server._peers)} still connected)\n'
+                )
+                log.warning(teardown_report)
 
         teardown_report += (
             '-]> all peer channels are complete.\n'
diff --git a/tractor/runtime/_state.py b/tractor/runtime/_state.py
index 55aa3291a..11e0c0fd0 100644
--- a/tractor/runtime/_state.py
+++ b/tractor/runtime/_state.py
@@ -93,6 +93,7 @@ class RuntimeVars(Struct):
     repl_fixture: bool|Callable = False  # |AbstractContextManager[bool]
     # for `tractor.pause_from_sync()` & `breakpoint()` support
     use_greenback: bool = False
+    use_stackscope: bool = False
 
     # infected-`asyncio`-mode: `trio` running as guest.
     _is_infected_aio: bool = False
@@ -117,7 +118,14 @@ def update(
             )
 
 
-_runtime_vars: dict[str, Any] = {
+# The "fresh process" defaults — what `_runtime_vars` looks
+# like in a just-booted Python process that hasn't yet entered
+# `open_root_actor()` nor received a parent `SpawnSpec`. Kept
+# as a module-level constant so `get_runtime_vars(clear_values=
+# True)` can reset the live dict back to this baseline (see
+# `tractor.spawn._main_thread_forkserver` for the one current
+# caller that needs it).
+_RUNTIME_VARS_DEFAULTS: dict[str, Any] = {
     # root of actor-process tree info
     '_is_root': False,  # bool
     '_root_mailbox': (None, None),  # tuple[str|None, str|None]
@@ -132,16 +140,19 @@ def update(
     # `debug_mode: bool` settings
     '_debug_mode': False,  # bool
     'repl_fixture': False,  # |AbstractContextManager[bool]
-    # for `tractor.pause_from_sync()` & `breakpoint()` support
-    'use_greenback': False,
+    
+    'use_greenback': False,  # `.pause_from_sync()`/`breakpoint()`
+    'use_stackscope': False,  # trio-task-stack dumps on SIGUSR1
 
     # infected-`asyncio`-mode: `trio` running as guest.
     '_is_infected_aio': False,
 }
+_runtime_vars: dict[str, Any] = dict(_RUNTIME_VARS_DEFAULTS)
 
 
 def get_runtime_vars(
     as_dict: bool = True,
+    clear_values: bool = False,
 ) -> dict:
     '''
     Deliver a **copy** of the current `Actor`'s "runtime variables".
@@ -150,11 +161,62 @@ def get_runtime_vars(
     form, but the `RuntimeVars` struct should be utilized as possible
     for future calls.
 
+    Pure read — **never mutates** the module-level `_runtime_vars`.
+
+    If `clear_values=True`, return a copy of the fresh-process
+    defaults (`_RUNTIME_VARS_DEFAULTS`) instead of the live
+    dict. Useful in combination with `set_runtime_vars()` to
+    reset process-global state back to "cold" — the main caller
+    today is the `main_thread_forkserver` spawn backend's post-fork
+    child prelude:
+
+        set_runtime_vars(get_runtime_vars(clear_values=True))
+
+    `os.fork()` inherits the parent's full memory image, so the
+    child sees the parent's populated `_runtime_vars` (e.g.
+    `_is_root=True`) which would trip the `assert not
+    self.enable_modules` gate in `Actor._from_parent()` on the
+    subsequent parent→child `SpawnSpec` handshake if left alone.
+
     '''
+    src: dict = (
+        _RUNTIME_VARS_DEFAULTS
+        if clear_values
+        else _runtime_vars
+    )
+    snapshot: dict = dict(src)
     if as_dict:
-        return dict(_runtime_vars)
+        return snapshot
+    return RuntimeVars(**snapshot)
 
-    return RuntimeVars(**_runtime_vars)
+
+def set_runtime_vars(
+    rtvars: dict | RuntimeVars,
+) -> None:
+    '''
+    Atomically replace the module-level `_runtime_vars` contents
+    with those of `rtvars` (via `.clear()` + `.update()` so
+    live references to the same dict object remain valid).
+
+    Accepts either the historical `dict` form or the `RuntimeVars`
+    `msgspec.Struct` form (the latter still mostly unused but
+    the blessed forward shape — see the struct's definition).
+
+    Paired with `get_runtime_vars()` as the explicit
+    write-half of the runtime-vars API — prefer this over
+    direct mutation of `_runtime_vars[...]` from new call sites.
+
+    '''
+    if isinstance(rtvars, RuntimeVars):
+        # `msgspec.Struct` → dict via its declared field set;
+        # avoids pulling in `msgspec.structs.asdict` just for
+        # this one call path.
+        rtvars = {
+            field_name: getattr(rtvars, field_name)
+            for field_name in rtvars.__struct_fields__
+        }
+    _runtime_vars.clear()
+    _runtime_vars.update(rtvars)
 
 
 def last_actor() -> Actor|None:
diff --git a/tractor/spawn/_main_thread_forkserver.py b/tractor/spawn/_main_thread_forkserver.py
new file mode 100644
index 000000000..75c241979
--- /dev/null
+++ b/tractor/spawn/_main_thread_forkserver.py
@@ -0,0 +1,1024 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Variant-1 "main-thread forkserver" spawn backend (today's
+working impl) + the generic fork-from-main-interp-worker-thread
+primitives it's built on.
+
+Spawn-method key: `'main_thread_forkserver'`. The legacy
+`'subint_forkserver'` key currently aliases here too — see
+`tractor.spawn._subint_forkserver` for the future variant-2
+(subint-isolated-child runtime, gated on
+[jcrist/msgspec#1026](https://github.com/jcrist/msgspec/issues/1026))
+that key is reserved for.
+
+Background
+----------
+
+Two empirical CPython properties drive the design:
+
+1. **`os.fork()` from a non-main sub-interpreter is refused by
+   CPython.** `PyOS_AfterFork_Child()` →
+   `_PyInterpreterState_DeleteExceptMain()` gates on the calling
+   thread's tstate belonging to the main interpreter and aborts
+   the forked child otherwise (`Fatal Python error: not main
+   interpreter`). Full source-level walkthrough:
+   `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`.
+
+2. **`os.fork()` from a regular `threading.Thread` attached to
+   the *main* interpreter — i.e. a worker thread that has never
+   entered a subint — works cleanly.** Empirically validated
+   across four scenarios by
+   `ai/conc-anal/subint_fork_from_main_thread_smoketest.py` on
+   py3.14.
+
+The fork-from-main-thread primitives below codify property (2)
+into a reusable surface: spawn a worker thread, fork in it,
+retrieve the child pid back to the caller trio task, and offer a
+`trio.Process`-shaped shim around the raw pid so the existing
+`soft_kill`/`hard_reap` patterns from `_spawn.py` keep working
+unchanged.
+
+Design rationale — why a forkserver, and why in-process
+-------------------------------------------------------
+
+Two design questions worth pinning down up front, since the
+naming intentionally evokes the stdlib `multiprocessing.forkserver`
+for comparison:
+
+**(1) Why a forkserver pattern at all, vs. forking directly
+from the trio task?**
+
+`os.fork()` is fundamentally hostile to trio: trio owns
+file descriptors, signal-wakeup-fds, threadpools, and an
+event loop with non-trivial post-fork lifecycle invariants
+(see python-trio/trio#1614 et al.). Forking a trio-running
+thread duplicates all that state into the child, which then
+either needs surgical reset (fragile) or has to immediately
+`exec()` (defeats the point of fork-without-exec). The
+*forkserver* sidesteps this by isolating the `os.fork()`
+call in a worker that has provably never entered trio — so
+the child inherits a clean, trio-free image.
+
+**(2) Why an in-process forkserver, vs. stdlib
+`multiprocessing.forkserver`?**
+
+The stdlib design solves the same "fork from clean state"
+problem by spinning up a **separate sidecar process** at
+first use of `mp.set_start_method('forkserver')`. The parent
+then IPC's each spawn request to that sidecar over a unix
+socket; the sidecar is the process that actually calls
+`os.fork()`. This works but pays for cleanliness with three
+costs:
+
+- **Sidecar lifecycle**: a second long-lived process per
+  parent, with its own start/stop/health-check semantics.
+
+- **IPC overhead per spawn**: every actor-spawn round-trips
+  an `mp` request message through a unix socket before any
+  child code runs.
+
+- **State isolation by process boundary**: the sidecar can't
+  share parent state at all — every spawn is a "cold" child
+  re-importing modules from disk.
+
+Once the variant-2 (subint-isolated child runtime) lands the
+in-process forkserver collapses all three costs:
+
+- no sidecar — the forkserver is just another thread,
+- spawn signal is a thread-local event/condition, not IPC,
+- child inherits the warm parent state (loaded modules,
+  populated caches, etc.) for free.
+
+For the full variant-2 picture see
+`tractor.spawn._subint_forkserver`'s docstring. Today (variant
+1) we already get costs 1 + 2 collapsed; cost 3 will land
+when msgspec#1026 unblocks isolated-mode subints.
+
+
+What survives the fork? — POSIX semantics
+-----------------------------------------
+
+A natural worry when forking from a parent that's running
+`trio.run()` on another thread: does that trio thread (and
+any other threads in the parent) keep running in the child?
+
+**No** — but with a precise meaning that's worth pinning
+down, since the canonical trio framing
+([python-trio/trio#1614](https://github.com/python-trio/trio/issues/1614))
+puts it the opposite-sounding way:
+
+> If you use `fork()` in a process with multiple threads,
+> all the other thread stacks are just leaked: there's
+> nothing else you can reasonably do with them.
+
+Both statements describe the same POSIX reality from
+opposite sides:
+
+- **Execution-side ("gone")**: POSIX `fork()` only
+  preserves the *calling* thread as a runnable thread in
+  the child. Every other thread in the parent — trio's
+  runner thread, any `to_thread` cache threads, anything
+  else — never executes another instruction post-fork.
+
+- **Memory-side ("leaked")**: those non-running threads'
+  *stacks* and per-thread heap structures are still
+  COW-inherited into the child's address space. They
+  persist as orphaned bytes with no owning thread, no
+  scheduler entry, and no way for the child to clean
+  them up — hence trio's word "leaked".
+
+Concretely, after the forkserver worker calls `os.fork()`:
+
+| thread              | parent    | child (executing) | child (memory)              |
+|---------------------|-----------|-------------------|-----------------------------|
+| forkserver worker   | continues | sole survivor     | live stack                  |
+| `trio.run()` thread | continues | not running       | leaked stack (zombie bytes) |
+| any other thread    | continues | not running       | leaked stack (zombie bytes) |
+
+The forkserver worker becomes the new "main" execution
+context in the child; `trio.run()` and every other parent
+thread never executes a single instruction post-fork.
+Their stack memory rides along as inert COW pages until
+the child's fresh `trio.run()` boots and overwrites/GCs
+it (or until the child `exec()`s and discards the entire
+image).
+
+This is exactly *why* `os.fork()` is delegated to a
+dedicated worker thread that has provably never entered
+trio: we want that trio-free thread to be the surviving
+*executing* thread in the child, with the leaked trio
+stack reduced to inert COW pages we don't touch.
+
+The leaked-stack residue is one slice of the broader
+"fork in a multithreaded program is dangerous" hazard
+class (see `man pthread_atfork`). Other dead-thread
+artifacts that cross the fork boundary, and how we handle
+each:
+
+- **Inherited file descriptors** — the dead trio thread's
+  epoll fd, signal-wakeup-fd, eventfds, sockets, IPC
+  pipes, pytest's capture-fds, etc. are all still in the
+  child's fd table (kernel-level inheritance). Handled by
+  `_close_inherited_fds()` in the child prelude — walks
+  `/proc/self/fd` and closes everything except stdio +
+  the channel pipe to the forkserver.
+
+- **Memory image** — trio's internal data structures
+  (scheduler, task queues, runner state) sit in COW
+  memory alongside the leaked stacks above. Nobody's
+  executing them; they get GC'd / overwritten when the
+  child's fresh `trio.run()` boots.
+
+- **Python thread state** — handled automatically by
+  CPython. `PyOS_AfterFork_Child()` calls
+  `_PyThreadState_DeleteExceptCurrent()`, so dead
+  `PyThreadState` objects are cleaned and
+  `threading.enumerate()` returns just the surviving
+  thread.
+
+- **User-level locks (`threading.Lock`)** —
+  held-by-dead-thread state is the canonical fork hazard.
+  Not an issue in practice for tractor: trio doesn't hold
+  cross-thread locks across fork (its synchronization is
+  within the trio task system, which doesn't survive in
+  either direction). CPython's GIL is auto-reset by the
+  fork callback.
+
+
+FYI: how this dodges the `trio.run()` × `fork()` hazards
+--------------------------------------------------------
+
+`os.fork()` is famously hostile to `trio` (see
+python-trio/trio#1614 et al.) because trio owns several
+classes of process-global state that all break across the
+fork boundary in different ways. The forkserver-thread
+design dodges each class explicitly:
+
+- **Signal-wakeup-fd**: trio installs a wakeup-fd via
+  `signal.set_wakeup_fd()` on `trio.run()` startup so
+  signals can interrupt `epoll_wait`. The child inherits
+  this fd, but trio's runner that owns it is gone — so
+  any signal delivery in the child writes to a dead
+  reader. *Dodge*: the inherited wakeup-fd is closed by
+  `_close_inherited_fds()`, then the child's own
+  `trio.run()` installs a fresh one.
+
+- **`epoll`/`kqueue` instance**: trio's I/O backend holds
+  one. Inherited as a dead fd; same fix as above.
+
+- **Threadpool cache threads** (`trio.to_thread`): worker
+  threads with cached tstate. Don't exist in the child
+  (POSIX); cache state is meaningless garbage that gets
+  reset when the child's trio.run() initializes its own
+  thread cache.
+
+- **Cancel scopes / nurseries / open `trio.Process` /
+  open sockets**: these are trio-runtime objects, not
+  kernel objects. The runtime that owns them is gone in
+  the child, so the Python objects exist as zombie data
+  in COW memory and get overwritten as the child runs.
+  Inherited *kernel* fds those objects wrapped (sockets,
+  proc pipes) are caught by `_close_inherited_fds()`.
+
+- **`atexit` handlers**: trio doesn't register any that
+  would mis-fire post-fork; trio's lifetime-stack is
+  all `with`-block-scoped and dies with the runner.
+
+- **Foreign-language I/O state** (libcurl, OpenSSL session
+  caches, etc.): out of scope — same hazard as any
+  fork-without-exec; users layering those on top of
+  tractor need their own pthread_atfork handlers.
+
+Net effect: for the runtime surface tractor controls
+(trio + IPC layer + msgspec), the forkserver-thread
+isolation + `_close_inherited_fds()` cleanup gives the
+forked child a clean trio environment. Everything else
+falls under the standard fork-without-exec disclaimer.
+
+
+Implementation status
+---------------------
+
+- A dedicated main-interp worker thread owns all `os.fork()`
+  calls (never enters a subint). ✓ landed.
+- Parent actor's `trio.run()` lives **on the main interp**
+  for now (not a subint yet). The subint-hosted root
+  runtime is the variant-2 step gated on jcrist/msgspec#1026.
+- Spawn-request signal: trio task `→ to_thread.run_sync` to
+  the forkserver-worker thread. ✓ landed.
+- Forked child: runs `_actor_child_main` against a normal
+  trio runtime. ✓ landed.
+
+Validated by `tests/spawn/test_subint_forkserver.py` (file
+will be renamed to `test_main_thread_forkserver.py` in a
+follow-up) including the
+`test_subint_forkserver_spawn_basic` backend-tier check.
+
+Still-open work (tracked on tractor #379):
+
+- [ ] no cancellation / hard-kill stress coverage yet
+  (counterpart to `tests/test_subint_cancellation.py` for
+  the plain `subint` backend),
+
+- [ ] `child_sigint='trio'` mode (flag scaffolded below; default
+  is `'ipc'`). Originally intended as a manual SIGINT →
+  trio-cancel bridge, but investigation showed trio's
+  handler IS already correctly installed in the fork-child
+  subactor — the orphan-SIGINT hang is actually a separate
+  bug where trio's event loop stays wedged in `epoll_wait`
+  despite delivery. See
+  `ai/conc-anal/subint_forkserver_orphan_sigint_hang_issue.md`
+  for the full trace + fix directions. Once that root cause
+  is fixed, this flag may end up a no-op / doc-only mode.
+
+TODO — cleanup gated on msgspec PEP 684 support
+-----------------------------------------------
+
+Both worker-thread primitives below allocate a dedicated
+`threading.Thread` rather than using
+`trio.to_thread.run_sync()`. That's a cautious design
+rooted in three distinct-but-entangled issues (GIL
+starvation from legacy-config subints, tstate-recycling
+destroy race on trio cache threads, fork-from-main-tstate
+invariant). Some of those dissolve under PEP 684
+isolated-mode subints; one requires empirical re-testing
+to know.
+
+Full analysis + audit plan in
+`ai/conc-anal/subint_forkserver_thread_constraints_on_pep684_issue.md`,
+tracked at #450; gated on jcrist/msgspec#1026.
+
+What lives here
+---------------
+
+Truly generic primitives (tractor-spawn-backend-agnostic):
+
+- `_close_inherited_fds()`   — fd hygiene primitive
+- `_format_child_exit()`     — `waitpid()` status renderer
+- `wait_child()`             — synchronous waitpid wrapper
+- `fork_from_worker_thread()` — the core fork primitive
+- `_ForkedProc`              — trio-cancellable child-wait shim
+
+The variant-1 spawn-backend coroutine on top:
+
+- `main_thread_forkserver_proc()` — SpawnSpec handshake, IPC
+  wiring, lifecycle. Registered as the
+  `'main_thread_forkserver'` (and currently the legacy
+  `'subint_forkserver'`-aliased) entry in
+  `tractor.spawn._spawn._methods`.
+
+See also
+--------
+
+- `tractor.spawn._subint_forkserver` — variant-2 placeholder
+  module; reserved for the future subint-isolated-child
+  runtime once jcrist/msgspec#1026 unblocks.
+
+- `tractor.spawn._subint_fork` — the stub for the
+  fork-from-non-main-subint strategy that DIDN'T work (kept
+  in-tree as documentation of the attempt + the CPython-level
+  block).
+
+- `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
+  — CPython source walkthrough of why fork-from-subint is dead.
+
+- `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`
+  — standalone feasibility check (delegates to this module
+  for the primitives it exercises).
+
+'''
+from __future__ import annotations
+import errno
+import os
+import signal
+import threading
+from functools import partial
+from typing import (
+    Any,
+    Callable,
+    Literal,
+    TYPE_CHECKING,
+)
+
+import trio
+from trio import TaskStatus
+
+from tractor.log import get_logger
+from tractor.msg import (
+    types as msgtypes,
+    pretty_struct,
+)
+from tractor.runtime._state import current_actor
+from tractor.runtime._portal import Portal
+from ._spawn import (
+    cancel_on_completion,
+    soft_kill,
+)
+
+if TYPE_CHECKING:
+    from tractor.discovery._addr import UnwrappedAddress
+    from tractor.ipc import (
+        _server,
+    )
+    from tractor.runtime._runtime import Actor
+    from tractor.runtime._supervise import ActorNursery
+
+
+log = get_logger('tractor')
+
+
+# Configurable child-side SIGINT handling for forkserver-spawned
+# subactors. Threaded through `main_thread_forkserver_proc`'s
+# `proc_kwargs` under the `'child_sigint'` key.
+#
+# - `'ipc'` (default, currently the only implemented mode):
+#   child has NO trio-level SIGINT handler — trio.run() is on
+#   the fork-inherited non-main thread, `signal.set_wakeup_fd()`
+#   is main-thread-only. Cancellation flows exclusively via
+#   the parent's `Portal.cancel_actor()` IPC path. Safe +
+#   deterministic for nursery-structured apps where the parent
+#   is always the cancel authority. Known gap: orphan
+#   (post-parent-SIGKILL) children don't respond to SIGINT
+#   — see `test_orphaned_subactor_sigint_cleanup_DRAFT`.
+#
+# - `'trio'` (**not yet implemented**): install a manual
+#   SIGINT → trio-cancel bridge in the child's fork prelude
+#   (pre-`trio.run()`) so external Ctrl-C reaches stuck
+#   grandchildren even with a dead parent. Adds signal-
+#   handling surface the `'ipc'` default cleanly avoids; only
+#   pay for it when externally-interruptible children actually
+#   matter (e.g. CLI tool grandchildren).
+ChildSigintMode = Literal['ipc', 'trio']
+_DEFAULT_CHILD_SIGINT: ChildSigintMode = 'ipc'
+
+
+def _close_inherited_fds(
+    keep: frozenset[int] = frozenset({0, 1, 2}),
+) -> int:
+    '''
+    Close every open file descriptor in the current process
+    EXCEPT those in `keep` (default: stdio only).
+
+    Intended as the first thing a post-`os.fork()` child runs
+    after closing any communication pipes it knows about. This
+    is the fork-child FD hygiene discipline that
+    `subprocess.Popen(close_fds=True)` applies by default for
+    its exec-based children, but which we have to implement
+    ourselves because our `fork_from_worker_thread()` primitive
+    deliberately does NOT exec.
+
+    Why it matters
+    --------------
+    Without this, a forkserver-spawned subactor inherits the
+    parent actor's IPC listener sockets, trio-epoll fd, trio
+    wakeup-pipe, peer-channel sockets, etc. If that subactor
+    then itself forkserver-spawns a grandchild, the grandchild
+    inherits the FDs transitively from *both* its direct
+    parent AND the root actor — IPC message routing becomes
+    ambiguous and the cancel cascade deadlocks. See
+    `ai/conc-anal/subint_forkserver_test_cancellation_leak_issue.md`
+    for the full diagnosis + the empirical repro.
+
+    Fresh children will open their own IPC sockets via
+    `_actor_child_main()`, so they don't need any of the
+    parent's FDs.
+
+    Returns the count of fds that were successfully closed —
+    useful for sanity-check logging at callsites.
+
+    '''
+    # Enumerate open fds via `/proc/self/fd` on Linux (the fast +
+    # precise path); fall back to `RLIMIT_NOFILE` range close on
+    # other platforms. Matches stdlib
+    # `subprocess._posixsubprocess.close_fds` strategy.
+    try:
+        fd_names: list[str] = os.listdir('/proc/self/fd')
+        candidates: list[int] = [
+            int(n) for n in fd_names if n.isdigit()
+        ]
+    except (
+        FileNotFoundError,
+        PermissionError,
+    ):
+        import resource
+        soft, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
+        candidates = list(range(3, soft))
+
+    closed: int = 0
+    for fd in candidates:
+        if fd in keep:
+            continue
+        try:
+            os.close(fd)
+            closed += 1
+        except OSError as oserr:
+            # `EBADF` is the benign-and-expected case: the
+            # `os.listdir('/proc/self/fd')` call above itself
+            # opens a transient dirfd that ends up in
+            # `candidates`, then auto-closes before this loop
+            # reaches it. Same for any fd whose Python wrapper
+            # was GC'd between `listdir` and `os.close`.
+            # Suppress at debug-level — surfacing every
+            # EBADF as a full traceback (prior `log.exception`
+            # behavior) drowned the post-fork log channel.
+            if oserr.errno == errno.EBADF:
+                log.debug(
+                    f'Skip already-closed inherited fd {fd!r} '
+                    f'(EBADF, benign race with listdir)\n'
+                )
+                continue
+            # Other errnos (EIO / EPERM / EINTR / ...) are
+            # genuinely unexpected — keep the loud surface.
+            log.exception(
+                f'Failed to close inherited fd in child ??\n'
+                f'{fd!r}\n'
+            )
+
+    return closed
+
+
+def _format_child_exit(
+    status: int,
+) -> str:
+    '''
+    Render `os.waitpid()`-returned status as a short human
+    string (`'rc=0'` / `'signal=SIGABRT'` / etc.) for log
+    output.
+
+    '''
+    if os.WIFEXITED(status):
+        return f'rc={os.WEXITSTATUS(status)}'
+    elif os.WIFSIGNALED(status):
+        sig: int = os.WTERMSIG(status)
+        return f'signal={signal.Signals(sig).name}'
+    else:
+        return f'raw_status={status}'
+
+
+def wait_child(
+    pid: int,
+    *,
+    expect_exit_ok: bool = True,
+) -> tuple[bool, str]:
+    '''
+    `os.waitpid()` + classify the child's exit as
+    expected-or-not.
+
+    `expect_exit_ok=True` → expect clean `rc=0`. `False` →
+    expect abnormal death (any signal or nonzero rc). Used
+    by the control-case smoke-test scenario where CPython
+    is meant to abort the child.
+
+    Returns `(ok, status_str)` — `ok` reflects whether the
+    observed outcome matches `expect_exit_ok`, `status_str`
+    is a short render of the actual status.
+
+    '''
+    _, status = os.waitpid(pid, 0)
+    exited_normally: bool = (
+        os.WIFEXITED(status)
+        and
+        os.WEXITSTATUS(status) == 0
+    )
+    ok: bool = (
+        exited_normally
+        if expect_exit_ok
+        else not exited_normally
+    )
+    return ok, _format_child_exit(status)
+
+
+def fork_from_worker_thread(
+    child_target: Callable[[], int] | None = None,
+    *,
+    thread_name: str = 'main-thread-fork',
+    join_timeout: float = 10.0,
+
+) -> int:
+    '''
+    `os.fork()` from a main-interp worker thread; return the
+    forked child's pid.
+
+    The calling context **must** be the main interpreter
+    (not a subinterpreter) — that's the whole point of this
+    primitive. A regular `threading.Thread(target=...)`
+    spawned from main-interp code satisfies this
+    automatically because Python attaches the thread's
+    tstate to the *calling* interpreter, and our main
+    thread's calling interp is always main.
+
+    If `child_target` is provided, it runs IN the forked
+    child process before `os._exit` is called. The callable
+    should return an int used as the child's exit rc. If
+    `child_target` is None, the child `_exit(0)`s immediately
+    (useful for the baseline sanity case).
+
+    On the PARENT side, this function drives the worker
+    thread to completion (`fork()` returns near-instantly;
+    the thread is expected to exit promptly) and then
+    returns the forked child's pid. Raises `RuntimeError`
+    if the worker thread fails to return within
+    `join_timeout` seconds — that'd be an unexpected CPython
+    pathology.
+
+    '''
+    # Use a pipe to shuttle the forked child's pid from the
+    # worker thread back to the caller.
+    rfd, wfd = os.pipe()
+
+    def _worker() -> None:
+        '''
+        Runs on the forkserver worker thread. Forks; child
+        runs `child_target` (if any) and exits; parent side
+        writes the child pid to the pipe so the main-thread
+        caller can retrieve it.
+
+        '''
+        pid: int = os.fork()
+        if pid == 0:
+            # CHILD: close the pid-pipe ends (we don't use
+            # them here), then scrub ALL other inherited FDs
+            # so the child starts with a clean slate
+            # (stdio-only). Critical for multi-level spawn
+            # trees — see `_close_inherited_fds()` docstring.
+            os.close(rfd)
+            os.close(wfd)
+            _close_inherited_fds()
+            rc: int = 0
+            if child_target is not None:
+                try:
+                    rc = child_target() or 0
+                except BaseException as err:
+                    log.error(
+                        f'main-thread-fork child_target '
+                        f'raised:\n'
+                        f'|_{type(err).__name__}: {err}'
+                    )
+                    rc = 2
+            os._exit(rc)
+        else:
+            # PARENT (still inside the worker thread):
+            # hand the child pid back to main via pipe.
+            os.write(wfd, pid.to_bytes(8, 'little'))
+
+    worker: threading.Thread = threading.Thread(
+        target=_worker,
+        name=thread_name,
+        daemon=False,
+    )
+    worker.start()
+    worker.join(timeout=join_timeout)
+    if worker.is_alive():
+        # Pipe cleanup best-effort before bail.
+        try:
+            os.close(rfd)
+        except OSError:
+            log.exception(
+                f'Failed to close PID-pipe read-fd in parent ??\n'
+                f'{rfd!r}\n'
+            )
+        try:
+            os.close(wfd)
+        except OSError:
+            log.exception(
+                f'Failed to close PID-pipe write-fd in parent ??\n'
+                f'{wfd!r}\n'
+            )
+        raise RuntimeError(
+            f'main-thread-fork worker thread '
+            f'{thread_name!r} did not return within '
+            f'{join_timeout}s — this is unexpected since '
+            f'`os.fork()` should return near-instantly on '
+            f'the parent side.'
+        )
+
+    pid_bytes: bytes = os.read(rfd, 8)
+    os.close(rfd)
+    os.close(wfd)
+    pid: int = int.from_bytes(pid_bytes, 'little')
+    log.runtime(
+        f'main-thread-fork forked child\n'
+        f'(>\n'
+        f' |_pid={pid}\n'
+    )
+    return pid
+
+
+class _ForkedProc:
+    '''
+    Thin `trio.Process`-compatible shim around a raw OS pid
+    returned by `fork_from_worker_thread()`, exposing just
+    enough surface for the `soft_kill()` / hard-reap pattern
+    borrowed from `trio_proc()`.
+
+    Unlike `trio.Process`, we have no direct handles on the
+    child's std-streams (fork-without-exec inherits the
+    parent's FDs, but we don't marshal them into this
+    wrapper) — `.stdin`/`.stdout`/`.stderr` are all `None`,
+    which matches what `soft_kill()` handles via its
+    `is not None` guards.
+
+    '''
+    def __init__(self, pid: int):
+        self.pid: int = pid
+        self._returncode: int | None = None
+        # `soft_kill`/`hard_kill` check these for pipe
+        # teardown — all None since we didn't wire up pipes
+        # on the fork-without-exec path.
+        self.stdin = None
+        self.stdout = None
+        self.stderr = None
+        # pidfd (Linux 5.3+, Python 3.9+) — a file descriptor
+        # referencing this child process which becomes readable
+        # once the child exits. Enables a fully trio-cancellable
+        # wait via `trio.lowlevel.wait_readable()` — same
+        # pattern `trio.Process.wait()` uses under the hood, and
+        # the same pattern `multiprocessing.Process.sentinel`
+        # uses for `tractor.spawn._spawn.proc_waiter()`. Without
+        # this, waiting via `trio.to_thread.run_sync(os.waitpid,
+        # ...)` blocks a cache thread on a sync syscall that is
+        # NOT trio-cancellable, which prevents outer cancel
+        # scopes from unwedging a stuck-child cancel cascade.
+        self._pidfd: int = os.pidfd_open(pid)
+
+    def poll(self) -> int | None:
+        '''
+        Non-blocking liveness probe. Returns `None` if the
+        child is still running, else its exit code (negative
+        for signal-death, matching `subprocess.Popen`
+        convention).
+
+        '''
+        if self._returncode is not None:
+            return self._returncode
+        try:
+            waited_pid, status = os.waitpid(self.pid, os.WNOHANG)
+        except ChildProcessError:
+            # already reaped (or never existed) — treat as
+            # clean exit for polling purposes.
+            self._returncode = 0
+            return 0
+        if waited_pid == 0:
+            return None
+        self._returncode = self._parse_status(status)
+        return self._returncode
+
+    @property
+    def returncode(self) -> int | None:
+        return self._returncode
+
+    async def wait(self) -> int:
+        '''
+        Async, fully-trio-cancellable wait for the child's
+        exit. Uses `trio.lowlevel.wait_readable()` on the
+        `pidfd` sentinel — same pattern as `trio.Process.wait`
+        and `tractor.spawn._spawn.proc_waiter` (mp backend).
+
+        Safe to call multiple times; subsequent calls return
+        the cached rc without re-issuing the syscall.
+
+        '''
+        if self._returncode is not None:
+            return self._returncode
+        # Park until the pidfd becomes readable — the OS
+        # signals this exactly once on child exit. Cancellable
+        # via any outer trio cancel scope (this was the key
+        # fix vs. the prior `to_thread.run_sync(os.waitpid,
+        # abandon_on_cancel=False)` which blocked a thread on
+        # a sync syscall and swallowed cancels).
+        await trio.lowlevel.wait_readable(self._pidfd)
+        # pidfd signaled → reap non-blocking to collect the
+        # exit status. `WNOHANG` here is correct: by the time
+        # the pidfd is readable, `waitpid()` won't block.
+        try:
+            _, status = os.waitpid(self.pid, os.WNOHANG)
+        except ChildProcessError:
+            # already reaped by something else
+            status = 0
+        self._returncode = self._parse_status(status)
+        # pidfd is one-shot; close it so we don't leak fds
+        # across many spawns.
+        try:
+            os.close(self._pidfd)
+        except OSError:
+            pass
+        self._pidfd = -1
+        return self._returncode
+
+    def kill(self) -> None:
+        '''
+        OS-level `SIGKILL` to the child. Swallows
+        `ProcessLookupError` (already dead).
+
+        '''
+        try:
+            os.kill(self.pid, signal.SIGKILL)
+        except ProcessLookupError:
+            pass
+
+    def __del__(self) -> None:
+        # belt-and-braces: close the pidfd if `wait()` wasn't
+        # called (e.g. unexpected teardown path).
+        fd: int = getattr(self, '_pidfd', -1)
+        if fd >= 0:
+            try:
+                os.close(fd)
+            except OSError:
+                pass
+
+    def _parse_status(self, status: int) -> int:
+        if os.WIFEXITED(status):
+            return os.WEXITSTATUS(status)
+        elif os.WIFSIGNALED(status):
+            # negative rc by `subprocess.Popen` convention
+            return -os.WTERMSIG(status)
+        return 0
+
+    def __repr__(self) -> str:
+        return (
+            f'<_ForkedProc pid={self.pid} '
+            f'returncode={self._returncode}>'
+        )
+
+
+async def main_thread_forkserver_proc(
+    name: str,
+    actor_nursery: ActorNursery,
+    subactor: Actor,
+    errors: dict[tuple[str, str], Exception],
+
+    # passed through to actor main
+    bind_addrs: list[UnwrappedAddress],
+    parent_addr: UnwrappedAddress,
+    _runtime_vars: dict[str, Any],
+    *,
+    infect_asyncio: bool = False,
+    task_status: TaskStatus[Portal] = trio.TASK_STATUS_IGNORED,
+    proc_kwargs: dict[str, any] = {},
+
+) -> None:
+    '''
+    Spawn a subactor via `os.fork()` from a non-trio worker
+    thread (see `fork_from_worker_thread()`), with the forked
+    child running `tractor._child._actor_child_main()` and
+    connecting back via tractor's normal IPC handshake.
+
+    Supervision model mirrors `trio_proc()` — we manage a
+    real OS subprocess, so `Portal.cancel_actor()` +
+    `soft_kill()` on graceful teardown and `os.kill(SIGKILL)`
+    on hard-reap both apply directly (no
+    `_interpreters.destroy()` voodoo needed since the child
+    is in its own process).
+
+    The only real difference from `trio_proc` is the spawn
+    mechanism: fork from a known-clean main-interp worker
+    thread instead of `trio.lowlevel.open_process()`.
+
+    '''
+    # Backend-scoped config pulled from `proc_kwargs`. Using
+    # `proc_kwargs` (vs a first-class kwarg on this function)
+    # matches how other backends expose per-spawn tuning
+    # (`trio_proc` threads it to `trio.lowlevel.open_process`,
+    # etc.) and keeps `ActorNursery.start_actor(proc_kwargs=...)`
+    # as the single ergonomic entry point.
+    child_sigint: ChildSigintMode = proc_kwargs.get(
+        'child_sigint',
+        _DEFAULT_CHILD_SIGINT,
+    )
+    if child_sigint not in ('ipc', 'trio'):
+        raise ValueError(
+            f'Invalid `child_sigint={child_sigint!r}` for '
+            f'`main_thread_forkserver` backend.\n'
+            f'Expected one of: {ChildSigintMode}.'
+        )
+    if child_sigint == 'trio':
+        raise NotImplementedError(
+            "`child_sigint='trio'` mode — trio-native SIGINT "
+            "plumbing in the fork-child — is scaffolded but "
+            "not yet implemented. See the xfail'd "
+            "`test_orphaned_subactor_sigint_cleanup_DRAFT` "
+            "and the TODO in this module's docstring."
+        )
+
+    uid: tuple[str, str] = subactor.aid.uid
+    loglevel: str | None = subactor.loglevel
+
+    # Closure captured into the fork-child's memory image.
+    # In the child this is the first post-fork Python code to
+    # run, on what was the fork-worker thread in the parent.
+    # `child_sigint` is captured here so the impl lands inside
+    # this function once the `'trio'` mode is wired up —
+    # nothing above this comment needs to change.
+    def _child_target() -> int:
+        # Dispatch on the captured SIGINT-mode closure var.
+        # Today only `'ipc'` is reachable (the `'trio'` branch
+        # is fenced off at the backend-entry guard above); the
+        # match is in place so the future `'trio'` impl slots
+        # in as a plain case arm without restructuring.
+        match child_sigint:
+            case 'ipc':
+                pass  # <- current behavior: no child-side
+                      #    SIGINT plumbing; rely on parent
+                      #    `Portal.cancel_actor()` IPC path.
+            case 'trio':
+                # Unreachable today (see entry-guard above);
+                # this stub exists so that lifting the guard
+                # is the only change required to enable
+                # `'trio'` mode once the SIGINT wakeup-fd
+                # bridge is implemented.
+                raise NotImplementedError(
+                    "`child_sigint='trio'` fork-prelude "
+                    "plumbing not yet wired."
+                )
+        # Lazy import so the parent doesn't pay for it on
+        # every spawn — it's module-level in `_child` but
+        # cheap enough to re-resolve here.
+        from tractor._child import _actor_child_main
+        # XXX, `os.fork()` inherits the parent's entire memory
+        # image, including `tractor.runtime._state._runtime_vars`
+        # (which in the parent encodes "this process IS the root
+        # actor"). A fresh `exec`-based child starts cold; we
+        # replicate that here by explicitly resetting runtime
+        # vars to their fresh-process defaults — otherwise
+        # `Actor.__init__` takes the `is_root_process() == True`
+        # branch, pre-populates `self.enable_modules`, and trips
+        # the `assert not self.enable_modules` gate at the top
+        # of `Actor._from_parent()` on the subsequent parent→
+        # child `SpawnSpec` handshake. (`_state._current_actor`
+        # is unconditionally overwritten by `_trio_main` → no
+        # reset needed for it.)
+        from tractor.runtime._state import (
+            get_runtime_vars,
+            set_runtime_vars,
+        )
+        set_runtime_vars(get_runtime_vars(clear_values=True))
+        _actor_child_main(
+            uid=uid,
+            loglevel=loglevel,
+            parent_addr=parent_addr,
+            infect_asyncio=infect_asyncio,
+            # The child's runtime is trio-native (uses
+            # `_trio_main` + receives `SpawnSpec` over IPC),
+            # but label it with the actual parent-side spawn
+            # mechanism so `Actor.pformat()` / log lines
+            # reflect reality. Downstream runtime gates that
+            # key on `_spawn_method` group `main_thread_forkserver`
+            # alongside `trio`/`subint` where the SpawnSpec
+            # IPC handshake is concerned — see
+            # `runtime._runtime.Actor._from_parent()`.
+            spawn_method='main_thread_forkserver',
+        )
+        return 0
+
+    cancelled_during_spawn: bool = False
+    proc: _ForkedProc | None = None
+    ipc_server: _server.Server = actor_nursery._actor.ipc_server
+
+    try:
+        try:
+            pid: int = await trio.to_thread.run_sync(
+                partial(
+                    fork_from_worker_thread,
+                    _child_target,
+                    thread_name=(
+                        f'main-thread-forkserver[{name}]'
+                    ),
+                ),
+                abandon_on_cancel=False,
+            )
+            proc = _ForkedProc(pid)
+            log.runtime(
+                f'Forked subactor via main-thread-forkserver\n'
+                f'(>\n'
+                f' |_{proc}\n'
+            )
+
+            event, chan = await ipc_server.wait_for_peer(uid)
+
+        except trio.Cancelled:
+            cancelled_during_spawn = True
+            raise
+
+        assert proc is not None
+
+        portal = Portal(chan)
+        actor_nursery._children[uid] = (
+            subactor,
+            proc,
+            portal,
+        )
+
+        sspec = msgtypes.SpawnSpec(
+            _parent_main_data=subactor._parent_main_data,
+            enable_modules=subactor.enable_modules,
+            reg_addrs=subactor.reg_addrs,
+            bind_addrs=bind_addrs,
+            _runtime_vars=_runtime_vars,
+        )
+        log.runtime(
+            f'Sending spawn spec to forkserver child\n'
+            f'{{}}=> {chan.aid.reprol()!r}\n'
+            f'\n'
+            f'{pretty_struct.pformat(sspec)}\n'
+        )
+        await chan.send(sspec)
+
+        curr_actor: Actor = current_actor()
+        curr_actor._actoruid2nursery[uid] = actor_nursery
+
+        task_status.started(portal)
+
+        with trio.CancelScope(shield=True):
+            await actor_nursery._join_procs.wait()
+
+        async with trio.open_nursery() as nursery:
+            if portal in actor_nursery._cancel_after_result_on_exit:
+                nursery.start_soon(
+                    cancel_on_completion,
+                    portal,
+                    subactor,
+                    errors,
+                )
+
+            # reuse `trio_proc`'s soft-kill dance — `proc`
+            # is our `_ForkedProc` shim which implements the
+            # same `.poll()` / `.wait()` / `.kill()` surface
+            # `soft_kill` expects.
+            await soft_kill(
+                proc,
+                _ForkedProc.wait,
+                portal,
+            )
+            nursery.cancel_scope.cancel()
+
+    finally:
+        # Hard reap: SIGKILL + waitpid. Cheap since we have
+        # the real OS pid, unlike `subint_proc` which has to
+        # fuss with `_interpreters.destroy()` races.
+        if proc is not None and proc.poll() is None:
+            log.cancel(
+                f'Hard killing main-thread-forkserver subactor\n'
+                f'>x)\n'
+                f' |_{proc}\n'
+            )
+            with trio.CancelScope(shield=True):
+                proc.kill()
+                await proc.wait()
+
+        if not cancelled_during_spawn:
+            actor_nursery._children.pop(uid, None)
diff --git a/tractor/spawn/_reap.py b/tractor/spawn/_reap.py
new file mode 100644
index 000000000..bf2c104d7
--- /dev/null
+++ b/tractor/spawn/_reap.py
@@ -0,0 +1,183 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Post-mortem subactor cleanup primitives — things the parent
+runtime has to clean up because the dead-or-SIGKILL'd child
+couldn't.
+
+Sibling of `tractor._testing._reap` which is the test-harness
+equivalent (orphan-pid + leaked-shm + leaked-UDS-sock sweeper
+fixtures). This module is the spawn-layer counterpart, called
+inline from `hard_kill` and the broader subactor reap path.
+
+Today this is just `unlink_uds_bind_addrs()`. As future
+post-mortem cleanup needs surface (e.g. `/dev/shm` segment
+unlink for hard-crashed actors, leaked-pidfile cleanup), they
+land here too.
+
+Future-work TODO — authoritative UDS bind-addr tracking
+-------------------------------------------------------
+
+`unlink_uds_bind_addrs()` currently has two cleanup paths:
+
+1. Explicit `bind_addrs` (when parent set them at spawn time)
+2. **Convention-based reconstruction** —
+   `<XDG_RUNTIME_DIR>/tractor/<name>@<pid>.sock` — for the
+   common case where the subactor self-assigned a random sock
+   via `UDSAddress.get_random()`.
+
+Path (2) hardcodes the `<name>@<pid>.sock` convention from
+`tractor.ipc._uds.UDSAddress`. If that convention ever
+changes — or the subactor binds to a non-default
+`bindspace`/`filedir` — we'll silently fail to unlink.
+
+A more authoritative approach would be:
+
+- Subactors register their bound UDS sockpaths in a
+  per-process registry inside `tractor.ipc._uds` at
+  `start_listener()` time.
+- The subactor reports its bound sockpath(s) back to the
+  parent over IPC immediately post-bind (extension to
+  `SpawnSpec` reply / a new handshake msg).
+- Parent caches the subactor's authoritative sockpaths.
+- `unlink_uds_bind_addrs()` checks the cache FIRST, falls
+  back to convention-reconstruction if the subactor died
+  before reporting (which is the SIGKILL case this fn
+  primarily exists for).
+
+Tracked as future work in #454 (the parent UDS-leak
+issue this module addresses); a separate issue may be
+filed if/when the registry impl is scoped.
+
+See also #452 — the discovery-client `CLOSE_WAIT` TCP
+fd leak. Different bug class but same broader theme of
+"fork-spawn unmasked latent cleanup gaps".
+
+'''
+from __future__ import annotations
+
+import os
+from typing import TYPE_CHECKING
+
+import trio
+
+from tractor.discovery._addr import (
+    UnwrappedAddress,
+    wrap_address,
+)
+from tractor.ipc._uds import UDSAddress
+from tractor.log import get_logger
+
+
+if TYPE_CHECKING:
+    from tractor.runtime._runtime import Actor
+
+
+log = get_logger('tractor')
+
+
+def unlink_uds_bind_addrs(
+    proc: trio.Process,
+    *,
+    bind_addrs: list[UnwrappedAddress] | None = None,
+    subactor: Actor | None = None,
+) -> None:
+    '''
+    Best-effort post-mortem cleanup of any UDS sock-files
+    a hard-killed subactor was bound to.
+
+    SIGKILL bypasses Python execution → the subactor's
+    `_serve_ipc_eps` `finally:` block (which normally calls
+    `os.unlink(addr.sockpath)`) never runs. Without this
+    parent-side cleanup, the dead subactor's
+    `${XDG_RUNTIME_DIR}/tractor/<name>@<pid>.sock` file
+    accumulates on the filesystem (see issue #454 + the
+    autouse `_track_orphaned_uds_per_test` fixture).
+
+    Two cleanup paths, in order:
+
+    1. **Explicit `bind_addrs`** — when the parent set the
+       subactor's bind addrs at spawn time, unlink each
+       UDS-flavored sockpath directly.
+    2. **Self-assigned reconstruction** — when
+       `bind_addrs` is empty (the common case: subactor
+       picked its own random sock via
+       `UDSAddress.get_random()`), reconstruct the path
+       from `(subactor.aid.name, proc.pid)` using the
+       same `<name>@<pid>.sock` convention. We can do this
+       because the subactor uses its OWN `os.getpid()` at
+       bind time, which equals `proc.pid` from the
+       parent's view.
+
+    Idempotent: `FileNotFoundError` (graceful exit
+    already-unlinked, or sock never bound under early-
+    spawn cancel) is silenced; other `OSError`s log a
+    warning but never raise. TCP / non-UDS bind addrs are
+    skipped.
+
+    '''
+    sockpaths: list[str] = []
+
+    # path 1: explicit bind_addrs set at spawn time
+    for unwrapped in (bind_addrs or ()):
+        try:
+            addr = wrap_address(unwrapped)
+        except Exception:
+            log.exception(
+                f'Failed to wrap addr for UDS post-kill cleanup '
+                f'— skipping {unwrapped!r}\n'
+            )
+            continue
+        if isinstance(addr, UDSAddress):
+            sockpaths.append(str(addr.sockpath))
+
+    # path 2: reconstruct from subactor name + proc pid
+    # for the random-self-assign case (bind_addrs=None)
+    #
+    # TODO authoritative tracking — see module docstring.
+    if (
+        not sockpaths
+        and subactor is not None
+        and proc.pid is not None
+    ):
+        sockname: str = f'{subactor.aid.name}@{proc.pid}.sock'
+        sockpath: str = str(
+            UDSAddress.def_bindspace / sockname
+        )
+        sockpaths.append(sockpath)
+
+    for sockpath in sockpaths:
+        try:
+            os.unlink(sockpath)
+            log.runtime(
+                f'Unlinked orphaned UDS sock-file post-SIGKILL\n'
+                f' |_{proc}\n'
+                f' |_{sockpath}\n'
+            )
+        except FileNotFoundError:
+            # raced — subactor cleaned up before SIGKILL,
+            # OR sockfile never bound (early-spawn cancel),
+            # OR transport wasn't UDS this run.
+            pass
+        except OSError as exc:
+            log.warning(
+                f'Failed to unlink subactor UDS sock-file '
+                f'post-SIGKILL\n'
+                f' |_{proc}\n'
+                f' |_{sockpath}\n'
+                f' |_{exc!r}\n'
+            )
diff --git a/tractor/spawn/_spawn.py b/tractor/spawn/_spawn.py
index 937d8c951..df3e928b0 100644
--- a/tractor/spawn/_spawn.py
+++ b/tractor/spawn/_spawn.py
@@ -40,7 +40,10 @@
     _runtime_vars,
 )
 from tractor.log import get_logger
-from tractor.discovery._addr import UnwrappedAddress
+from tractor.discovery._addr import (
+    UnwrappedAddress,
+)
+from ._reap import unlink_uds_bind_addrs
 from tractor.runtime._portal import Portal
 from tractor.runtime._runtime import Actor
 from tractor.msg import types as msgtypes
@@ -72,6 +75,20 @@
     # `ai/conc-anal/subint_fork_blocked_by_cpython_post_fork_issue.md`
     # + issue #379 for the full analysis.
     'subint_fork',
+    # EXPERIMENTAL — the `subint_fork` workaround. `os.fork()`
+    # from a non-trio worker thread (never entered a subint)
+    # is CPython-legal and works cleanly; forked child runs
+    # `tractor._child._actor_child_main()` against a trio
+    # runtime, exactly like `trio_proc` but via fork instead
+    # of subproc-exec. See `tractor.spawn._main_thread_forkserver`.
+    'main_thread_forkserver',
+    # RESERVED for the future variant-2 subint-isolated-child
+    # runtime — gated on jcrist/msgspec#1026 + PEP 684. Today
+    # this key aliases to `main_thread_forkserver_proc`; once
+    # the upstream unblocks land it'll dispatch to the
+    # subint-hosted-trio impl. See
+    # `tractor.spawn._subint_forkserver`.
+    'subint_forkserver',
 ]
 _spawn_method: SpawnMethodKey = 'trio'
 
@@ -121,16 +138,28 @@ def try_set_start_method(
         case 'mp_spawn':
             _ctx = mp.get_context('spawn')
 
-        case 'trio':
+        case (
+            'trio'
+            | 'main_thread_forkserver'
+        ):
             _ctx = None
 
-        case 'subint' | 'subint_fork':
-            # Both subint backends need no `mp.context`; both
-            # feature-gate on the py3.14 public
+        case (
+            'subint'
+            | 'subint_fork'
+            | 'subint_forkserver'
+        ):
+            # All subint-family backends need no `mp.context`;
+            # all four feature-gate on the py3.14 public
             # `concurrent.interpreters` wrapper (PEP 734). See
             # `tractor.spawn._subint` for the detailed
-            # reasoning and the distinction between the two
-            # (`subint_fork` is WIP/experimental).
+            # reasoning. `subint_fork` is blocked at the
+            # CPython level (raises `NotImplementedError`);
+            # `main_thread_forkserver` is the working
+            # variant-1 backend; `subint_forkserver` aliases
+            # to it today, reserved for the future variant-2
+            # subint-isolated-child runtime once upstream
+            # msgspec#1026 unblocks.
             from ._subint import _has_subints
             if not _has_subints:
                 raise RuntimeError(
@@ -253,6 +282,16 @@ async def hard_kill(
     # whilst also hacking on it XD
     # terminate_after: int = 99999,
 
+    *,
+    # Subactor's bind addresses + subactor record, used
+    # for post-SIGKILL UDS sockpath cleanup. Optional for
+    # legacy callers; new call sites should pass at least
+    # `subactor` (which lets us reconstruct the sock path
+    # from `aid.name + proc.pid` when `bind_addrs` is
+    # empty/self-assigned). See `._reap.unlink_uds_bind_addrs()`.
+    bind_addrs: list[UnwrappedAddress] | None = None,
+    subactor: Actor | None = None,
+
 ) -> None:
     '''
     Un-gracefully terminate an OS level `trio.Process` after timeout.
@@ -340,6 +379,21 @@ async def hard_kill(
         )
         proc.kill()
 
+    # Post-mortem UDS sockpath cleanup. SIGKILL bypassed
+    # the subactor's normal `os.unlink(addr.sockpath)` in
+    # `_serve_ipc_eps`'s `finally:`; the parent has the
+    # bind addrs (or can reconstruct from name + pid) so
+    # we do it here. Runs UNCONDITIONALLY (graceful-exit
+    # case is a no-op via `FileNotFoundError` skip in the
+    # helper) so the cleanup also covers the "cancelled
+    # during spawn" path where the subactor never reached
+    # its IPC server finally block.
+    unlink_uds_bind_addrs(
+        proc,
+        bind_addrs=bind_addrs,
+        subactor=subactor,
+    )
+
 
 async def soft_kill(
     proc: ProcessType,
@@ -469,6 +523,8 @@ async def new_proc(
 from ._mp import mp_proc
 from ._subint import subint_proc
 from ._subint_fork import subint_fork_proc
+from ._main_thread_forkserver import main_thread_forkserver_proc
+from ._subint_forkserver import subint_forkserver_proc
 
 
 # proc spawning backend target map
@@ -483,4 +539,18 @@ async def new_proc(
     # clean `NotImplementedError` with pointer to the analysis,
     # rather than an "invalid backend" error.
     'subint_fork': subint_fork_proc,
+    # Variant-1 (working today): fork from a regular main-interp
+    # worker thread, child runs trio on its own main interp.
+    # Validated by
+    # `ai/conc-anal/subint_fork_from_main_thread_smoketest.py`.
+    # See `tractor.spawn._main_thread_forkserver`.
+    'main_thread_forkserver': main_thread_forkserver_proc,
+    # Variant-2 (future, reserved): same fork machinery but
+    # child enters a sub-interpreter to host its `trio.run()`
+    # — gated on jcrist/msgspec#1026 unblocking PEP 684
+    # isolated-mode subints. Today the stub raises
+    # `NotImplementedError` pointing at the variant-1 backend
+    # + upstream blocker. See
+    # `tractor.spawn._subint_forkserver`.
+    'subint_forkserver': subint_forkserver_proc,
 }
diff --git a/tractor/spawn/_subint.py b/tractor/spawn/_subint.py
index eea3d00fd..5ee6aaa89 100644
--- a/tractor/spawn/_subint.py
+++ b/tractor/spawn/_subint.py
@@ -123,7 +123,6 @@
     from tractor.discovery._addr import UnwrappedAddress
     from tractor.ipc import (
         _server,
-        Channel,
     )
     from tractor.runtime._runtime import Actor
     from tractor.runtime._supervise import ActorNursery
@@ -202,6 +201,15 @@ async def subint_proc(
     # `parent_addr` (`tuple[str, int|str]` — see `UnwrappedAddress`)
     # and `infect_asyncio` (`bool`) `repr()` to valid Python
     # literals, so we can embed them directly.
+    #
+    # ?TODO, future SpawnSpec enrichment: if we ever want to pass
+    # non-`repr()`-roundtrippable values (a pre-built `SpawnSpec`
+    # struct, a credential token, a callable) we should switch to
+    # `_interpreters.set___main___attrs(interp_id, {...})` — the
+    # API anyio uses in `to_interpreter._Worker.call()`. Pattern:
+    # https://github.com/agronholm/anyio/blob/master/src/anyio/to_interpreter.py
+    # (the `set___main___attrs` site). See also tracking issue
+    # `#379`.
     bootstrap: str = (
         'from tractor._child import _actor_child_main\n'
         '_actor_child_main(\n'
@@ -238,6 +246,31 @@ def _subint_target() -> None:
         '''
         try:
             _interpreters.exec(interp_id, bootstrap)
+        # XXX without this catch, a hard subint-bootstrap
+        # failure (e.g. `ImportError` because the actor module
+        # isn't importable inside the subint, or a syntax error
+        # in the bootstrap str) goes only to Python's default
+        # thread-excepthook and is INVISIBLE to the parent
+        # task. At minimum, log via `log.exception` so the
+        # operator sees what failed.
+        # ?TODO, surface the captured exc to the parent task
+        # via a `nonlocal err` slot inspected after
+        # `subint_exited.wait()` — see anyio's
+        # `to_interpreter._interp_call` `(retval, is_exception)`
+        # tuple pattern +
+        # `_subint_forkserver.run_subint_in_worker_thread._drive`'s
+        # equivalent which already does this. Skipped here for
+        # now: re-raise from the parent must coordinate with
+        # the existing `trio.Cancelled` paths around the
+        # `subint_exited.wait()` calls (lines 327, 362).
+        # NOTE: this whole dedicated-thread machinery may go
+        # away under #450 (PEP 684 isolated mode), in which
+        # case `trio.to_thread.run_sync(Interpreter.exec, ...)`
+        # would handle exception propagation natively.
+        except BaseException:
+            log.exception(
+                f'subint bootstrap raised — interp_id={interp_id}'
+            )
         finally:
             try:
                 trio.from_thread.run_sync(
@@ -398,11 +431,19 @@ def _subint_target() -> None:
                         abandon_on_cancel=True,
                     )
             if cs.cancelled_caught:
+                # Disambiguate "thread leaked but subint already
+                # done" from "thread alive because subint is
+                # genuinely wedged" — pattern borrowed from
+                # trio-parallel's `_sint.SintWorker.is_alive()`.
+                still_running: bool = _interpreters.is_running(
+                    interp_id,
+                )
                 log.warning(
                     f'Subint driver thread did not exit within '
                     f'{_HARD_KILL_TIMEOUT}s — abandoning.\n'
                     f'   |_interp_id={interp_id}\n'
                     f'   |_thread={driver_thread.name}\n'
+                    f'   |_subint_still_running={still_running}\n'
                     f'(This usually means portal-cancel could '
                     f'not be delivered — e.g., IPC channel was '
                     f'already broken. The subint will continue '
@@ -431,5 +472,3 @@ def _subint_target() -> None:
     finally:
         if not cancelled_during_spawn:
             actor_nursery._children.pop(uid, None)
-
-
diff --git a/tractor/spawn/_subint_forkserver.py b/tractor/spawn/_subint_forkserver.py
new file mode 100644
index 000000000..07fbcf140
--- /dev/null
+++ b/tractor/spawn/_subint_forkserver.py
@@ -0,0 +1,310 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Variant-2 (future) "subint forkserver" placeholder — reserved
+for the eventual subint-isolated-child runtime variant.
+
+> **Status:** placeholder. Today
+> `--spawn-backend=subint_forkserver` aliases to
+> `main_thread_forkserver_proc` (variant 1, see
+> `tractor.spawn._main_thread_forkserver`). A follow-up commit
+> in this PR series flips the alias to a `NotImplementedError`
+> stub reserving the `'subint_forkserver'` key for the literal
+> subint-hosted-child variant once
+> [jcrist/msgspec#1026](https://github.com/jcrist/msgspec/issues/1026)
+> unblocks PEP 684 isolated-mode subints upstream.
+
+Future arch — what subints would buy us
+---------------------------------------
+
+When msgspec#1026 unblocks isolated-mode subints (PEP 684
+per-interp GIL), three concrete wins land — these are the
+reason the `'subint_forkserver'` key is reserved as a
+distinct backend rather than just folded into
+`'main_thread_forkserver'`:
+
+**(1) Cheaper forks (smaller main-interp COW image)**
+
+Today (variant 1) the parent's main interp carries the full
+tractor stack: trio runtime, msgspec codecs, IPC layer,
+every user module the actor imported. When the forkserver
+worker calls `os.fork()` the child inherits ALL of that as
+COW memory — even though most gets overwritten when the
+child boots its own `trio.run()`.
+
+Variant 2 moves the parent's `trio.run()` into a subint (its
+own `sys.modules` / `__main__` / globals). The main interp
+**stays minimal** — just the forkserver-thread plumbing +
+bare CPython. The main interp becomes the *literal*
+forkserver: an intentionally-empty execution context whose
+only job is to call `os.fork()` cleanly. Inherited COW image
+shrinks proportionally.
+
+**(2) True parallelism between forkserver and trio
+(per-interp GIL)**
+
+Variant-1 today: the forkserver worker and the trio.run()
+thread share the main GIL — when one runs the other waits.
+Spawn requests briefly stall trio while the worker takes
+the GIL to call `os.fork()`. PEP 684 isolated-mode gives
+each subint its own GIL: forkserver thread on main + trio
+on subint actually run in parallel. Spawn latency drops,
+trio loop doesn't notice the fork happening.
+
+**(3) Multi-actor-per-process (the architectural prize)**
+
+The bigger payoff and the reason `_subint.py` (the in-thread
+`subint` backend) exists in parallel with this module. With
+per-interp-GIL subints, one process can host:
+
+- main interp: forkserver thread + bookkeeping
+- subint A: actor 1's `trio.run()`
+- subint B: actor 2's `trio.run()`
+- subint C: ...
+
+`os.fork()` becomes the **last-resort** spawn — used only
+when a new OS process is actually required (cgroups,
+namespaces, security boundary, multi-host distribution).
+Within a single process, subint-per-actor is radically
+cheaper: no fork, no COW, no inherited-fd cleanup — just
+`_interpreters.create()` + `_interpreters.exec()`.
+
+The three backends converge on a coherent story:
+
+- `subint` → in-process spawn (cheap, GIL-isolated),
+- `main_thread_forkserver` → cross-process spawn today
+  (variant 1, working),
+- `subint_forkserver` → cross-process spawn with
+  isolated-subint child (variant 2, this module, future).
+
+What lives here today
+---------------------
+
+- `run_subint_in_worker_thread()` — companion primitive to
+  `_main_thread_forkserver.fork_from_worker_thread()`. Creates
+  a fresh `legacy`-config sub-interpreter and drives a given
+  bootstrap code string through `_interpreters.exec()` on a
+  dedicated worker thread; destroys the subint after the
+  thread joins. Used today by the
+  `subint_fork_from_main_thread_smoketest.py` feasibility
+  check; will be wired into the variant-2
+  `subint_forkserver_proc` spawn-coroutine when it lands.
+- (legacy re-exports of fork primitives kept for backward-
+  compatible imports until external consumers migrate to
+  `_main_thread_forkserver`)
+
+What will live here when variant 2 ships
+----------------------------------------
+
+- `subint_forkserver_proc()` — the variant-2 spawn-backend
+  coroutine. Same fork machinery as variant 1, but the
+  fork-child enters a fresh subint (via
+  `run_subint_in_worker_thread`) before booting its
+  `trio.run()`. Net effect: child runtime is GIL-isolated
+  from the parent + any sibling actors in the same process.
+- A stub `subint_forkserver_proc` is added in a follow-up
+  commit that raises `NotImplementedError(...)` pointing at
+  this docstring + jcrist/msgspec#1026 + tractor #379, so
+  `--spawn-backend=subint_forkserver` errors cleanly today
+  rather than silently aliasing variant 1.
+
+See also
+--------
+
+- `tractor.spawn._main_thread_forkserver` — variant 1,
+  working today; for the full design rationale, fork-
+  semantics analysis, and trio×fork hazard breakdown.
+- `tractor.spawn._subint` — the in-thread `subint` backend
+  (one process, one actor per subint, no fork).
+- `tractor.spawn._subint_fork` — RFC stub for the
+  fork-from-non-main-subint strategy that is blocked at the
+  CPython level.
+- [#379](https://github.com/goodboy/tractor/issues/379)
+  — subint backend umbrella tracking issue.
+- [jcrist/msgspec#1026](https://github.com/jcrist/msgspec/issues/1026)
+  — upstream blocker for PEP 684 isolated-mode subints.
+- [#450](https://github.com/goodboy/tractor/issues/450) —
+  thread-constraints audit follow-up tied to msgspec#1026.
+
+'''
+from __future__ import annotations
+import threading
+from typing import (
+    Any,
+    TYPE_CHECKING,
+)
+
+import trio
+from trio import TaskStatus
+
+from tractor.log import get_logger
+from ._subint import _has_subints
+
+if TYPE_CHECKING:
+    from tractor.discovery._addr import UnwrappedAddress
+    from tractor.runtime._portal import Portal
+    from tractor.runtime._runtime import Actor
+    from tractor.runtime._supervise import ActorNursery
+
+# Private CPython subint API — used by `run_subint_in_worker_thread`
+# below. Imported only when 3.14+ is detected (via `_has_subints`
+# from `_subint`); on older runtimes the symbol is `None` and
+# the function raises a clean `RuntimeError` on entry.
+if _has_subints:
+    import _interpreters  # type: ignore
+else:
+    _interpreters = None  # type: ignore
+
+
+log = get_logger('tractor')
+
+
+def run_subint_in_worker_thread(
+    bootstrap: str,
+    *,
+    thread_name: str = 'subint-trio',
+    join_timeout: float = 10.0,
+
+) -> None:
+    '''
+    Create a fresh legacy-config sub-interpreter and drive
+    the given `bootstrap` code string through
+    `_interpreters.exec()` on a dedicated worker thread.
+
+    Naming mirrors `fork_from_worker_thread()`:
+    "<action>_in_worker_thread" — the action here is "run a
+    subint", not "run trio" per se. Typical `bootstrap`
+    content does import `trio` + call `trio.run()`, but
+    nothing about this primitive requires trio; it's a
+    generic "host a subint on a worker thread" helper.
+    Intended mainly for use inside a fork-child (see
+    `tractor.spawn._subint_forkserver` module docstring) but
+    works anywhere.
+
+    See `tractor.spawn._subint.subint_proc` for the matching
+    pattern tractor uses at the sub-actor level.
+
+    Destroys the subint after the thread joins.
+
+    '''
+    if not _has_subints:
+        raise RuntimeError(
+            'subint-forkserver primitives require Python '
+            '3.14+.'
+        )
+
+    interp_id: int = _interpreters.create('legacy')
+    log.runtime(
+        f'Created child-side subint for trio.run()\n'
+        f'(>\n'
+        f' |_interp_id={interp_id}\n'
+    )
+
+    err: BaseException | None = None
+
+    def _drive() -> None:
+        nonlocal err
+        try:
+            _interpreters.exec(interp_id, bootstrap)
+        except BaseException as e:
+            err = e
+            log.exception(
+                f'Failed to .exec() in subint ??\n'
+                f'_interpreters.exec(\n'
+                f'    interp_id={interp_id!r},\n'
+                f'    bootstrap={bootstrap!r},\n'
+                f') => {err!r}\n'
+            )
+
+    worker: threading.Thread = threading.Thread(
+        target=_drive,
+        name=thread_name,
+        daemon=False,
+    )
+    worker.start()
+    worker.join(timeout=join_timeout)
+
+    try:
+        _interpreters.destroy(interp_id)
+    except _interpreters.InterpreterError as e:
+        log.warning(
+            f'Could not destroy child-side subint '
+            f'{interp_id}: {e}'
+        )
+
+    if worker.is_alive():
+        raise RuntimeError(
+            f'child-side subint trio-driver thread '
+            f'{thread_name!r} did not return within '
+            f'{join_timeout}s.'
+        )
+    if err is not None:
+        raise err
+
+
+async def subint_forkserver_proc(
+    name: str,
+    actor_nursery: ActorNursery,
+    subactor: Actor,
+    errors: dict[tuple[str, str], Exception],
+
+    bind_addrs: list[UnwrappedAddress],
+    parent_addr: UnwrappedAddress,
+    _runtime_vars: dict[str, Any],
+    *,
+    infect_asyncio: bool = False,
+    task_status: TaskStatus[Portal] = trio.TASK_STATUS_IGNORED,
+    proc_kwargs: dict[str, any] = {},
+
+) -> None:
+    '''
+    PLACEHOLDER — variant-2 (subint-isolated child runtime)
+    spawn-backend coroutine. Reserved for the eventual impl
+    that uses `run_subint_in_worker_thread()` in the fork-child
+    to host the child's `trio.run()` inside a fresh subint.
+
+    Today this stub raises immediately so
+    `--spawn-backend=subint_forkserver` errors cleanly with a
+    pointer to the working variant-1 backend
+    (`main_thread_forkserver`) and the upstream blocker
+    ([jcrist/msgspec#1026](https://github.com/jcrist/msgspec/issues/1026)).
+
+    See this module's top-level docstring for the future-arch
+    design + what lives here when the variant-2 impl lands.
+
+    '''
+    raise NotImplementedError(
+        f'`{ "subint_forkserver"!r}` spawn backend is reserved '
+        f'for the future variant-2 (subint-isolated child '
+        f'runtime) — gated on jcrist/msgspec#1026 unblocking '
+        f'PEP 684 isolated-mode subints upstream.\n'
+        f'\n'
+        f'For the working fork-based backend today, use '
+        f'`--spawn-backend=main_thread_forkserver` (variant '
+        f'1: fork from a regular main-interp worker thread, '
+        f'child runs trio on its own main interp).\n'
+        f'\n'
+        f'See:\n'
+        f'  - tractor.spawn._main_thread_forkserver — the '
+        f'working variant-1 impl + design rationale\n'
+        f'  - tractor.spawn._subint_forkserver — this '
+        f'module\'s docstring for the variant-2 future-arch\n'
+        f'  - https://github.com/goodboy/tractor/issues/379 '
+        f'(subint umbrella)\n'
+        f'  - https://github.com/jcrist/msgspec/issues/1026 '
+        f'(upstream PEP 684 blocker)'
+    )
diff --git a/tractor/spawn/_trio.py b/tractor/spawn/_trio.py
index 3b425256c..c2e3ba477 100644
--- a/tractor/spawn/_trio.py
+++ b/tractor/spawn/_trio.py
@@ -39,7 +39,7 @@
     current_actor,
     is_root_process,
     debug_mode,
-    get_runtime_vars,
+    # get_runtime_vars,
 )
 from tractor.log import get_logger
 from tractor.discovery._addr import UnwrappedAddress
@@ -282,7 +282,23 @@ async def trio_proc(
 
                 if proc.poll() is None:
                     log.cancel(f"Attempting to hard kill {proc}")
-                    await hard_kill(proc)
+                    await hard_kill(
+                        proc,
+                        # NOTE, pass through so post-SIGKILL we
+                        # can `os.unlink()` the subactor's
+                        # orphaned UDS sock-file(s) — the
+                        # subactor's own
+                        # `_serve_ipc_eps`-`finally:` cleanup
+                        # never runs under SIGKILL. `subactor`
+                        # lets the helper reconstruct the
+                        # sock path via `aid.name + proc.pid`
+                        # when `bind_addrs` is the common
+                        # self-assigned-random case
+                        # (bind_addrs=None at spawn). See
+                        # `_unlink_uds_bind_addrs()` in `_spawn`.
+                        bind_addrs=bind_addrs,
+                        subactor=subactor,
+                    )
 
                 log.debug(f"Joined {proc}")
         else:
diff --git a/tractor/trionics/patches/README.md b/tractor/trionics/patches/README.md
new file mode 100644
index 000000000..c03845f32
--- /dev/null
+++ b/tractor/trionics/patches/README.md
@@ -0,0 +1,95 @@
+# `tractor.trionics.patches`
+
+Defensive monkey-patches for bugs in `trio` itself.
+
+## What goes here
+
+- Bugs in upstream `trio` that we've encountered while
+  running `tractor` and need to work around until
+  upstream releases a fix.
+- Each patch fixes EXACTLY one trio internal — no
+  multi-bug omnibus patches.
+
+## What does NOT go here
+
+- Bugs in `tractor`'s own code (those get fixed
+  in-tree, in the offending tractor module).
+- Bugs in `asyncio`, `pytest`, the stdlib, etc. (file
+  separate `tractor.<lib>.patches` subpkgs as
+  needed).
+- Workarounds for behavior we *disagree* with but that
+  isn't a bug per se. If trio's API does what it says
+  on the tin, we don't override it here.
+
+## Per-patch contract
+
+Every `_<topic>.py` module in this directory MUST
+expose:
+
+- **`apply() -> bool`** — apply the patch. Idempotent
+  (safe to call multiple times). Version-gated — must
+  consult `is_needed()` and skip when False. Returns
+  `True` if patched this call, `False` if skipped.
+
+- **`is_needed() -> bool`** — does upstream still need
+  patching? Today most patches return `True`
+  unconditionally, but as upstream releases land each
+  should gate on `Version(trio.__version__) <
+  Version('X.Y.Z')`. When the gated version is
+  released, the patch can be DELETED entirely.
+
+- **`repro() -> None`** — minimal demonstration of the
+  bug. Used by the regression test suite to assert (a)
+  the upstream bug still exists, (b) our patch fixes
+  it. Should be tight enough that calling it post-
+  `apply()` returns cleanly within a few hundred
+  milliseconds — tests wrap it with a wall-clock cap.
+
+Each module's docstring MUST contain:
+
+- **Problem**: what trio does wrong + the trigger
+  conditions (e.g. "fork-spawn backend, peer-closed
+  socketpair, etc.")
+- **Fix**: the one-line (ideally) patch
+- **Repro**: the standalone snippet `repro()`
+  implements
+- **Upstream**: link to filed issue/PR (or
+  `TODO: file`)
+- **REMOVE WHEN**: `trio>=X.Y.Z` ships the upstream
+  fix
+
+## Adding a patch
+
+1. Create `_<topic>.py` with the `apply` /
+   `is_needed` / `repro` API.
+2. Register it in `__init__.py::_PATCHES`.
+3. Add a regression test in
+   `tests/trionics/test_patches.py` that uses
+   `repro()` to assert pre/post-patch behavior with a
+   wall-clock cap.
+4. File the upstream issue/PR. Add the link to your
+   module's `Upstream:` and `# REMOVE WHEN:` lines.
+
+## Removing a patch (when upstream releases the fix)
+
+1. Confirm the upstream-fixed `trio` version is the
+   minimum we depend on, OR keep the version-gate in
+   `is_needed()` if we still support older trio.
+2. If we've fully bumped past the broken versions:
+   - Delete `_<topic>.py`
+   - Remove the entry from `__init__.py::_PATCHES`
+   - Delete the corresponding test in
+     `tests/trionics/test_patches.py`
+   - Bump the conc-anal doc with a "FIXED" header
+
+## Calling
+
+```python
+from tractor.trionics.patches import apply_all
+apply_all()
+```
+
+Currently invoked from `tractor._child._actor_child_main`
+before `_trio_main` so every spawned subactor gets
+patched. The root actor's entry could opt in too if a
+patch turns out to bite the root (none do today).
diff --git a/tractor/trionics/patches/__init__.py b/tractor/trionics/patches/__init__.py
new file mode 100644
index 000000000..5d2cdfb33
--- /dev/null
+++ b/tractor/trionics/patches/__init__.py
@@ -0,0 +1,84 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Defensive monkey-patches for `trio` internals.
+
+Every patch in this package fixes a bug in `trio` itself
+that we've encountered while running `tractor` — usually
+a fork-survival edge case that upstream `trio` hasn't
+filed/fixed yet. Each patch is:
+
+- **idempotent** — safe to call multiple times
+- **version-gated** — checks `trio.__version__` and skips
+  itself if upstream has shipped the fix
+- **scoped** — only modifies the specific trio internal
+  it's targeting; no broad side effects
+- **removable** — every patch carries a `# REMOVE WHEN:`
+  marker in its docstring pointing at the upstream PR
+  whose release allows us to drop it
+
+Add a new patch by:
+
+1. Create `tractor/trionics/patches/_<topic>.py` exposing
+   the `apply()` / `is_needed()` / `repro()` API
+   contract.
+2. Import it in this `__init__.py` and add an entry to
+   `_PATCHES`.
+3. Document upstream-fix-tracking in the module
+   docstring's `# REMOVE WHEN:` line.
+4. Add a regression test in
+   `tests/trionics/test_patches.py` that uses the
+   patch's `repro()` to assert the bug exists + the
+   patch fixes it.
+
+Calling `apply_all()` from a tractor entry point (e.g.
+`tractor._child._actor_child_main`) applies every
+registered patch + returns `{patch_name: applied?}` so
+callers can log/assert as needed.
+
+'''
+from typing import Callable
+
+from . import _wakeup_socketpair
+
+
+_PATCHES: list[tuple[str, Callable[[], bool]]] = [
+    (
+        'trio_wakeup_socketpair_drain_eof',
+        _wakeup_socketpair.apply,
+    ),
+]
+
+
+def apply_all() -> dict[str, bool]:
+    '''
+    Apply every registered patch. Idempotent — calling
+    twice is fine, second call's dict will be all
+    `False`.
+
+    Returns `{patch_name: applied?}`:
+
+    - `True` — patch was applied THIS call (inaugural
+      apply, or first-call-since-process-start).
+    - `False` — skipped (already applied OR upstream fix
+      detected via `is_needed() == False`).
+
+    '''
+    results: dict[str, bool] = {}
+    for name, applier in _PATCHES:
+        results[name] = applier()
+    return results
diff --git a/tractor/trionics/patches/_wakeup_socketpair.py b/tractor/trionics/patches/_wakeup_socketpair.py
new file mode 100644
index 000000000..6939bdcd4
--- /dev/null
+++ b/tractor/trionics/patches/_wakeup_socketpair.py
@@ -0,0 +1,171 @@
+# tractor: structured concurrent "actors".
+# Copyright 2018-eternity Tyler Goodlet.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+'''
+Patch `trio._core._wakeup_socketpair.WakeupSocketpair.drain()`
+to break on peer-closed EOF.
+
+Problem
+-------
+`drain()` loops on `self.wakeup_sock.recv(2**16)` and
+exits ONLY on `BlockingIOError` (buffer-empty on a
+non-blocking socket), NEVER on `recv() == b''`
+(peer-closed FIN). When the socketpair's write-end
+has been closed, `recv` returns 0 bytes each call →
+infinite C-level tight loop → 100% CPU, no Python
+checkpoints, no signal delivery, no progress.
+
+Most reliably triggered under fork-spawn backends —
+`os.fork()` + `_close_inherited_fds()` can leave a
+`WakeupSocketpair` instance whose `write_sock` was
+closed in the child (or whose peer-end is held by a
+process that has since exited).
+
+Repro
+-----
+```python
+from trio._core._wakeup_socketpair import WakeupSocketpair
+ws = WakeupSocketpair()
+ws.write_sock.close()
+ws.drain()  # spins forever pre-patch
+```
+
+Fix
+---
+One line: break the drain loop on `b''` EOF
+in addition to the existing `BlockingIOError` exit.
+
+```python
+def _safe_drain(self) -> None:
+    try:
+        while True:
+            data = self.wakeup_sock.recv(2**16)
+            if not data:  # ← peer-closed; nothing more to drain
+                return
+    except BlockingIOError:
+        pass
+```
+
+Upstream
+--------
+TODO: file at `python-trio/trio` — the standalone
+`repro()` below + this docstring is the issue body's
+evidence section.
+
+REMOVE WHEN: trio>=`<TBD>` ships the EOF-break in
+`_wakeup_socketpair.WakeupSocketpair.drain()`.
+
+See also
+--------
+- `ai/conc-anal/trio_wakeup_socketpair_busy_loop_under_fork_issue.md`
+- `ai/conc-anal/infected_asyncio_under_main_thread_forkserver_hang_issue.md`
+  — sibling-bug analysis fixed by the same patch.
+
+'''
+from __future__ import annotations
+
+
+# Module-local sentinel — set True by `apply()` after the
+# first successful patch. Idempotency guard.
+_APPLIED: bool = False
+
+
+def is_needed() -> bool:
+    '''
+    True iff upstream `trio` is the broken version that
+    needs our patch.
+
+    Today: always True since no released `trio` has the
+    fix. When upstream lands it, gate on:
+
+    ```python
+    from packaging.version import Version
+    import trio
+    return Version(trio.__version__) < Version('<TBD>')
+    ```
+
+    '''
+    # TODO version-gate once upstream lands the fix.
+    return True
+
+
+def repro() -> None:
+    '''
+    Minimal hang demonstrator + regression test target.
+
+    Returns CLEANLY when `apply()` has been called
+    earlier in this process (the patched
+    `_safe_drain` breaks on EOF). Spins forever
+    UNPATCHED — caller should wrap with a wall-clock
+    cap (e.g. `signal.alarm(N)` or `trio.fail_after`)
+    to avoid hanging the test runner if regressing.
+
+    Used by `tests/trionics/test_patches.py` to assert
+    both:
+
+    1. The bug exists upstream (sanity check the
+       repro is real).
+    2. Our patch fixes it (post-`apply()` returns
+       cleanly).
+
+    '''
+    from trio._core._wakeup_socketpair import (
+        WakeupSocketpair,
+    )
+    ws = WakeupSocketpair()
+    ws.write_sock.close()
+    ws.drain()  # ← targeted operation
+
+
+def apply() -> bool:
+    '''
+    Apply the EOF-break patch to
+    `WakeupSocketpair.drain`. Idempotent + version-
+    gated.
+
+    Returns:
+
+    - `True` if patched THIS call (inaugural apply).
+    - `False` if skipped (already applied this process,
+      OR `is_needed() == False` because upstream fixed
+      it).
+
+    '''
+    global _APPLIED
+    if _APPLIED or not is_needed():
+        return False
+
+    from trio._core._wakeup_socketpair import (
+        WakeupSocketpair as _WSP,
+    )
+
+    def _safe_drain(self) -> None:
+        try:
+            while True:
+                data = self.wakeup_sock.recv(2**16)
+                # XXX patch — break on EOF instead of
+                # spinning. Upstream trio's `drain()`
+                # only handles the `BlockingIOError`
+                # (buffer-empty) case; missed the
+                # peer-closed (`recv == b''`) case.
+                if not data:
+                    return
+        except BlockingIOError:
+            pass
+
+    _WSP.drain = _safe_drain
+    _APPLIED = True
+    return True
diff --git a/uv.lock b/uv.lock
index 086ae8ef7..722dc55be 100644
--- a/uv.lock
+++ b/uv.lock
@@ -682,6 +682,7 @@ dependencies = [
 
 [package.dev-dependencies]
 dev = [
+    { name = "greenback", marker = "python_full_version < '3.14'" },
     { name = "pexpect" },
     { name = "prompt-toolkit" },
     { name = "psutil" },
@@ -716,6 +717,7 @@ sync-pause = [
 ]
 testing = [
     { name = "pexpect" },
+    { name = "psutil" },
     { name = "pytest" },
     { name = "pytest-timeout" },
 ]
@@ -735,6 +737,7 @@ requires-dist = [
 
 [package.metadata.requires-dev]
 dev = [
+    { name = "greenback", marker = "python_full_version == '3.13.*'", specifier = ">=1.2.1,<2" },
     { name = "pexpect", specifier = ">=4.9.0,<5" },
     { name = "prompt-toolkit", specifier = ">=3.0.50" },
     { name = "psutil", specifier = ">=7.0.0" },
@@ -743,7 +746,7 @@ dev = [
     { name = "pytest-timeout", specifier = ">=2.3" },
     { name = "stackscope", specifier = ">=0.2.2,<0.3" },
     { name = "typing-extensions", specifier = ">=4.14.1" },
-    { name = "xonsh", editable = "../xonsh" },
+    { name = "xonsh", specifier = ">=0.23.0" },
 ]
 devx = [
     { name = "stackscope", specifier = ">=0.2.2,<0.3" },
@@ -755,12 +758,13 @@ repl = [
     { name = "prompt-toolkit", specifier = ">=3.0.50" },
     { name = "psutil", specifier = ">=7.0.0" },
     { name = "pyperclip", specifier = ">=1.9.0" },
-    { name = "xonsh", editable = "../xonsh" },
+    { name = "xonsh", specifier = ">=0.23.0" },
 ]
 subints = [{ name = "msgspec", marker = "python_full_version >= '3.14'", specifier = ">=0.21.0" }]
 sync-pause = [{ name = "greenback", marker = "python_full_version == '3.13.*'", specifier = ">=1.2.1,<2" }]
 testing = [
     { name = "pexpect", specifier = ">=4.9.0,<5" },
+    { name = "psutil", specifier = ">=7.0.0" },
     { name = "pytest", specifier = ">=8.3.5" },
     { name = "pytest-timeout", specifier = ">=2.3" },
 ]
@@ -868,61 +872,15 @@ wheels = [
 
 [[package]]
 name = "xonsh"
-source = { editable = "../xonsh" }
-
-[package.metadata]
-requires-dist = [
-    { name = "click", marker = "extra == 'full'" },
-    { name = "coverage", marker = "extra == 'test'", specifier = ">=5.3.1" },
-    { name = "distro", marker = "sys_platform == 'linux' and extra == 'full'" },
-    { name = "distro", marker = "extra == 'linux'" },
-    { name = "furo", marker = "extra == 'doc'" },
-    { name = "gnureadline", marker = "sys_platform == 'darwin' and extra == 'full'" },
-    { name = "gnureadline", marker = "extra == 'mac'" },
-    { name = "matplotlib", marker = "extra == 'doc'" },
-    { name = "myst-parser", marker = "extra == 'doc'" },
-    { name = "numpydoc", marker = "extra == 'doc'" },
-    { name = "pre-commit", marker = "extra == 'dev'" },
-    { name = "prompt-toolkit", marker = "extra == 'bestshell'", specifier = ">=3.0.29" },
-    { name = "prompt-toolkit", marker = "extra == 'ptk'", specifier = ">=3.0.29" },
-    { name = "prompt-toolkit", marker = "extra == 'test'", specifier = ">=3.0.29" },
-    { name = "psutil", marker = "extra == 'doc'" },
-    { name = "pygments", marker = "extra == 'bestshell'", specifier = ">=2.2" },
-    { name = "pygments", marker = "extra == 'pygments'", specifier = ">=2.2" },
-    { name = "pygments", marker = "extra == 'test'", specifier = ">=2.2" },
-    { name = "pyperclip", marker = "extra == 'ptk'" },
-    { name = "pyte", marker = "extra == 'test'", specifier = ">=0.8.0" },
-    { name = "pytest", marker = "extra == 'test'", specifier = ">=7" },
-    { name = "pytest-cov", marker = "extra == 'test'" },
-    { name = "pytest-mock", marker = "extra == 'test'" },
-    { name = "pytest-rerunfailures", marker = "extra == 'test'" },
-    { name = "pytest-subprocess", marker = "extra == 'test'" },
-    { name = "pytest-timeout", marker = "extra == 'test'" },
-    { name = "pyzmq", marker = "extra == 'doc'" },
-    { name = "re-ver", marker = "extra == 'dev'" },
-    { name = "requests", marker = "extra == 'test'" },
-    { name = "restructuredtext-lint", marker = "extra == 'test'" },
-    { name = "runthis-sphinxext", marker = "extra == 'doc'" },
-    { name = "setproctitle", marker = "sys_platform == 'win32' and extra == 'full'" },
-    { name = "setproctitle", marker = "extra == 'proctitle'" },
-    { name = "sphinx", marker = "extra == 'doc'", specifier = ">=3.1" },
-    { name = "sphinx-autobuild", marker = "extra == 'doc'" },
-    { name = "sphinx-prompt", marker = "extra == 'doc'" },
-    { name = "sphinx-reredirects", marker = "extra == 'doc'" },
-    { name = "sphinx-sitemap", marker = "extra == 'doc'" },
-    { name = "tomli", marker = "extra == 'dev'" },
-    { name = "tornado", marker = "extra == 'doc'" },
-    { name = "ujson", marker = "extra == 'full'" },
-    { name = "virtualenv", marker = "extra == 'test'", specifier = ">=20.16.2" },
-    { name = "xonsh", extras = ["bestshell"], marker = "extra == 'doc'" },
-    { name = "xonsh", extras = ["bestshell"], marker = "extra == 'test'" },
-    { name = "xonsh", extras = ["doc", "test"], marker = "extra == 'dev'" },
-    { name = "xonsh", extras = ["ptk", "pygments"], marker = "extra == 'full'" },
-]
-provides-extras = ["ptk", "pygments", "mac", "linux", "proctitle", "full", "bestshell", "test", "dev", "doc"]
-
-[package.metadata.requires-dev]
-dev = [{ name = "xonsh", extras = ["dev"] }]
+version = "0.23.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/60/e5/2dfa99e21a8118bed0e73ed50e91962fdad01b900e23497064e8810b03b5/xonsh-0.23.2.tar.gz", hash = "sha256:633608c8292938af0f242f05326cc2912f25fa72bd808824ab0534a6df304402", size = 1030659, upload-time = "2026-04-26T19:28:40.744Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/53/0d/bf7869dd57b40888ea1da8fc88f70d8e94ec2f8ee236ea4c22a757593235/xonsh-0.23.2-py311-none-any.whl", hash = "sha256:a38dd84e23e97fc42e0156c80024b3449474dfcbb6c3a344bd38c45a2b2de44d", size = 756215, upload-time = "2026-04-26T19:28:38.875Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/9f/b1bb0c15bf2120469c94b062f4b854588370ab94c7a1679c84ff646bf50b/xonsh-0.23.2-py312-none-any.whl", hash = "sha256:190a348fa19774de8e697af5f44c9adb95aca687fa475b31dda23d1a3462a3c6", size = 756224, upload-time = "2026-04-26T19:28:39.17Z" },
+    { url = "https://files.pythonhosted.org/packages/83/23/8e037579ac86d8f266b4116338f902eab04175b88574a6438ee739dd3084/xonsh-0.23.2-py313-none-any.whl", hash = "sha256:4ebbf42a94f505d25694f154556ca0caa149a3f59870ec850bd13ad8df519dce", size = 756728, upload-time = "2026-04-26T19:28:39.493Z" },
+    { url = "https://files.pythonhosted.org/packages/05/ec/090300d9c5f14f58b5a684302f43535457f733a62f11673aa3ac38460717/xonsh-0.23.2-py314-none-any.whl", hash = "sha256:5efcd0f6db8f9f1dace256de2c04c3c044f2d86b48434187c43a69d602283a9e", size = 756767, upload-time = "2026-04-26T19:28:37.218Z" },
+]
 
 [[package]]
 name = "zipp"