From 232422dabb00deee8fe54d6fe0edce0d46692e56 Mon Sep 17 00:00:00 2001
From: You Yan <yy2900@columbia.edu>
Date: Thu, 16 Apr 2026 18:02:12 -0700
Subject: [PATCH 1/5] fix: detect JobRunner subprocess death and stop
 memory-profiler ESRCH spam

A JobRunner subprocess dying mid-acquisition (e.g. segfault in a
fork()-unsafe native library) was previously undetected: the parent kept
dispatching save jobs into a queue nothing was consuming, and the
acquisition appeared to complete normally while hundreds of timepoints
were lost. The only visible symptom was a flood of memory-profiler
tracebacks about /proc/<pid>/smaps_rollup.

- JobRunner: spawn a daemon watchdog thread in start() that blocks on
  self.sentinel. When the subprocess exits, it distinguishes expected
  shutdown (via _shutdown_event) from unexpected death and invokes a
  registered handler with the exitcode. kill() now also sets
  _shutdown_event so intentional termination is not flagged.
- MultiPointWorker: registers a handler that logs the failure and calls
  request_abort_fn(), so the acquisition loop exits on the next abort
  check instead of silently rotting for hours.
- memory_profiler._get_linux_pss_mb: catch ProcessLookupError in
  addition to FileNotFoundError/PermissionError/ValueError. Reading
  /proc/<zombie>/smaps_rollup returns ESRCH, which is a sibling of
  FileNotFoundError under OSError and was not being caught.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 software/control/core/job_processing.py     | 55 ++++++++++++++++++++-
 software/control/core/memory_profiler.py    |  4 +-
 software/control/core/multi_point_worker.py | 10 ++++
 3 files changed, 67 insertions(+), 2 deletions(-)
diff --git a/software/control/core/job_processing.py b/software/control/core/job_processing.py
index bcf183df0..6288b3c10 100644
--- a/software/control/core/job_processing.py
+++ b/software/control/core/job_processing.py
@@ -1,12 +1,14 @@
 import abc
 import multiprocessing
+import multiprocessing.connection
 import queue
 import os
+import threading
 import time
 import json
 from datetime import datetime
 from contextlib import contextmanager
-from typing import ClassVar, Dict, Generic, List, Optional, Set, Tuple, TypeVar, Union
+from typing import Callable, ClassVar, Dict, Generic, List, Optional, Set, Tuple, TypeVar, Union
 from uuid import uuid4
 
 from dataclasses import dataclass, field
@@ -1064,6 +1066,12 @@ def __init__(
         self._bp_pending_bytes = bp_pending_bytes
         self._bp_capacity_event = bp_capacity_event
 
+        # Watchdog for detecting unexpected subprocess death (segfault, OOM kill, etc.).
+        # Without this, a dead JobRunner silently rots the acquisition: the parent keeps
+        # queuing save jobs that no one consumes.
+        self._on_unexpected_exit: Optional[Callable[[Optional[int]], None]] = None
+        self._watchdog: Optional[threading.Thread] = None
+
         # Clean up stale metadata files from previous crashed acquisitions
         # Only run when explicitly requested (i.e., when OME-TIFF saving is being used)
         if cleanup_stale_ome_files:
@@ -1071,6 +1079,51 @@ def __init__(
             if removed:
                 self._log.info(f"Cleaned up {len(removed)} stale OME-TIFF metadata files")
 
+    def set_unexpected_exit_handler(self, handler: Optional[Callable[[Optional[int]], None]]) -> None:
+        """Register a callback to invoke if the subprocess dies without a clean shutdown.
+
+        The handler is called from the watchdog thread with the subprocess exitcode
+        (which may be None, a positive int, or a negative signal number on POSIX).
+        """
+        self._on_unexpected_exit = handler
+
+    def start(self):
+        super().start()
+        # Watchdog must start after super().start() so self.pid and self.sentinel are set.
+        self._watchdog = threading.Thread(
+            target=self._watch_subprocess,
+            daemon=True,
+            name=f"JobRunner-watchdog[{self.pid}]",
+        )
+        self._watchdog.start()
+
+    def kill(self):
+        # Mark as expected so the watchdog treats the exit as intentional.
+        if self._shutdown_event is not None:
+            self._shutdown_event.set()
+        super().kill()
+
+    def _watch_subprocess(self) -> None:
+        """Block until the subprocess exits, then distinguish expected vs. unexpected death."""
+        # Capture references; shutdown() clears self._shutdown_event after join().
+        shutdown_event = self._shutdown_event
+        pid = self.pid
+        multiprocessing.connection.wait([self.sentinel])
+        exitcode = self.exitcode
+        if shutdown_event is not None and shutdown_event.is_set():
+            self._log.info(f"JobRunner PID={pid} exited cleanly (exitcode={exitcode})")
+            return
+        self._log.error(
+            f"JobRunner PID={pid} died UNEXPECTEDLY (exitcode={exitcode}). "
+            f"Pending save jobs will not complete."
+        )
+        handler = self._on_unexpected_exit
+        if handler is not None:
+            try:
+                handler(exitcode)
+            except Exception:
+                self._log.exception("JobRunner unexpected-exit handler raised")
+
     def dispatch(self, job: Job):
         # Inject acquisition_info into SaveOMETiffJob instances before serialization.
         # The job object is pickled when placed in the queue, so injection must happen here.
diff --git a/software/control/core/memory_profiler.py b/software/control/core/memory_profiler.py
index 8d6ab5421..92dc20212 100644
--- a/software/control/core/memory_profiler.py
+++ b/software/control/core/memory_profiler.py
@@ -259,7 +259,9 @@ def _get_linux_pss_mb(pid: int) -> float:
                     pss_total_kb += int(parts[1])
 
         return pss_total_kb / 1024
-    except (FileNotFoundError, PermissionError, ValueError):
+    except (FileNotFoundError, PermissionError, ValueError, ProcessLookupError):
+        # ProcessLookupError (errno ESRCH) can occur when the process exited between
+        # PID enumeration and the smaps_rollup read, or when the PID is a zombie.
         pass
     return 0.0
 
diff --git a/software/control/core/multi_point_worker.py b/software/control/core/multi_point_worker.py
index fa5fabb71..30c75faca 100644
--- a/software/control/core/multi_point_worker.py
+++ b/software/control/core/multi_point_worker.py
@@ -370,9 +370,19 @@ def __init__(
                     # Subprocess starts warming up in background - don't block here
 
             self._job_runners.append((job_class, job_runner))
+            if job_runner is not None:
+                job_runner.set_unexpected_exit_handler(self._on_job_runner_died)
         self._abort_on_failed_job = abort_on_failed_jobs
         self._first_job_dispatched = False  # Track if we've waited for subprocess warmup
 
+    def _on_job_runner_died(self, exitcode: Optional[int]) -> None:
+        """Invoked by JobRunner's watchdog when a subprocess dies unexpectedly."""
+        self._log.error(
+            f"JobRunner subprocess died unexpectedly (exitcode={exitcode}); aborting acquisition."
+        )
+        self._acquisition_error_count += 1
+        self.request_abort_fn()
+
     def update_use_piezo(self, value):
         self.use_piezo = value
         self._log.info(f"MultiPointWorker: updated use_piezo to {value}")

From 3d8058d240290c61c29824667dc9fb2821158742 Mon Sep 17 00:00:00 2001
From: You Yan <yy2900@columbia.edu>
Date: Thu, 16 Apr 2026 18:14:12 -0700
Subject: [PATCH 2/5] style: apply Black formatting

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 software/control/core/job_processing.py     | 3 +--
 software/control/core/multi_point_worker.py | 4 +---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/software/control/core/job_processing.py b/software/control/core/job_processing.py
index 6288b3c10..64007f65d 100644
--- a/software/control/core/job_processing.py
+++ b/software/control/core/job_processing.py
@@ -1114,8 +1114,7 @@ def _watch_subprocess(self) -> None:
             self._log.info(f"JobRunner PID={pid} exited cleanly (exitcode={exitcode})")
             return
         self._log.error(
-            f"JobRunner PID={pid} died UNEXPECTEDLY (exitcode={exitcode}). "
-            f"Pending save jobs will not complete."
+            f"JobRunner PID={pid} died UNEXPECTEDLY (exitcode={exitcode}). " f"Pending save jobs will not complete."
         )
         handler = self._on_unexpected_exit
         if handler is not None:
diff --git a/software/control/core/multi_point_worker.py b/software/control/core/multi_point_worker.py
index 30c75faca..3afb47325 100644
--- a/software/control/core/multi_point_worker.py
+++ b/software/control/core/multi_point_worker.py
@@ -377,9 +377,7 @@ def __init__(
 
     def _on_job_runner_died(self, exitcode: Optional[int]) -> None:
         """Invoked by JobRunner's watchdog when a subprocess dies unexpectedly."""
-        self._log.error(
-            f"JobRunner subprocess died unexpectedly (exitcode={exitcode}); aborting acquisition."
-        )
+        self._log.error(f"JobRunner subprocess died unexpectedly (exitcode={exitcode}); aborting acquisition.")
         self._acquisition_error_count += 1
         self.request_abort_fn()
 

From 5267dd4caed3d019479fcfdbc712192c10b50419 Mon Sep 17 00:00:00 2001
From: You Yan <yy2900@columbia.edu>
Date: Fri, 17 Apr 2026 11:07:27 -0700
Subject: [PATCH 3/5] =?UTF-8?q?fix:=20address=20Copilot=20review=20?=
 =?UTF-8?q?=E2=80=94=20terminate()=20+=20early=20handler=20registration?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Override JobRunner.terminate() to set _shutdown_event so an intentional
  terminate() (e.g., from MultiPointController.close) is not reported as
  "died UNEXPECTEDLY" by the watchdog. Matches the kill() override.
- Register the unexpected-exit handler on freshly created JobRunners
  before start(), eliminating the race where the subprocess could die
  during warmup before the handler was installed.
- Register the handler on pre-warmed runners the moment MultiPointWorker
  adopts them (before set_acquisition_info), narrowing the uncovered
  window to the pre-handoff phase only.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 software/control/core/job_processing.py     |  6 ++++++
 software/control/core/multi_point_worker.py | 10 ++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/software/control/core/job_processing.py b/software/control/core/job_processing.py
index 64007f65d..8590f1db0 100644
--- a/software/control/core/job_processing.py
+++ b/software/control/core/job_processing.py
@@ -1103,6 +1103,12 @@ def kill(self):
             self._shutdown_event.set()
         super().kill()
 
+    def terminate(self):
+        # Mark as expected so the watchdog treats the exit as intentional.
+        if self._shutdown_event is not None:
+            self._shutdown_event.set()
+        super().terminate()
+
     def _watch_subprocess(self) -> None:
         """Block until the subprocess exits, then distinguish expected vs. unexpected death."""
         # Capture references; shutdown() clears self._shutdown_event after join().
diff --git a/software/control/core/multi_point_worker.py b/software/control/core/multi_point_worker.py
index 3afb47325..5ce756778 100644
--- a/software/control/core/multi_point_worker.py
+++ b/software/control/core/multi_point_worker.py
@@ -335,6 +335,11 @@ def __init__(
                     if prewarmed_job_runner.is_ready():
                         self._log.info(f"Using pre-warmed job runner for {job_class.__name__} jobs")
                         job_runner = prewarmed_job_runner
+                        # Register abort handler as early as possible on adoption. The
+                        # window between controller-side start() and this point remains
+                        # uncovered for pre-warmed runners, but beyond here any
+                        # unexpected death triggers an abort.
+                        job_runner.set_unexpected_exit_handler(self._on_job_runner_died)
                         # Configure it with current acquisition settings
                         job_runner.set_acquisition_info(self.acquisition_info)
                         if zarr_writer_info:
@@ -366,12 +371,13 @@ def __init__(
                         # Pass zarr writer info for ZARR_V3 format
                         zarr_writer_info=zarr_writer_info,
                     )
+                    # Register abort handler before start() so the watchdog always has
+                    # a handler available, even if the subprocess dies during warmup.
+                    job_runner.set_unexpected_exit_handler(self._on_job_runner_died)
                     job_runner.start()
                     # Subprocess starts warming up in background - don't block here
 
             self._job_runners.append((job_class, job_runner))
-            if job_runner is not None:
-                job_runner.set_unexpected_exit_handler(self._on_job_runner_died)
         self._abort_on_failed_job = abort_on_failed_jobs
         self._first_job_dispatched = False  # Track if we've waited for subprocess warmup
 

From 24751ec9478365503ea3a7a19cbfc479fd804349 Mon Sep 17 00:00:00 2001
From: You Yan <yy2900@columbia.edu>
Date: Sat, 18 Apr 2026 00:03:53 -0700
Subject: [PATCH 4/5] refactor: simplify watchdog intent tracking and trim
 comments

- Replace the "shutdown_event is set" signal with a dedicated
  _intentional_exit bool. _shutdown_event is a multiprocessing
  primitive that shutdown() nulls during cleanup, which opened a
  narrow race where the watchdog could read None and misclassify an
  intentional shutdown as unexpected death. The new flag survives
  cleanup and is set by all three explicit-stop paths: kill(),
  terminate(), and shutdown().
- Drop narrative comments that restate what the code already says,
  per project style. Keep the WHY comment about _intentional_exit
  vs _shutdown_event.
- Collapse a split f-string log message to a single literal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 software/control/core/job_processing.py     | 23 ++++++++-------------
 software/control/core/multi_point_worker.py |  9 +++-----
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/software/control/core/job_processing.py b/software/control/core/job_processing.py
index 8590f1db0..8a72615d2 100644
--- a/software/control/core/job_processing.py
+++ b/software/control/core/job_processing.py
@@ -1066,11 +1066,12 @@ def __init__(
         self._bp_pending_bytes = bp_pending_bytes
         self._bp_capacity_event = bp_capacity_event
 
-        # Watchdog for detecting unexpected subprocess death (segfault, OOM kill, etc.).
-        # Without this, a dead JobRunner silently rots the acquisition: the parent keeps
-        # queuing save jobs that no one consumes.
         self._on_unexpected_exit: Optional[Callable[[Optional[int]], None]] = None
         self._watchdog: Optional[threading.Thread] = None
+        # Set by kill()/terminate()/shutdown() so the watchdog can distinguish
+        # intentional exit from segfault/OOM. Separate from _shutdown_event, which
+        # is a multiprocessing primitive that shutdown() nulls during cleanup.
+        self._intentional_exit = False
 
         # Clean up stale metadata files from previous crashed acquisitions
         # Only run when explicitly requested (i.e., when OME-TIFF saving is being used)
@@ -1089,7 +1090,6 @@ def set_unexpected_exit_handler(self, handler: Optional[Callable[[Optional[int]]
 
     def start(self):
         super().start()
-        # Watchdog must start after super().start() so self.pid and self.sentinel are set.
         self._watchdog = threading.Thread(
             target=self._watch_subprocess,
             daemon=True,
@@ -1098,29 +1098,23 @@ def start(self):
         self._watchdog.start()
 
     def kill(self):
-        # Mark as expected so the watchdog treats the exit as intentional.
-        if self._shutdown_event is not None:
-            self._shutdown_event.set()
+        self._intentional_exit = True
         super().kill()
 
     def terminate(self):
-        # Mark as expected so the watchdog treats the exit as intentional.
-        if self._shutdown_event is not None:
-            self._shutdown_event.set()
+        self._intentional_exit = True
         super().terminate()
 
     def _watch_subprocess(self) -> None:
         """Block until the subprocess exits, then distinguish expected vs. unexpected death."""
-        # Capture references; shutdown() clears self._shutdown_event after join().
-        shutdown_event = self._shutdown_event
         pid = self.pid
         multiprocessing.connection.wait([self.sentinel])
         exitcode = self.exitcode
-        if shutdown_event is not None and shutdown_event.is_set():
+        if self._intentional_exit:
             self._log.info(f"JobRunner PID={pid} exited cleanly (exitcode={exitcode})")
             return
         self._log.error(
-            f"JobRunner PID={pid} died UNEXPECTEDLY (exitcode={exitcode}). " f"Pending save jobs will not complete."
+            f"JobRunner PID={pid} died UNEXPECTEDLY (exitcode={exitcode}). Pending save jobs will not complete."
         )
         handler = self._on_unexpected_exit
         if handler is not None:
@@ -1232,6 +1226,7 @@ def shutdown(self, timeout_s=1.0):
         # Guard against double shutdown
         if self._shutdown_event is None:
             return
+        self._intentional_exit = True
         self._shutdown_event.set()
         # Send sentinel to wake up worker blocked on queue.get()
         try:
diff --git a/software/control/core/multi_point_worker.py b/software/control/core/multi_point_worker.py
index 5ce756778..5786d09ef 100644
--- a/software/control/core/multi_point_worker.py
+++ b/software/control/core/multi_point_worker.py
@@ -335,10 +335,8 @@ def __init__(
                     if prewarmed_job_runner.is_ready():
                         self._log.info(f"Using pre-warmed job runner for {job_class.__name__} jobs")
                         job_runner = prewarmed_job_runner
-                        # Register abort handler as early as possible on adoption. The
-                        # window between controller-side start() and this point remains
-                        # uncovered for pre-warmed runners, but beyond here any
-                        # unexpected death triggers an abort.
+                        # Pre-warmed runners were started by the controller without a
+                        # handler; the pre-handoff window stays uncovered by design.
                         job_runner.set_unexpected_exit_handler(self._on_job_runner_died)
                         # Configure it with current acquisition settings
                         job_runner.set_acquisition_info(self.acquisition_info)
@@ -371,8 +369,7 @@ def __init__(
                         # Pass zarr writer info for ZARR_V3 format
                         zarr_writer_info=zarr_writer_info,
                     )
-                    # Register abort handler before start() so the watchdog always has
-                    # a handler available, even if the subprocess dies during warmup.
+                    # Must precede start() so the watchdog covers warmup-time deaths.
                     job_runner.set_unexpected_exit_handler(self._on_job_runner_died)
                     job_runner.start()
                     # Subprocess starts warming up in background - don't block here

From 9ae07e330334f80005e263c82a94a990df15cb27 Mon Sep 17 00:00:00 2001
From: You Yan <yy2900@columbia.edu>
Date: Sun, 26 Apr 2026 21:09:12 -0700
Subject: [PATCH 5/5] test: add watchdog regression tests; close pre-warm
 adoption window

- Add tests/control/core/test_job_runner_watchdog.py covering SIGKILL
  detection, intentional kill/terminate/shutdown suppression, handler
  exception isolation, and the _intentional_exit-survives-shutdown
  regression from commit 24751ec9.
- multi_point_worker: check is_alive() alongside is_ready() before
  adopting a pre-warmed runner. is_ready() reads a multiprocessing.Event
  the subprocess sets early in run(); once the subprocess dies the Event
  remains set in shared memory, so is_ready() alone can't distinguish a
  live runner from a corpse. Without is_alive(), a runner that segfaults
  during pre-warm would be adopted into the acquisition and resume the
  silent-rot failure mode this PR is meant to fix.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 software/control/core/multi_point_worker.py   |  13 +-
 .../control/core/test_job_runner_watchdog.py  | 147 ++++++++++++++++++
 2 files changed, 153 insertions(+), 7 deletions(-)
 create mode 100644 software/tests/control/core/test_job_runner_watchdog.py

diff --git a/software/control/core/multi_point_worker.py b/software/control/core/multi_point_worker.py
index 5786d09ef..3df033581 100644
--- a/software/control/core/multi_point_worker.py
+++ b/software/control/core/multi_point_worker.py
@@ -332,27 +332,26 @@ def __init__(
             if Acquisition.USE_MULTIPROCESSING:
                 # Try to use pre-warmed runner for the first job class
                 if can_use_prewarmed and not used_prewarmed:
-                    if prewarmed_job_runner.is_ready():
+                    # is_alive() must be checked alongside is_ready(): the subprocess sets
+                    # _ready_event early in run() and the Event survives in shared memory
+                    # after death, so is_ready() alone can't detect a corpse.
+                    if prewarmed_job_runner.is_alive() and prewarmed_job_runner.is_ready():
                         self._log.info(f"Using pre-warmed job runner for {job_class.__name__} jobs")
                         job_runner = prewarmed_job_runner
-                        # Pre-warmed runners were started by the controller without a
-                        # handler; the pre-handoff window stays uncovered by design.
                         job_runner.set_unexpected_exit_handler(self._on_job_runner_died)
-                        # Configure it with current acquisition settings
                         job_runner.set_acquisition_info(self.acquisition_info)
                         if zarr_writer_info:
                             job_runner.set_zarr_writer_info(zarr_writer_info)
                         used_prewarmed = True
                     else:
                         self._log.warning(
-                            f"Pre-warmed job runner not ready (possibly hung during warmup), "
+                            f"Pre-warmed job runner unavailable (died or hung during warmup); "
                             f"shutting it down and creating new one for {job_class.__name__}"
                         )
-                        # Shutdown the hung pre-warmed runner to avoid resource leak
                         try:
                             prewarmed_job_runner.shutdown(timeout_s=1.0)
                         except Exception as e:
-                            self._log.error(f"Error shutting down hung pre-warmed runner: {e}")
+                            self._log.error(f"Error shutting down unusable pre-warmed runner: {e}")
                         # Don't try to use pre-warmed runner again for subsequent job classes
                         can_use_prewarmed = False
 
diff --git a/software/tests/control/core/test_job_runner_watchdog.py b/software/tests/control/core/test_job_runner_watchdog.py
new file mode 100644
index 000000000..cde9dacb3
--- /dev/null
+++ b/software/tests/control/core/test_job_runner_watchdog.py
@@ -0,0 +1,147 @@
+"""Tests for JobRunner watchdog (unexpected subprocess death detection).
+
+These tests cover the watchdog thread that distinguishes intentional shutdown
+from unexpected subprocess death (segfault, SIGKILL, OOM kill) and invokes a
+registered handler so an acquisition can abort instead of silently rotting.
+"""
+
+import os
+import signal
+import threading
+import time
+
+import pytest
+
+from control.core.job_processing import JobRunner
+
+
+@pytest.fixture
+def runner():
+    """Provide an unstarted JobRunner; ensure cleanup even if the test crashes mid-run."""
+    r = JobRunner()
+    r.daemon = True
+    yield r
+    if r.is_alive():
+        try:
+            r.kill()
+            r.join(timeout=2.0)
+        except Exception:
+            pass
+
+
+# Watchdog runs in a daemon thread; allow it to finish after the sentinel fires.
+_WATCHDOG_GRACE_S = 0.3
+
+
+class TestWatchdogUnexpectedDeath:
+    """Verify the watchdog detects unexpected subprocess death and invokes the handler."""
+
+    def test_sigkill_fires_handler_with_negative_exitcode(self, runner):
+        handler_fired = threading.Event()
+        received_exitcode = []
+
+        def handler(exitcode):
+            received_exitcode.append(exitcode)
+            handler_fired.set()
+
+        runner.set_unexpected_exit_handler(handler)
+        runner.start()
+        assert runner.wait_ready(timeout_s=5.0)
+
+        os.kill(runner.pid, signal.SIGKILL)
+
+        assert handler_fired.wait(timeout=5.0), "Watchdog handler did not fire after SIGKILL"
+        assert received_exitcode == [-signal.SIGKILL]
+
+
+class TestWatchdogIntentionalExit:
+    """Verify intentional stop paths (kill/terminate/shutdown) do NOT fire the handler."""
+
+    def test_kill_does_not_fire_handler(self, runner):
+        handler_fired = threading.Event()
+        runner.set_unexpected_exit_handler(lambda ec: handler_fired.set())
+        runner.start()
+        assert runner.wait_ready(timeout_s=5.0)
+
+        runner.kill()
+        runner.join(timeout=2.0)
+        time.sleep(_WATCHDOG_GRACE_S)
+
+        assert not handler_fired.is_set(), "Handler fired despite intentional kill()"
+
+    def test_terminate_does_not_fire_handler(self, runner):
+        handler_fired = threading.Event()
+        runner.set_unexpected_exit_handler(lambda ec: handler_fired.set())
+        runner.start()
+        assert runner.wait_ready(timeout_s=5.0)
+
+        runner.terminate()
+        runner.join(timeout=2.0)
+        time.sleep(_WATCHDOG_GRACE_S)
+
+        assert not handler_fired.is_set(), "Handler fired despite intentional terminate()"
+
+    def test_shutdown_does_not_fire_handler(self, runner):
+        handler_fired = threading.Event()
+        runner.set_unexpected_exit_handler(lambda ec: handler_fired.set())
+        runner.start()
+        assert runner.wait_ready(timeout_s=5.0)
+
+        runner.shutdown(timeout_s=2.0)
+        time.sleep(_WATCHDOG_GRACE_S)
+
+        assert not handler_fired.is_set(), "Handler fired despite intentional shutdown()"
+
+
+class TestWatchdogResilience:
+    """Verify the watchdog is robust to handler misbehavior and shutdown ordering."""
+
+    def test_handler_exception_does_not_propagate(self, runner):
+        # The watchdog daemon thread must catch handler exceptions (it logs them).
+        # If propagation happened, the test process would not reach the post-join asserts.
+        runner.set_unexpected_exit_handler(lambda ec: (_ for _ in ()).throw(RuntimeError("boom")))
+        runner.start()
+        assert runner.wait_ready(timeout_s=5.0)
+
+        os.kill(runner.pid, signal.SIGKILL)
+        runner.join(timeout=5.0)
+        time.sleep(_WATCHDOG_GRACE_S)
+
+        assert not runner.is_alive()
+
+    def test_intentional_exit_survives_shutdown_cleanup(self, runner):
+        """Regression: shutdown() nulls _shutdown_event during cleanup. The intent flag
+        must be a separate attribute that survives that nullification, or the watchdog
+        could read None and misclassify intentional shutdown as unexpected death.
+        """
+        handler_fired = threading.Event()
+        runner.set_unexpected_exit_handler(lambda ec: handler_fired.set())
+        runner.start()
+        assert runner.wait_ready(timeout_s=5.0)
+
+        runner.shutdown(timeout_s=2.0)
+
+        assert runner._intentional_exit is True
+        assert runner._shutdown_event is None
+
+        time.sleep(_WATCHDOG_GRACE_S)
+        assert not handler_fired.is_set()
+
+
+class TestPreWarmedAdoption:
+    """Document the load-bearing assumption behind the is_alive() check at adoption."""
+
+    def test_is_ready_returns_true_for_dead_subprocess(self, runner):
+        """is_ready() reads a multiprocessing.Event the subprocess sets early in run().
+        After SIGKILL the Event remains set in shared memory, so is_ready() alone cannot
+        distinguish a live runner from a corpse. is_alive() must also be checked before
+        adopting a pre-warmed runner.
+        """
+        runner.start()
+        assert runner.wait_ready(timeout_s=5.0)
+
+        os.kill(runner.pid, signal.SIGKILL)
+        runner.join(timeout=5.0)
+
+        assert runner.is_ready() is True, "is_ready() should still report True even after death"
+        assert runner.is_alive() is False, "is_alive() should report False after death"