From 6e4cdd14809b0329198e373db6146650422a866f Mon Sep 17 00:00:00 2001
From: p4gs <10093271+p4gs@users.noreply.github.com>
Date: Sat, 30 May 2026 20:14:44 -0400
Subject: [PATCH] fix(GRC-500): stop cleanup_orphans deleting live sinks on
 macOS/Windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The format-matrix "bugs" (HTML exit 137/no output, markdown cold-cache empty,
JSON ZERO_VENDORS) shared one root cause: a /tmp result-sink race, not any
writer defect. All four writers receive the identical relationships slice, so
format never changes discovery.

`is_process_running` checked `/proc/{pid}`, which only exists on Linux. On
macOS/Windows it returned false for every PID, so each run's startup call to
`ResultSink::cleanup_orphans("/tmp")` deleted the live PID-stamped sink files
of concurrently-running scans. The victim then panicked in `drain_all()` with
ENOENT (exit 101) before any output was written — surfacing under whichever
format ran cold while CSV got lucky on timing.

Evidence: test-output/campaign-post-tf5/vanta.com/d2_fmt/stderr.txt (html run)
panicked at app.rs:1608 "Failed to open result file: /tmp/...-59762.jsonl.zst:
No such file or directory" right after "0 vendors found".

Changes:
- result_sink.rs: portable liveness via `sysinfo` (already a dep); correct on
  Linux/macOS/Windows so live sibling sinks are preserved. Fixed two tests that
  had codified the buggy macOS behavior; added a regression test that models the
  race (live sink preserved, dead orphan removed).
- app.rs: the disk-sink read path no longer `.expect()`-panics; on a genuine
  read failure it fails loudly with a clear message and exit code 4 instead of
  emitting a silently-empty report.
- cli.rs: `--timeout` help now explains deep/cold scans exceed the 600s default
  and that output format does not affect discovery time.
- CHANGELOG.md: document the fix.

Verified: cargo build + clippy + fmt clean; 39 result_sink tests pass incl. the
new regression. The 53 failing dns/discovery tests are pre-existing and
environmental (network/DoH) — confirmed failing identically on clean master.
---
 nthpartyfinder/CHANGELOG.md       |  20 +++++
 nthpartyfinder/src/app.rs         |  44 +++++++++--
 nthpartyfinder/src/cli.rs         |   5 ++
 nthpartyfinder/src/result_sink.rs | 122 ++++++++++++++++++++++--------
 4 files changed, 154 insertions(+), 37 deletions(-)

diff --git a/nthpartyfinder/CHANGELOG.md b/nthpartyfinder/CHANGELOG.md
index 272bc2a..32aaf84 100644
--- a/nthpartyfinder/CHANGELOG.md
+++ b/nthpartyfinder/CHANGELOG.md
@@ -1,5 +1,25 @@
 # Changelog
 
+## [Unreleased]
+
+### Fixed
+- GRC-500: `cleanup_orphans` deleted the live result-sink files of
+  concurrently-running scans on macOS/Windows. `is_process_running` checked
+  `/proc/{pid}` (Linux-only), so every PID read as "not running" off Linux and
+  a sibling run's startup cleanup removed an active scan's `/tmp` sink. The
+  victim then panicked in `drain_all()` with ENOENT (exit 101) before writing
+  output — surfacing as HTML/JSON/markdown "crashes" and silently-empty
+  reports in the format matrix while CSV got lucky on timing. Liveness now uses
+  `sysinfo` for correct cross-platform detection.
+- The disk-sink read path no longer panics when results can't be read back; it
+  fails loudly with a clear message and a dedicated exit code (4) instead of
+  emitting a silently-empty report.
+
+### Changed
+- `--timeout` help now explains that depth-3+/cold-cache scans routinely exceed
+  the 600s default (raise it or use `--timeout 0`) and that the output format
+  does not affect discovery time.
+
 ## [1.0.0] - 2026-04-28
 
 ### Fixed
diff --git a/nthpartyfinder/src/app.rs b/nthpartyfinder/src/app.rs
index 028c721..5e30844 100644
--- a/nthpartyfinder/src/app.rs
+++ b/nthpartyfinder/src/app.rs
@@ -32,6 +32,14 @@ use crate::whois;
 
 use std::path::PathBuf;
 
+/// Process exit code carried as an error so `run_inner` can bubble it up to
+/// `main` for a clean `std::process::exit`.
+///
+/// Known codes:
+/// - `1` — generic failure (config error, analysis timeout, batch failure)
+/// - `2` — invalid CLI arguments
+/// - `4` — analysis results could not be read back from the disk sink
+/// - `130` — interrupted by the user (SIGINT)
 #[derive(Debug)]
 pub struct AppExitCode(pub i32);
 
@@ -1601,16 +1609,38 @@ pub async fn run_inner(args: Args, input: &dyn InputSource) -> Result<()> {
             let _ = sink.flush();
             sink_path = sink.path().to_path_buf();
         }
-        match Arc::try_unwrap(result_sink) {
-            Ok(mutex) => {
-                let sink = mutex.into_inner();
-                sink.drain_all()
-                    .expect("Failed to read results from disk sink")
-            }
+        let drained = match Arc::try_unwrap(result_sink) {
+            Ok(mutex) => mutex.into_inner().drain_all(),
             Err(_arc) => {
                 logger.debug("ResultSink has outstanding references, reading from file path");
                 ResultSink::read_results(&sink_path)
-                    .expect("Failed to read results from disk sink file")
+            }
+        };
+        match drained {
+            Ok(results) => results,
+            // Fail loudly instead of panicking. A genuinely empty scan returns an
+            // intact (zero-row) file here, so reaching this arm means the sink
+            // file was unreadable or missing — historically because a concurrent
+            // run deleted the shared /tmp sink (GRC-500). Never silently emit an
+            // empty report in that case.
+            Err(e) => {
+                logger.error(&format!(
+                    "Failed to read analysis results from disk sink {}: {}",
+                    sink_path.display(),
+                    e
+                ));
+                eprintln!();
+                eprintln!(
+                    "Failed to read analysis results from the disk sink at {}.",
+                    sink_path.display()
+                );
+                eprintln!("The result file was missing or unreadable, so no report was written.");
+                eprintln!(
+                    "This can happen when a concurrent nthpartyfinder run removes the shared \
+                     /tmp sink file. Re-run the scan, and when running scans in parallel give \
+                     each its own --output-dir."
+                );
+                bail!(AppExitCode(4));
             }
         }
     };
diff --git a/nthpartyfinder/src/cli.rs b/nthpartyfinder/src/cli.rs
index bdd9b3a..8ffee44 100644
--- a/nthpartyfinder/src/cli.rs
+++ b/nthpartyfinder/src/cli.rs
@@ -156,6 +156,11 @@ pub struct Cli {
 
     /// Analysis timeout in seconds (default: 600). Use 0 for no timeout.
     /// Overrides NTHPARTY_ANALYSIS_TIMEOUT_SECS environment variable.
+    /// The default suits a depth-1 scan; depth 3+ or cold-cache runs routinely
+    /// exceed 600s (e.g. ~1500-3000s), so raise this (e.g. --timeout 1800) or
+    /// disable it (--timeout 0) for deep scans. The output format does not
+    /// change discovery time. On timeout the scan exits non-zero with a
+    /// checkpoint rather than emitting an empty report.
     #[arg(long, value_name = "SECONDS")]
     pub timeout: Option<u64>,
 
diff --git a/nthpartyfinder/src/result_sink.rs b/nthpartyfinder/src/result_sink.rs
index 320ae21..b8aac43 100644
--- a/nthpartyfinder/src/result_sink.rs
+++ b/nthpartyfinder/src/result_sink.rs
@@ -229,14 +229,38 @@ impl ResultSink {
     }
 }
 
-// cfg(not(coverage)): uses /proc which only exists on Linux — result is platform-dependent
+/// Returns `true` if a process with the given PID is currently running.
+///
+/// Uses `sysinfo` for portable liveness detection across Linux, macOS, and
+/// Windows. The previous implementation only checked `/proc/{pid}`, which does
+/// not exist on macOS/Windows: there it reported *every* PID as "not running",
+/// so `cleanup_orphans` deleted the live result-sink files of concurrently
+/// running scans. The victim run then panicked in `drain_all()` with ENOENT
+/// (exit 101) before any output was written (GRC-500).
+//
+// cfg(not(coverage)): queries the host process table via platform-specific
+// syscalls, so the result is environment-dependent and excluded from coverage.
 #[cfg(not(coverage))]
 fn is_process_running(pid: u32) -> bool {
-    Path::new(&format!("/proc/{}", pid)).exists()
+    use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System};
+
+    let target = Pid::from_u32(pid);
+    let mut system = System::new();
+    system.refresh_processes_specifics(
+        ProcessesToUpdate::Some(&[target]),
+        false,
+        ProcessRefreshKind::new(),
+    );
+    system.process(target).is_some()
 }
+
+// cfg(coverage): the sysinfo path above is excluded from coverage, so this
+// deterministic stub stands in. The current process is always alive; every
+// other PID is treated as dead so both branches of `cleanup_orphans` are
+// exercised without depending on the host process table.
 #[cfg(coverage)]
-fn is_process_running(_pid: u32) -> bool {
-    false
+fn is_process_running(pid: u32) -> bool {
+    pid == std::process::id()
 }
 
 // cfg(not(coverage)): df --output=avail is Linux-only; macOS df writes nothing to stdout, so the parse closure is unreachable
@@ -489,6 +513,34 @@ mod tests {
         assert_eq!(result.unwrap(), 0);
     }
 
+    // GRC-500: when a sibling scan runs cleanup_orphans, it must remove only
+    // truly-orphaned sinks (dead PIDs) and leave a concurrently-running scan's
+    // live sink untouched. Before the fix, macOS deleted both, so the live scan
+    // later panicked reading its own (now missing) sink.
+    #[test]
+    fn test_orphan_cleanup_preserves_live_removes_dead() {
+        let tmp = TempDir::new().unwrap();
+
+        // Live sibling: our own PID stands in for a concurrently-running scan.
+        let live_file = tmp.path().join(format!(
+            "nthpartyfinder-results-{}.jsonl.zst",
+            std::process::id()
+        ));
+        std::fs::write(&live_file, b"live").unwrap();
+
+        // Orphan: an impossible PID can never be running.
+        let dead_file = tmp
+            .path()
+            .join(format!("nthpartyfinder-results-{}.jsonl.zst", u32::MAX));
+        std::fs::write(&dead_file, b"dead").unwrap();
+
+        let cleaned = ResultSink::cleanup_orphans(tmp.path()).unwrap();
+
+        assert_eq!(cleaned, 1, "only the dead-PID orphan should be cleaned");
+        assert!(live_file.exists(), "live sibling sink must be preserved");
+        assert!(!dead_file.exists(), "dead orphan sink must be removed");
+    }
+
     #[test]
     fn test_orphan_cleanup_skips_current_process() {
         let tmp = TempDir::new().unwrap();
@@ -501,10 +553,18 @@ mod tests {
         std::fs::write(&own_file, b"data").unwrap();
 
         let cleaned = ResultSink::cleanup_orphans(tmp.path()).unwrap();
-        // On macOS /proc doesn't exist, so is_process_running returns false
-        // On Linux, our own PID would be detected as running
-        // Either way, the function should not panic
-        let _ = cleaned;
+        // The current process is alive on every platform, so its sink file must
+        // be preserved. GRC-500 regression: macOS previously reported the live
+        // PID as "not running" and deleted the file out from under a running
+        // scan, which then panicked in drain_all().
+        assert_eq!(
+            cleaned, 0,
+            "a live current-process sink must not be cleaned"
+        );
+        assert!(
+            own_file.exists(),
+            "current-process sink file must survive cleanup_orphans"
+        );
     }
 
     #[test]
@@ -521,11 +581,12 @@ mod tests {
 
     #[test]
     fn test_is_process_running_nonexistent() {
-        // PID 999999 is very unlikely to be running
-        // On macOS /proc doesn't exist, so this always returns false
-        let result = is_process_running(999999);
-        // Just verify it doesn't panic
-        let _ = result;
+        // u32::MAX exceeds the maximum PID on every supported platform, so it
+        // can never correspond to a live process and must read as not running.
+        assert!(
+            !is_process_running(u32::MAX),
+            "an impossible PID must read as not running"
+        );
     }
 
     #[test]
@@ -815,42 +876,43 @@ mod tests {
 
     // ── is_process_running additional coverage ───────────────────────
 
-    // cfg(not(coverage)): /proc platform branch — only one arm executes per OS
+    // cfg(not(coverage)): exercises the real sysinfo-backed liveness check.
     #[cfg(not(coverage))]
     #[test]
     fn test_is_process_running_current_process() {
-        let pid = std::process::id();
-        let result = is_process_running(pid);
-        if Path::new("/proc").exists() {
-            assert!(result, "current process should be running");
-        } else {
-            assert!(!result, "without /proc, is_process_running returns false");
-        }
+        // The current process is always running, on every platform. GRC-500
+        // regression: this previously asserted `false` on hosts without /proc.
+        assert!(
+            is_process_running(std::process::id()),
+            "the current process must be detected as running on every platform"
+        );
     }
 
-    // cfg(not(coverage)): /proc platform branch — macOS vs Linux behavior
+    // cfg(not(coverage)): exercises the real liveness check + a removal failure.
     #[cfg(not(coverage))]
     #[cfg(unix)]
     #[test]
     fn test_cleanup_orphans_remove_fails_readonly_dir() {
         use std::os::unix::fs::PermissionsExt;
         let dir = TempDir::new().unwrap();
-        // Create an orphaned result file with a PID that's definitely not running
-        let orphan_name = "nthpartyfinder-results-999999.jsonl.zst";
-        let orphan_path = dir.path().join(orphan_name);
+        // An impossible PID is guaranteed dead, so cleanup is attempted.
+        let orphan_name = format!("nthpartyfinder-results-{}.jsonl.zst", u32::MAX);
+        let orphan_path = dir.path().join(&orphan_name);
         std::fs::write(&orphan_path, b"dummy").unwrap();
 
         // Make directory read-only to prevent file removal
         std::fs::set_permissions(dir.path(), std::fs::Permissions::from_mode(0o555)).unwrap();
 
         let result = ResultSink::cleanup_orphans(dir.path());
-        // On macOS (no /proc), PID 999999 is always "not running" so cleanup is attempted
-        // but remove_file fails because dir is read-only
-        if !Path::new("/proc").exists() {
-            // macOS: cleanup attempted, remove fails, cleaned count = 0
+        // Skip the removal-failure assertions when running as root (common in
+        // Linux CI containers), where read-only directory permissions are
+        // ignored and the file would actually be removed.
+        let running_as_root = std::fs::write(dir.path().join(".root-probe"), b"x").is_ok();
+        if !running_as_root {
+            // Cleanup is attempted, but remove_file fails because the directory
+            // is read-only: cleaned count stays 0 and the file survives.
             assert!(result.is_ok());
             assert_eq!(result.unwrap(), 0);
-            // File should still exist since removal failed
             assert!(orphan_path.exists());
         }