From 6e4cdd14809b0329198e373db6146650422a866f Mon Sep 17 00:00:00 2001 From: p4gs <10093271+p4gs@users.noreply.github.com> Date: Sat, 30 May 2026 20:14:44 -0400 Subject: [PATCH] fix(GRC-500): stop cleanup_orphans deleting live sinks on macOS/Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The format-matrix "bugs" (HTML exit 137/no output, markdown cold-cache empty, JSON ZERO_VENDORS) shared one root cause: a /tmp result-sink race, not any writer defect. All four writers receive the identical relationships slice, so format never changes discovery. `is_process_running` checked `/proc/{pid}`, which only exists on Linux. On macOS/Windows it returned false for every PID, so each run's startup call to `ResultSink::cleanup_orphans("/tmp")` deleted the live PID-stamped sink files of concurrently-running scans. The victim then panicked in `drain_all()` with ENOENT (exit 101) before any output was written — surfacing under whichever format ran cold while CSV got lucky on timing. Evidence: test-output/campaign-post-tf5/vanta.com/d2_fmt/stderr.txt (html run) panicked at app.rs:1608 "Failed to open result file: /tmp/...-59762.jsonl.zst: No such file or directory" right after "0 vendors found". Changes: - result_sink.rs: portable liveness via `sysinfo` (already a dep); correct on Linux/macOS/Windows so live sibling sinks are preserved. Fixed two tests that had codified the buggy macOS behavior; added a regression test that models the race (live sink preserved, dead orphan removed). - app.rs: the disk-sink read path no longer `.expect()`-panics; on a genuine read failure it fails loudly with a clear message and exit code 4 instead of emitting a silently-empty report. - cli.rs: `--timeout` help now explains deep/cold scans exceed the 600s default and that output format does not affect discovery time. - CHANGELOG.md: document the fix. Verified: cargo build + clippy + fmt clean; 39 result_sink tests pass incl. the new regression. The 53 failing dns/discovery tests are pre-existing and environmental (network/DoH) — confirmed failing identically on clean master. --- nthpartyfinder/CHANGELOG.md | 20 +++++ nthpartyfinder/src/app.rs | 44 +++++++++-- nthpartyfinder/src/cli.rs | 5 ++ nthpartyfinder/src/result_sink.rs | 122 ++++++++++++++++++++++-------- 4 files changed, 154 insertions(+), 37 deletions(-) diff --git a/nthpartyfinder/CHANGELOG.md b/nthpartyfinder/CHANGELOG.md index 272bc2a..32aaf84 100644 --- a/nthpartyfinder/CHANGELOG.md +++ b/nthpartyfinder/CHANGELOG.md @@ -1,5 +1,25 @@ # Changelog +## [Unreleased] + +### Fixed +- GRC-500: `cleanup_orphans` deleted the live result-sink files of + concurrently-running scans on macOS/Windows. `is_process_running` checked + `/proc/{pid}` (Linux-only), so every PID read as "not running" off Linux and + a sibling run's startup cleanup removed an active scan's `/tmp` sink. The + victim then panicked in `drain_all()` with ENOENT (exit 101) before writing + output — surfacing as HTML/JSON/markdown "crashes" and silently-empty + reports in the format matrix while CSV got lucky on timing. Liveness now uses + `sysinfo` for correct cross-platform detection. +- The disk-sink read path no longer panics when results can't be read back; it + fails loudly with a clear message and a dedicated exit code (4) instead of + emitting a silently-empty report. + +### Changed +- `--timeout` help now explains that depth-3+/cold-cache scans routinely exceed + the 600s default (raise it or use `--timeout 0`) and that the output format + does not affect discovery time. + ## [1.0.0] - 2026-04-28 ### Fixed diff --git a/nthpartyfinder/src/app.rs b/nthpartyfinder/src/app.rs index 028c721..5e30844 100644 --- a/nthpartyfinder/src/app.rs +++ b/nthpartyfinder/src/app.rs @@ -32,6 +32,14 @@ use crate::whois; use std::path::PathBuf; +/// Process exit code carried as an error so `run_inner` can bubble it up to +/// `main` for a clean `std::process::exit`. +/// +/// Known codes: +/// - `1` — generic failure (config error, analysis timeout, batch failure) +/// - `2` — invalid CLI arguments +/// - `4` — analysis results could not be read back from the disk sink +/// - `130` — interrupted by the user (SIGINT) #[derive(Debug)] pub struct AppExitCode(pub i32); @@ -1601,16 +1609,38 @@ pub async fn run_inner(args: Args, input: &dyn InputSource) -> Result<()> { let _ = sink.flush(); sink_path = sink.path().to_path_buf(); } - match Arc::try_unwrap(result_sink) { - Ok(mutex) => { - let sink = mutex.into_inner(); - sink.drain_all() - .expect("Failed to read results from disk sink") - } + let drained = match Arc::try_unwrap(result_sink) { + Ok(mutex) => mutex.into_inner().drain_all(), Err(_arc) => { logger.debug("ResultSink has outstanding references, reading from file path"); ResultSink::read_results(&sink_path) - .expect("Failed to read results from disk sink file") + } + }; + match drained { + Ok(results) => results, + // Fail loudly instead of panicking. A genuinely empty scan returns an + // intact (zero-row) file here, so reaching this arm means the sink + // file was unreadable or missing — historically because a concurrent + // run deleted the shared /tmp sink (GRC-500). Never silently emit an + // empty report in that case. + Err(e) => { + logger.error(&format!( + "Failed to read analysis results from disk sink {}: {}", + sink_path.display(), + e + )); + eprintln!(); + eprintln!( + "Failed to read analysis results from the disk sink at {}.", + sink_path.display() + ); + eprintln!("The result file was missing or unreadable, so no report was written."); + eprintln!( + "This can happen when a concurrent nthpartyfinder run removes the shared \ + /tmp sink file. Re-run the scan, and when running scans in parallel give \ + each its own --output-dir." + ); + bail!(AppExitCode(4)); } } }; diff --git a/nthpartyfinder/src/cli.rs b/nthpartyfinder/src/cli.rs index bdd9b3a..8ffee44 100644 --- a/nthpartyfinder/src/cli.rs +++ b/nthpartyfinder/src/cli.rs @@ -156,6 +156,11 @@ pub struct Cli { /// Analysis timeout in seconds (default: 600). Use 0 for no timeout. /// Overrides NTHPARTY_ANALYSIS_TIMEOUT_SECS environment variable. + /// The default suits a depth-1 scan; depth 3+ or cold-cache runs routinely + /// exceed 600s (e.g. ~1500-3000s), so raise this (e.g. --timeout 1800) or + /// disable it (--timeout 0) for deep scans. The output format does not + /// change discovery time. On timeout the scan exits non-zero with a + /// checkpoint rather than emitting an empty report. #[arg(long, value_name = "SECONDS")] pub timeout: Option, diff --git a/nthpartyfinder/src/result_sink.rs b/nthpartyfinder/src/result_sink.rs index 320ae21..b8aac43 100644 --- a/nthpartyfinder/src/result_sink.rs +++ b/nthpartyfinder/src/result_sink.rs @@ -229,14 +229,38 @@ impl ResultSink { } } -// cfg(not(coverage)): uses /proc which only exists on Linux — result is platform-dependent +/// Returns `true` if a process with the given PID is currently running. +/// +/// Uses `sysinfo` for portable liveness detection across Linux, macOS, and +/// Windows. The previous implementation only checked `/proc/{pid}`, which does +/// not exist on macOS/Windows: there it reported *every* PID as "not running", +/// so `cleanup_orphans` deleted the live result-sink files of concurrently +/// running scans. The victim run then panicked in `drain_all()` with ENOENT +/// (exit 101) before any output was written (GRC-500). +// +// cfg(not(coverage)): queries the host process table via platform-specific +// syscalls, so the result is environment-dependent and excluded from coverage. #[cfg(not(coverage))] fn is_process_running(pid: u32) -> bool { - Path::new(&format!("/proc/{}", pid)).exists() + use sysinfo::{Pid, ProcessRefreshKind, ProcessesToUpdate, System}; + + let target = Pid::from_u32(pid); + let mut system = System::new(); + system.refresh_processes_specifics( + ProcessesToUpdate::Some(&[target]), + false, + ProcessRefreshKind::new(), + ); + system.process(target).is_some() } + +// cfg(coverage): the sysinfo path above is excluded from coverage, so this +// deterministic stub stands in. The current process is always alive; every +// other PID is treated as dead so both branches of `cleanup_orphans` are +// exercised without depending on the host process table. #[cfg(coverage)] -fn is_process_running(_pid: u32) -> bool { - false +fn is_process_running(pid: u32) -> bool { + pid == std::process::id() } // cfg(not(coverage)): df --output=avail is Linux-only; macOS df writes nothing to stdout, so the parse closure is unreachable @@ -489,6 +513,34 @@ mod tests { assert_eq!(result.unwrap(), 0); } + // GRC-500: when a sibling scan runs cleanup_orphans, it must remove only + // truly-orphaned sinks (dead PIDs) and leave a concurrently-running scan's + // live sink untouched. Before the fix, macOS deleted both, so the live scan + // later panicked reading its own (now missing) sink. + #[test] + fn test_orphan_cleanup_preserves_live_removes_dead() { + let tmp = TempDir::new().unwrap(); + + // Live sibling: our own PID stands in for a concurrently-running scan. + let live_file = tmp.path().join(format!( + "nthpartyfinder-results-{}.jsonl.zst", + std::process::id() + )); + std::fs::write(&live_file, b"live").unwrap(); + + // Orphan: an impossible PID can never be running. + let dead_file = tmp + .path() + .join(format!("nthpartyfinder-results-{}.jsonl.zst", u32::MAX)); + std::fs::write(&dead_file, b"dead").unwrap(); + + let cleaned = ResultSink::cleanup_orphans(tmp.path()).unwrap(); + + assert_eq!(cleaned, 1, "only the dead-PID orphan should be cleaned"); + assert!(live_file.exists(), "live sibling sink must be preserved"); + assert!(!dead_file.exists(), "dead orphan sink must be removed"); + } + #[test] fn test_orphan_cleanup_skips_current_process() { let tmp = TempDir::new().unwrap(); @@ -501,10 +553,18 @@ mod tests { std::fs::write(&own_file, b"data").unwrap(); let cleaned = ResultSink::cleanup_orphans(tmp.path()).unwrap(); - // On macOS /proc doesn't exist, so is_process_running returns false - // On Linux, our own PID would be detected as running - // Either way, the function should not panic - let _ = cleaned; + // The current process is alive on every platform, so its sink file must + // be preserved. GRC-500 regression: macOS previously reported the live + // PID as "not running" and deleted the file out from under a running + // scan, which then panicked in drain_all(). + assert_eq!( + cleaned, 0, + "a live current-process sink must not be cleaned" + ); + assert!( + own_file.exists(), + "current-process sink file must survive cleanup_orphans" + ); } #[test] @@ -521,11 +581,12 @@ mod tests { #[test] fn test_is_process_running_nonexistent() { - // PID 999999 is very unlikely to be running - // On macOS /proc doesn't exist, so this always returns false - let result = is_process_running(999999); - // Just verify it doesn't panic - let _ = result; + // u32::MAX exceeds the maximum PID on every supported platform, so it + // can never correspond to a live process and must read as not running. + assert!( + !is_process_running(u32::MAX), + "an impossible PID must read as not running" + ); } #[test] @@ -815,42 +876,43 @@ mod tests { // ── is_process_running additional coverage ─────────────────────── - // cfg(not(coverage)): /proc platform branch — only one arm executes per OS + // cfg(not(coverage)): exercises the real sysinfo-backed liveness check. #[cfg(not(coverage))] #[test] fn test_is_process_running_current_process() { - let pid = std::process::id(); - let result = is_process_running(pid); - if Path::new("/proc").exists() { - assert!(result, "current process should be running"); - } else { - assert!(!result, "without /proc, is_process_running returns false"); - } + // The current process is always running, on every platform. GRC-500 + // regression: this previously asserted `false` on hosts without /proc. + assert!( + is_process_running(std::process::id()), + "the current process must be detected as running on every platform" + ); } - // cfg(not(coverage)): /proc platform branch — macOS vs Linux behavior + // cfg(not(coverage)): exercises the real liveness check + a removal failure. #[cfg(not(coverage))] #[cfg(unix)] #[test] fn test_cleanup_orphans_remove_fails_readonly_dir() { use std::os::unix::fs::PermissionsExt; let dir = TempDir::new().unwrap(); - // Create an orphaned result file with a PID that's definitely not running - let orphan_name = "nthpartyfinder-results-999999.jsonl.zst"; - let orphan_path = dir.path().join(orphan_name); + // An impossible PID is guaranteed dead, so cleanup is attempted. + let orphan_name = format!("nthpartyfinder-results-{}.jsonl.zst", u32::MAX); + let orphan_path = dir.path().join(&orphan_name); std::fs::write(&orphan_path, b"dummy").unwrap(); // Make directory read-only to prevent file removal std::fs::set_permissions(dir.path(), std::fs::Permissions::from_mode(0o555)).unwrap(); let result = ResultSink::cleanup_orphans(dir.path()); - // On macOS (no /proc), PID 999999 is always "not running" so cleanup is attempted - // but remove_file fails because dir is read-only - if !Path::new("/proc").exists() { - // macOS: cleanup attempted, remove fails, cleaned count = 0 + // Skip the removal-failure assertions when running as root (common in + // Linux CI containers), where read-only directory permissions are + // ignored and the file would actually be removed. + let running_as_root = std::fs::write(dir.path().join(".root-probe"), b"x").is_ok(); + if !running_as_root { + // Cleanup is attempted, but remove_file fails because the directory + // is read-only: cleaned count stays 0 and the file survives. assert!(result.is_ok()); assert_eq!(result.unwrap(), 0); - // File should still exist since removal failed assert!(orphan_path.exists()); }