diff --git a/Cargo.lock b/Cargo.lock index 43ef6f13..a1020b14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -509,7 +509,7 @@ checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "cli-sub-agent" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -701,7 +701,7 @@ dependencies = [ [[package]] name = "csa-acp" -version = "0.1.734" +version = "0.1.736" dependencies = [ "agent-client-protocol", "anyhow", @@ -721,7 +721,7 @@ dependencies = [ [[package]] name = "csa-config" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -739,7 +739,7 @@ dependencies = [ [[package]] name = "csa-core" -version = "0.1.734" +version = "0.1.736" dependencies = [ "agent-teams", "chrono", @@ -755,7 +755,7 @@ dependencies = [ [[package]] name = "csa-eval" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -769,7 +769,7 @@ dependencies = [ [[package]] name = "csa-executor" -version = "0.1.734" +version = "0.1.736" dependencies = [ "agent-teams", "anyhow", @@ -797,7 +797,7 @@ dependencies = [ [[package]] name = "csa-hooks" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -816,7 +816,7 @@ dependencies = [ [[package]] name = "csa-lock" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -830,7 +830,7 @@ dependencies = [ [[package]] name = "csa-mcp-hub" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "axum", @@ -853,7 +853,7 @@ dependencies = [ [[package]] name = "csa-memory" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "async-trait", @@ -872,7 +872,7 @@ dependencies = [ [[package]] name = "csa-process" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -891,7 +891,7 @@ dependencies = [ [[package]] name = "csa-resource" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "csa-core", @@ -907,7 +907,7 @@ dependencies = [ [[package]] name = "csa-scheduler" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -925,7 +925,7 @@ dependencies = [ [[package]] name = "csa-session" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -951,7 +951,7 @@ dependencies = [ [[package]] name = "csa-todo" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "chrono", @@ -4516,7 +4516,7 @@ dependencies = [ [[package]] name = "weave" -version = "0.1.734" +version = "0.1.736" dependencies = [ "anyhow", "clap", diff --git a/Cargo.toml b/Cargo.toml index a628703f..9c2a8be9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ default-members = [ resolver = "2" [workspace.package] -version = "0.1.734" +version = "0.1.736" edition = "2024" rust-version = "1.88" license = "Apache-2.0" diff --git a/crates/cli-sub-agent/src/session_cmds_daemon_wait.rs b/crates/cli-sub-agent/src/session_cmds_daemon_wait.rs index f05fe9e8..f778afd7 100644 --- a/crates/cli-sub-agent/src/session_cmds_daemon_wait.rs +++ b/crates/cli-sub-agent/src/session_cmds_daemon_wait.rs @@ -7,6 +7,11 @@ use std::os::fd::AsRawFd; pub(crate) const SESSION_WAIT_MEMORY_WARN_EXIT_CODE: i32 = 33; const SESSION_WAIT_SUCCESS_EXIT_CODE: i32 = 0; const SESSION_WAIT_FAILURE_EXIT_CODE: i32 = 1; +/// Healthy poll-cap exit when the session is still alive: callers should +/// process tokens (warming their KV cache) and re-wait. See #1439. +const SESSION_WAIT_KV_WARM_EXIT_CODE: i32 = 0; +/// Reserved for the rare case where the wait cap is reached but the session +/// daemon is no longer alive and no result.toml was produced. const SESSION_WAIT_TIMEOUT_EXIT_CODE: i32 = 124; const SESSION_WAIT_MEMORY_SAMPLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(15); @@ -308,28 +313,12 @@ where let refreshed_result = refreshed_result.ok().flatten(); let mut synthetic = false; let refreshed_result_available = refreshed_result.is_some(); - let mut loaded_result = refreshed_result.filter(|result| { - match ( - fs::metadata(session_dir.join(csa_session::result::RESULT_FILE_NAME)) - .ok() - .and_then(|metadata| metadata.modified().ok()), - fs::metadata(daemon_completion_path(&session_dir)) - .ok() - .and_then(|metadata| metadata.modified().ok()), - ) { - (Some(result_modified), Some(completion_modified)) - if result_modified > completion_modified => - { - true - } - (Some(result_modified), Some(completion_modified)) - if result_modified == completion_modified => - { - completion.exit_code == 0 && result.exit_code != 0 - } - _ => false, - } - }); + // result.toml is the authoritative session artifact; trust it over the daemon + // completion packet (which records the daemon process exit, not the session + // outcome). The daemon may exit non-zero after writing a successful result.toml + // (e.g., post-write cleanup failure, parent SIGTERM), so prior mtime-based + // filtering caused #1442 false failures. See #1442. + let mut loaded_result = refreshed_result; if refreshed_result_available { crate::session_cmds::retire_if_dead_with_result( effective_root, @@ -487,38 +476,46 @@ where let elapsed = start.elapsed().as_secs(); if elapsed >= wait_behavior.wait_timeout_secs { - eprintln!( - "Timeout: session {} did not complete within {}s", - resolved.session_id, wait_behavior.wait_timeout_secs, - ); - // Emit structured retry hint for orchestrators / agents. let cd_arg = cd .as_ref() .map(|path| format!(" --cd '{}'", path)) .unwrap_or_default(); + let session_alive = session_has_terminal_process(&session_dir) + || csa_process::ToolLiveness::is_alive(&session_dir); + if session_alive { + // KV-warm exit: session still alive at the wait cap. See #1439. + eprintln!( + "Session {} still running after {}s wait cap; returning so caller can warm its KV cache before re-waiting.", + resolved.session_id, wait_behavior.wait_timeout_secs, + ); + eprintln!( + "", + resolved.session_id, elapsed, resolved.session_id, cd_arg, + ); + eprintln!( + "", + sid = resolved.session_id, + cd = cd + .as_ref() + .map(|p| format!(" --cd '{p}'")) + .unwrap_or_default(), + ); + return Ok(SESSION_WAIT_KV_WARM_EXIT_CODE); + } + // Defensive: daemon gone with no result.toml (rare; earlier loop branches usually exit-1 first). eprintln!( - "", - resolved.session_id, elapsed, resolved.session_id, cd_arg, - ); - eprintln!( - "Hint: Call `csa session wait` again individually (not in a tight loop script). \ - The {}s timeout is designed to let the calling agent generate tokens between waits, \ - keeping its KV cache warm.", - wait_behavior.wait_timeout_secs, + "Timeout: session {} did not complete within {}s and no live daemon process remains.", + resolved.session_id, wait_behavior.wait_timeout_secs, ); eprintln!( - "", - sid = resolved.session_id, - cd = cd - .as_ref() - .map(|p| format!(" --cd '{p}'")) - .unwrap_or_default(), + "", + resolved.session_id, elapsed, resolved.session_id, cd_arg, ); return Ok(SESSION_WAIT_TIMEOUT_EXIT_CODE); } diff --git a/crates/cli-sub-agent/src/session_cmds_tests.rs b/crates/cli-sub-agent/src/session_cmds_tests.rs index 8f4a3b14..69642961 100644 --- a/crates/cli-sub-agent/src/session_cmds_tests.rs +++ b/crates/cli-sub-agent/src/session_cmds_tests.rs @@ -743,6 +743,8 @@ include!("session_cmds_tests_fork_tail.rs"); #[path = "session_cmds_tests_daemon_pid_tail.rs"] mod daemon_pid_tail_tests; +#[path = "session_cmds_tests_kv_warm.rs"] +mod kv_warm_tests; #[path = "session_cmds_tests_list_format.rs"] mod list_format_tests; #[path = "session_cmds_tests_result_cli.rs"] diff --git a/crates/cli-sub-agent/src/session_cmds_tests_daemon_pid_tail.rs b/crates/cli-sub-agent/src/session_cmds_tests_daemon_pid_tail.rs index 44dc079a..947c115e 100644 --- a/crates/cli-sub-agent/src/session_cmds_tests_daemon_pid_tail.rs +++ b/crates/cli-sub-agent/src/session_cmds_tests_daemon_pid_tail.rs @@ -68,9 +68,12 @@ fn handle_session_wait_ignores_completion_packet_while_raw_daemon_pid_is_alive() let exit_code = handle_session_wait(session_id, Some(project.to_string_lossy().into_owned()), 1).unwrap(); + // daemon.pid is still alive when the 1s wait cap fires, so the completion + // packet (which only records the daemon-process exit) must be ignored. + // Under #1439, alive-at-cap returns the KV-warm code (0) rather than 124. assert_eq!( - exit_code, 124, - "wait should time out instead of trusting daemon-completion while daemon.pid is still alive" + exit_code, 0, + "wait should emit the KV-warm exit instead of trusting daemon-completion while daemon.pid is still alive" ); child.kill().ok(); diff --git a/crates/cli-sub-agent/src/session_cmds_tests_kv_warm.rs b/crates/cli-sub-agent/src/session_cmds_tests_kv_warm.rs new file mode 100644 index 00000000..a996bea0 --- /dev/null +++ b/crates/cli-sub-agent/src/session_cmds_tests_kv_warm.rs @@ -0,0 +1,122 @@ +use super::*; +use crate::session_cmds_daemon::{ + WaitBehavior, WaitLoopTiming, WaitReconciliationOutcome, handle_session_wait_with_hooks, +}; +use crate::test_env_lock::TEST_ENV_LOCK; +use tempfile::tempdir; + +struct EnvVarGuard { + key: &'static str, + original: Option, +} + +impl EnvVarGuard { + fn set(key: &'static str, value: impl AsRef) -> Self { + let original = std::env::var(key).ok(); + // SAFETY: test-scoped env mutation guarded by a process-wide mutex. + unsafe { std::env::set_var(key, value) }; + Self { key, original } + } +} + +impl Drop for EnvVarGuard { + fn drop(&mut self) { + // SAFETY: test-scoped env mutation guarded by a process-wide mutex. + unsafe { + match self.original.as_deref() { + Some(value) => std::env::set_var(self.key, value), + None => std::env::remove_var(self.key), + } + } + } +} + +#[cfg(target_os = "linux")] +fn read_process_start_time_ticks(pid: u32) -> u64 { + let stat_path = format!("/proc/{pid}/stat"); + let content = std::fs::read_to_string(stat_path).expect("read /proc stat"); + let close_paren = content.rfind(')').expect("stat comm terminator"); + let after_comm = &content[close_paren + 1..]; + let mut parts = after_comm.split_whitespace(); + parts.next().expect("state"); + parts.next().expect("ppid"); + parts.next().expect("pgrp"); + for _ in 0..16 { + parts.next().expect("intermediate stat field"); + } + parts + .next() + .expect("starttime") + .parse::() + .expect("starttime parse") +} + +#[cfg(target_os = "linux")] +fn daemon_pid_record(pid: u32) -> String { + format!("{pid} {}\n", read_process_start_time_ticks(pid)) +} + +/// Regression test for #1439: when the wait cap fires and the session daemon is +/// still alive, the wait must exit with code 0 (KV-warm), not 124 (legacy +/// timeout). The accompanying `CSA:SESSION_WAIT_KV_WARM` marker is verified by +/// code inspection in `session_cmds_daemon_wait.rs`; this test pins the +/// exit-code contract that callers (especially AI agents in `run_in_background` +/// task-notification loops) depend on. +#[cfg(target_os = "linux")] +#[test] +fn handle_session_wait_kv_warm_exit_when_daemon_alive_at_cap() { + let td = tempdir().expect("tempdir"); + let _env_lock = TEST_ENV_LOCK.blocking_lock(); + let state_home = td.path().join("xdg-state"); + std::fs::create_dir_all(&state_home).expect("create state home"); + let _home_guard = EnvVarGuard::set("HOME", td.path()); + let _state_guard = EnvVarGuard::set("XDG_STATE_HOME", &state_home); + let project = td.path(); + + let session = + create_session(project, Some("wait-kv-warm-alive"), None, Some("codex")).expect("create"); + let session_id = session.meta_session_id; + let session_dir = get_session_dir(project, &session_id).expect("session dir"); + + let mut child = std::process::Command::new("sleep") + .arg("30") + .spawn() + .expect("spawn child"); + std::fs::write( + session_dir.join("daemon.pid"), + daemon_pid_record(child.id()), + ) + .expect("write daemon pid"); + assert!(csa_process::ToolLiveness::daemon_pid_is_alive(&session_dir)); + + let exit_code = handle_session_wait_with_hooks( + session_id, + Some(project.to_string_lossy().into_owned()), + WaitBehavior { + wait_timeout_secs: 0, + memory_warn_mb: None, + timing: WaitLoopTiming { + poll_interval: std::time::Duration::from_millis(1), + memory_sample_interval: std::time::Duration::from_secs(15), + }, + }, + |_project_root, _current_session_id, _trigger| { + Ok(WaitReconciliationOutcome { + result_became_available: false, + synthetic: false, + }) + }, + |_sid, _status, _exit_code, _synthetic, _mirror_to_stdout| { + panic!("alive-at-cap path must not emit a completion signal"); + }, + ) + .expect("wait should reach KV-warm exit"); + + let _ = child.kill(); + let _ = child.wait(); + + assert_eq!( + exit_code, 0, + "live daemon at wait cap must emit KV-warm exit (0), not legacy timeout (124)" + ); +} diff --git a/crates/cli-sub-agent/src/session_cmds_tests_tail.rs b/crates/cli-sub-agent/src/session_cmds_tests_tail.rs index 42a927be..fbcf0bc7 100644 --- a/crates/cli-sub-agent/src/session_cmds_tests_tail.rs +++ b/crates/cli-sub-agent/src/session_cmds_tests_tail.rs @@ -568,9 +568,15 @@ fn handle_session_wait_ignores_incomplete_result_while_daemon_alive() { assert!(status.success()); } +/// Regression test for #1442: result.toml is the authoritative session artifact. +/// When result.toml records success/exit_code=0 but the daemon process exited with +/// non-zero (recording failure in daemon-completion.toml), the wait command MUST +/// report the session's actual outcome from result.toml — not the daemon's process +/// exit code. The daemon may exit non-zero for reasons unrelated to the session +/// outcome (post-write cleanup failure, parent SIGTERM, etc.). #[cfg(unix)] #[test] -fn handle_session_wait_prefers_daemon_completion_exit_code() { +fn handle_session_wait_prefers_result_toml_over_daemon_completion_exit_code() { let td = tempdir().unwrap(); let _env_lock = TEST_ENV_LOCK.blocking_lock(); let state_home = td.path().join("xdg-state"); @@ -594,8 +600,8 @@ fn handle_session_wait_prefers_daemon_completion_exit_code() { handle_session_wait(session_id, Some(project.to_string_lossy().into_owned()), 1).unwrap(); assert_eq!( - exit_code, 1, - "session wait should return the daemon's final exit code, not the intermediate result.toml exit code" + exit_code, 0, + "result.toml records the session's actual outcome and must override the daemon process exit code" ); } @@ -643,9 +649,13 @@ fn handle_session_wait_ignores_completion_packet_while_daemon_alive() { let exit_code = handle_session_wait(session_id, Some(project.to_string_lossy().into_owned()), 1).unwrap(); + // Daemon is still alive when the 1s wait cap fires; the completion packet + // recorded the daemon process exit, not the session outcome, so the wait + // must NOT surface it. Under #1439, alive-at-cap returns the KV-warm code + // (0) rather than the legacy 124. assert_eq!( - exit_code, 124, - "wait should time out instead of reporting completion while the daemon is still alive" + exit_code, 0, + "wait should emit the KV-warm exit instead of reporting completion while the daemon is still alive" ); child.kill().ok(); diff --git a/crates/cli-sub-agent/src/session_cmds_tests_tail_wait.rs b/crates/cli-sub-agent/src/session_cmds_tests_tail_wait.rs index 4e334cba..c52d3b96 100644 --- a/crates/cli-sub-agent/src/session_cmds_tests_tail_wait.rs +++ b/crates/cli-sub-agent/src/session_cmds_tests_tail_wait.rs @@ -634,12 +634,12 @@ fn test_session_wait_memory_warn_disabled_when_zero() { panic!("disabled sampler must not emit marker"); }, ) - .expect("wait should fall back to timeout"); + .expect("wait should fall back to KV-warm exit"); let _ = child.kill(); let _ = child.wait(); - assert_eq!(exit_code, 124); + assert_eq!(exit_code, 0); assert_eq!(sample_calls, 0); } @@ -704,12 +704,12 @@ fn test_session_wait_procfs_failure_fallback() { panic!("sampling failure fallback must not emit marker"); }, ) - .expect("wait should fall back to classic timeout"); + .expect("wait should fall back to KV-warm exit"); let _ = child.kill(); let _ = child.wait(); - assert_eq!(exit_code, 124); + assert_eq!(exit_code, 0); assert_eq!(sample_calls, 1); } diff --git a/crates/cli-sub-agent/src/session_cmds_tests_tail_wait_liveness.rs b/crates/cli-sub-agent/src/session_cmds_tests_tail_wait_liveness.rs index ebb48a47..a459786c 100644 --- a/crates/cli-sub-agent/src/session_cmds_tests_tail_wait_liveness.rs +++ b/crates/cli-sub-agent/src/session_cmds_tests_tail_wait_liveness.rs @@ -33,9 +33,9 @@ impl Drop for EnvVarGuard { /// When PID-level detection misses (session_has_terminal_process=false) but broader /// filesystem liveness signals remain (is_alive=true due to recent session writes), -/// the wait must continue polling and eventually return 124 (timeout) — not 1 (failure). -/// -/// Regression test for #1396. +/// the wait must continue polling and eventually return the KV-warm exit (0) — not +/// failure (1). Original regression for #1396; updated for #1439 which reclassifies +/// the alive-at-cap exit code from 124 to 0. #[test] fn handle_session_wait_continues_polling_when_pid_missing_but_liveness_signals_present() { let td = tempdir().expect("tempdir"); @@ -95,7 +95,7 @@ fn handle_session_wait_continues_polling_when_pid_missing_but_liveness_signals_p .expect("wait should succeed"); assert_eq!( - exit_code, 124, - "wait must time out (124) not report failure (1) when liveness signals exist" + exit_code, 0, + "wait must emit the KV-warm exit (0), not report failure (1), when liveness signals exist" ); }