Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ default-members = [
resolver = "2"

[workspace.package]
version = "0.1.734"
version = "0.1.736"
edition = "2024"
rust-version = "1.88"
license = "Apache-2.0"
Expand Down
91 changes: 44 additions & 47 deletions crates/cli-sub-agent/src/session_cmds_daemon_wait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ use std::os::fd::AsRawFd;
pub(crate) const SESSION_WAIT_MEMORY_WARN_EXIT_CODE: i32 = 33;
const SESSION_WAIT_SUCCESS_EXIT_CODE: i32 = 0;
const SESSION_WAIT_FAILURE_EXIT_CODE: i32 = 1;
/// Healthy poll-cap exit when the session is still alive: callers should
/// process tokens (warming their KV cache) and re-wait. See #1439.
const SESSION_WAIT_KV_WARM_EXIT_CODE: i32 = 0;
/// Reserved for the rare case where the wait cap is reached but the session
/// daemon is no longer alive and no result.toml was produced.
const SESSION_WAIT_TIMEOUT_EXIT_CODE: i32 = 124;
const SESSION_WAIT_MEMORY_SAMPLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(15);

Expand Down Expand Up @@ -308,28 +313,12 @@ where
let refreshed_result = refreshed_result.ok().flatten();
let mut synthetic = false;
let refreshed_result_available = refreshed_result.is_some();
let mut loaded_result = refreshed_result.filter(|result| {
match (
fs::metadata(session_dir.join(csa_session::result::RESULT_FILE_NAME))
.ok()
.and_then(|metadata| metadata.modified().ok()),
fs::metadata(daemon_completion_path(&session_dir))
.ok()
.and_then(|metadata| metadata.modified().ok()),
) {
(Some(result_modified), Some(completion_modified))
if result_modified > completion_modified =>
{
true
}
(Some(result_modified), Some(completion_modified))
if result_modified == completion_modified =>
{
completion.exit_code == 0 && result.exit_code != 0
}
_ => false,
}
});
// result.toml is the authoritative session artifact; trust it over the daemon
// completion packet (which records the daemon process exit, not the session
// outcome). The daemon may exit non-zero after writing a successful result.toml
// (e.g., post-write cleanup failure, parent SIGTERM), so prior mtime-based
// filtering caused #1442 false failures. See #1442.
let mut loaded_result = refreshed_result;
if refreshed_result_available {
crate::session_cmds::retire_if_dead_with_result(
effective_root,
Expand Down Expand Up @@ -487,38 +476,46 @@ where

let elapsed = start.elapsed().as_secs();
if elapsed >= wait_behavior.wait_timeout_secs {
eprintln!(
"Timeout: session {} did not complete within {}s",
resolved.session_id, wait_behavior.wait_timeout_secs,
);
// Emit structured retry hint for orchestrators / agents.
let cd_arg = cd
.as_ref()
.map(|path| format!(" --cd '{}'", path))
.unwrap_or_default();
let session_alive = session_has_terminal_process(&session_dir)
|| csa_process::ToolLiveness::is_alive(&session_dir);
Comment on lines +483 to +484
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The check session_has_terminal_process(&session_dir) is redundant here because csa_process::ToolLiveness::is_alive(&session_dir) already includes checks for live processes (both via daemon.pid and lock files). Simplifying this to just use is_alive improves readability and reduces redundant filesystem operations.

Suggested change
let session_alive = session_has_terminal_process(&session_dir)
|| csa_process::ToolLiveness::is_alive(&session_dir);
let session_alive = csa_process::ToolLiveness::is_alive(&session_dir);

if session_alive {
// KV-warm exit: session still alive at the wait cap. See #1439.
eprintln!(
"Session {} still running after {}s wait cap; returning so caller can warm its KV cache before re-waiting.",
resolved.session_id, wait_behavior.wait_timeout_secs,
);
eprintln!(
"<!-- CSA:SESSION_WAIT_KV_WARM session={} status=alive elapsed={}s action=re-wait cmd=\"csa session wait --session {}{}\" -->",
resolved.session_id, elapsed, resolved.session_id, cd_arg,
);
eprintln!(
"<!-- CSA:CALLER_HINT action=\"retry_wait\" \
rule=\"Session is alive; this exit is healthy. Process this output NOW and generate tokens to warm your KV cache, \
then call 'csa session wait --session {sid}{cd}' again in a NEW Bash call. \
NEVER batch multiple session waits in one Bash call. \
If you background the wait (run_in_background: true), the completion task-notification IS your wake signal — do NOT stack ScheduleWakeup, /loop, or sleep loops on top; that's pure redundancy and wastes tokens. \
FORBIDDEN: ls/cat/wc/grep on session-dir, state.toml reads, ps checks on daemon PID — \
any manual polling wastes caller tokens with zero benefit.\" -->",
sid = resolved.session_id,
cd = cd
.as_ref()
.map(|p| format!(" --cd '{p}'"))
.unwrap_or_default(),
Comment on lines +504 to +507
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The cd argument formatting is redundant here. It was already calculated and stored in cd_arg at line 479. Using cd_arg directly simplifies the code and ensures consistency with the other eprintln! calls in this block.

Suggested change
cd = cd
.as_ref()
.map(|p| format!(" --cd '{p}'"))
.unwrap_or_default(),
cd = cd_arg,

);
return Ok(SESSION_WAIT_KV_WARM_EXIT_CODE);
}
// Defensive: daemon gone with no result.toml (rare; earlier loop branches usually exit-1 first).
eprintln!(
"<!-- CSA:SESSION_WAIT_TIMEOUT session={} elapsed={}s cmd=\"csa session wait --session {}{}\" -->",
resolved.session_id, elapsed, resolved.session_id, cd_arg,
);
eprintln!(
"Hint: Call `csa session wait` again individually (not in a tight loop script). \
The {}s timeout is designed to let the calling agent generate tokens between waits, \
keeping its KV cache warm.",
wait_behavior.wait_timeout_secs,
"Timeout: session {} did not complete within {}s and no live daemon process remains.",
resolved.session_id, wait_behavior.wait_timeout_secs,
);
eprintln!(
"<!-- CSA:CALLER_HINT action=\"retry_wait\" \
rule=\"Process this output NOW and generate tokens to warm your KV cache, \
then call 'csa session wait --session {sid}{cd}' again in a NEW Bash call. \
NEVER batch multiple session waits in one Bash call. \
If you background the wait (run_in_background: true), the completion task-notification IS your wake signal — do NOT stack ScheduleWakeup, /loop, or sleep loops on top; that's pure redundancy and wastes tokens. \
FORBIDDEN: ls/cat/wc/grep on session-dir, state.toml reads, ps checks on daemon PID — \
any manual polling wastes caller tokens with zero benefit.\" -->",
sid = resolved.session_id,
cd = cd
.as_ref()
.map(|p| format!(" --cd '{p}'"))
.unwrap_or_default(),
"<!-- CSA:SESSION_WAIT_TIMEOUT session={} elapsed={}s status=dead cmd=\"csa session result --session {}{}\" -->",
resolved.session_id, elapsed, resolved.session_id, cd_arg,
);
return Ok(SESSION_WAIT_TIMEOUT_EXIT_CODE);
}
Expand Down
2 changes: 2 additions & 0 deletions crates/cli-sub-agent/src/session_cmds_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,8 @@ include!("session_cmds_tests_fork_tail.rs");

#[path = "session_cmds_tests_daemon_pid_tail.rs"]
mod daemon_pid_tail_tests;
#[path = "session_cmds_tests_kv_warm.rs"]
mod kv_warm_tests;
#[path = "session_cmds_tests_list_format.rs"]
mod list_format_tests;
#[path = "session_cmds_tests_result_cli.rs"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,12 @@ fn handle_session_wait_ignores_completion_packet_while_raw_daemon_pid_is_alive()
let exit_code =
handle_session_wait(session_id, Some(project.to_string_lossy().into_owned()), 1).unwrap();

// daemon.pid is still alive when the 1s wait cap fires, so the completion
// packet (which only records the daemon-process exit) must be ignored.
// Under #1439, alive-at-cap returns the KV-warm code (0) rather than 124.
assert_eq!(
exit_code, 124,
"wait should time out instead of trusting daemon-completion while daemon.pid is still alive"
exit_code, 0,
"wait should emit the KV-warm exit instead of trusting daemon-completion while daemon.pid is still alive"
);

child.kill().ok();
Expand Down
Loading
Loading