-
Notifications
You must be signed in to change notification settings - Fork 2
fix(session): accurate wait exit codes + KV-warm distinction (#1442, #1439) #1445
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -7,6 +7,11 @@ use std::os::fd::AsRawFd; | |||||||||||
| pub(crate) const SESSION_WAIT_MEMORY_WARN_EXIT_CODE: i32 = 33; | ||||||||||||
| const SESSION_WAIT_SUCCESS_EXIT_CODE: i32 = 0; | ||||||||||||
| const SESSION_WAIT_FAILURE_EXIT_CODE: i32 = 1; | ||||||||||||
| /// Healthy poll-cap exit when the session is still alive: callers should | ||||||||||||
| /// process tokens (warming their KV cache) and re-wait. See #1439. | ||||||||||||
| const SESSION_WAIT_KV_WARM_EXIT_CODE: i32 = 0; | ||||||||||||
| /// Reserved for the rare case where the wait cap is reached but the session | ||||||||||||
| /// daemon is no longer alive and no result.toml was produced. | ||||||||||||
| const SESSION_WAIT_TIMEOUT_EXIT_CODE: i32 = 124; | ||||||||||||
| const SESSION_WAIT_MEMORY_SAMPLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(15); | ||||||||||||
|
|
||||||||||||
|
|
@@ -308,28 +313,12 @@ where | |||||||||||
| let refreshed_result = refreshed_result.ok().flatten(); | ||||||||||||
| let mut synthetic = false; | ||||||||||||
| let refreshed_result_available = refreshed_result.is_some(); | ||||||||||||
| let mut loaded_result = refreshed_result.filter(|result| { | ||||||||||||
| match ( | ||||||||||||
| fs::metadata(session_dir.join(csa_session::result::RESULT_FILE_NAME)) | ||||||||||||
| .ok() | ||||||||||||
| .and_then(|metadata| metadata.modified().ok()), | ||||||||||||
| fs::metadata(daemon_completion_path(&session_dir)) | ||||||||||||
| .ok() | ||||||||||||
| .and_then(|metadata| metadata.modified().ok()), | ||||||||||||
| ) { | ||||||||||||
| (Some(result_modified), Some(completion_modified)) | ||||||||||||
| if result_modified > completion_modified => | ||||||||||||
| { | ||||||||||||
| true | ||||||||||||
| } | ||||||||||||
| (Some(result_modified), Some(completion_modified)) | ||||||||||||
| if result_modified == completion_modified => | ||||||||||||
| { | ||||||||||||
| completion.exit_code == 0 && result.exit_code != 0 | ||||||||||||
| } | ||||||||||||
| _ => false, | ||||||||||||
| } | ||||||||||||
| }); | ||||||||||||
| // result.toml is the authoritative session artifact; trust it over the daemon | ||||||||||||
| // completion packet (which records the daemon process exit, not the session | ||||||||||||
| // outcome). The daemon may exit non-zero after writing a successful result.toml | ||||||||||||
| // (e.g., post-write cleanup failure, parent SIGTERM), so prior mtime-based | ||||||||||||
| // filtering caused #1442 false failures. See #1442. | ||||||||||||
| let mut loaded_result = refreshed_result; | ||||||||||||
| if refreshed_result_available { | ||||||||||||
| crate::session_cmds::retire_if_dead_with_result( | ||||||||||||
| effective_root, | ||||||||||||
|
|
@@ -487,38 +476,46 @@ where | |||||||||||
|
|
||||||||||||
| let elapsed = start.elapsed().as_secs(); | ||||||||||||
| if elapsed >= wait_behavior.wait_timeout_secs { | ||||||||||||
| eprintln!( | ||||||||||||
| "Timeout: session {} did not complete within {}s", | ||||||||||||
| resolved.session_id, wait_behavior.wait_timeout_secs, | ||||||||||||
| ); | ||||||||||||
| // Emit structured retry hint for orchestrators / agents. | ||||||||||||
| let cd_arg = cd | ||||||||||||
| .as_ref() | ||||||||||||
| .map(|path| format!(" --cd '{}'", path)) | ||||||||||||
| .unwrap_or_default(); | ||||||||||||
| let session_alive = session_has_terminal_process(&session_dir) | ||||||||||||
| || csa_process::ToolLiveness::is_alive(&session_dir); | ||||||||||||
| if session_alive { | ||||||||||||
| // KV-warm exit: session still alive at the wait cap. See #1439. | ||||||||||||
| eprintln!( | ||||||||||||
| "Session {} still running after {}s wait cap; returning so caller can warm its KV cache before re-waiting.", | ||||||||||||
| resolved.session_id, wait_behavior.wait_timeout_secs, | ||||||||||||
| ); | ||||||||||||
| eprintln!( | ||||||||||||
| "<!-- CSA:SESSION_WAIT_KV_WARM session={} status=alive elapsed={}s action=re-wait cmd=\"csa session wait --session {}{}\" -->", | ||||||||||||
| resolved.session_id, elapsed, resolved.session_id, cd_arg, | ||||||||||||
| ); | ||||||||||||
| eprintln!( | ||||||||||||
| "<!-- CSA:CALLER_HINT action=\"retry_wait\" \ | ||||||||||||
| rule=\"Session is alive; this exit is healthy. Process this output NOW and generate tokens to warm your KV cache, \ | ||||||||||||
| then call 'csa session wait --session {sid}{cd}' again in a NEW Bash call. \ | ||||||||||||
| NEVER batch multiple session waits in one Bash call. \ | ||||||||||||
| If you background the wait (run_in_background: true), the completion task-notification IS your wake signal — do NOT stack ScheduleWakeup, /loop, or sleep loops on top; that's pure redundancy and wastes tokens. \ | ||||||||||||
| FORBIDDEN: ls/cat/wc/grep on session-dir, state.toml reads, ps checks on daemon PID — \ | ||||||||||||
| any manual polling wastes caller tokens with zero benefit.\" -->", | ||||||||||||
| sid = resolved.session_id, | ||||||||||||
| cd = cd | ||||||||||||
| .as_ref() | ||||||||||||
| .map(|p| format!(" --cd '{p}'")) | ||||||||||||
| .unwrap_or_default(), | ||||||||||||
|
Comment on lines
+504
to
+507
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||||||||
| ); | ||||||||||||
| return Ok(SESSION_WAIT_KV_WARM_EXIT_CODE); | ||||||||||||
| } | ||||||||||||
| // Defensive: daemon gone with no result.toml (rare; earlier loop branches usually exit-1 first). | ||||||||||||
| eprintln!( | ||||||||||||
| "<!-- CSA:SESSION_WAIT_TIMEOUT session={} elapsed={}s cmd=\"csa session wait --session {}{}\" -->", | ||||||||||||
| resolved.session_id, elapsed, resolved.session_id, cd_arg, | ||||||||||||
| ); | ||||||||||||
| eprintln!( | ||||||||||||
| "Hint: Call `csa session wait` again individually (not in a tight loop script). \ | ||||||||||||
| The {}s timeout is designed to let the calling agent generate tokens between waits, \ | ||||||||||||
| keeping its KV cache warm.", | ||||||||||||
| wait_behavior.wait_timeout_secs, | ||||||||||||
| "Timeout: session {} did not complete within {}s and no live daemon process remains.", | ||||||||||||
| resolved.session_id, wait_behavior.wait_timeout_secs, | ||||||||||||
| ); | ||||||||||||
| eprintln!( | ||||||||||||
| "<!-- CSA:CALLER_HINT action=\"retry_wait\" \ | ||||||||||||
| rule=\"Process this output NOW and generate tokens to warm your KV cache, \ | ||||||||||||
| then call 'csa session wait --session {sid}{cd}' again in a NEW Bash call. \ | ||||||||||||
| NEVER batch multiple session waits in one Bash call. \ | ||||||||||||
| If you background the wait (run_in_background: true), the completion task-notification IS your wake signal — do NOT stack ScheduleWakeup, /loop, or sleep loops on top; that's pure redundancy and wastes tokens. \ | ||||||||||||
| FORBIDDEN: ls/cat/wc/grep on session-dir, state.toml reads, ps checks on daemon PID — \ | ||||||||||||
| any manual polling wastes caller tokens with zero benefit.\" -->", | ||||||||||||
| sid = resolved.session_id, | ||||||||||||
| cd = cd | ||||||||||||
| .as_ref() | ||||||||||||
| .map(|p| format!(" --cd '{p}'")) | ||||||||||||
| .unwrap_or_default(), | ||||||||||||
| "<!-- CSA:SESSION_WAIT_TIMEOUT session={} elapsed={}s status=dead cmd=\"csa session result --session {}{}\" -->", | ||||||||||||
| resolved.session_id, elapsed, resolved.session_id, cd_arg, | ||||||||||||
| ); | ||||||||||||
| return Ok(SESSION_WAIT_TIMEOUT_EXIT_CODE); | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The check
session_has_terminal_process(&session_dir)is redundant here becausecsa_process::ToolLiveness::is_alive(&session_dir)already includes checks for live processes (both viadaemon.pidand lock files). Simplifying this to just useis_aliveimproves readability and reduces redundant filesystem operations.