diff --git a/docs/content/docs/(configuration)/config.mdx b/docs/content/docs/(configuration)/config.mdx index f098b6d73..386cd7f9d 100644 --- a/docs/content/docs/(configuration)/config.mdx +++ b/docs/content/docs/(configuration)/config.mdx @@ -84,6 +84,12 @@ background_threshold = 0.80 # background summarization aggressive_threshold = 0.85 # aggressive summarization emergency_threshold = 0.95 # drop oldest 50%, no LLM +# Deterministic worker task contract timing. +[defaults.worker_contract] +ack_secs = 5 # seconds before first ack checkpoint +progress_secs = 45 # seconds between progress heartbeat nudges +tick_secs = 2 # scheduler tick interval for contract deadline checks + # Cortex (system observer) settings. [defaults.cortex] tick_interval_secs = 30 @@ -103,6 +109,7 @@ startup_delay_secs = 5 enabled = true headless = true evaluate_enabled = false +browser_action_timeout_secs = 45 executable_path = "/path/to/chrome" # optional, auto-detected screenshot_dir = "/path/to/screenshots" # optional, defaults to data_dir/screenshots @@ -119,6 +126,12 @@ cron_timezone = "America/Los_Angeles" # optional per-agent cron timezone overri [agents.routing] channel = "anthropic/claude-opus-4-20250514" +# Per-agent worker contract overrides (inherits defaults when omitted). +[agents.worker_contract] +ack_secs = 8 +progress_secs = 60 +tick_secs = 3 + # Per-agent sandbox configuration. [agents.sandbox] mode = "enabled" # "enabled" (default) or "disabled" @@ -227,6 +240,7 @@ Most config values are hot-reloaded when their files change. Spacebot watches `c | `max_concurrent_branches` | Yes | Next branch spawn checks new limit | | Browser config | Yes | Next worker spawn uses new config | | Warmup config | Yes | Next warmup pass uses new values | +| `[defaults.worker_contract]` / `[agents.worker_contract]` (`ack_secs`, `progress_secs`, `tick_secs`) | Yes | Runtime and agent-level contract deadlines and polling update without restart | | Identity files (SOUL.md, etc.) | Yes | Next channel message renders new identity | | Skills (SKILL.md files) | Yes | Next message / worker spawn sees new skills | | Bindings | Yes | Next message routes using new bindings | @@ -471,12 +485,22 @@ Map of model names to ordered fallback chains. Used when the primary model retur Thresholds are fractions of `context_window`. +### `[defaults.worker_contract]` + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| `ack_secs` | integer | 5 | Deadline to confirm a worker start was surfaced | +| `progress_secs` | integer | 45 | Deadline between meaningful worker progress updates | +| `tick_secs` | integer | 2 | Poll interval for worker contract deadline checks | + +Setting `ack_secs`, `progress_secs`, or `tick_secs` to `0` is treated as unset and falls back to the resolved default for that scope. + ### `[defaults.cortex]` | Key | Type | Default | Description | |-----|------|---------|-------------| | `tick_interval_secs` | integer | 30 | How often the cortex checks system state | -| `worker_timeout_secs` | integer | 300 | Worker timeout before cancellation | +| `worker_timeout_secs` | integer | 300 | Inactivity timeout for worker progress events before forced cancellation | | `branch_timeout_secs` | integer | 60 | Branch timeout before cancellation | | `circuit_breaker_threshold` | integer | 3 | Consecutive failures before auto-disable | @@ -504,6 +528,7 @@ When branch/worker/cron dispatch happens before readiness is satisfied, Spacebot | `enabled` | bool | true | Whether workers have browser tools | | `headless` | bool | true | Run Chrome headless | | `evaluate_enabled` | bool | false | Allow JavaScript evaluation | +| `browser_action_timeout_secs` | integer | 45 | Per-action timeout for browser operations | | `executable_path` | string | None | Custom Chrome/Chromium path | | `screenshot_dir` | string | None | Directory for screenshots | @@ -518,9 +543,31 @@ When branch/worker/cron dispatch happens before readiness is satisfied, Spacebot | `max_concurrent_branches` | integer | inherits | Override instance default | | `max_turns` | integer | inherits | Override instance default | | `context_window` | integer | inherits | Override instance default | +| `worker_contract` | table | inherits | Per-agent worker contract override | Agent-specific routing is set via `[agents.routing]` with the same keys as `[defaults.routing]`. +### `[agents.worker_contract]` + +Per-agent worker contract override. +Unset keys inherit from `[defaults.worker_contract]`. + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| `ack_secs` | integer | inherits | Deadline to confirm a worker start was surfaced | +| `progress_secs` | integer | inherits | Deadline between meaningful worker progress updates | +| `tick_secs` | integer | inherits | Poll interval for worker contract deadline checks | + +Setting `ack_secs`, `progress_secs`, or `tick_secs` to `0` is treated as unset and falls back to the resolved default for that scope. + +```toml +[agents.worker_contract] +# Setting any field to 0 treats it as unset and falls back to the resolved default. +ack_secs = 8 +progress_secs = 60 +tick_secs = 3 +``` + ### `[agents.sandbox]` OS-level filesystem containment for shell and exec tool subprocesses. Uses bubblewrap (Linux) or sandbox-exec (macOS) to enforce read-only access to everything outside the workspace. diff --git a/docs/content/docs/(deployment)/roadmap.mdx b/docs/content/docs/(deployment)/roadmap.mdx index 6711bdd2d..2ce2006a3 100644 --- a/docs/content/docs/(deployment)/roadmap.mdx +++ b/docs/content/docs/(deployment)/roadmap.mdx @@ -39,6 +39,7 @@ The full message-in → LLM → response-out pipeline is wired end-to-end across - **Tools** — 16 tools implement Rig's `Tool` trait with real logic (reply, branch, spawn_worker, route, cancel, skip, react, memory_save, memory_recall, set_status, shell, file, exec, browser, cron, web_search) - **Workspace containment** — file tool validates paths stay within workspace boundary, shell/exec tools block instance directory traversal, sensitive file access, and secret env var leakage - **Conversation persistence** — `ConversationLogger` with fire-and-forget SQLite writes, compaction archiving +- **Worker task contracts** — deterministic worker ack/progress/terminal deadlines with one-time SLA nudge and durable terminal convergence (`terminal_acked` / `terminal_failed`) - **Cron** — scheduler with timers, active hours, circuit breaker (3 failures → disable), creates real channels. CronTool wired into channel tool factory. - **Message routing** — full event loop with binding resolution, channel lifecycle, outbound routing - **Settings store** — redb key-value with WorkerLogMode diff --git a/docs/content/docs/(features)/workers.mdx b/docs/content/docs/(features)/workers.mdx index 3e0a1096a..ae0c1e286 100644 --- a/docs/content/docs/(features)/workers.mdx +++ b/docs/content/docs/(features)/workers.mdx @@ -62,13 +62,17 @@ Workers don't get memory tools, channel tools, or branch tools. They can't talk ``` Running ──→ Done (fire-and-forget completed) -Running ──→ Failed (error or cancellation) +Running ──→ Failed (error) +Running ──→ Cancelled (cancelled by channel/system) +Running ──→ timed_out (inactivity timeout elapsed) Running ──→ WaitingForInput (interactive worker finished initial task) WaitingForInput ──→ Running (follow-up message received via route) WaitingForInput ──→ Failed (follow-up processing failed) +WaitingForInput ──→ Cancelled (cancelled by channel/system) +WaitingForInput ──→ timed_out (inactivity timeout elapsed) ``` -`Done` and `Failed` are terminal. Illegal transitions are runtime errors. +`Done`, `Failed`, `Cancelled`, and `timed_out` are terminal. Illegal transitions are runtime errors. ## Context and History @@ -95,7 +99,7 @@ Workers run in segments of 25 turns each. After each segment: - If the agent returned a result: done - If max turns hit: compact if needed, continue with "Continue where you left off" -- If cancelled: state = Failed +- If cancelled: state = Cancelled - If context overflow: force compact, retry This prevents runaway workers and handles long tasks that exceed a single agent loop. @@ -111,10 +115,52 @@ Workers report progress via the `set_status` tool. The status string (max 256 ch The channel LLM sees this and can decide whether to wait, ask for more info, or cancel. +Spacebot also forwards throttled worker checkpoints to the user-facing adapter: + +- Start and completion updates are always surfaced. +- Mid-run checkpoints are deduped and rate-limited (default: at most one every 20s per worker, with urgent states bypassing the limit). +- Adapters that support message editing (for example Discord) update a single progress message in place to avoid channel spam. + ## Concurrency Workers run concurrently. The default limit is `max_concurrent_workers: 5` per channel (configurable per agent). Attempting to spawn beyond the limit returns an error to the LLM so it can wait or cancel an existing worker. +## Timeouts + +Worker runs are bounded by `worker_timeout_secs` (default `300`) as an inactivity timeout. Any worker progress event (status updates, tool activity, permission/question prompts) resets the timer. + +If no progress arrives within the timeout window, Spacebot marks the worker as `timed_out`, records a terminal result, and removes it from active worker state so the channel can continue delegating work. + +## Deterministic Task Contracts + +Each worker run now gets an internal task contract with three deadlines: + +- **Acknowledge deadline** — confirms the worker start was surfaced to the user-facing adapter. +- **Progress deadline** — expects a meaningful heartbeat before the deadline. +- **Terminal deadline** — tracks terminal delivery lifecycle until receipt ack/failure. + +If the acknowledge deadline is missed, Spacebot emits a synthesized "running" checkpoint. If the progress deadline is missed, it emits one synthesized "still working" nudge (one-time, no spam loop). Terminal receipt ack/failure then closes the contract as `terminal_acked` or `terminal_failed`. + +## Terminal Delivery Reliability + +Terminal worker notices (`done`, `failed`, `timed_out`, `cancelled`) are queued as durable delivery receipts before they are sent to the messaging adapter. + +- Receipts are retried with bounded backoff on adapter delivery errors. +- Successful delivery marks the receipt as acknowledged. +- On process restart, in-flight (`sending`) receipts are re-queued so completion notices are not silently dropped. +- Old terminal receipts (`acked`, `failed`) are pruned periodically to keep storage bounded. + +## Canonical Timeline Projection + +Worker execution truth stays in `worker_runs.transcript`. Delivery truth stays in `worker_task_contracts` and `worker_delivery_receipts`. + +Spacebot computes a read-time projection (it does not persist a second event log): + +- Transcript steps are ordered by step index. +- Delivery/contract snapshots are ordered by timestamp. +- `workers/detail?include_timeline=true` returns the synthesized timeline plus a `terminal_converged` flag. +- `worker_inspect` shows the same projection so transcript and delivery state can be audited together. + ## Model Routing Workers default to `anthropic/claude-haiku-4.5-20250514`. Task-type overrides apply — for example, a `coding` task type routes to `anthropic/claude-sonnet-4-20250514`. Fallback chains are supported. All hot-reloadable. diff --git a/migrations/20260224000001_worker_delivery_receipts.sql b/migrations/20260224000001_worker_delivery_receipts.sql new file mode 100644 index 000000000..4c31c75ea --- /dev/null +++ b/migrations/20260224000001_worker_delivery_receipts.sql @@ -0,0 +1,28 @@ +-- Durable delivery receipts for terminal worker notifications. +-- +-- Tracks whether a terminal worker completion notice has been delivered to the +-- user-facing channel, with bounded retry metadata for transient adapter +-- failures. + +CREATE TABLE IF NOT EXISTS worker_delivery_receipts ( + id TEXT PRIMARY KEY, + worker_id TEXT NOT NULL, + channel_id TEXT NOT NULL, + kind TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + terminal_state TEXT NOT NULL, + payload_text TEXT NOT NULL, + attempt_count INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + next_attempt_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + acked_at TIMESTAMP, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + UNIQUE(worker_id, kind) +); + +CREATE INDEX idx_worker_delivery_receipts_due + ON worker_delivery_receipts(status, next_attempt_at); + +CREATE INDEX idx_worker_delivery_receipts_channel + ON worker_delivery_receipts(channel_id, created_at); diff --git a/migrations/20260224000002_worker_task_contracts.sql b/migrations/20260224000002_worker_task_contracts.sql new file mode 100644 index 000000000..f21ccb1cb --- /dev/null +++ b/migrations/20260224000002_worker_task_contracts.sql @@ -0,0 +1,35 @@ +-- Deterministic worker task contracts. +-- +-- Tracks acknowledgement/progress/terminal guarantees for worker executions so +-- long-running tasks always provide bounded feedback and reach terminal states. + +CREATE TABLE IF NOT EXISTS worker_task_contracts ( + id TEXT PRIMARY KEY, + agent_id TEXT NOT NULL, + channel_id TEXT NOT NULL, + worker_id TEXT NOT NULL UNIQUE, + task_summary TEXT NOT NULL, + state TEXT NOT NULL DEFAULT 'created', + ack_deadline_at TIMESTAMP NOT NULL, + progress_deadline_at TIMESTAMP NOT NULL, + terminal_deadline_at TIMESTAMP NOT NULL, + last_progress_at TIMESTAMP, + last_status_hash TEXT, + attempt_count INTEGER NOT NULL DEFAULT 0, + sla_nudge_sent INTEGER NOT NULL DEFAULT 0, + terminal_state TEXT, + created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX idx_worker_task_contracts_channel_state + ON worker_task_contracts(channel_id, state); + +CREATE INDEX idx_worker_task_contracts_ack_due + ON worker_task_contracts(state, ack_deadline_at); + +CREATE INDEX idx_worker_task_contracts_progress_due + ON worker_task_contracts(state, progress_deadline_at); + +CREATE INDEX idx_worker_task_contracts_terminal_due + ON worker_task_contracts(state, terminal_deadline_at); diff --git a/migrations/20260224000001_worker_tool_calls.sql b/migrations/20260224000003_worker_tool_calls.sql similarity index 100% rename from migrations/20260224000001_worker_tool_calls.sql rename to migrations/20260224000003_worker_tool_calls.sql diff --git a/src/agent/channel.rs b/src/agent/channel.rs index 7157bbf9d..6fa3696d4 100644 --- a/src/agent/channel.rs +++ b/src/agent/channel.rs @@ -10,8 +10,8 @@ use crate::error::{AgentError, Result}; use crate::hooks::SpacebotHook; use crate::llm::SpacebotModel; use crate::{ - AgentDeps, BranchId, ChannelId, InboundMessage, OutboundResponse, ProcessEvent, ProcessId, - ProcessType, WorkerId, + AgentDeps, BranchId, ChannelId, InboundMessage, OutboundEnvelope, OutboundResponse, + ProcessEvent, ProcessId, ProcessType, WorkerId, }; use rig::agent::AgentBuilder; use rig::completion::{CompletionModel, Prompt}; @@ -21,6 +21,7 @@ use rig::tool::server::ToolServer; use std::collections::HashMap; use std::collections::HashSet; use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use tokio::sync::broadcast; use tokio::sync::{RwLock, mpsc}; use tracing::Instrument as _; @@ -33,6 +34,92 @@ const RETRIGGER_DEBOUNCE_MS: u64 = 500; /// infinite retrigger cascades where each retrigger spawns more work. const MAX_RETRIGGERS_PER_TURN: usize = 3; +/// Minimum interval between user-facing worker checkpoint updates. +/// This keeps progress useful without flooding channel messages. +const WORKER_CHECKPOINT_MIN_INTERVAL_SECS: u64 = 20; + +/// Maximum length for user-facing checkpoint text. +const WORKER_CHECKPOINT_MAX_CHARS: usize = 220; +/// How often terminal delivery receipts are drained from SQLite. +/// +/// Keep this small enough for low completion latency, but not so small that +/// the dispatcher loops too aggressively under idle load. +const WORKER_RECEIPT_DISPATCH_INTERVAL_SECS: u64 = 5; +/// Max receipt rows to claim per dispatch pass. +/// +/// `i64` matches SQL bind/count types; conversion to `usize` only happens when +/// allocating local vectors from fetched row counts. +const WORKER_RECEIPT_DISPATCH_BATCH_SIZE: i64 = 8; +/// Max contract rows to claim per acknowledgement deadline scan. +/// +/// `i64` is used for direct SQL LIMIT binding. +const WORKER_CONTRACT_ACK_BATCH_SIZE: i64 = 8; +/// Max contract rows to claim per progress-SLA scan. +/// +/// Tune with `WORKER_CONTRACT_ACK_BATCH_SIZE` to avoid large burst writes. +const WORKER_CONTRACT_PROGRESS_BATCH_SIZE: i64 = 8; +/// Max contract rows to claim per terminal deadline scan. +/// +/// Uses `i64` for SQL LIMIT compatibility; callers only cast when needed. +const WORKER_CONTRACT_TERMINAL_BATCH_SIZE: i64 = 8; +const WORKER_FLUSH_FAILURE_THRESHOLD: usize = 3; +const WORKER_FAILED_PREFIX: &str = "Worker failed:"; +const WORKER_TIMED_OUT_PREFIX: &str = "Worker timed out after "; +const WORKER_CANCELLED_PREFIX: &str = "Worker cancelled:"; +const WORKER_LAG_RECONCILE_FAILURE_RESULT: &str = + "Worker failed: completion event dropped after channel lag; final result unavailable."; + +#[derive(Debug, Clone)] +struct WorkerCheckpointState { + last_status: String, + last_sent_at: tokio::time::Instant, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum FallbackReplyOutcome { + Sent, + EmptyInput, + EmptyNormalized, + SuppressedLowValue, + SendFailed, +} + +struct ResetFlag { + flag: Arc, +} + +impl ResetFlag { + fn new(flag: Arc) -> Self { + Self { flag } + } +} + +impl Drop for ResetFlag { + fn drop(&mut self) { + self.flag.store(false, Ordering::Release); + } +} + +fn apply_periodic_flush_circuit( + task_result: std::result::Result<(), &'static str>, + failure_count: &AtomicUsize, + circuit_open: &AtomicBool, +) -> std::result::Result<(), (usize, &'static str)> { + match task_result { + Ok(()) => { + failure_count.store(0, Ordering::Release); + Ok(()) + } + Err(task_error) => { + let failures = failure_count.fetch_add(1, Ordering::AcqRel) + 1; + if failures >= WORKER_FLUSH_FAILURE_THRESHOLD { + circuit_open.store(true, Ordering::Release); + } + Err((failures, task_error)) + } + } +} + /// Shared state that channel tools need to act on the channel. /// /// Wrapped in Arc and passed to tools (branch, spawn_worker, route, cancel) @@ -60,9 +147,60 @@ pub struct ChannelState { } impl ChannelState { + fn send_worker_terminal_events( + &self, + worker_id: WorkerId, + status: &str, + result: String, + success: bool, + ) { + if let Err(error) = self.deps.event_tx.send(crate::ProcessEvent::WorkerStatus { + agent_id: self.deps.agent_id.clone(), + worker_id, + channel_id: Some(self.channel_id.clone()), + status: status.to_string(), + }) { + tracing::warn!( + %error, + channel_id = %self.channel_id, + worker_id = %worker_id, + status, + "failed to emit worker terminal status event" + ); + } + if let Err(error) = self + .deps + .event_tx + .send(crate::ProcessEvent::WorkerComplete { + agent_id: self.deps.agent_id.clone(), + worker_id, + channel_id: Some(self.channel_id.clone()), + result, + notify: true, + success, + }) + { + tracing::warn!( + %error, + channel_id = %self.channel_id, + worker_id = %worker_id, + success, + "failed to emit worker terminal completion event" + ); + } + } + /// Cancel a running worker by aborting its tokio task and cleaning up state. /// Returns an error message if the worker is not found. - pub async fn cancel_worker(&self, worker_id: WorkerId) -> std::result::Result<(), String> { + pub async fn cancel_worker( + &self, + worker_id: WorkerId, + reason: Option<&str>, + ) -> std::result::Result<(), String> { + let reason = reason + .map(str::trim) + .filter(|value| !value.is_empty()) + .unwrap_or("cancelled by request"); let handle = self.worker_handles.write().await.remove(&worker_id); let removed = self .active_workers @@ -74,13 +212,51 @@ impl ChannelState { if let Some(handle) = handle { handle.abort(); - // Mark the DB row as cancelled since the abort prevents WorkerComplete from firing - self.process_run_logger - .log_worker_completed(worker_id, "Worker cancelled", false); + let state = self.clone(); + let reason = reason.to_string(); + tokio::spawn(async move { + match handle.await { + Err(join_error) if join_error.is_cancelled() => { + state.send_worker_terminal_events( + worker_id, + "cancelled", + format!("{WORKER_CANCELLED_PREFIX} {reason}."), + false, + ); + } + Err(join_error) => { + let failure = format!("Worker failed during cancellation: {join_error}"); + tracing::warn!( + %join_error, + worker_id = %worker_id, + channel_id = %state.channel_id, + "worker join failed after cancellation request" + ); + state.send_worker_terminal_events(worker_id, "failed", failure, false); + } + Ok(()) => { + tracing::debug!( + worker_id = %worker_id, + channel_id = %state.channel_id, + "worker finished before cancellation took effect" + ); + } + } + }); Ok(()) } else if removed { - self.process_run_logger - .log_worker_completed(worker_id, "Worker cancelled", false); + // Worker was in active_workers but had no handle (shouldn't happen, but handle gracefully) + tracing::warn!( + worker_id = %worker_id, + channel_id = %self.channel_id, + "worker cancellation requested but no join handle was present" + ); + self.send_worker_terminal_events( + worker_id, + "cancelled", + format!("{WORKER_CANCELLED_PREFIX} {reason}."), + false, + ); Ok(()) } else { Err(format!("Worker {worker_id} not found")) @@ -122,7 +298,7 @@ pub struct Channel { /// Event receiver for process events. pub event_rx: broadcast::Receiver, /// Outbound response sender for the messaging layer. - pub response_tx: mpsc::Sender, + pub response_tx: mpsc::Sender, /// Self-sender for re-triggering the channel after background process completion. pub self_tx: mpsc::Sender, /// Conversation ID from the first message (for synthetic re-trigger messages). @@ -149,19 +325,36 @@ pub struct Channel { pending_retrigger_metadata: HashMap, /// Deadline for firing the pending retrigger (debounce timer). retrigger_deadline: Option, - /// Optional send_agent_message tool (only when agent has active links). + /// Non-terminal worker completion payloads waiting to be relayed on retrigger. + pending_worker_completion_results: Vec, + /// Optional cross-agent messaging tool for linked agent conversations. send_agent_message_tool: Option, - /// Turn counter for link channels (used for safety cap). + /// Number of turns processed in a link channel. link_turn_count: u32, - /// Originating channel that triggered this link conversation (for routing conclusions back). + /// Originating channel id propagated through link channels. originating_channel: Option, - /// Messaging adapter name from the originating channel (e.g. "webchat", "discord"). - /// Used by `route_link_conclusion` to set the correct `source` on injected messages. + /// Originating adapter source propagated through link channels. originating_source: Option, - /// Set after `conclude_link` fires. Prevents the channel from processing - /// further messages, stopping the ping-pong that happens when both sides - /// keep responding to each other after the task is done. + /// Set once a link conversation has been explicitly concluded. link_concluded: bool, + /// Per-worker checkpoint state used for status dedupe/throttling. + worker_checkpoints: HashMap, + /// Periodic deadline for checking due worker terminal delivery receipts. + worker_receipt_dispatch_deadline: tokio::time::Instant, + /// Consecutive failures for flush_due_worker_delivery_receipts(). + worker_receipt_failure_count: Arc, + /// Circuit-breaker state for worker_receipt_dispatch_deadline scheduling. + worker_receipt_circuit_open: Arc, + /// True while a worker terminal receipt flush task is running. + worker_receipt_flush_in_progress: Arc, + /// Periodic deadline for deterministic worker task contract checks. + worker_contract_tick_deadline: tokio::time::Instant, + /// Consecutive failures for flush_due_worker_task_contract_deadlines(). + worker_contract_failure_count: Arc, + /// Circuit-breaker state for worker_contract_tick_deadline scheduling. + worker_contract_circuit_open: Arc, + /// True while a worker task-contract deadline flush task is running. + worker_contract_flush_in_progress: Arc, } impl Channel { @@ -173,7 +366,7 @@ impl Channel { pub fn new( id: ChannelId, deps: AgentDeps, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, event_rx: broadcast::Receiver, screenshot_dir: std::path::PathBuf, logs_dir: std::path::PathBuf, @@ -242,6 +435,7 @@ impl Channel { }; let self_tx = message_tx.clone(); + let worker_contract_tick_secs = deps.runtime_config.worker_contract.load().tick_secs.max(1); let channel = Self { id: id.clone(), title: None, @@ -265,11 +459,23 @@ impl Channel { pending_retrigger: false, pending_retrigger_metadata: HashMap::new(), retrigger_deadline: None, + pending_worker_completion_results: Vec::new(), send_agent_message_tool, link_turn_count: 0, originating_channel: None, originating_source: None, link_concluded: false, + worker_checkpoints: HashMap::new(), + worker_receipt_dispatch_deadline: tokio::time::Instant::now() + + std::time::Duration::from_secs(WORKER_RECEIPT_DISPATCH_INTERVAL_SECS), + worker_receipt_failure_count: Arc::new(AtomicUsize::new(0)), + worker_receipt_circuit_open: Arc::new(AtomicBool::new(false)), + worker_receipt_flush_in_progress: Arc::new(AtomicBool::new(false)), + worker_contract_tick_deadline: tokio::time::Instant::now() + + std::time::Duration::from_secs(worker_contract_tick_secs), + worker_contract_failure_count: Arc::new(AtomicUsize::new(0)), + worker_contract_circuit_open: Arc::new(AtomicBool::new(false)), + worker_contract_flush_in_progress: Arc::new(AtomicBool::new(false)), }; (channel, message_tx) @@ -289,13 +495,23 @@ impl Channel { tracing::info!(channel_id = %self.id, "channel started"); loop { - // Compute next deadline from coalesce and retrigger timers - let next_deadline = match (self.coalesce_deadline, self.retrigger_deadline) { - (Some(a), Some(b)) => Some(a.min(b)), - (Some(a), None) => Some(a), - (None, Some(b)) => Some(b), - (None, None) => None, - }; + // Compute next deadline from coalesce/retrigger timers and periodic flushes. + // Circuit-open flushers are omitted from scheduling until explicitly reset. + let worker_receipt_deadline = + (!self.worker_receipt_circuit_open.load(Ordering::Acquire)) + .then_some(self.worker_receipt_dispatch_deadline); + let worker_contract_deadline = + (!self.worker_contract_circuit_open.load(Ordering::Acquire)) + .then_some(self.worker_contract_tick_deadline); + let next_deadline = [ + self.coalesce_deadline, + self.retrigger_deadline, + worker_receipt_deadline, + worker_contract_deadline, + ] + .into_iter() + .flatten() + .min(); let sleep_duration = next_deadline .map(|deadline| { let now = tokio::time::Instant::now(); @@ -323,13 +539,29 @@ impl Channel { } } } - Ok(event) = self.event_rx.recv() => { - // Events bypass coalescing - flush buffer first if needed - if let Err(error) = self.flush_coalesce_buffer().await { - tracing::error!(%error, channel_id = %self.id, "error flushing coalesce buffer"); - } - if let Err(error) = self.handle_event(event).await { - tracing::error!(%error, channel_id = %self.id, "error handling event"); + event = self.event_rx.recv() => { + match event { + Ok(event) => { + // Events bypass coalescing - flush buffer first if needed + if let Err(error) = self.flush_coalesce_buffer().await { + tracing::error!(%error, channel_id = %self.id, "error flushing coalesce buffer"); + } + if let Err(error) = self.handle_event(event).await { + tracing::error!(%error, channel_id = %self.id, "error handling event"); + } + } + Err(tokio::sync::broadcast::error::RecvError::Lagged(skipped)) => { + tracing::warn!( + channel_id = %self.id, + skipped, + "channel event stream lagged; continuing after dropping stale events" + ); + self.reconcile_finished_workers_after_lag(skipped).await; + } + Err(tokio::sync::broadcast::error::RecvError::Closed) => { + tracing::warn!(channel_id = %self.id, "channel event stream closed"); + break; + } } } _ = tokio::time::sleep(sleep_duration), if next_deadline.is_some() => { @@ -344,6 +576,31 @@ impl Channel { if self.retrigger_deadline.is_some_and(|d| d <= now) { self.flush_pending_retrigger().await; } + // Check worker terminal receipt dispatch deadline + if !self.worker_receipt_circuit_open.load(Ordering::Acquire) + && self.worker_receipt_dispatch_deadline <= now + { + self.flush_due_worker_delivery_receipts(); + self.worker_receipt_dispatch_deadline = tokio::time::Instant::now() + + std::time::Duration::from_secs( + WORKER_RECEIPT_DISPATCH_INTERVAL_SECS, + ); + } + // Check worker task contract deadline + if !self.worker_contract_circuit_open.load(Ordering::Acquire) + && self.worker_contract_tick_deadline <= now + { + self.flush_due_worker_task_contract_deadlines(); + let tick_secs = self + .deps + .runtime_config + .worker_contract + .load() + .tick_secs + .max(1); + self.worker_contract_tick_deadline = tokio::time::Instant::now() + + std::time::Duration::from_secs(tick_secs); + } } else => break, } @@ -358,6 +615,244 @@ impl Channel { Ok(()) } + async fn reconcile_finished_workers_after_lag(&mut self, skipped: u64) { + // Lagged event streams can drop WorkerComplete events, leaving finished + // handles behind and causing check_worker_limit() to overcount workers. + let finished_worker_ids: Vec = { + let worker_handles = self.state.worker_handles.read().await; + worker_handles + .iter() + .filter_map(|(worker_id, handle)| handle.is_finished().then_some(*worker_id)) + .collect() + }; + + if finished_worker_ids.is_empty() { + return; + } + + let notify_by_worker: HashMap = { + let status_block = self.state.status_block.read().await; + finished_worker_ids + .iter() + .map(|worker_id| { + let notify = status_block + .active_workers + .iter() + .any(|worker| worker.id == *worker_id && worker.notify_on_complete); + (*worker_id, notify) + }) + .collect() + }; + + let mut removed_handles = 0usize; + { + let mut worker_handles = self.state.worker_handles.write().await; + for worker_id in &finished_worker_ids { + if worker_handles.remove(worker_id).is_some() { + removed_handles += 1; + } + } + } + + { + // Replay status-block convergence for dropped WorkerComplete events. + let mut status_block = self.state.status_block.write().await; + for worker_id in &finished_worker_ids { + let notify = *notify_by_worker.get(worker_id).unwrap_or(&false); + let synthetic = ProcessEvent::WorkerComplete { + agent_id: self.deps.agent_id.clone(), + worker_id: *worker_id, + channel_id: Some(self.id.clone()), + result: WORKER_LAG_RECONCILE_FAILURE_RESULT.to_string(), + notify, + success: false, + }; + status_block.update(&synthetic); + } + } + + for worker_id in &finished_worker_ids { + let notify = *notify_by_worker.get(worker_id).unwrap_or(&false); + self.apply_worker_completion_side_effects( + *worker_id, + WORKER_LAG_RECONCILE_FAILURE_RESULT, + notify, + false, + ) + .await; + } + + if self.pending_retrigger { + // Ensure reconciling lagged completions does not stall retrigger dispatch. + if self.retrigger_deadline.is_none() { + self.retrigger_deadline = Some( + tokio::time::Instant::now() + + std::time::Duration::from_millis(RETRIGGER_DEBOUNCE_MS), + ); + } + } + + tracing::warn!( + channel_id = %self.id, + skipped, + removed_handles, + reconciled_workers = finished_worker_ids.len(), + "reconciled finished workers after lagged channel events" + ); + } + + async fn upsert_worker_terminal_delivery_receipt( + &mut self, + worker_id: WorkerId, + terminal_state: &str, + result_text: &str, + ) -> Option { + let terminal_secs = self + .deps + .runtime_config + .cortex + .load() + .worker_timeout_secs + .max(1); + self.worker_receipt_dispatch_deadline = tokio::time::Instant::now(); + let run_logger = self.state.process_run_logger.clone(); + let channel_id = self.id.clone(); + if let Err(error) = run_logger + .mark_worker_task_contract_terminal_pending(worker_id, terminal_state, terminal_secs) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %worker_id, + terminal_state = %terminal_state, + "failed to mark worker contract terminal pending" + ); + return None; + } + + let payload_text = build_worker_terminal_receipt_payload(terminal_state, result_text); + match run_logger + .upsert_worker_terminal_receipt(&channel_id, worker_id, terminal_state, &payload_text) + .await + { + Ok(receipt_id) => { + tracing::info!( + channel_id = %channel_id, + worker_id = %worker_id, + receipt_id = %receipt_id, + terminal_state = %terminal_state, + "queued worker terminal receipt" + ); + Some(receipt_id) + } + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %worker_id, + terminal_state = %terminal_state, + "failed to queue worker terminal receipt" + ); + None + } + } + } + + async fn apply_worker_completion_side_effects( + &mut self, + worker_id: WorkerId, + result: &str, + notify: bool, + success: bool, + ) { + let run_logger = self.state.process_run_logger.clone(); + run_logger.log_worker_completed(worker_id, result, success); + self.worker_checkpoints.remove(&worker_id); + + if notify { + let terminal_state = classify_worker_terminal_state(result); + let terminal_receipt_id = self + .upsert_worker_terminal_delivery_receipt(worker_id, terminal_state, result) + .await; + self.send_status_update_with_receipt( + crate::StatusUpdate::WorkerCompleted { + worker_id, + result: summarize_worker_result_for_status(result), + }, + terminal_receipt_id, + ) + .await; + } + + let mut workers = self.state.active_workers.write().await; + workers.remove(&worker_id); + drop(workers); + + self.state.worker_handles.write().await.remove(&worker_id); + self.state.worker_inputs.write().await.remove(&worker_id); + + if notify && !is_worker_terminal_failure(result) { + self.queue_pending_worker_completion_result(result); + let mut history = self.state.history.write().await; + let worker_message = format!("[Worker {worker_id} completed]: {result}"); + history.push(rig::message::Message::from(worker_message)); + self.pending_retrigger = true; + self.retrigger_deadline = Some( + tokio::time::Instant::now() + + std::time::Duration::from_millis(RETRIGGER_DEBOUNCE_MS), + ); + } + } + + fn queue_pending_worker_completion_result(&mut self, result: &str) { + let trimmed = result.trim(); + if trimmed.is_empty() { + return; + } + if self.pending_worker_completion_results.len() >= 8 { + self.pending_worker_completion_results.remove(0); + } + self.pending_worker_completion_results.push(trimmed.to_string()); + } + + pub fn reset_worker_receipt_circuit(&mut self) { + self.worker_receipt_failure_count + .store(0, Ordering::Release); + self.worker_receipt_circuit_open + .store(false, Ordering::Release); + self.worker_receipt_dispatch_deadline = tokio::time::Instant::now() + + std::time::Duration::from_secs(WORKER_RECEIPT_DISPATCH_INTERVAL_SECS); + tracing::info!( + channel_id = %self.id, + worker_receipt_failure_count = 0, + worker_receipt_circuit_open = false, + "reset worker receipt circuit breaker for flush_due_worker_delivery_receipts / worker_receipt_dispatch_deadline" + ); + } + + pub fn reset_worker_contract_circuit(&mut self) { + self.worker_contract_failure_count + .store(0, Ordering::Release); + self.worker_contract_circuit_open + .store(false, Ordering::Release); + let tick_secs = self + .deps + .runtime_config + .worker_contract + .load() + .tick_secs + .max(1); + self.worker_contract_tick_deadline = + tokio::time::Instant::now() + std::time::Duration::from_secs(tick_secs); + tracing::info!( + channel_id = %self.id, + worker_contract_failure_count = 0, + worker_contract_circuit_open = false, + "reset worker contract circuit breaker for flush_due_worker_task_contract_deadlines / worker_contract_tick_deadline" + ); + } + /// Determine if a message should be coalesced (batched with other messages). /// /// Returns false for: @@ -1316,10 +1811,19 @@ impl Channel { .tool_server_handle(self.tool_server.clone()) .build(); - let _ = self + if let Err(error) = self .response_tx - .send(OutboundResponse::Status(crate::StatusUpdate::Thinking)) - .await; + .send(OutboundEnvelope::from(OutboundResponse::Status( + crate::StatusUpdate::Thinking, + ))) + .await + { + tracing::warn!( + %error, + channel_id = %self.id, + "failed to send thinking status update" + ); + } // Inject attachments as a user message before the text prompt if !attachment_content.is_empty() { @@ -1347,24 +1851,11 @@ impl Channel { .await; // If the LLM responded with text that looks like tool call syntax, it failed - // to use the tool calling API. Inject a correction and retry a couple - // times so the model can recover by calling `reply` or `skip`. - const TOOL_SYNTAX_RECOVERY_MAX_ATTEMPTS: usize = 2; - let mut recovery_attempts = 0; - while let Ok(ref response) = result { - if !crate::tools::should_block_user_visible_text(response) - || recovery_attempts >= TOOL_SYNTAX_RECOVERY_MAX_ATTEMPTS - { - break; - } - - recovery_attempts += 1; - tracing::warn!( - channel_id = %self.id, - attempt = recovery_attempts, - "LLM emitted blocked structured output, retrying with correction" - ); - + // to use the tool calling API. Inject a correction and give it one more try. + if let Ok(ref response) = result + && extract_reply_from_tool_syntax(response.trim()).is_some() + { + tracing::warn!(channel_id = %self.id, "LLM emitted tool syntax as text, retrying with correction"); let prompt_engine = self.deps.runtime_config.prompts.load(); let correction = prompt_engine.render_system_tool_syntax_correction()?; result = agent @@ -1400,12 +1891,13 @@ impl Channel { /// The LLM sometimes incorrectly skips on retrigger turns thinking the /// result was "already processed" when the user hasn't seen it yet. async fn handle_agent_result( - &self, + &mut self, result: std::result::Result, skip_flag: &crate::tools::SkipFlag, replied_flag: &crate::tools::RepliedFlag, is_retrigger: bool, ) { + let mut relayed_retrigger_result = false; match result { Ok(response) => { let skipped = skip_flag.load(std::sync::atomic::Ordering::Relaxed); @@ -1418,44 +1910,22 @@ impl Channel { // fallback since the user hasn't seen the result yet. let text = response.trim(); if !text.is_empty() { - if crate::tools::should_block_user_visible_text(text) { - tracing::warn!( - channel_id = %self.id, - "blocked retrigger fallback output containing structured or tool syntax" - ); - } else { - tracing::info!( - channel_id = %self.id, - response_len = text.len(), - "LLM skipped on retrigger but produced text, sending as fallback" - ); - let extracted = extract_reply_from_tool_syntax(text); - let source = self - .conversation_id - .as_deref() - .and_then(|conversation_id| conversation_id.split(':').next()) - .unwrap_or("unknown"); - let final_text = crate::tools::reply::normalize_discord_mention_tokens( - extracted.as_deref().unwrap_or(text), - source, - ); - if !final_text.is_empty() { - if extracted.is_some() { - tracing::warn!(channel_id = %self.id, "extracted reply from malformed tool syntax in retrigger fallback"); - } - self.state - .conversation_logger - .log_bot_message(&self.state.channel_id, &final_text); - if let Err(error) = self - .response_tx - .send(OutboundResponse::Text(final_text)) - .await - { - tracing::error!(%error, channel_id = %self.id, "failed to send retrigger fallback reply"); - } - } - } - } else { + tracing::info!( + channel_id = %self.id, + response_len = text.len(), + "LLM skipped on retrigger but produced text, sending as fallback" + ); + } + + let fallback_outcome = self + .emit_fallback_reply( + &response, + "LLM skipped on retrigger but produced text fallback", + ) + .await; + relayed_retrigger_result = + relayed_retrigger_result || fallback_outcome == FallbackReplyOutcome::Sent; + if fallback_outcome == FallbackReplyOutcome::EmptyInput { tracing::warn!( channel_id = %self.id, "LLM skipped on retrigger with no text — worker/branch result may not have been relayed" @@ -1465,92 +1935,40 @@ impl Channel { tracing::debug!(channel_id = %self.id, "channel turn skipped (no response)"); } else if replied { tracing::debug!(channel_id = %self.id, "channel turn replied via tool (fallback suppressed)"); + if is_retrigger { + relayed_retrigger_result = true; + } } else if is_retrigger { - // On retrigger turns the LLM should use the reply tool, but - // some models return the result as raw text instead. Send it - // as a fallback so the user still gets the worker/branch output. - let text = response.trim(); - if !text.is_empty() { - if crate::tools::should_block_user_visible_text(text) { - tracing::warn!( + // Retrigger turns are vulnerable to tool-call misses; when the + // model emits substantive text without calling `reply`, relay it. + // Keep suppressing low-value "still waiting" chatter. + let fallback_outcome = self + .emit_fallback_reply(&response, "retrigger text output fallback") + .await; + relayed_retrigger_result = + relayed_retrigger_result || fallback_outcome == FallbackReplyOutcome::Sent; + match fallback_outcome { + FallbackReplyOutcome::EmptyInput => { + tracing::debug!( channel_id = %self.id, - "blocked retrigger output containing structured or tool syntax" + "retrigger turn fallback suppressed (empty text)" ); - } else { - tracing::info!( + } + FallbackReplyOutcome::EmptyNormalized => { + tracing::debug!( channel_id = %self.id, - response_len = text.len(), - "retrigger produced text without reply tool, sending as fallback" + "retrigger turn fallback suppressed (empty normalized text)" ); - let extracted = extract_reply_from_tool_syntax(text); - let source = self - .conversation_id - .as_deref() - .and_then(|conversation_id| conversation_id.split(':').next()) - .unwrap_or("unknown"); - let final_text = crate::tools::reply::normalize_discord_mention_tokens( - extracted.as_deref().unwrap_or(text), - source, - ); - if !final_text.is_empty() { - self.state - .conversation_logger - .log_bot_message(&self.state.channel_id, &final_text); - if let Err(error) = self - .response_tx - .send(OutboundResponse::Text(final_text)) - .await - { - tracing::error!(%error, channel_id = %self.id, "failed to send retrigger fallback reply"); - } - } } - } else { - tracing::debug!( - channel_id = %self.id, - "retrigger turn produced no text and no reply tool call" - ); + _ => {} } } else { // If the LLM returned text without using the reply tool, send it // directly. Some models respond with text instead of tool calls. // When the text looks like tool call syntax (e.g. "[reply]\n{\"content\": \"hi\"}"), // attempt to extract the reply content and send that instead. - let text = response.trim(); - if crate::tools::should_block_user_visible_text(text) { - tracing::warn!( - channel_id = %self.id, - "blocked fallback output containing structured or tool syntax" - ); - } else { - let extracted = extract_reply_from_tool_syntax(text); - let source = self - .conversation_id - .as_deref() - .and_then(|conversation_id| conversation_id.split(':').next()) - .unwrap_or("unknown"); - let final_text = crate::tools::reply::normalize_discord_mention_tokens( - extracted.as_deref().unwrap_or(text), - source, - ); - if !final_text.is_empty() { - if extracted.is_some() { - tracing::warn!(channel_id = %self.id, "extracted reply from malformed tool syntax in LLM text output"); - } - self.state.conversation_logger.log_bot_message_with_name( - &self.state.channel_id, - &final_text, - Some(self.agent_display_name()), - ); - if let Err(error) = self - .response_tx - .send(OutboundResponse::Text(final_text)) - .await - { - tracing::error!(%error, channel_id = %self.id, "failed to send fallback reply"); - } - } - } + self.emit_fallback_reply(&response, "LLM text output fallback") + .await; tracing::debug!(channel_id = %self.id, "channel turn completed"); } @@ -1561,6 +1979,9 @@ impl Channel { Err(rig::completion::PromptError::PromptCancelled { reason, .. }) => { if reason == "reply delivered" { tracing::debug!(channel_id = %self.id, "channel turn completed via reply tool"); + if is_retrigger { + relayed_retrigger_result = true; + } } else { tracing::info!(channel_id = %self.id, %reason, "channel turn cancelled"); } @@ -1570,39 +1991,127 @@ impl Channel { } } - // Ensure typing indicator is always cleaned up, even on error paths - let _ = self - .response_tx - .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping)) - .await; - } + if is_retrigger && !relayed_retrigger_result { + relayed_retrigger_result = self + .emit_pending_worker_completion_fallback( + "retrigger produced no user-visible reply; relaying worker result fallback", + ) + .await + == FallbackReplyOutcome::Sent; + } - /// Handle a process event (branch results, worker completions, status updates). - async fn handle_event(&mut self, event: ProcessEvent) -> Result<()> { - // Only process events targeted at this channel - if !event_is_for_channel(&event, &self.id) { - return Ok(()); + if is_retrigger && relayed_retrigger_result { + self.pending_worker_completion_results.clear(); } - // Update status block + // Ensure typing indicator is always cleaned up, even on error paths + if let Err(error) = self + .response_tx + .send(OutboundEnvelope::from(OutboundResponse::Status( + crate::StatusUpdate::StopTyping, + ))) + .await { - let mut status = self.state.status_block.write().await; - status.update(&event); + tracing::warn!( + %error, + channel_id = %self.id, + "failed to send stop-typing status update" + ); } + } - let mut should_retrigger = false; - let mut retrigger_metadata = std::collections::HashMap::new(); - let run_logger = &self.state.process_run_logger; + async fn emit_pending_worker_completion_fallback(&self, source: &str) -> FallbackReplyOutcome { + let Some(fallback_text) = + format_pending_worker_completion_fallback(&self.pending_worker_completion_results) + else { + return FallbackReplyOutcome::EmptyInput; + }; - match &event { - ProcessEvent::BranchStarted { - branch_id, - channel_id, - description, - reply_to_message_id, - .. - } => { - run_logger.log_branch_started(channel_id, *branch_id, description); + self.emit_fallback_reply(&fallback_text, source).await + } + + async fn emit_fallback_reply(&self, raw_response: &str, source: &str) -> FallbackReplyOutcome { + let text = raw_response.trim(); + if text.is_empty() { + return FallbackReplyOutcome::EmptyInput; + } + + let extracted = extract_reply_from_tool_syntax(text); + let message_source = self + .conversation_id + .as_deref() + .and_then(|conversation_id| conversation_id.split(':').next()) + .unwrap_or("unknown"); + let final_text = crate::tools::reply::normalize_discord_mention_tokens( + extracted.as_deref().unwrap_or(text), + message_source, + ); + if final_text.is_empty() { + return FallbackReplyOutcome::EmptyNormalized; + } + if crate::tools::reply::is_low_value_waiting_update(&final_text) { + tracing::info!( + channel_id = %self.id, + source, + "suppressing low-value waiting fallback text" + ); + return FallbackReplyOutcome::SuppressedLowValue; + } + + if extracted.is_some() { + tracing::warn!( + channel_id = %self.id, + source, + "extracted reply from malformed tool syntax in fallback output" + ); + } + + self.state + .conversation_logger + .log_bot_message(&self.state.channel_id, &final_text); + if let Err(error) = self + .response_tx + .send(OutboundEnvelope::from(OutboundResponse::Text(final_text))) + .await + { + tracing::error!( + %error, + channel_id = %self.id, + source, + "failed to send fallback reply" + ); + return FallbackReplyOutcome::SendFailed; + } + + FallbackReplyOutcome::Sent + } + + /// Handle a process event (branch results, worker completions, status updates). + async fn handle_event(&mut self, event: ProcessEvent) -> Result<()> { + // Only process events targeted at this channel + if !event_is_for_channel(&event, &self.id) { + return Ok(()); + } + + // Update status block + { + let mut status = self.state.status_block.write().await; + status.update(&event); + } + + let mut should_retrigger = false; + let mut retrigger_metadata = std::collections::HashMap::new(); + let run_logger = self.state.process_run_logger.clone(); + + match &event { + ProcessEvent::BranchStarted { + branch_id, + channel_id, + description, + reply_to_message_id, + .. + } => { + run_logger.log_branch_started(channel_id, *branch_id, description); if let Some(message_id) = reply_to_message_id { self.branch_reply_targets.insert(*branch_id, *message_id); } @@ -1661,11 +2170,128 @@ impl Channel { worker_type, &self.deps.agent_id, ); + let worker_contract_config = self.deps.runtime_config.worker_contract.load(); + let terminal_secs = self + .deps + .runtime_config + .cortex + .load() + .worker_timeout_secs + .max(1); + let public_task_summary = summarize_worker_start_for_status(task); + let timing = crate::conversation::history::WorkerTaskContractTiming { + ack_secs: worker_contract_config.ack_secs.max(1), + progress_secs: worker_contract_config.progress_secs.max(1), + terminal_secs, + }; + let run_logger = run_logger.clone(); + let agent_id = self.deps.agent_id.clone(); + let channel_id = self.id.clone(); + let event_worker_id = *worker_id; + let task_summary = public_task_summary.clone(); + tokio::spawn(async move { + if let Err(error) = run_logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + event_worker_id, + &task_summary, + timing, + ) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %event_worker_id, + "failed to upsert worker task contract" + ); + } + }); + self.worker_contract_tick_deadline = tokio::time::Instant::now(); + if self.worker_is_user_visible(*worker_id).await { + self.send_status_update(crate::StatusUpdate::WorkerStarted { + worker_id: *worker_id, + task: public_task_summary.clone(), + }) + .await; + if let Some(status) = normalize_worker_checkpoint_status(&public_task_summary) { + self.worker_checkpoints.insert( + *worker_id, + WorkerCheckpointState { + last_status: status, + last_sent_at: tokio::time::Instant::now(), + }, + ); + } + } } ProcessEvent::WorkerStatus { worker_id, status, .. } => { run_logger.log_worker_status(*worker_id, status); + self.spawn_worker_progress_refresh( + run_logger.clone(), + *worker_id, + Some(status.clone()), + "worker status", + ); + if self.worker_is_user_visible(*worker_id).await { + self.maybe_send_worker_checkpoint(*worker_id, status).await; + } + } + ProcessEvent::ToolStarted { + process_id: ProcessId::Worker(worker_id), + channel_id, + tool_name, + .. + } if channel_id.as_ref() == Some(&self.id) => { + self.spawn_worker_progress_refresh( + run_logger.clone(), + *worker_id, + Some(tool_name.clone()), + "tool_started", + ); + } + ProcessEvent::ToolCompleted { + process_id: ProcessId::Worker(worker_id), + channel_id, + tool_name, + .. + } if channel_id.as_ref() == Some(&self.id) => { + self.spawn_worker_progress_refresh( + run_logger.clone(), + *worker_id, + Some(tool_name.clone()), + "tool_completed", + ); + } + ProcessEvent::WorkerPermission { + worker_id, + channel_id, + permission_id: _, + description, + .. + } if channel_id.as_ref() == Some(&self.id) => { + self.spawn_worker_progress_refresh( + run_logger.clone(), + *worker_id, + Some(description.clone()), + "permission", + ); + } + ProcessEvent::WorkerQuestion { + worker_id, + channel_id, + question_id: _, + .. + } if channel_id.as_ref() == Some(&self.id) => { + self.spawn_worker_progress_refresh( + run_logger.clone(), + *worker_id, + None, + "question", + ); } ProcessEvent::WorkerComplete { worker_id, @@ -1674,22 +2300,11 @@ impl Channel { success, .. } => { - run_logger.log_worker_completed(*worker_id, result, *success); - - let mut workers = self.state.active_workers.write().await; - workers.remove(worker_id); - drop(workers); - - self.state.worker_handles.write().await.remove(worker_id); - self.state.worker_inputs.write().await.remove(worker_id); - - if *notify { - let mut history = self.state.history.write().await; - let worker_message = format!("[Worker {worker_id} completed]: {result}"); - history.push(rig::message::Message::from(worker_message)); - should_retrigger = true; - } - + let retrigger_before = self.pending_retrigger; + self.apply_worker_completion_side_effects(*worker_id, result, *notify, *success) + .await; + should_retrigger = + should_retrigger || (!retrigger_before && self.pending_retrigger); tracing::info!(worker_id = %worker_id, "worker completed"); } _ => {} @@ -1722,6 +2337,420 @@ impl Channel { Ok(()) } + async fn send_status_update(&self, status: crate::StatusUpdate) { + self.send_status_update_with_receipt(status, None).await; + } + + async fn send_status_update_with_receipt( + &self, + status: crate::StatusUpdate, + receipt_id: Option, + ) { + let response = OutboundResponse::Status(status); + let envelope = match receipt_id { + Some(receipt_id) => OutboundEnvelope::tracked(response, receipt_id), + None => OutboundEnvelope::from(response), + }; + if let Err(error) = self + .response_tx + .send(envelope) + .await + { + tracing::debug!( + %error, + channel_id = %self.id, + "failed to route status update to messaging adapter" + ); + } + } + + fn spawn_worker_progress_refresh( + &self, + run_logger: ProcessRunLogger, + worker_id: WorkerId, + status_text: Option, + event_label: &'static str, + ) { + let progress_secs = self + .deps + .runtime_config + .worker_contract + .load() + .progress_secs + .max(1); + let channel_id = self.id.clone(); + + tokio::spawn(async move { + if let Err(error) = run_logger + .touch_worker_task_contract_progress( + worker_id, + status_text.as_deref(), + progress_secs, + ) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %worker_id, + event_label, + "failed to refresh worker task contract progress" + ); + } + }); + } + + async fn maybe_send_worker_checkpoint(&mut self, worker_id: WorkerId, raw_status: &str) { + let Some(status) = normalize_worker_checkpoint_status(raw_status) else { + return; + }; + + let now = tokio::time::Instant::now(); + let previous = self.worker_checkpoints.get(&worker_id); + if !should_emit_worker_checkpoint(previous, &status, now) { + return; + } + + self.send_status_update(crate::StatusUpdate::WorkerCheckpoint { + worker_id, + status: status.clone(), + }) + .await; + + self.worker_checkpoints.insert( + worker_id, + WorkerCheckpointState { + last_status: status, + last_sent_at: now, + }, + ); + } + + fn flush_due_worker_delivery_receipts(&self) { + if self + .worker_receipt_flush_in_progress + .swap(true, Ordering::AcqRel) + { + tracing::debug!( + channel_id = %self.id, + "worker terminal receipt flush already in progress; skipping tick" + ); + return; + } + + let run_logger = self.state.process_run_logger.clone(); + let response_tx = self.response_tx.clone(); + let channel_id = self.id.clone(); + let in_progress = self.worker_receipt_flush_in_progress.clone(); + let failure_count = self.worker_receipt_failure_count.clone(); + let circuit_open = self.worker_receipt_circuit_open.clone(); + tokio::spawn(async move { + let _reset_flag = ResetFlag::new(in_progress); + let task_result = Self::flush_due_worker_delivery_receipts_task( + run_logger, + response_tx, + channel_id.clone(), + ) + .await; + if let Err((failures, task_error)) = + apply_periodic_flush_circuit(task_result, &failure_count, &circuit_open) + { + tracing::warn!( + channel_id = %channel_id, + task_error, + worker_receipt_failure_count = failures, + "flush_due_worker_delivery_receipts failed; incrementing worker_receipt_failure_count" + ); + if failures == WORKER_FLUSH_FAILURE_THRESHOLD { + tracing::error!( + channel_id = %channel_id, + worker_receipt_failure_count = failures, + threshold = WORKER_FLUSH_FAILURE_THRESHOLD, + worker_receipt_circuit_open = true, + "opening worker receipt circuit breaker after repeated flush_due_worker_delivery_receipts failures; scheduling via worker_receipt_dispatch_deadline is disabled until reset_worker_receipt_circuit" + ); + } + } + }); + } + + async fn flush_due_worker_delivery_receipts_task( + run_logger: ProcessRunLogger, + response_tx: mpsc::Sender, + channel_id: ChannelId, + ) -> std::result::Result<(), &'static str> { + let due = match run_logger + .claim_due_worker_terminal_receipts(&channel_id, WORKER_RECEIPT_DISPATCH_BATCH_SIZE) + .await + { + Ok(receipts) => receipts, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + "failed to claim due worker terminal receipts" + ); + return Err("claim_due_worker_terminal_receipts"); + } + }; + + if due.is_empty() { + return Ok(()); + } + + let mut had_errors = false; + for receipt in due { + let message = OutboundResponse::Text(receipt.payload_text.clone()); + let envelope = OutboundEnvelope::tracked(message, receipt.id.clone()); + + if let Err(error) = response_tx.send(envelope).await { + had_errors = true; + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + "failed to queue worker terminal receipt for outbound delivery" + ); + + if let Err(update_error) = run_logger + .fail_worker_delivery_receipt_attempt(&receipt.id, &error.to_string()) + .await + { + tracing::warn!( + %update_error, + channel_id = %channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + "failed to mark worker terminal receipt send failure" + ); + had_errors = true; + } + } + } + + if had_errors { + Err("dispatch_due_worker_terminal_receipts") + } else { + Ok(()) + } + } + + fn flush_due_worker_task_contract_deadlines(&self) { + if self + .worker_contract_flush_in_progress + .swap(true, Ordering::AcqRel) + { + tracing::debug!( + channel_id = %self.id, + "worker task-contract flush already in progress; skipping tick" + ); + return; + } + + let run_logger = self.state.process_run_logger.clone(); + let response_tx = self.response_tx.clone(); + let channel_id = self.id.clone(); + let status_block = self.state.status_block.clone(); + let in_progress = self.worker_contract_flush_in_progress.clone(); + let failure_count = self.worker_contract_failure_count.clone(); + let circuit_open = self.worker_contract_circuit_open.clone(); + let ack_secs = self + .deps + .runtime_config + .worker_contract + .load() + .ack_secs + .max(1); + tokio::spawn(async move { + let _reset_flag = ResetFlag::new(in_progress); + let task_result = Self::flush_due_worker_task_contract_deadlines_task( + run_logger, + response_tx, + channel_id.clone(), + status_block, + ack_secs, + ) + .await; + if let Err((failures, task_error)) = + apply_periodic_flush_circuit(task_result, &failure_count, &circuit_open) + { + tracing::warn!( + channel_id = %channel_id, + task_error, + worker_contract_failure_count = failures, + "flush_due_worker_task_contract_deadlines failed; incrementing worker_contract_failure_count" + ); + if failures == WORKER_FLUSH_FAILURE_THRESHOLD { + tracing::error!( + channel_id = %channel_id, + worker_contract_failure_count = failures, + threshold = WORKER_FLUSH_FAILURE_THRESHOLD, + worker_contract_circuit_open = true, + "opening worker contract circuit breaker after repeated flush_due_worker_task_contract_deadlines failures; scheduling via worker_contract_tick_deadline is disabled until reset_worker_contract_circuit" + ); + } + } + }); + } + + async fn flush_due_worker_task_contract_deadlines_task( + run_logger: ProcessRunLogger, + response_tx: mpsc::Sender, + channel_id: ChannelId, + status_block: Arc>, + ack_secs: u64, + ) -> std::result::Result<(), &'static str> { + let due_ack = match run_logger + .claim_due_worker_task_contract_ack_deadlines( + &channel_id, + WORKER_CONTRACT_ACK_BATCH_SIZE, + ack_secs, + ) + .await + { + Ok(due) => due, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + "failed to claim due worker task contract ack deadlines" + ); + return Err("claim_due_worker_task_contract_ack_deadlines"); + } + }; + + let mut had_errors = false; + for due in due_ack { + if !Self::worker_is_user_visible_in_status_block(&status_block, due.worker_id).await { + if let Err(error) = run_logger + .mark_worker_task_contract_acknowledged(due.worker_id) + .await + { + tracing::warn!( + %error, + channel_id = %channel_id, + worker_id = %due.worker_id, + "failed to auto-ack hidden worker task contract" + ); + had_errors = true; + } + continue; + } + let status = build_worker_ack_checkpoint(&due.task_summary, due.attempt_count); + if let Err(error) = response_tx + .send(OutboundEnvelope::from(OutboundResponse::Status( + crate::StatusUpdate::WorkerCheckpoint { + worker_id: due.worker_id, + status, + }, + ))) + .await + { + tracing::debug!( + %error, + channel_id = %channel_id, + worker_id = %due.worker_id, + "failed to route worker ack checkpoint status update" + ); + had_errors = true; + } + } + + let due_progress = match run_logger + .claim_due_worker_task_contract_progress_deadlines( + &channel_id, + WORKER_CONTRACT_PROGRESS_BATCH_SIZE, + ) + .await + { + Ok(due) => due, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + "failed to claim due worker task contract progress deadlines" + ); + return Err("claim_due_worker_task_contract_progress_deadlines"); + } + }; + + for due in due_progress { + if !Self::worker_is_user_visible_in_status_block(&status_block, due.worker_id).await { + continue; + } + let status = build_worker_progress_sla_nudge(&due.task_summary); + if let Err(error) = response_tx + .send(OutboundEnvelope::from(OutboundResponse::Status( + crate::StatusUpdate::WorkerCheckpoint { + worker_id: due.worker_id, + status, + }, + ))) + .await + { + tracing::debug!( + %error, + channel_id = %channel_id, + worker_id = %due.worker_id, + "failed to route worker progress checkpoint status update" + ); + had_errors = true; + } + } + + let due_terminal = match run_logger + .claim_due_worker_task_contract_terminal_deadlines( + &channel_id, + WORKER_CONTRACT_TERMINAL_BATCH_SIZE, + ) + .await + { + Ok(due) => due, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + "failed to claim due worker task contract terminal deadlines" + ); + return Err("claim_due_worker_task_contract_terminal_deadlines"); + } + }; + + for due in due_terminal { + tracing::warn!( + channel_id = %channel_id, + worker_id = %due.worker_id, + "worker terminal deadline elapsed before adapter acknowledgement; contract transitioned to terminal_failed and pending receipts were marked failed" + ); + } + + if had_errors { + Err("dispatch_due_worker_task_contract_deadlines") + } else { + Ok(()) + } + } + + async fn worker_is_user_visible_in_status_block( + status_block: &Arc>, + worker_id: WorkerId, + ) -> bool { + let status_block = status_block.read().await; + status_block + .active_workers + .iter() + .any(|worker| worker.id == worker_id && worker.notify_on_complete) + } + + async fn worker_is_user_visible(&self, worker_id: WorkerId) -> bool { + let status_block = self.state.status_block.read().await; + status_block + .active_workers + .iter() + .any(|worker| worker.id == worker_id && worker.notify_on_complete) + } + /// Flush the pending retrigger: send a synthetic system message to re-trigger /// the channel LLM so it can process background results and respond. async fn flush_pending_retrigger(&mut self) { @@ -2009,8 +3038,8 @@ async fn spawn_branch( /// Check whether the channel has capacity for another worker. async fn check_worker_limit(state: &ChannelState) -> std::result::Result<(), AgentError> { let max_workers = **state.deps.runtime_config.max_concurrent_workers.load(); - let workers = state.active_workers.read().await; - if workers.len() >= max_workers { + let worker_handles = state.worker_handles.read().await; + if worker_handles.len() >= max_workers { return Err(AgentError::WorkerLimitReached { channel_id: state.channel_id.to_string(), max: max_workers, @@ -2100,6 +3129,7 @@ pub async fn spawn_worker_from_state( state.deps.event_tx.clone(), state.deps.agent_id.clone(), Some(state.channel_id.clone()), + state.deps.runtime_config.cortex.load().worker_timeout_secs, worker.run().instrument(worker_span), ); @@ -2107,7 +3137,7 @@ pub async fn spawn_worker_from_state( { let mut status = state.status_block.write().await; - status.add_worker(worker_id, &task, false); + status.add_worker(worker_id, &task, true); } state @@ -2195,6 +3225,7 @@ pub async fn spawn_opencode_worker_from_state( state.deps.event_tx.clone(), state.deps.agent_id.clone(), Some(state.channel_id.clone()), + state.deps.runtime_config.cortex.load().worker_timeout_secs, async move { let result = worker.run().await?; Ok::(result.result_text) @@ -2207,7 +3238,7 @@ pub async fn spawn_opencode_worker_from_state( let opencode_task = format!("[opencode] {task}"); { let mut status = state.status_block.write().await; - status.add_worker(worker_id, &opencode_task, false); + status.add_worker(worker_id, &opencode_task, true); } state @@ -2237,6 +3268,7 @@ fn spawn_worker_task( event_tx: broadcast::Sender, agent_id: crate::AgentId, channel_id: Option, + timeout_secs: u64, future: F, ) -> tokio::task::JoinHandle<()> where @@ -2253,13 +3285,99 @@ where .with_label_values(&[&*agent_id]) .inc(); - let (result_text, notify, success) = match future.await { - Ok(text) => (text, true, true), - Err(error) => { - tracing::error!(worker_id = %worker_id, %error, "worker failed"); - (format!("Worker failed: {error}"), true, false) + let outcome = if timeout_secs == 0 { + match future.await { + Ok(text) => ("done", text, true, true), + Err(error) => { + tracing::error!(worker_id = %worker_id, %error, "worker failed"); + ( + "failed", + format!("{WORKER_FAILED_PREFIX} {error}"), + true, + false, + ) + } + } + } else { + let timeout_duration = std::time::Duration::from_secs(timeout_secs.max(1)); + let mut event_rx = event_tx.subscribe(); + let future = future; + tokio::pin!(future); + let mut deadline = tokio::time::Instant::now() + timeout_duration; + + loop { + let sleep = tokio::time::sleep_until(deadline); + tokio::pin!(sleep); + + tokio::select! { + result = &mut future => { + let outcome = match result { + Ok(text) => ("done", text, true, true), + Err(error) => { + tracing::error!(worker_id = %worker_id, %error, "worker failed"); + ("failed", format!("{WORKER_FAILED_PREFIX} {error}"), true, false) + } + }; + break outcome; + } + event = event_rx.recv() => { + match event { + Ok(event) => { + if is_worker_progress_event(&event, worker_id) { + deadline = tokio::time::Instant::now() + timeout_duration; + } + } + Err(tokio::sync::broadcast::error::RecvError::Lagged(skipped)) => { + tracing::warn!( + worker_id = %worker_id, + skipped, + "worker timeout watcher lagged on event stream" + ); + } + Err(tokio::sync::broadcast::error::RecvError::Closed) => { + tracing::warn!( + worker_id = %worker_id, + "worker timeout watcher event stream closed" + ); + let outcome = match future.await { + Ok(text) => ("done", text, true, true), + Err(error) => { + tracing::error!( + worker_id = %worker_id, + %error, + "worker failed after watcher channel closed" + ); + ( + "failed", + format!("{WORKER_FAILED_PREFIX} {error}"), + true, + false, + ) + } + }; + break outcome; + } + } + } + _ = &mut sleep => { + tracing::error!( + worker_id = %worker_id, + timeout_secs, + "worker timed out due to inactivity" + ); + break ( + "timed_out", + format!( + "{WORKER_TIMED_OUT_PREFIX}{timeout_secs} seconds without progress." + ), + true, + false, + ); + } + } } }; + let (terminal_status, result_text, notify, success) = outcome; #[cfg(feature = "metrics")] { let metrics = crate::telemetry::Metrics::global(); @@ -2273,17 +3391,71 @@ where .observe(worker_start.elapsed().as_secs_f64()); } - let _ = event_tx.send(ProcessEvent::WorkerComplete { + if let Err(error) = event_tx.send(ProcessEvent::WorkerStatus { + agent_id: agent_id.clone(), + worker_id, + channel_id: channel_id.clone(), + status: terminal_status.to_string(), + }) { + tracing::warn!( + %error, + agent_id = %agent_id, + worker_id = %worker_id, + channel_id = ?channel_id, + terminal_status, + "failed to send terminal worker status event" + ); + } + + let result_len = result_text.len(); + let completion_channel_id = channel_id.clone(); + if let Err(error) = event_tx.send(ProcessEvent::WorkerComplete { agent_id, worker_id, channel_id, result: result_text, notify, success, - }); + }) { + tracing::warn!( + %error, + worker_id = %worker_id, + channel_id = ?completion_channel_id, + result_len, + notify, + success, + "failed to send worker completion event" + ); + } }) } +fn is_worker_progress_event(event: &ProcessEvent, worker_id: WorkerId) -> bool { + match event { + ProcessEvent::WorkerStatus { + worker_id: event_worker_id, + .. + } => *event_worker_id == worker_id, + ProcessEvent::ToolStarted { + process_id: crate::ProcessId::Worker(event_worker_id), + .. + } => *event_worker_id == worker_id, + ProcessEvent::ToolCompleted { + process_id: crate::ProcessId::Worker(event_worker_id), + .. + } => *event_worker_id == worker_id, + ProcessEvent::WorkerPermission { + worker_id: event_worker_id, + .. + } => *event_worker_id == worker_id, + ProcessEvent::WorkerQuestion { + worker_id: event_worker_id, + .. + } => *event_worker_id == worker_id, + _ => false, + } +} + /// Some models emit tool call syntax as plain text instead of making actual tool calls. /// When the text starts with a tool-like prefix (e.g. `[reply]`, `(reply)`), try to /// extract the reply content so we can send it cleanly instead of showing raw JSON. @@ -2345,12 +3517,7 @@ fn extract_reply_from_tool_syntax(text: &str) -> Option { /// System-generated messages (re-triggers) are passed through as-is. fn format_user_message(raw_text: &str, message: &InboundMessage) -> String { if message.source == "system" { - // System messages should never be empty, but guard against it - return if raw_text.trim().is_empty() { - "[system event]".to_string() - } else { - raw_text.to_string() - }; + return raw_text.to_string(); } // Use platform-formatted author if available, fall back to metadata @@ -2395,15 +3562,7 @@ fn format_user_message(raw_text: &str, message: &InboundMessage) -> String { }) .unwrap_or_default(); - // If raw_text is empty or just whitespace, use a placeholder to avoid - // sending empty text content blocks to the LLM API. - let text_content = if raw_text.trim().is_empty() { - "[attachment or empty message]" - } else { - raw_text - }; - - format!("{display_name}{bot_tag}{reply_context}: {text_content}") + format!("{display_name}{bot_tag}{reply_context}: {raw_text}") } fn extract_discord_message_id(message: &InboundMessage) -> Option { @@ -2417,6 +3576,149 @@ fn extract_discord_message_id(message: &InboundMessage) -> Option { .and_then(|value| value.as_u64()) } +fn normalize_worker_checkpoint_status(status: &str) -> Option { + let trimmed = status.trim(); + if trimmed.is_empty() { + return None; + } + if trimmed.len() <= WORKER_CHECKPOINT_MAX_CHARS { + return Some(trimmed.to_string()); + } + + let end = trimmed.floor_char_boundary(WORKER_CHECKPOINT_MAX_CHARS); + let boundary = trimmed[..end].rfind(char::is_whitespace).unwrap_or(end); + Some(format!("{}...", &trimmed[..boundary])) +} + +fn is_high_priority_worker_checkpoint(status: &str) -> bool { + let normalized = status.to_ascii_lowercase(); + normalized.contains("waiting for input") + || normalized.contains("permission") + || normalized.contains("question") + || normalized.contains("failed") + || normalized.contains("error") + || normalized.contains("cancelled") + || normalized.contains("timed out") +} + +fn should_emit_worker_checkpoint( + previous: Option<&WorkerCheckpointState>, + next_status: &str, + now: tokio::time::Instant, +) -> bool { + let Some(previous) = previous else { + return true; + }; + + if previous.last_status == next_status { + return false; + } + + if is_high_priority_worker_checkpoint(next_status) { + return true; + } + + now.duration_since(previous.last_sent_at) + >= std::time::Duration::from_secs(WORKER_CHECKPOINT_MIN_INTERVAL_SECS) +} + +fn summarize_worker_result_for_status(result: &str) -> String { + let first_non_empty_line = result + .lines() + .find(|line| !line.trim().is_empty()) + .unwrap_or(result); + normalize_worker_checkpoint_status(first_non_empty_line).unwrap_or_else(|| "completed".into()) +} + +fn summarize_worker_start_for_status(task: &str) -> String { + let lowered = task.to_ascii_lowercase(); + if lowered.contains("research") + || lowered.contains("investigat") + || lowered.contains("verify") + || lowered.contains("source") + { + "research task".to_string() + } else if lowered.contains("[opencode]") + || lowered.contains("code") + || lowered.contains("implement") + || lowered.contains("refactor") + || lowered.contains("fix") + { + "coding task".to_string() + } else if lowered.contains("test") + || lowered.contains("pytest") + || lowered.contains("cargo test") + { + "test task".to_string() + } else if lowered.contains("summar") || lowered.contains("analy") || lowered.contains("review") + { + "analysis task".to_string() + } else { + "background task".to_string() + } +} + +fn build_worker_ack_checkpoint(task_summary: &str, attempt_count: i64) -> String { + let message = if attempt_count <= 1 { + format!("Acknowledged {task_summary}; running now.") + } else { + format!("Still running {task_summary}.") + }; + normalize_worker_checkpoint_status(&message) + .unwrap_or_else(|| "background task running".to_string()) +} + +fn build_worker_progress_sla_nudge(task_summary: &str) -> String { + let message = format!("Still working on {task_summary}. I will report back when complete."); + normalize_worker_checkpoint_status(&message) + .unwrap_or_else(|| "still working; I will report back when complete.".to_string()) +} + +fn is_worker_terminal_failure(result: &str) -> bool { + let trimmed = result.trim_start(); + trimmed.starts_with(WORKER_FAILED_PREFIX) + || trimmed.starts_with(WORKER_TIMED_OUT_PREFIX) + || trimmed.starts_with(WORKER_CANCELLED_PREFIX) +} + +fn classify_worker_terminal_state(result: &str) -> &'static str { + let trimmed = result.trim_start(); + if trimmed.starts_with(WORKER_FAILED_PREFIX) { + "failed" + } else if trimmed.starts_with(WORKER_TIMED_OUT_PREFIX) { + "timed_out" + } else if trimmed.starts_with(WORKER_CANCELLED_PREFIX) { + "cancelled" + } else { + "done" + } +} + +fn build_worker_terminal_receipt_payload(terminal_state: &str, result: &str) -> String { + let summary = summarize_worker_result_for_status(result); + match terminal_state { + "failed" => format!("Background task failed: {summary}"), + "timed_out" => format!("Background task timed out: {summary}"), + "cancelled" => "Background task was cancelled.".to_string(), + _ => format!("Background task completed: {summary}"), + } +} + +fn format_pending_worker_completion_fallback(pending_results: &[String]) -> Option { + if pending_results.is_empty() { + return None; + } + if pending_results.len() == 1 { + return Some(pending_results[0].clone()); + } + + Some(format!( + "Completed {} background tasks:\n\n{}", + pending_results.len(), + pending_results.join("\n\n---\n\n") + )) +} + /// Check if a ProcessEvent is targeted at a specific channel. /// /// Events from branches and workers carry a channel_id. We only process events @@ -2436,6 +3738,10 @@ fn event_is_for_channel(event: &ProcessEvent, channel_id: &ChannelId) -> bool { channel_id: event_channel, .. } => event_channel.as_ref() == Some(channel_id), + ProcessEvent::WorkerStarted { + channel_id: event_channel, + .. + } => event_channel.as_ref() == Some(channel_id), // Status block updates, tool events, etc. — match on agent_id which // is already filtered by the event bus subscription. Let them through. _ => true, @@ -2849,10 +4155,32 @@ fn apply_history_after_turn( #[cfg(test)] mod tests { + use super::WORKER_CHECKPOINT_MIN_INTERVAL_SECS; + use super::WORKER_FLUSH_FAILURE_THRESHOLD; + use super::WorkerCheckpointState; use super::apply_history_after_turn; + use super::apply_periodic_flush_circuit; + use super::build_worker_ack_checkpoint; + use super::build_worker_progress_sla_nudge; + use super::build_worker_terminal_receipt_payload; + use super::classify_worker_terminal_state; + use super::format_pending_worker_completion_fallback; + use super::is_worker_progress_event; + use super::is_worker_terminal_failure; + use super::normalize_worker_checkpoint_status; + use super::should_emit_worker_checkpoint; + use super::spawn_worker_task; + use super::summarize_worker_result_for_status; + use super::summarize_worker_start_for_status; + use crate::ProcessEvent; use rig::completion::{CompletionError, PromptError}; use rig::message::Message; use rig::tool::ToolSetError; + use std::sync::Arc; + use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; + use std::time::Duration; + use tokio::sync::{broadcast, oneshot}; + use uuid::Uuid; fn user_msg(text: &str) -> Message { Message::User { @@ -3083,71 +4411,399 @@ mod tests { ); } + #[tokio::test] + async fn worker_task_timeout_emits_terminal_events() { + let (event_tx, mut event_rx) = broadcast::channel(16); + let worker_id = Uuid::new_v4(); + let agent_id: crate::AgentId = Arc::from("test-agent"); + let channel_id: crate::ChannelId = Arc::from("test-channel"); + + let handle = spawn_worker_task(worker_id, event_tx, agent_id, Some(channel_id), 1, async { + tokio::time::sleep(Duration::from_secs(3)).await; + Ok::("should not complete".to_string()) + }); + + let mut saw_status = false; + let mut saw_complete = false; + let deadline = tokio::time::Instant::now() + Duration::from_secs(4); + + while tokio::time::Instant::now() < deadline && !(saw_status && saw_complete) { + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + let event = tokio::time::timeout(remaining, event_rx.recv()) + .await + .expect("timed out waiting for worker events") + .expect("failed to receive worker event"); + + match event { + ProcessEvent::WorkerStatus { + worker_id: event_worker_id, + status, + .. + } if event_worker_id == worker_id => { + saw_status = true; + assert_eq!(status, "timed_out"); + } + ProcessEvent::WorkerComplete { + worker_id: event_worker_id, + result, + .. + } if event_worker_id == worker_id => { + saw_complete = true; + assert!(result.contains("timed out after 1 seconds")); + } + _ => {} + } + } + + handle.await.expect("worker task join failed"); + assert!(saw_status, "expected terminal WorkerStatus event"); + assert!(saw_complete, "expected WorkerComplete event"); + } + + #[tokio::test(start_paused = true)] + async fn worker_timeout_resets_on_progress_events() { + let (event_tx, mut event_rx) = broadcast::channel(32); + let worker_id = Uuid::new_v4(); + let agent_id: crate::AgentId = Arc::from("test-agent"); + let channel_id: crate::ChannelId = Arc::from("test-channel"); + + let progress_tx = event_tx.clone(); + let progress_agent_id = agent_id.clone(); + let progress_channel_id = channel_id.clone(); + let progress_task = tokio::spawn(async move { + tokio::time::sleep(Duration::from_millis(700)).await; + progress_tx + .send(ProcessEvent::WorkerStatus { + agent_id: progress_agent_id, + worker_id, + channel_id: Some(progress_channel_id), + status: "still working".to_string(), + }) + .ok(); + }); + + let handle = spawn_worker_task(worker_id, event_tx, agent_id, Some(channel_id), 1, async { + tokio::time::sleep(Duration::from_millis(1500)).await; + Ok::("completed after progress heartbeat".to_string()) + }); + + let mut terminal_status = None::; + let mut complete_result = None::; + for _ in 0..20 { + while let Ok(event) = event_rx.try_recv() { + match event { + ProcessEvent::WorkerStatus { + worker_id: event_worker_id, + status, + .. + } if event_worker_id == worker_id => { + if status == "done" || status == "timed_out" || status == "failed" { + terminal_status = Some(status); + } + } + ProcessEvent::WorkerComplete { + worker_id: event_worker_id, + result, + .. + } if event_worker_id == worker_id => { + complete_result = Some(result); + } + _ => {} + } + } + + if terminal_status.is_some() && complete_result.is_some() { + break; + } + + tokio::time::advance(Duration::from_millis(100)).await; + tokio::task::yield_now().await; + } + + progress_task.await.expect("progress sender task failed"); + handle.await.expect("worker task join failed"); + + assert_eq!( + terminal_status.as_deref(), + Some("done"), + "worker should finish after progress heartbeat" + ); + assert_eq!( + complete_result.as_deref(), + Some("completed after progress heartbeat") + ); + } + + #[tokio::test] + async fn aborting_worker_task_drops_inner_future() { + struct DropSignal(Option>); + + impl Drop for DropSignal { + fn drop(&mut self) { + if let Some(sender) = self.0.take() { + sender.send(()).ok(); + } + } + } + + let (event_tx, _event_rx) = broadcast::channel(8); + let worker_id = Uuid::new_v4(); + let agent_id: crate::AgentId = Arc::from("test-agent"); + let channel_id: crate::ChannelId = Arc::from("test-channel"); + let (drop_tx, drop_rx) = oneshot::channel(); + let (started_tx, started_rx) = oneshot::channel(); + + let handle = + spawn_worker_task(worker_id, event_tx, agent_id, Some(channel_id), 30, async { + let _guard = DropSignal(Some(drop_tx)); + started_tx.send(()).ok(); + tokio::time::sleep(Duration::from_secs(120)).await; + Ok::("should not finish".to_string()) + }); + + tokio::time::timeout(Duration::from_secs(1), started_rx) + .await + .expect("future should start before cancellation") + .expect("start signal channel unexpectedly closed"); + handle.abort(); + let join_error = handle + .await + .expect_err("aborted worker task should not complete successfully"); + assert!( + join_error.is_cancelled(), + "aborted worker task should report cancellation" + ); + + tokio::time::timeout(Duration::from_secs(1), drop_rx) + .await + .expect("future should be dropped when worker task is aborted") + .expect("drop signal channel unexpectedly closed"); + } + #[test] - fn format_user_message_handles_empty_text() { - use super::format_user_message; - use crate::{Arc, InboundMessage}; - use chrono::Utc; - use std::collections::HashMap; - - // Test empty text with user message - let message = InboundMessage { - id: "test".to_string(), - agent_id: Some(Arc::from("test_agent")), - sender_id: "user123".to_string(), - conversation_id: "conv".to_string(), - content: crate::MessageContent::Text("".to_string()), - source: "discord".to_string(), - metadata: HashMap::new(), - formatted_author: Some("TestUser".to_string()), - timestamp: Utc::now(), + fn progress_event_detection_matches_worker_status_events() { + let worker_id = Uuid::new_v4(); + let other_worker_id = Uuid::new_v4(); + let agent_id: crate::AgentId = Arc::from("test-agent"); + let channel_id: crate::ChannelId = Arc::from("test-channel"); + + let progress = ProcessEvent::WorkerStatus { + agent_id: agent_id.clone(), + worker_id, + channel_id: Some(channel_id.clone()), + status: "working".to_string(), }; + let non_progress = ProcessEvent::WorkerStatus { + agent_id, + worker_id: other_worker_id, + channel_id: Some(channel_id), + status: "working".to_string(), + }; + + assert!(is_worker_progress_event(&progress, worker_id)); + assert!(!is_worker_progress_event(&non_progress, worker_id)); + } - let formatted = format_user_message("", &message); + #[test] + fn worker_failure_detection_matches_terminal_messages() { + assert!(is_worker_terminal_failure( + "Worker timed out after 300 seconds." + )); + assert!(is_worker_terminal_failure("Worker failed: request error")); + assert!(is_worker_terminal_failure( + "Worker cancelled: cancelled by request." + )); + assert!(!is_worker_terminal_failure( + "Completed summary with citations" + )); + } + + #[test] + fn worker_terminal_receipt_payload_reflects_terminal_state() { + assert_eq!( + classify_worker_terminal_state("Worker failed: provider error"), + "failed" + ); + assert_eq!( + classify_worker_terminal_state("Worker timed out after 45 seconds."), + "timed_out" + ); + assert_eq!( + classify_worker_terminal_state("Worker cancelled: cancelled by request."), + "cancelled" + ); + assert_eq!( + classify_worker_terminal_state("Completed report with citations"), + "done" + ); + + assert_eq!( + build_worker_terminal_receipt_payload("failed", "Worker failed: provider error"), + "Background task failed: Worker failed: provider error" + ); + assert_eq!( + build_worker_terminal_receipt_payload("done", "Completed report with citations"), + "Background task completed: Completed report with citations" + ); + } + + #[test] + fn pending_worker_completion_fallback_formats_consistently() { + assert_eq!( + format_pending_worker_completion_fallback(&[]), + None, + "empty pending set should not produce fallback text" + ); + + let single = vec!["## Summary\nLeafs game details".to_string()]; + assert_eq!( + format_pending_worker_completion_fallback(&single), + Some("## Summary\nLeafs game details".to_string()) + ); + + let multiple = vec![ + "First worker result".to_string(), + "Second worker result".to_string(), + ]; + assert_eq!( + format_pending_worker_completion_fallback(&multiple), + Some( + "Completed 2 background tasks:\n\nFirst worker result\n\n---\n\nSecond worker result" + .to_string() + ) + ); + } + + #[test] + fn worker_checkpoint_status_normalizes_and_truncates() { + assert_eq!(normalize_worker_checkpoint_status(" "), None); + assert_eq!( + normalize_worker_checkpoint_status("running tests"), + Some("running tests".to_string()) + ); + + let long_status = "word ".repeat(80); + let normalized = + normalize_worker_checkpoint_status(&long_status).expect("expected normalized status"); assert!( - !formatted.trim().is_empty(), - "formatted message should not be empty" + normalized.len() <= 223, + "status should be capped with ellipsis" ); assert!( - formatted.contains("[attachment or empty message]"), - "should use placeholder for empty text" + normalized.ends_with("..."), + "truncated checkpoint should end with ellipsis" ); + } - // Test whitespace-only text - let formatted_ws = format_user_message(" ", &message); + #[test] + fn worker_checkpoint_throttles_non_critical_updates() { + let now = tokio::time::Instant::now(); + let previous = WorkerCheckpointState { + last_status: "running".to_string(), + last_sent_at: now, + }; + + assert!( + !should_emit_worker_checkpoint(Some(&previous), "still running", now), + "non-critical updates should be throttled inside the interval" + ); + assert!( + should_emit_worker_checkpoint( + Some(&previous), + "waiting for input", + now + Duration::from_secs(1), + ), + "high-priority checkpoints should bypass throttle" + ); assert!( - formatted_ws.contains("[attachment or empty message]"), - "should use placeholder for whitespace-only text" + should_emit_worker_checkpoint( + Some(&previous), + "indexing repository", + now + Duration::from_secs(WORKER_CHECKPOINT_MIN_INTERVAL_SECS + 1), + ), + "non-critical updates should flow once interval elapsed" ); + } - // Test empty system message - let system_message = InboundMessage { - id: "test".to_string(), - agent_id: Some(Arc::from("test_agent")), - sender_id: "system".to_string(), - conversation_id: "conv".to_string(), - content: crate::MessageContent::Text("".to_string()), - source: "system".to_string(), - metadata: HashMap::new(), - formatted_author: None, - timestamp: Utc::now(), - }; + #[test] + fn worker_result_summary_uses_first_non_empty_line() { + let summary = summarize_worker_result_for_status( + "\n\nCompleted research with 8 cited sources.\nAdditional detail that should not be included.", + ); + assert_eq!(summary, "Completed research with 8 cited sources."); + } - let formatted_sys = format_user_message("", &system_message); - assert_eq!( - formatted_sys, "[system event]", - "system messages should use [system event] placeholder" + #[test] + fn worker_start_summary_redacts_task_details() { + let research = summarize_worker_start_for_status( + "Research this GitHub commit thoroughly: https://github.com/openai/codex/commit/...", ); + let coding = + summarize_worker_start_for_status("[opencode] Implement a retry loop in channel.rs"); - // Test normal message with text - let formatted_normal = format_user_message("hello", &message); + assert_eq!(research, "research task"); + assert_eq!(coding, "coding task"); assert!( - formatted_normal.contains("hello"), - "normal messages should preserve text" + !research.contains("http"), + "public summary should not expose raw task content" ); - assert!( - !formatted_normal.contains("[attachment or empty message]"), - "normal messages should not use placeholder" + } + + #[test] + fn worker_ack_checkpoint_is_deterministic() { + assert_eq!( + build_worker_ack_checkpoint("research task", 1), + "Acknowledged research task; running now." + ); + assert_eq!( + build_worker_ack_checkpoint("research task", 2), + "Still running research task." + ); + } + + #[test] + fn worker_progress_sla_nudge_is_deterministic() { + assert_eq!( + build_worker_progress_sla_nudge("analysis task"), + "Still working on analysis task. I will report back when complete." + ); + } + + #[test] + fn periodic_flush_circuit_resets_failure_count_on_success() { + let failure_count = AtomicUsize::new(2); + let circuit_open = AtomicBool::new(false); + let result = apply_periodic_flush_circuit(Ok(()), &failure_count, &circuit_open); + assert!(result.is_ok()); + assert_eq!(failure_count.load(Ordering::Acquire), 0); + assert!(!circuit_open.load(Ordering::Acquire)); + } + + #[test] + fn periodic_flush_circuit_opens_after_three_failures() { + let failure_count = AtomicUsize::new(0); + let circuit_open = AtomicBool::new(false); + + for expected in 1..WORKER_FLUSH_FAILURE_THRESHOLD { + let result = apply_periodic_flush_circuit( + Err("dispatch_due_worker_terminal_receipts"), + &failure_count, + &circuit_open, + ); + let (count, error) = result.expect_err("expected failure path"); + assert_eq!(count, expected); + assert_eq!(error, "dispatch_due_worker_terminal_receipts"); + assert!(!circuit_open.load(Ordering::Acquire)); + } + + let threshold_result = apply_periodic_flush_circuit( + Err("dispatch_due_worker_terminal_receipts"), + &failure_count, + &circuit_open, ); + let (count, error) = threshold_result.expect_err("expected threshold failure path"); + assert_eq!(count, WORKER_FLUSH_FAILURE_THRESHOLD); + assert_eq!(error, "dispatch_due_worker_terminal_receipts"); + assert!(circuit_open.load(Ordering::Acquire)); } } diff --git a/src/api/agents.rs b/src/api/agents.rs index 154fe6d5d..c07924dc3 100644 --- a/src/api/agents.rs +++ b/src/api/agents.rs @@ -524,6 +524,7 @@ pub(super) async fn create_agent( memory_persistence: None, coalesce: None, ingestion: None, + worker_contract: None, cortex: None, warmup: None, browser: None, diff --git a/src/api/channels.rs b/src/api/channels.rs index 857c41674..4cc6bd87b 100644 --- a/src/api/channels.rs +++ b/src/api/channels.rs @@ -132,19 +132,39 @@ pub(super) async fn channel_messages( pub(super) async fn channel_status( State(state): State>, ) -> Json> { - let snapshot: Vec<_> = { + let status_snapshot: Vec<_> = { let blocks = state.channel_status_blocks.read().await; blocks.iter().map(|(k, v)| (k.clone(), v.clone())).collect() }; + let state_snapshot: HashMap = { + let channel_states = state.channel_states.read().await; + channel_states + .iter() + .map(|(channel_id, channel_state)| (channel_id.clone(), channel_state.clone())) + .collect() + }; let mut result = HashMap::new(); - for (channel_id, status_block) in snapshot { + for (channel_id, status_block) in status_snapshot { let block = status_block.read().await; if let Ok(value) = serde_json::to_value(&*block) { result.insert(channel_id, value); } } + // Include channels that are active in state but missing from the + // channel_status_blocks snapshot (for example, during registration races). + for (channel_id, channel_state) in &state_snapshot { + if result.contains_key(channel_id) { + continue; + } + + let block = channel_state.status_block.read().await; + if let Ok(value) = serde_json::to_value(&*block) { + result.insert(channel_id.clone(), value); + } + } + Json(result) } @@ -198,7 +218,7 @@ pub(super) async fn cancel_process( .parse() .map_err(|_| StatusCode::BAD_REQUEST)?; channel_state - .cancel_worker(worker_id) + .cancel_worker(worker_id, None) .await .map_err(|_| StatusCode::NOT_FOUND)?; Ok(Json(CancelProcessResponse { diff --git a/src/api/workers.rs b/src/api/workers.rs index 39fcc8d73..4de049b96 100644 --- a/src/api/workers.rs +++ b/src/api/workers.rs @@ -52,6 +52,8 @@ pub(super) struct WorkerListItem { pub(super) struct WorkerDetailQuery { agent_id: String, worker_id: String, + #[serde(default)] + include_timeline: bool, } #[derive(Serialize)] @@ -67,6 +69,8 @@ pub(super) struct WorkerDetailResponse { completed_at: Option, transcript: Option>, tool_calls: i64, + #[serde(skip_serializing_if = "Option::is_none")] + timeline: Option, } /// List worker runs for an agent, with live status merged from StatusBlocks. @@ -164,6 +168,48 @@ pub(super) async fn worker_detail( .ok() }); + let timeline = if query.include_timeline { + let (contract, receipt) = tokio::try_join!( + async { + logger + .get_worker_task_contract_snapshot(&query.worker_id) + .await + .map_err(|error| { + tracing::warn!( + %error, + worker_id = %query.worker_id, + "failed to load worker task contract snapshot" + ); + StatusCode::INTERNAL_SERVER_ERROR + }) + }, + async { + logger + .get_worker_terminal_receipt_snapshot(&query.worker_id) + .await + .map_err(|error| { + tracing::warn!( + %error, + worker_id = %query.worker_id, + "failed to load worker receipt snapshot" + ); + StatusCode::INTERNAL_SERVER_ERROR + }) + } + )?; + let steps = transcript.as_deref().unwrap_or(&[]); + Some( + crate::conversation::history::build_worker_timeline_projection( + &detail.status, + steps, + contract.as_ref(), + receipt.as_ref(), + ), + ) + } else { + None + }; + Ok(Json(WorkerDetailResponse { id: detail.id, task: detail.task, @@ -176,5 +222,6 @@ pub(super) async fn worker_detail( completed_at: detail.completed_at, transcript, tool_calls: detail.tool_calls, + timeline, })) } diff --git a/src/config.rs b/src/config.rs index 4a54e8de7..21f0ea0f3 100644 --- a/src/config.rs +++ b/src/config.rs @@ -342,6 +342,7 @@ pub struct DefaultsConfig { pub memory_persistence: MemoryPersistenceConfig, pub coalesce: CoalesceConfig, pub ingestion: IngestionConfig, + pub worker_contract: WorkerContractConfig, pub cortex: CortexConfig, pub warmup: WarmupConfig, pub browser: BrowserConfig, @@ -370,6 +371,7 @@ impl std::fmt::Debug for DefaultsConfig { .field("memory_persistence", &self.memory_persistence) .field("coalesce", &self.coalesce) .field("ingestion", &self.ingestion) + .field("worker_contract", &self.worker_contract) .field("cortex", &self.cortex) .field("warmup", &self.warmup) .field("browser", &self.browser) @@ -523,6 +525,8 @@ pub struct BrowserConfig { pub headless: bool, /// Allow JavaScript evaluation via the browser tool. pub evaluate_enabled: bool, + /// Per-action timeout in seconds for browser operations. + pub browser_action_timeout_secs: u64, /// Custom Chrome/Chromium executable path. pub executable_path: Option, /// Directory for storing screenshots and other browser artifacts. @@ -535,6 +539,7 @@ impl Default for BrowserConfig { enabled: true, headless: true, evaluate_enabled: false, + browser_action_timeout_secs: 45, executable_path: None, screenshot_dir: None, } @@ -572,6 +577,27 @@ impl Default for OpenCodeConfig { } } +/// Worker task contract timing configuration. +#[derive(Debug, Clone, Copy)] +pub struct WorkerContractConfig { + /// Deadline (seconds) to confirm a spawned worker has been acknowledged. + pub ack_secs: u64, + /// Deadline (seconds) between meaningful progress updates. + pub progress_secs: u64, + /// Polling interval (seconds) for contract deadline checks. + pub tick_secs: u64, +} + +impl Default for WorkerContractConfig { + fn default() -> Self { + Self { + ack_secs: 5, + progress_secs: 45, + tick_secs: 2, + } + } +} + /// Cortex configuration. #[derive(Debug, Clone, Copy)] pub struct CortexConfig { @@ -761,6 +787,7 @@ pub struct AgentConfig { pub memory_persistence: Option, pub coalesce: Option, pub ingestion: Option, + pub worker_contract: Option, pub cortex: Option, pub warmup: Option, pub browser: Option, @@ -811,6 +838,7 @@ pub struct ResolvedAgentConfig { pub memory_persistence: MemoryPersistenceConfig, pub coalesce: CoalesceConfig, pub ingestion: IngestionConfig, + pub worker_contract: WorkerContractConfig, pub cortex: CortexConfig, pub warmup: WarmupConfig, pub browser: BrowserConfig, @@ -837,6 +865,7 @@ impl Default for DefaultsConfig { memory_persistence: MemoryPersistenceConfig::default(), coalesce: CoalesceConfig::default(), ingestion: IngestionConfig::default(), + worker_contract: WorkerContractConfig::default(), cortex: CortexConfig::default(), warmup: WarmupConfig::default(), browser: BrowserConfig::default(), @@ -885,6 +914,7 @@ impl AgentConfig { .unwrap_or(defaults.memory_persistence), coalesce: self.coalesce.unwrap_or(defaults.coalesce), ingestion: self.ingestion.unwrap_or(defaults.ingestion), + worker_contract: self.worker_contract.unwrap_or(defaults.worker_contract), cortex: self.cortex.unwrap_or(defaults.cortex), warmup: self.warmup.unwrap_or(defaults.warmup), browser: self @@ -1717,6 +1747,7 @@ struct TomlDefaultsConfig { memory_persistence: Option, coalesce: Option, ingestion: Option, + worker_contract: Option, cortex: Option, warmup: Option, browser: Option, @@ -1769,6 +1800,30 @@ struct TomlIngestionConfig { chunk_size: Option, } +#[derive(Deserialize)] +struct TomlWorkerContractConfig { + ack_secs: Option, + progress_secs: Option, + tick_secs: Option, +} + +fn resolve_nonzero_secs(value: Option, fallback: u64) -> u64 { + value + .and_then(|configured| (configured > 0).then_some(configured)) + .unwrap_or(fallback) +} + +fn normalize_worker_contract_config( + contract: TomlWorkerContractConfig, + base: WorkerContractConfig, +) -> WorkerContractConfig { + WorkerContractConfig { + ack_secs: resolve_nonzero_secs(contract.ack_secs, base.ack_secs), + progress_secs: resolve_nonzero_secs(contract.progress_secs, base.progress_secs), + tick_secs: resolve_nonzero_secs(contract.tick_secs, base.tick_secs), + } +} + #[derive(Deserialize)] struct TomlCompactionConfig { background_threshold: Option, @@ -1804,6 +1859,7 @@ struct TomlBrowserConfig { enabled: Option, headless: Option, evaluate_enabled: Option, + browser_action_timeout_secs: Option, executable_path: Option, screenshot_dir: Option, } @@ -1863,6 +1919,7 @@ struct TomlAgentConfig { memory_persistence: Option, coalesce: Option, ingestion: Option, + worker_contract: Option, cortex: Option, warmup: Option, browser: Option, @@ -2578,6 +2635,7 @@ impl Config { memory_persistence: None, coalesce: None, ingestion: None, + worker_contract: None, cortex: None, warmup: None, browser: None, @@ -3115,6 +3173,13 @@ impl Config { chunk_size: ig.chunk_size.unwrap_or(base_defaults.ingestion.chunk_size), }) .unwrap_or(base_defaults.ingestion), + worker_contract: toml + .defaults + .worker_contract + .map(|contract| { + normalize_worker_contract_config(contract, base_defaults.worker_contract) + }) + .unwrap_or(base_defaults.worker_contract), cortex: toml .defaults .cortex @@ -3177,6 +3242,10 @@ impl Config { enabled: b.enabled.unwrap_or(base.enabled), headless: b.headless.unwrap_or(base.headless), evaluate_enabled: b.evaluate_enabled.unwrap_or(base.evaluate_enabled), + browser_action_timeout_secs: resolve_nonzero_secs( + b.browser_action_timeout_secs, + base.browser_action_timeout_secs, + ), executable_path: b.executable_path.or_else(|| base.executable_path.clone()), screenshot_dir: b .screenshot_dir @@ -3310,6 +3379,9 @@ impl Config { .unwrap_or(defaults.ingestion.poll_interval_secs), chunk_size: ig.chunk_size.unwrap_or(defaults.ingestion.chunk_size), }), + worker_contract: a.worker_contract.map(|contract| { + normalize_worker_contract_config(contract, defaults.worker_contract) + }), cortex: a.cortex.map(|c| CortexConfig { tick_interval_secs: c .tick_interval_secs @@ -3361,6 +3433,10 @@ impl Config { evaluate_enabled: b .evaluate_enabled .unwrap_or(defaults.browser.evaluate_enabled), + browser_action_timeout_secs: resolve_nonzero_secs( + b.browser_action_timeout_secs, + defaults.browser.browser_action_timeout_secs, + ), executable_path: b .executable_path .or_else(|| defaults.browser.executable_path.clone()), @@ -3403,6 +3479,7 @@ impl Config { memory_persistence: None, coalesce: None, ingestion: None, + worker_contract: None, cortex: None, warmup: None, browser: None, @@ -3673,6 +3750,7 @@ pub struct RuntimeConfig { pub memory_persistence: ArcSwap, pub coalesce: ArcSwap, pub ingestion: ArcSwap, + pub worker_contract: ArcSwap, pub max_turns: ArcSwap, pub branch_max_turns: ArcSwap, pub context_window: ArcSwap, @@ -3733,6 +3811,7 @@ impl RuntimeConfig { memory_persistence: ArcSwap::from_pointee(agent_config.memory_persistence), coalesce: ArcSwap::from_pointee(agent_config.coalesce), ingestion: ArcSwap::from_pointee(agent_config.ingestion), + worker_contract: ArcSwap::from_pointee(agent_config.worker_contract), max_turns: ArcSwap::from_pointee(agent_config.max_turns), branch_max_turns: ArcSwap::from_pointee(agent_config.branch_max_turns), context_window: ArcSwap::from_pointee(agent_config.context_window), @@ -3815,6 +3894,8 @@ impl RuntimeConfig { .store(Arc::new(resolved.memory_persistence)); self.coalesce.store(Arc::new(resolved.coalesce)); self.ingestion.store(Arc::new(resolved.ingestion)); + self.worker_contract + .store(Arc::new(resolved.worker_contract)); self.max_turns.store(Arc::new(resolved.max_turns)); self.branch_max_turns .store(Arc::new(resolved.branch_max_turns)); @@ -5170,6 +5251,95 @@ id = "main" assert_eq!(provider.base_url, "http://remote-ollama:11434"); } + #[test] + fn worker_contract_zero_defaults_fallback_to_safe_defaults() { + let toml = r#" +[defaults.worker_contract] +ack_secs = 0 +progress_secs = 0 +tick_secs = 0 + +[[agents]] +id = "main" +"#; + let parsed: TomlConfig = toml::from_str(toml).expect("failed to parse test TOML"); + let config = Config::from_toml(parsed, PathBuf::from(".")).expect("failed to build Config"); + let defaults = WorkerContractConfig::default(); + + assert_eq!(config.defaults.worker_contract.ack_secs, defaults.ack_secs); + assert_eq!( + config.defaults.worker_contract.progress_secs, + defaults.progress_secs + ); + assert_eq!( + config.defaults.worker_contract.tick_secs, + defaults.tick_secs + ); + } + + #[test] + fn worker_contract_zero_agent_override_falls_back_to_instance_defaults() { + let toml = r#" +[defaults.worker_contract] +ack_secs = 9 +progress_secs = 27 +tick_secs = 3 + +[[agents]] +id = "main" + +[agents.worker_contract] +ack_secs = 0 +progress_secs = 0 +tick_secs = 0 +"#; + let parsed: TomlConfig = toml::from_str(toml).expect("failed to parse test TOML"); + let config = Config::from_toml(parsed, PathBuf::from(".")).expect("failed to build Config"); + let resolved = config.agents[0].resolve(&config.instance_dir, &config.defaults); + + assert_eq!(resolved.worker_contract.ack_secs, 9); + assert_eq!(resolved.worker_contract.progress_secs, 27); + assert_eq!(resolved.worker_contract.tick_secs, 3); + } + + #[test] + fn browser_action_timeout_zero_defaults_fallback_to_safe_default() { + let toml = r#" +[defaults.browser] +browser_action_timeout_secs = 0 + +[[agents]] +id = "main" +"#; + let parsed: TomlConfig = toml::from_str(toml).expect("failed to parse test TOML"); + let config = Config::from_toml(parsed, PathBuf::from(".")).expect("failed to build Config"); + let default_timeout = BrowserConfig::default().browser_action_timeout_secs; + + assert_eq!( + config.defaults.browser.browser_action_timeout_secs, + default_timeout + ); + } + + #[test] + fn browser_action_timeout_zero_agent_override_falls_back_to_instance_default() { + let toml = r#" +[defaults.browser] +browser_action_timeout_secs = 33 + +[[agents]] +id = "main" + +[agents.browser] +browser_action_timeout_secs = 0 +"#; + let parsed: TomlConfig = toml::from_str(toml).expect("failed to parse test TOML"); + let config = Config::from_toml(parsed, PathBuf::from(".")).expect("failed to build Config"); + let resolved = config.agents[0].resolve(&config.instance_dir, &config.defaults); + + assert_eq!(resolved.browser.browser_action_timeout_secs, 33); + } + #[test] fn test_warmup_defaults_applied_when_not_configured() { let toml = r#" diff --git a/src/conversation.rs b/src/conversation.rs index bbfc5fac9..0f1a96e0d 100644 --- a/src/conversation.rs +++ b/src/conversation.rs @@ -7,6 +7,8 @@ pub mod worker_transcript; pub use channels::ChannelStore; pub use history::{ - ConversationLogger, ProcessRunLogger, TimelineItem, WorkerDetailRow, WorkerRunRow, + ConversationLogger, ProcessRunLogger, TimelineItem, WorkerDeliveryReceiptSnapshot, + WorkerDetailRow, WorkerRunRow, WorkerTaskContractSnapshot, WorkerTimelineEvent, + WorkerTimelineProjection, build_worker_timeline_projection, worker_terminal_delivery_converged, }; pub use worker_transcript::{ActionContent, TranscriptStep}; diff --git a/src/conversation/history.rs b/src/conversation/history.rs index 13d80dcf9..b9e77610f 100644 --- a/src/conversation/history.rs +++ b/src/conversation/history.rs @@ -1,8 +1,10 @@ //! Conversation message persistence (SQLite). +use crate::conversation::worker_transcript::{ActionContent, TranscriptStep}; use crate::{BranchId, ChannelId, WorkerId}; use serde::Serialize; +use sha2::{Digest as _, Sha256}; use sqlx::{Row as _, SqlitePool}; use std::collections::HashMap; @@ -214,6 +216,81 @@ pub enum TimelineItem { }, } +const WORKER_TERMINAL_RECEIPT_KIND: &str = "worker_terminal"; +const WORKER_RECEIPT_MAX_ATTEMPTS: i64 = 6; +const WORKER_RECEIPT_BACKOFF_SECS: [i64; 5] = [5, 15, 45, 120, 300]; +const WORKER_RECEIPT_RETENTION_DAYS: i64 = 30; +const WORKER_CONTRACT_STATE_CREATED: &str = "created"; +const WORKER_CONTRACT_STATE_ACKED: &str = "acked"; +const WORKER_CONTRACT_STATE_PROGRESSING: &str = "progressing"; +const WORKER_CONTRACT_STATE_SLA_MISSED: &str = "sla_missed"; +const WORKER_CONTRACT_STATE_TERMINAL_PENDING: &str = "terminal_pending"; +const WORKER_CONTRACT_STATE_TERMINAL_ACKED: &str = "terminal_acked"; +const WORKER_CONTRACT_STATE_TERMINAL_FAILED: &str = "terminal_failed"; + +fn worker_receipt_backoff_secs(attempt_count: i64) -> Option { + if attempt_count <= 0 { + return WORKER_RECEIPT_BACKOFF_SECS.first().copied(); + } + WORKER_RECEIPT_BACKOFF_SECS + .get((attempt_count - 1) as usize) + .copied() +} + +fn status_fingerprint(status: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(status.as_bytes()); + format!("{:x}", hasher.finalize()) +} + +#[derive(Debug, Clone)] +pub struct PendingWorkerDeliveryReceipt { + pub id: String, + pub worker_id: String, + pub channel_id: String, + pub terminal_state: String, + pub payload_text: String, + pub attempt_count: i64, +} + +#[derive(Debug, Clone, Serialize)] +pub struct WorkerDeliveryReceiptStats { + pub pending: u64, + pub failed: u64, +} + +#[derive(Debug, Clone, Serialize)] +pub struct WorkerDeliveryRetryOutcome { + pub status: String, + pub attempt_count: i64, + pub next_attempt_at: Option, +} + +#[derive(Debug, Clone)] +pub struct DueWorkerTaskContractAck { + pub worker_id: WorkerId, + pub task_summary: String, + pub attempt_count: i64, +} + +#[derive(Debug, Clone)] +pub struct DueWorkerTaskContractProgress { + pub worker_id: WorkerId, + pub task_summary: String, +} + +#[derive(Debug, Clone)] +pub struct DueWorkerTaskContractTerminal { + pub worker_id: WorkerId, +} + +#[derive(Debug, Clone, Copy)] +pub struct WorkerTaskContractTiming { + pub ack_secs: u64, + pub progress_secs: u64, + pub terminal_secs: u64, +} + /// Persists branch and worker run records for channel timeline history. /// /// All write methods are fire-and-forget, same pattern as ConversationLogger. @@ -309,14 +386,11 @@ impl ProcessRunLogger { } /// Update a worker's status. Fire-and-forget. - /// Worker status text updates are transient — they're available via the - /// in-memory StatusBlock for live workers and don't need to be persisted. - /// The `status` column is reserved for the state enum (running/done/failed). + /// Worker status text updates are transient and read from in-memory status + /// blocks while the worker is active. pub fn log_worker_status(&self, _worker_id: WorkerId, _status: &str) { - // Intentionally a no-op. Status text was previously written to the - // `status` column, overwriting the state enum with free-text like - // "Searching for weather in Germany" which broke badge rendering - // and status filtering. + // Intentionally a no-op. The worker_runs `status` column is reserved + // for terminal state values (running/done/failed/cancelled/timed_out). } /// Record a worker completing with its result. Fire-and-forget. @@ -324,14 +398,28 @@ impl ProcessRunLogger { let pool = self.pool.clone(); let id = worker_id.to_string(); let result = result.to_string(); - let status = if success { "done" } else { "failed" }; + let success_int = if success { 1_i64 } else { 0_i64 }; tokio::spawn(async move { if let Err(error) = sqlx::query( - "UPDATE worker_runs SET result = ?, status = ?, completed_at = CURRENT_TIMESTAMP WHERE id = ?" + "UPDATE worker_runs \ + SET result = ?, \ + status = CASE \ + WHEN status IN ('cancelled', 'failed', 'timed_out') THEN status \ + WHEN ? LIKE 'Worker cancelled:%' THEN 'cancelled' \ + WHEN ? LIKE 'Worker failed:%' THEN 'failed' \ + WHEN ? LIKE 'Worker timed out after %' THEN 'timed_out' \ + WHEN ? = 1 THEN 'done' \ + ELSE 'failed' \ + END, \ + completed_at = CURRENT_TIMESTAMP \ + WHERE id = ?", ) .bind(&result) - .bind(status) + .bind(&result) + .bind(&result) + .bind(&result) + .bind(success_int) .bind(&id) .execute(&pool) .await @@ -341,260 +429,2825 @@ impl ProcessRunLogger { }); } - /// Load a unified timeline for a channel: messages, branch runs, and worker runs - /// interleaved chronologically (oldest first). - /// - /// When `before` is provided, only items with a timestamp strictly before that - /// value are returned, enabling cursor-based pagination. - pub async fn load_channel_timeline( + /// Create or refresh the deterministic task contract for a worker. + pub async fn upsert_worker_task_contract( &self, - channel_id: &str, - limit: i64, - before: Option<&str>, - ) -> crate::error::Result> { - let before_clause = if before.is_some() { - "AND datetime(timestamp) < datetime(?3)" - } else { - "" - }; - - let query_str = format!( - "SELECT * FROM ( \ - SELECT 'message' AS item_type, id, role, sender_name, sender_id, content, \ - NULL AS description, NULL AS conclusion, NULL AS task, NULL AS result, NULL AS status, \ - created_at AS timestamp, NULL AS completed_at \ - FROM conversation_messages WHERE channel_id = ?1 \ - UNION ALL \ - SELECT 'branch_run' AS item_type, id, NULL, NULL, NULL, NULL, \ - description, conclusion, NULL, NULL, NULL, \ - started_at AS timestamp, completed_at \ - FROM branch_runs WHERE channel_id = ?1 \ - UNION ALL \ - SELECT 'worker_run' AS item_type, id, NULL, NULL, NULL, NULL, \ - NULL, NULL, task, result, status, \ - started_at AS timestamp, completed_at \ - FROM worker_runs WHERE channel_id = ?1 \ - ) WHERE 1=1 {before_clause} ORDER BY timestamp DESC LIMIT ?2" - ); + agent_id: &crate::AgentId, + channel_id: &ChannelId, + worker_id: WorkerId, + task_summary: &str, + timing: WorkerTaskContractTiming, + ) -> crate::error::Result<()> { + let id = uuid::Uuid::new_v4().to_string(); + let worker_id = worker_id.to_string(); + let channel_id = channel_id.to_string(); + let status_hash = status_fingerprint(task_summary); - let mut query = sqlx::query(&query_str).bind(channel_id).bind(limit); + sqlx::query( + "INSERT OR IGNORE INTO worker_task_contracts \ + (id, agent_id, channel_id, worker_id, task_summary, state, \ + ack_deadline_at, progress_deadline_at, terminal_deadline_at, \ + last_status_hash, created_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, \ + datetime('now', '+' || ? || ' seconds'), \ + datetime('now', '+' || ? || ' seconds'), \ + datetime('now', '+' || ? || ' seconds'), \ + ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)", + ) + .bind(&id) + .bind(agent_id.as_ref()) + .bind(&channel_id) + .bind(&worker_id) + .bind(task_summary) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(timing.ack_secs as i64) + .bind(timing.progress_secs as i64) + .bind(timing.terminal_secs as i64) + .bind(&status_hash) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; - if let Some(before_ts) = before { - query = query.bind(before_ts); - } + sqlx::query( + "UPDATE worker_task_contracts \ + SET task_summary = ?, \ + ack_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + progress_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + terminal_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + last_status_hash = ?, \ + sla_nudge_sent = 0, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ? \ + AND state NOT IN (?, ?)", + ) + .bind(task_summary) + .bind(timing.ack_secs as i64) + .bind(timing.progress_secs as i64) + .bind(timing.terminal_secs as i64) + .bind(&status_hash) + .bind(&worker_id) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; - let rows = query - .fetch_all(&self.pool) - .await - .map_err(|e| anyhow::anyhow!(e))?; + Ok(()) + } - let mut items: Vec = rows - .into_iter() - .filter_map(|row| { - let item_type: String = row.try_get("item_type").ok()?; - match item_type.as_str() { - "message" => Some(TimelineItem::Message { - id: row.try_get("id").unwrap_or_default(), - role: row.try_get("role").unwrap_or_default(), - sender_name: row.try_get("sender_name").ok(), - sender_id: row.try_get("sender_id").ok(), - content: row.try_get("content").unwrap_or_default(), - created_at: row - .try_get::, _>("timestamp") - .map(|t| t.to_rfc3339()) - .unwrap_or_default(), - }), - "branch_run" => Some(TimelineItem::BranchRun { - id: row.try_get("id").unwrap_or_default(), - description: row.try_get("description").unwrap_or_default(), - conclusion: row.try_get("conclusion").ok(), - started_at: row - .try_get::, _>("timestamp") - .map(|t| t.to_rfc3339()) - .unwrap_or_default(), - completed_at: row - .try_get::, _>("completed_at") - .ok() - .map(|t| t.to_rfc3339()), - }), - "worker_run" => Some(TimelineItem::WorkerRun { - id: row.try_get("id").unwrap_or_default(), - task: row.try_get("task").unwrap_or_default(), - result: row.try_get("result").ok(), - status: row.try_get("status").unwrap_or_default(), - started_at: row - .try_get::, _>("timestamp") - .map(|t| t.to_rfc3339()) - .unwrap_or_default(), - completed_at: row - .try_get::, _>("completed_at") - .ok() - .map(|t| t.to_rfc3339()), - }), - _ => None, - } - }) - .collect(); + /// Mark that a user-visible acknowledgement has been delivered for a worker. + pub async fn mark_worker_task_contract_acknowledged( + &self, + worker_id: WorkerId, + ) -> crate::error::Result<()> { + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = CASE \ + WHEN state IN (?, ?, ?, ?, ?) THEN state \ + ELSE ? \ + END, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ?", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(worker_id.to_string()) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; - // Reverse to chronological order - items.reverse(); - Ok(items) + Ok(()) } - /// List worker runs for an agent, ordered by most recent first. - /// Does NOT include the transcript blob — that's fetched separately via `get_worker_detail`. - pub async fn list_worker_runs( + /// Refresh progress heartbeat information for a worker contract. + pub async fn touch_worker_task_contract_progress( &self, - agent_id: &str, - limit: i64, - offset: i64, - status_filter: Option<&str>, - ) -> crate::error::Result<(Vec, i64)> { - let (count_where_clause, list_where_clause, has_status_filter) = if status_filter.is_some() - { - ( - "WHERE w.agent_id = ?1 AND w.status = ?2", - "WHERE w.agent_id = ?1 AND w.status = ?4", - true, - ) - } else { - ("WHERE w.agent_id = ?1", "WHERE w.agent_id = ?1", false) - }; + worker_id: WorkerId, + status: Option<&str>, + progress_secs: u64, + ) -> crate::error::Result<()> { + let status_hash = status.map(status_fingerprint); - let count_query = - format!("SELECT COUNT(*) as total FROM worker_runs w {count_where_clause}"); - let list_query = format!( - "SELECT w.id, w.task, w.status, w.worker_type, w.channel_id, w.started_at, \ - w.completed_at, w.transcript IS NOT NULL as has_transcript, \ - w.tool_calls, c.display_name as channel_name \ - FROM worker_runs w \ - LEFT JOIN channels c ON w.channel_id = c.id \ - {list_where_clause} \ - ORDER BY w.started_at DESC \ - LIMIT ?2 OFFSET ?3" - ); + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = CASE \ + WHEN state IN (?, ?, ?, ?) THEN ? \ + ELSE state \ + END, \ + last_progress_at = CURRENT_TIMESTAMP, \ + progress_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + last_status_hash = COALESCE(?, last_status_hash), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ? \ + AND state NOT IN (?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(progress_secs as i64) + .bind(status_hash) + .bind(worker_id.to_string()) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; - let mut count_q = sqlx::query(&count_query).bind(agent_id); - let mut list_q = sqlx::query(&list_query) - .bind(agent_id) - .bind(limit) - .bind(offset); + Ok(()) + } - if has_status_filter { - let filter = status_filter.unwrap_or(""); - count_q = count_q.bind(filter); - list_q = list_q.bind(filter); - } + /// Mark a worker contract as terminal pending while delivery receipts are in-flight. + pub async fn mark_worker_task_contract_terminal_pending( + &self, + worker_id: WorkerId, + terminal_state: &str, + terminal_secs: u64, + ) -> crate::error::Result<()> { + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + terminal_state = ?, \ + terminal_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ? \ + AND state NOT IN (?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .bind(terminal_state) + .bind(terminal_secs as i64) + .bind(worker_id.to_string()) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; - let total: i64 = count_q - .fetch_one(&self.pool) + Ok(()) + } + + /// Claim workers whose acknowledgement deadline has expired. + pub async fn claim_due_worker_task_contract_ack_deadlines( + &self, + channel_id: &ChannelId, + limit: i64, + retry_secs: u64, + ) -> crate::error::Result> { + let channel_id = channel_id.to_string(); + let mut tx = self + .pool + .begin() .await - .map(|row| row.try_get("total").unwrap_or(0)) - .map_err(|e| anyhow::anyhow!(e))?; + .map_err(|error| anyhow::anyhow!(error))?; - let rows = list_q - .fetch_all(&self.pool) + let rows = sqlx::query( + "SELECT id, worker_id, task_summary, attempt_count \ + FROM worker_task_contracts \ + WHERE channel_id = ? \ + AND state = ? \ + AND ack_deadline_at <= CURRENT_TIMESTAMP \ + ORDER BY ack_deadline_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(&channel_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut due = Vec::with_capacity(rows.len()); + for row in rows { + let contract_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + column = "id", + "skipping malformed ack-deadline contract row" + ); + continue; + } + }; + let worker_id_raw: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + column = "worker_id", + "skipping malformed ack-deadline contract row" + ); + continue; + } + }; + let task_summary: String = match row.try_get("task_summary") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + worker_id = %worker_id_raw, + column = "task_summary", + "skipping malformed ack-deadline contract row" + ); + continue; + } + }; + let attempt_count: i64 = match row.try_get("attempt_count") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + worker_id = %worker_id_raw, + column = "attempt_count", + "skipping malformed ack-deadline contract row" + ); + continue; + } + }; + + let updated = sqlx::query( + "UPDATE worker_task_contracts \ + SET ack_deadline_at = datetime('now', '+' || ? || ' seconds'), \ + attempt_count = attempt_count + 1, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? \ + AND state = ? \ + AND ack_deadline_at <= CURRENT_TIMESTAMP", + ) + .bind(retry_secs as i64) + .bind(&contract_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .execute(&mut *tx) .await - .map_err(|e| anyhow::anyhow!(e))?; + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); - let items = rows - .into_iter() - .map(|row| WorkerRunRow { - id: row.try_get("id").unwrap_or_default(), - task: row.try_get("task").unwrap_or_default(), - status: row.try_get("status").unwrap_or_default(), - worker_type: row - .try_get("worker_type") - .unwrap_or_else(|_| "builtin".into()), - channel_id: row.try_get("channel_id").ok(), - channel_name: row.try_get("channel_name").ok(), - started_at: row - .try_get::, _>("started_at") - .map(|t| t.to_rfc3339()) - .unwrap_or_default(), - completed_at: row - .try_get::, _>("completed_at") - .ok() - .map(|t| t.to_rfc3339()), - has_transcript: row.try_get::("has_transcript").unwrap_or(false), - tool_calls: row.try_get::("tool_calls").unwrap_or(0), - }) - .collect(); + if updated == 0 { + continue; + } - Ok((items, total)) + match uuid::Uuid::parse_str(&worker_id_raw) { + Ok(worker_id) => due.push(DueWorkerTaskContractAck { + worker_id, + task_summary, + attempt_count: attempt_count + 1, + }), + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id_raw, + "skipping malformed worker task contract id" + ); + } + } + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(due) } - /// Get full detail for a single worker run, including the compressed transcript blob. - pub async fn get_worker_detail( + /// Claim workers whose progress deadline has expired and have not been nudged yet. + pub async fn claim_due_worker_task_contract_progress_deadlines( &self, - agent_id: &str, - worker_id: &str, - ) -> crate::error::Result> { - let row = sqlx::query( - "SELECT w.id, w.task, w.result, w.status, w.worker_type, w.channel_id, \ - w.started_at, w.completed_at, w.transcript, w.tool_calls, \ - c.display_name as channel_name \ - FROM worker_runs w \ - LEFT JOIN channels c ON w.channel_id = c.id \ - WHERE w.agent_id = ? AND w.id = ?", - ) - .bind(agent_id) - .bind(worker_id) + channel_id: &ChannelId, + limit: i64, + ) -> crate::error::Result> { + let channel_id = channel_id.to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id, task_summary \ + FROM worker_task_contracts \ + WHERE channel_id = ? \ + AND state IN (?, ?, ?) \ + AND sla_nudge_sent = 0 \ + AND progress_deadline_at <= CURRENT_TIMESTAMP \ + ORDER BY progress_deadline_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(&channel_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut due = Vec::with_capacity(rows.len()); + for row in rows { + let contract_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + column = "id", + "skipping malformed progress-deadline contract row" + ); + continue; + } + }; + let worker_id_raw: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + column = "worker_id", + "skipping malformed progress-deadline contract row" + ); + continue; + } + }; + let task_summary: String = match row.try_get("task_summary") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + worker_id = %worker_id_raw, + column = "task_summary", + "skipping malformed progress-deadline contract row" + ); + continue; + } + }; + + let updated = sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + sla_nudge_sent = 1, \ + attempt_count = attempt_count + 1, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? \ + AND state IN (?, ?, ?) \ + AND sla_nudge_sent = 0 \ + AND progress_deadline_at <= CURRENT_TIMESTAMP", + ) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(&contract_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + match uuid::Uuid::parse_str(&worker_id_raw) { + Ok(worker_id) => due.push(DueWorkerTaskContractProgress { + worker_id, + task_summary, + }), + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id_raw, + "skipping malformed worker task contract id" + ); + } + } + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(due) + } + + /// Claim terminal-pending contracts whose delivery window elapsed. + /// + /// Overdue contracts are transitioned to `terminal_failed` and any pending + /// terminal delivery receipts are marked `failed` to stop retry churn. + pub async fn claim_due_worker_task_contract_terminal_deadlines( + &self, + channel_id: &ChannelId, + limit: i64, + ) -> crate::error::Result> { + let channel_id = channel_id.to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id \ + FROM worker_task_contracts \ + WHERE channel_id = ? \ + AND state = ? \ + AND terminal_deadline_at <= CURRENT_TIMESTAMP \ + ORDER BY terminal_deadline_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(&channel_id) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut due = Vec::with_capacity(rows.len()); + for row in rows { + let contract_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + column = "id", + "skipping malformed terminal-deadline contract row" + ); + continue; + } + }; + let worker_id_raw: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + contract_id = %contract_id, + column = "worker_id", + "skipping malformed terminal-deadline contract row" + ); + continue; + } + }; + + let updated = sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + terminal_state = COALESCE(terminal_state, 'failed'), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? \ + AND state = ? \ + AND terminal_deadline_at <= CURRENT_TIMESTAMP", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(&contract_id) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'failed', \ + last_error = ?, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ? \ + AND kind = ? \ + AND status IN ('pending', 'sending')", + ) + .bind("terminal deadline elapsed before adapter acknowledgement") + .bind(&worker_id_raw) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + match uuid::Uuid::parse_str(&worker_id_raw) { + Ok(worker_id) => due.push(DueWorkerTaskContractTerminal { worker_id }), + Err(error) => { + tracing::warn!( + %error, + worker_id = %worker_id_raw, + "skipping malformed worker task contract id" + ); + } + } + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(due) + } + + /// Create (or refresh) the durable terminal delivery receipt for a worker. + /// + /// One terminal receipt exists per worker (`kind = worker_terminal`). If the + /// receipt already exists and is not acked, it is reset to pending so it can + /// be retried. + pub async fn upsert_worker_terminal_receipt( + &self, + channel_id: &ChannelId, + worker_id: WorkerId, + terminal_state: &str, + payload_text: &str, + ) -> crate::error::Result { + let worker_id = worker_id.to_string(); + let channel_id = channel_id.to_string(); + let candidate_receipt_id = uuid::Uuid::new_v4().to_string(); + + let receipt_id: String = sqlx::query_scalar( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at) \ + VALUES (?, ?, ?, ?, 'pending', ?, ?, CURRENT_TIMESTAMP) \ + ON CONFLICT(worker_id, kind) DO UPDATE SET \ + channel_id = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.channel_id \ + ELSE excluded.channel_id \ + END, \ + terminal_state = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.terminal_state \ + ELSE excluded.terminal_state \ + END, \ + payload_text = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.payload_text \ + ELSE excluded.payload_text \ + END, \ + status = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.status \ + ELSE 'pending' \ + END, \ + last_error = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.last_error \ + ELSE NULL \ + END, \ + next_attempt_at = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.next_attempt_at \ + ELSE CURRENT_TIMESTAMP \ + END, \ + updated_at = CASE \ + WHEN worker_delivery_receipts.status = 'acked' THEN worker_delivery_receipts.updated_at \ + ELSE CURRENT_TIMESTAMP \ + END \ + RETURNING id", + ) + .bind(&candidate_receipt_id) + .bind(&worker_id) + .bind(&channel_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind(terminal_state) + .bind(payload_text) + .fetch_one(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + Ok(receipt_id) + } + + /// Claim due pending terminal receipts for delivery. + /// + /// Claimed receipts are transitioned to `sending` so we can distinguish in-flight + /// deliveries from queued retries. + pub async fn claim_due_worker_terminal_receipts( + &self, + channel_id: &ChannelId, + limit: i64, + ) -> crate::error::Result> { + let channel_id = channel_id.to_string(); + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id, channel_id, terminal_state, payload_text, attempt_count \ + FROM worker_delivery_receipts \ + WHERE channel_id = ? \ + AND kind = ? \ + AND status = 'pending' \ + AND next_attempt_at <= CURRENT_TIMESTAMP \ + ORDER BY next_attempt_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(&channel_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut claimed = Vec::with_capacity(rows.len()); + for row in rows { + let receipt_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + column = "id", + "skipping malformed terminal receipt row" + ); + continue; + } + }; + let updated = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'sending', updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? AND status = 'pending'", + ) + .bind(&receipt_id) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + let worker_id: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + column = "worker_id", + "skipping malformed terminal receipt row" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; + continue; + } + }; + let receipt_channel_id: String = match row.try_get("channel_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "channel_id", + "skipping malformed terminal receipt row" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; + continue; + } + }; + let terminal_state: String = match row.try_get("terminal_state") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "terminal_state", + "skipping malformed terminal receipt row" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; + continue; + } + }; + let payload_text: String = match row.try_get("payload_text") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "payload_text", + "skipping malformed terminal receipt row" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; + continue; + } + }; + let attempt_count: i64 = match row.try_get("attempt_count") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + channel_id = %channel_id, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "attempt_count", + "skipping malformed terminal receipt row" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + &channel_id, + ) + .await; + continue; + } + }; + + claimed.push(PendingWorkerDeliveryReceipt { + id: receipt_id, + worker_id, + channel_id: receipt_channel_id, + terminal_state, + payload_text, + attempt_count, + }); + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(claimed) + } + + /// Claim due pending terminal receipts across all channels. + /// + /// Used by the global receipt dispatcher to drain terminal notices even + /// when no channel loop is currently active. + pub async fn claim_due_worker_terminal_receipts_any( + &self, + limit: i64, + ) -> crate::error::Result> { + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let rows = sqlx::query( + "SELECT id, worker_id, channel_id, terminal_state, payload_text, attempt_count \ + FROM worker_delivery_receipts \ + WHERE kind = ? \ + AND status = 'pending' \ + AND next_attempt_at <= CURRENT_TIMESTAMP \ + ORDER BY next_attempt_at ASC, created_at ASC \ + LIMIT ?", + ) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind(limit) + .fetch_all(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let mut claimed = Vec::with_capacity(rows.len()); + for row in rows { + let receipt_id: String = match row.try_get("id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + column = "id", + "skipping malformed terminal receipt row (global claim)" + ); + continue; + } + }; + let updated = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'sending', updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? AND status = 'pending'", + ) + .bind(&receipt_id) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated == 0 { + continue; + } + + let worker_id: String = match row.try_get("worker_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + column = "worker_id", + "skipping malformed terminal receipt row (global claim)" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; + continue; + } + }; + let receipt_channel_id: String = match row.try_get("channel_id") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "channel_id", + "skipping malformed terminal receipt row (global claim)" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; + continue; + } + }; + let terminal_state: String = match row.try_get("terminal_state") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "terminal_state", + "skipping malformed terminal receipt row (global claim)" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; + continue; + } + }; + let payload_text: String = match row.try_get("payload_text") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "payload_text", + "skipping malformed terminal receipt row (global claim)" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; + continue; + } + }; + let attempt_count: i64 = match row.try_get("attempt_count") { + Ok(value) => value, + Err(error) => { + tracing::warn!( + %error, + receipt_id = %receipt_id, + worker_id = %worker_id, + column = "attempt_count", + "skipping malformed terminal receipt row (global claim)" + ); + Self::revert_claimed_terminal_receipt_to_pending( + &mut tx, + &receipt_id, + "global", + ) + .await; + continue; + } + }; + + claimed.push(PendingWorkerDeliveryReceipt { + id: receipt_id, + worker_id, + channel_id: receipt_channel_id, + terminal_state, + payload_text, + attempt_count, + }); + } + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + Ok(claimed) + } + + /// Mark a terminal receipt as delivered. + /// + /// Returns true if this call transitioned the row to acked. + pub async fn ack_worker_delivery_receipt( + &self, + receipt_id: &str, + ) -> crate::error::Result { + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + let updated = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'acked', \ + acked_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP, \ + last_error = NULL \ + WHERE id = ? AND status != 'acked'", + ) + .bind(receipt_id) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + if updated > 0 { + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ( + SELECT worker_id FROM worker_delivery_receipts WHERE id = ? + ) \ + AND state IN (?, ?, ?, ?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(receipt_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + } + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + + Ok(updated > 0) + } + + /// Record a delivery failure and schedule the next retry (or terminal failure). + pub async fn fail_worker_delivery_receipt_attempt( + &self, + receipt_id: &str, + error: &str, + ) -> crate::error::Result { + let mut tx = self + .pool + .begin() + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + let row = sqlx::query( + "SELECT status, attempt_count \ + FROM worker_delivery_receipts \ + WHERE id = ?", + ) + .bind(receipt_id) + .fetch_optional(&mut *tx) + .await + .map_err(|db_error| anyhow::anyhow!(db_error))? + .ok_or_else(|| anyhow::anyhow!("worker delivery receipt not found: {receipt_id}"))?; + + let current_status: String = row.try_get("status").map_err(|decode_error| { + anyhow::anyhow!( + "failed to decode worker_delivery_receipts.status for {receipt_id}: {decode_error}" + ) + })?; + let current_attempts: i64 = row.try_get("attempt_count").map_err(|decode_error| { + anyhow::anyhow!( + "failed to decode worker_delivery_receipts.attempt_count for {receipt_id}: {decode_error}" + ) + })?; + + if current_status == "acked" { + tx.commit() + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + return Ok(WorkerDeliveryRetryOutcome { + status: "acked".to_string(), + attempt_count: current_attempts, + next_attempt_at: None, + }); + } + + let attempt_count = current_attempts + 1; + if attempt_count >= WORKER_RECEIPT_MAX_ATTEMPTS { + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'failed', \ + attempt_count = ?, \ + last_error = ?, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(attempt_count) + .bind(error) + .bind(receipt_id) + .execute(&mut *tx) + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE worker_id = ( + SELECT worker_id FROM worker_delivery_receipts WHERE id = ? + ) \ + AND state IN (?, ?, ?, ?, ?)", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(receipt_id) + .bind(WORKER_CONTRACT_STATE_CREATED) + .bind(WORKER_CONTRACT_STATE_ACKED) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(WORKER_CONTRACT_STATE_SLA_MISSED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_PENDING) + .execute(&mut *tx) + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + tx.commit() + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + + return Ok(WorkerDeliveryRetryOutcome { + status: "failed".to_string(), + attempt_count, + next_attempt_at: None, + }); + } + + let delay_secs = worker_receipt_backoff_secs(attempt_count).unwrap_or(300); + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'pending', \ + attempt_count = ?, \ + last_error = ?, \ + next_attempt_at = datetime('now', '+' || ? || ' seconds'), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(attempt_count) + .bind(error) + .bind(delay_secs) + .bind(receipt_id) + .execute(&mut *tx) + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + tx.commit() + .await + .map_err(|db_error| anyhow::anyhow!(db_error))?; + + let next_attempt_at = chrono::Utc::now() + .checked_add_signed(chrono::TimeDelta::seconds(delay_secs)) + .map(|timestamp| timestamp.to_rfc3339()); + + Ok(WorkerDeliveryRetryOutcome { + status: "pending".to_string(), + attempt_count, + next_attempt_at, + }) + } + + /// Load worker delivery receipt stats for a channel. + pub async fn load_worker_delivery_receipt_stats( + &self, + channel_id: &str, + ) -> crate::error::Result { + let row = sqlx::query( + "SELECT \ + SUM(CASE WHEN status IN ('pending', 'sending') THEN 1 ELSE 0 END) AS pending_count, \ + SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS failed_count \ + FROM worker_delivery_receipts \ + WHERE channel_id = ? \ + AND kind = ?", + ) + .bind(channel_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .fetch_one(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let pending = row.try_get::("pending_count").unwrap_or(0).max(0) as u64; + let failed = row.try_get::("failed_count").unwrap_or(0).max(0) as u64; + + Ok(WorkerDeliveryReceiptStats { pending, failed }) + } + + /// Delete old terminal delivery receipts that are no longer actionable. + /// + /// Keeps `pending` and `sending` rows intact, and only removes terminal rows + /// (`acked`, `failed`) older than the configured retention period. + pub async fn prune_worker_delivery_receipts(&self) -> crate::error::Result { + let deleted = sqlx::query( + "DELETE FROM worker_delivery_receipts \ + WHERE status IN ('acked', 'failed') \ + AND julianday(updated_at) < julianday('now', '-' || ? || ' days')", + ) + .bind(WORKER_RECEIPT_RETENTION_DAYS) + .execute(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))? + .rows_affected(); + + Ok(deleted) + } + + /// Close orphaned branch and worker runs from a previous process lifetime. + /// + /// This is called on startup before channels begin handling messages. Any + /// rows with NULL `completed_at` cannot be resumed and should be marked + /// terminal so timelines and analytics stay accurate. + pub async fn close_orphaned_runs(&self) -> crate::error::Result<(u64, u64, u64, u64)> { + let mut tx = self + .pool + .begin() + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let worker_result = sqlx::query( + "UPDATE worker_runs \ + SET status = 'failed', \ + result = COALESCE(result, 'Worker interrupted by restart before completion.'), \ + completed_at = CURRENT_TIMESTAMP \ + WHERE completed_at IS NULL", + ) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let branch_result = sqlx::query( + "UPDATE branch_runs \ + SET conclusion = COALESCE(conclusion, 'Branch interrupted by restart before completion.'), \ + completed_at = CURRENT_TIMESTAMP \ + WHERE completed_at IS NULL", + ) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let receipt_result = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'pending', \ + next_attempt_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE status = 'sending'", + ) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let contract_result = sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + terminal_state = COALESCE(terminal_state, 'failed'), \ + updated_at = CURRENT_TIMESTAMP \ + WHERE state NOT IN (?, ?) \ + AND NOT EXISTS ( \ + SELECT 1 \ + FROM worker_delivery_receipts \ + WHERE worker_delivery_receipts.worker_id = worker_task_contracts.worker_id \ + AND worker_delivery_receipts.status = 'acked' \ + )", + ) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_ACKED) + .bind(WORKER_CONTRACT_STATE_TERMINAL_FAILED) + .execute(&mut *tx) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + tx.commit().await.map_err(|error| anyhow::anyhow!(error))?; + + Ok(( + worker_result.rows_affected(), + branch_result.rows_affected(), + receipt_result.rows_affected(), + contract_result.rows_affected(), + )) + } + + async fn revert_claimed_terminal_receipt_to_pending( + tx: &mut sqlx::Transaction<'_, sqlx::Sqlite>, + receipt_id: &str, + scope: &str, + ) { + if let Err(error) = sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'pending', \ + next_attempt_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ? AND status = 'sending'", + ) + .bind(receipt_id) + .execute(&mut **tx) + .await + { + tracing::warn!( + %error, + receipt_id = %receipt_id, + scope, + "failed to revert malformed terminal receipt claim" + ); + } + } + + /// Load a unified timeline for a channel: messages, branch runs, and worker runs + /// interleaved chronologically (oldest first). + /// + /// When `before` is provided, only items with a timestamp strictly before that + /// value are returned, enabling cursor-based pagination. + pub async fn load_channel_timeline( + &self, + channel_id: &str, + limit: i64, + before: Option<&str>, + ) -> crate::error::Result> { + let before_clause = if before.is_some() { + "AND datetime(timestamp) < datetime(?3)" + } else { + "" + }; + + let query_str = format!( + "SELECT * FROM ( \ + SELECT 'message' AS item_type, id, role, sender_name, sender_id, content, \ + NULL AS description, NULL AS conclusion, NULL AS task, NULL AS result, NULL AS status, \ + created_at AS timestamp, NULL AS completed_at \ + FROM conversation_messages WHERE channel_id = ?1 \ + UNION ALL \ + SELECT 'branch_run' AS item_type, id, NULL, NULL, NULL, NULL, \ + description, conclusion, NULL, NULL, NULL, \ + started_at AS timestamp, completed_at \ + FROM branch_runs WHERE channel_id = ?1 \ + UNION ALL \ + SELECT 'worker_run' AS item_type, id, NULL, NULL, NULL, NULL, \ + NULL, NULL, task, result, status, \ + started_at AS timestamp, completed_at \ + FROM worker_runs WHERE channel_id = ?1 \ + ) WHERE 1=1 {before_clause} ORDER BY timestamp DESC LIMIT ?2" + ); + + let mut query = sqlx::query(&query_str).bind(channel_id).bind(limit); + + if let Some(before_ts) = before { + query = query.bind(before_ts); + } + + let rows = query + .fetch_all(&self.pool) + .await + .map_err(|e| anyhow::anyhow!(e))?; + + let mut items: Vec = rows + .into_iter() + .filter_map(|row| { + let item_type: String = row.try_get("item_type").ok()?; + match item_type.as_str() { + "message" => Some(TimelineItem::Message { + id: row.try_get("id").unwrap_or_default(), + role: row.try_get("role").unwrap_or_default(), + sender_name: row.try_get("sender_name").ok(), + sender_id: row.try_get("sender_id").ok(), + content: row.try_get("content").unwrap_or_default(), + created_at: row + .try_get::, _>("timestamp") + .map(|t| t.to_rfc3339()) + .unwrap_or_default(), + }), + "branch_run" => Some(TimelineItem::BranchRun { + id: row.try_get("id").unwrap_or_default(), + description: row.try_get("description").unwrap_or_default(), + conclusion: row.try_get("conclusion").ok(), + started_at: row + .try_get::, _>("timestamp") + .map(|t| t.to_rfc3339()) + .unwrap_or_default(), + completed_at: row + .try_get::, _>("completed_at") + .ok() + .map(|t| t.to_rfc3339()), + }), + "worker_run" => Some(TimelineItem::WorkerRun { + id: row.try_get("id").unwrap_or_default(), + task: row.try_get("task").unwrap_or_default(), + result: row.try_get("result").ok(), + status: row.try_get("status").unwrap_or_default(), + started_at: row + .try_get::, _>("timestamp") + .map(|t| t.to_rfc3339()) + .unwrap_or_default(), + completed_at: row + .try_get::, _>("completed_at") + .ok() + .map(|t| t.to_rfc3339()), + }), + _ => None, + } + }) + .collect(); + + // Reverse to chronological order + items.reverse(); + Ok(items) + } + + /// List worker runs for an agent, ordered by most recent first. + /// Does NOT include the transcript blob — that's fetched separately via `get_worker_detail`. + pub async fn list_worker_runs( + &self, + agent_id: &str, + limit: i64, + offset: i64, + status_filter: Option<&str>, + ) -> crate::error::Result<(Vec, i64)> { + let (count_where_clause, list_where_clause, has_status_filter) = if status_filter.is_some() + { + ( + "WHERE w.agent_id = ?1 AND w.status = ?2", + "WHERE w.agent_id = ?1 AND w.status = ?4", + true, + ) + } else { + ("WHERE w.agent_id = ?1", "WHERE w.agent_id = ?1", false) + }; + + let count_query = + format!("SELECT COUNT(*) as total FROM worker_runs w {count_where_clause}"); + let list_query = format!( + "SELECT w.id, w.task, w.status, w.worker_type, w.channel_id, w.started_at, \ + w.completed_at, w.transcript IS NOT NULL as has_transcript, \ + w.tool_calls, c.display_name as channel_name \ + FROM worker_runs w \ + LEFT JOIN channels c ON w.channel_id = c.id \ + {list_where_clause} \ + ORDER BY w.started_at DESC \ + LIMIT ?2 OFFSET ?3" + ); + + let mut count_q = sqlx::query(&count_query).bind(agent_id); + let mut list_q = sqlx::query(&list_query) + .bind(agent_id) + .bind(limit) + .bind(offset); + + if has_status_filter { + let filter = status_filter.unwrap_or(""); + count_q = count_q.bind(filter); + list_q = list_q.bind(filter); + } + + let total: i64 = count_q + .fetch_one(&self.pool) + .await + .map(|row| row.try_get("total").unwrap_or(0)) + .map_err(|e| anyhow::anyhow!(e))?; + + let rows = list_q + .fetch_all(&self.pool) + .await + .map_err(|e| anyhow::anyhow!(e))?; + + let items = rows + .into_iter() + .map(|row| WorkerRunRow { + id: row.try_get("id").unwrap_or_default(), + task: row.try_get("task").unwrap_or_default(), + status: row.try_get("status").unwrap_or_default(), + worker_type: row + .try_get("worker_type") + .unwrap_or_else(|_| "builtin".into()), + channel_id: row.try_get("channel_id").ok(), + channel_name: row.try_get("channel_name").ok(), + started_at: row + .try_get::, _>("started_at") + .map(|t| t.to_rfc3339()) + .unwrap_or_default(), + completed_at: row + .try_get::, _>("completed_at") + .ok() + .map(|t| t.to_rfc3339()), + has_transcript: row.try_get::("has_transcript").unwrap_or(false), + tool_calls: row.try_get::("tool_calls").unwrap_or(0), + }) + .collect(); + + Ok((items, total)) + } + + /// Get full detail for a single worker run, including the compressed transcript blob. + pub async fn get_worker_detail( + &self, + agent_id: &str, + worker_id: &str, + ) -> crate::error::Result> { + let row = sqlx::query( + "SELECT w.id, w.task, w.result, w.status, w.worker_type, w.channel_id, \ + w.started_at, w.completed_at, w.transcript, w.tool_calls, \ + c.display_name as channel_name \ + FROM worker_runs w \ + LEFT JOIN channels c ON w.channel_id = c.id \ + WHERE w.agent_id = ? AND w.id = ?", + ) + .bind(agent_id) + .bind(worker_id) .fetch_optional(&self.pool) .await .map_err(|e| anyhow::anyhow!(e))?; - Ok(row.map(|row| WorkerDetailRow { - id: row.try_get("id").unwrap_or_default(), - task: row.try_get("task").unwrap_or_default(), - result: row.try_get("result").ok(), - status: row.try_get("status").unwrap_or_default(), - worker_type: row - .try_get("worker_type") - .unwrap_or_else(|_| "builtin".into()), - channel_id: row.try_get("channel_id").ok(), - channel_name: row.try_get("channel_name").ok(), - started_at: row - .try_get::, _>("started_at") - .map(|t| t.to_rfc3339()) - .unwrap_or_default(), - completed_at: row - .try_get::, _>("completed_at") - .ok() - .map(|t| t.to_rfc3339()), - transcript_blob: row.try_get("transcript").ok(), - tool_calls: row.try_get::("tool_calls").unwrap_or(0), - })) + Ok(row.map(|row| WorkerDetailRow { + id: row.try_get("id").unwrap_or_default(), + task: row.try_get("task").unwrap_or_default(), + result: row.try_get("result").ok(), + status: row.try_get("status").unwrap_or_default(), + worker_type: row + .try_get("worker_type") + .unwrap_or_else(|_| "builtin".into()), + channel_id: row.try_get("channel_id").ok(), + channel_name: row.try_get("channel_name").ok(), + started_at: row + .try_get::, _>("started_at") + .map(|t| t.to_rfc3339()) + .unwrap_or_default(), + completed_at: row + .try_get::, _>("completed_at") + .ok() + .map(|t| t.to_rfc3339()), + transcript_blob: row.try_get("transcript").ok(), + tool_calls: row.try_get::("tool_calls").unwrap_or(0), + })) + } + + /// Load the current worker task contract snapshot for a worker. + pub async fn get_worker_task_contract_snapshot( + &self, + worker_id: &str, + ) -> crate::error::Result> { + let row = sqlx::query( + "SELECT id, worker_id, task_summary, state, attempt_count, sla_nudge_sent, \ + terminal_state, ack_deadline_at, progress_deadline_at, terminal_deadline_at, \ + last_progress_at, created_at, updated_at \ + FROM worker_task_contracts \ + WHERE worker_id = ? \ + LIMIT 1", + ) + .bind(worker_id) + .fetch_optional(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let Some(row) = row else { + return Ok(None); + }; + + let id: String = row.try_get("id").map_err(|error| anyhow::anyhow!(error))?; + let worker_id: String = row + .try_get("worker_id") + .map_err(|error| anyhow::anyhow!(error))?; + let task_summary: String = row + .try_get("task_summary") + .map_err(|error| anyhow::anyhow!(error))?; + let state: String = row + .try_get("state") + .map_err(|error| anyhow::anyhow!(error))?; + let attempt_count: i64 = row + .try_get("attempt_count") + .map_err(|error| anyhow::anyhow!(error))?; + let sla_nudge_sent = row.try_get::("sla_nudge_sent").unwrap_or(0) != 0; + let terminal_state = row.try_get("terminal_state").ok(); + let ack_deadline_at = row + .try_get::, _>("ack_deadline_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + let progress_deadline_at = row + .try_get::, _>("progress_deadline_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + let terminal_deadline_at = row + .try_get::, _>("terminal_deadline_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + let last_progress_at = row + .try_get::, _>("last_progress_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + let created_at = row + .try_get::, _>("created_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + let updated_at = row + .try_get::, _>("updated_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + + Ok(Some(WorkerTaskContractSnapshot { + id, + worker_id, + task_summary, + state, + attempt_count, + sla_nudge_sent, + terminal_state, + ack_deadline_at, + progress_deadline_at, + terminal_deadline_at, + last_progress_at, + created_at, + updated_at, + })) + } + + /// Load the current terminal delivery receipt snapshot for a worker. + pub async fn get_worker_terminal_receipt_snapshot( + &self, + worker_id: &str, + ) -> crate::error::Result> { + let row = sqlx::query( + "SELECT id, worker_id, channel_id, status, terminal_state, payload_text, \ + attempt_count, last_error, next_attempt_at, acked_at, created_at, updated_at \ + FROM worker_delivery_receipts \ + WHERE worker_id = ? \ + AND kind = ? \ + LIMIT 1", + ) + .bind(worker_id) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .fetch_optional(&self.pool) + .await + .map_err(|error| anyhow::anyhow!(error))?; + + let Some(row) = row else { + return Ok(None); + }; + + let id: String = row.try_get("id").map_err(|error| anyhow::anyhow!(error))?; + let worker_id: String = row + .try_get("worker_id") + .map_err(|error| anyhow::anyhow!(error))?; + let channel_id: String = row + .try_get("channel_id") + .map_err(|error| anyhow::anyhow!(error))?; + let status: String = row + .try_get("status") + .map_err(|error| anyhow::anyhow!(error))?; + let terminal_state: String = row + .try_get("terminal_state") + .map_err(|error| anyhow::anyhow!(error))?; + let payload_text: String = row + .try_get("payload_text") + .map_err(|error| anyhow::anyhow!(error))?; + let attempt_count: i64 = row + .try_get("attempt_count") + .map_err(|error| anyhow::anyhow!(error))?; + let last_error = row.try_get("last_error").ok(); + let next_attempt_at = row + .try_get::, _>("next_attempt_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + let acked_at = row + .try_get::, _>("acked_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + let created_at = row + .try_get::, _>("created_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + let updated_at = row + .try_get::, _>("updated_at") + .ok() + .map(|timestamp| timestamp.to_rfc3339()); + + Ok(Some(WorkerDeliveryReceiptSnapshot { + id, + worker_id, + channel_id, + status, + terminal_state, + payload_text, + attempt_count, + last_error, + next_attempt_at, + acked_at, + created_at, + updated_at, + })) + } +} + +/// A worker run row without the transcript blob (for list queries). +#[derive(Debug, Clone, Serialize)] +pub struct WorkerRunRow { + pub id: String, + pub task: String, + pub status: String, + pub worker_type: String, + pub channel_id: Option, + pub channel_name: Option, + pub started_at: String, + pub completed_at: Option, + pub has_transcript: bool, + pub tool_calls: i64, +} + +/// A worker run row with full detail including the transcript blob. +#[derive(Debug, Clone)] +pub struct WorkerDetailRow { + pub id: String, + pub task: String, + pub result: Option, + pub status: String, + pub worker_type: String, + pub channel_id: Option, + pub channel_name: Option, + pub started_at: String, + pub completed_at: Option, + pub transcript_blob: Option>, + pub tool_calls: i64, +} + +/// Snapshot of a worker task contract row. +#[derive(Debug, Clone, Serialize)] +pub struct WorkerTaskContractSnapshot { + pub id: String, + pub worker_id: String, + pub task_summary: String, + pub state: String, + pub attempt_count: i64, + pub sla_nudge_sent: bool, + pub terminal_state: Option, + pub ack_deadline_at: Option, + pub progress_deadline_at: Option, + pub terminal_deadline_at: Option, + pub last_progress_at: Option, + pub created_at: Option, + pub updated_at: Option, +} + +/// Snapshot of a worker terminal delivery receipt row. +#[derive(Debug, Clone, Serialize)] +pub struct WorkerDeliveryReceiptSnapshot { + pub id: String, + pub worker_id: String, + pub channel_id: String, + pub status: String, + pub terminal_state: String, + pub payload_text: String, + pub attempt_count: i64, + pub last_error: Option, + pub next_attempt_at: Option, + pub acked_at: Option, + pub created_at: Option, + pub updated_at: Option, +} + +/// One synthesized entry in the canonical worker timeline projection. +#[derive(Debug, Clone, Serialize)] +pub struct WorkerTimelineEvent { + pub sequence: usize, + pub source: String, + pub kind: String, + pub summary: String, + pub at: Option, + pub step_index: Option, + pub details: Option, +} + +/// Computed worker timeline projection used by APIs and tooling. +#[derive(Debug, Clone, Serialize)] +pub struct WorkerTimelineProjection { + pub terminal_converged: bool, + pub events: Vec, +} + +/// Build a deterministic worker timeline projection. +/// +/// Transcript entries are ordered strictly by transcript step index. +/// Delivery and contract snapshots are ordered by timestamp and appended after transcript entries. +pub fn build_worker_timeline_projection( + worker_status: &str, + transcript_steps: &[TranscriptStep], + contract: Option<&WorkerTaskContractSnapshot>, + receipt: Option<&WorkerDeliveryReceiptSnapshot>, +) -> WorkerTimelineProjection { + let mut events: Vec = transcript_steps + .iter() + .enumerate() + .map(|(step_index, step)| project_transcript_step(step_index, step)) + .collect(); + + let mut delivery_events = Vec::new(); + if let Some(contract) = contract { + let at = contract + .updated_at + .clone() + .or_else(|| contract.created_at.clone()); + let mut summary = format!("Contract state: {}", contract.state); + if let Some(terminal_state) = &contract.terminal_state { + summary.push_str(&format!(" ({terminal_state})")); + } + delivery_events.push(WorkerTimelineEvent { + sequence: 0, + source: "delivery".to_string(), + kind: "contract_state".to_string(), + summary, + at, + step_index: None, + details: Some(serde_json::json!({ + "attempt_count": contract.attempt_count, + "task_summary": contract.task_summary, + "sla_nudge_sent": contract.sla_nudge_sent, + "ack_deadline_at": contract.ack_deadline_at, + "progress_deadline_at": contract.progress_deadline_at, + "terminal_deadline_at": contract.terminal_deadline_at, + "last_progress_at": contract.last_progress_at, + })), + }); + } + + if let Some(receipt) = receipt { + let at = if receipt.status == "acked" { + receipt + .acked_at + .clone() + .or_else(|| receipt.updated_at.clone()) + .or_else(|| receipt.created_at.clone()) + } else { + receipt + .updated_at + .clone() + .or_else(|| receipt.created_at.clone()) + }; + let summary = format!( + "Terminal receipt: {} ({}, attempts={})", + receipt.status, receipt.terminal_state, receipt.attempt_count + ); + delivery_events.push(WorkerTimelineEvent { + sequence: 0, + source: "delivery".to_string(), + kind: "terminal_receipt".to_string(), + summary, + at, + step_index: None, + details: Some(serde_json::json!({ + "id": receipt.id, + "status": receipt.status, + "terminal_state": receipt.terminal_state, + "attempt_count": receipt.attempt_count, + "next_attempt_at": receipt.next_attempt_at, + "last_error": receipt.last_error, + })), + }); + } + + delivery_events.sort_by(|left, right| { + timeline_timestamp_key(left.at.as_deref()) + .cmp(&timeline_timestamp_key(right.at.as_deref())) + .then_with(|| left.kind.cmp(&right.kind)) + .then_with(|| left.summary.cmp(&right.summary)) + }); + events.extend(delivery_events); + + for (sequence, event) in events.iter_mut().enumerate() { + event.sequence = sequence; + } + + WorkerTimelineProjection { + terminal_converged: worker_terminal_delivery_converged(worker_status, contract), + events, + } +} + +/// Whether a terminal worker has converged to a terminal delivery contract state. +pub fn worker_terminal_delivery_converged( + worker_status: &str, + contract: Option<&WorkerTaskContractSnapshot>, +) -> bool { + if !is_terminal_worker_status(worker_status) { + return true; + } + + matches!( + contract.map(|value| value.state.as_str()), + Some(WORKER_CONTRACT_STATE_TERMINAL_ACKED | WORKER_CONTRACT_STATE_TERMINAL_FAILED) + ) +} + +fn project_transcript_step(step_index: usize, step: &TranscriptStep) -> WorkerTimelineEvent { + match step { + TranscriptStep::Action { content } => { + let mut tool_names = Vec::new(); + let mut text_preview = None; + + for item in content { + match item { + ActionContent::Text { text } => { + if text_preview.is_none() { + text_preview = Some(truncate_timeline_text(text, 180)); + } + } + ActionContent::ToolCall { name, .. } => tool_names.push(name.clone()), + } + } + + if !tool_names.is_empty() { + let summary = format!("Tool call(s): {}", tool_names.join(", ")); + WorkerTimelineEvent { + sequence: 0, + source: "transcript".to_string(), + kind: "tool_call".to_string(), + summary, + at: None, + step_index: Some(step_index), + details: Some(serde_json::json!({ + "tool_names": tool_names, + "text_preview": text_preview, + })), + } + } else { + let text_preview = text_preview.unwrap_or_default(); + let summary = if text_preview.is_empty() { + "Agent message".to_string() + } else { + format!("Agent message: {text_preview}") + }; + WorkerTimelineEvent { + sequence: 0, + source: "transcript".to_string(), + kind: "agent_text".to_string(), + summary, + at: None, + step_index: Some(step_index), + details: None, + } + } + } + TranscriptStep::ToolResult { + call_id, + name, + text, + } => { + let label = if name.is_empty() { "tool" } else { name }; + let preview = truncate_timeline_text(text, 180); + let summary = if preview.is_empty() { + format!("Tool result ({label})") + } else { + format!("Tool result ({label}): {preview}") + }; + WorkerTimelineEvent { + sequence: 0, + source: "transcript".to_string(), + kind: "tool_result".to_string(), + summary, + at: None, + step_index: Some(step_index), + details: Some(serde_json::json!({ + "call_id": call_id, + "bytes": text.len(), + })), + } + } + } +} + +fn timeline_timestamp_key(value: Option<&str>) -> (i64, u32) { + let Some(value) = value else { + return (i64::MAX, u32::MAX); + }; + let Ok(parsed) = chrono::DateTime::parse_from_rfc3339(value) else { + return (i64::MAX, u32::MAX); + }; + (parsed.timestamp(), parsed.timestamp_subsec_nanos()) +} + +fn truncate_timeline_text(text: &str, max_chars: usize) -> String { + if text.chars().count() <= max_chars { + return text.to_string(); + } + + let mut end = text.len(); + for (index, (byte_index, _)) in text.char_indices().enumerate() { + if index >= max_chars { + end = byte_index; + break; + } } + format!("{}...", &text[..end]) } -/// A worker run row without the transcript blob (for list queries). -#[derive(Debug, Clone, Serialize)] -pub struct WorkerRunRow { - pub id: String, - pub task: String, - pub status: String, - pub worker_type: String, - pub channel_id: Option, - pub channel_name: Option, - pub started_at: String, - pub completed_at: Option, - pub has_transcript: bool, - pub tool_calls: i64, +fn is_terminal_worker_status(status: &str) -> bool { + matches!(status, "done" | "failed" | "cancelled" | "timed_out") } -/// A worker run row with full detail including the transcript blob. -#[derive(Debug, Clone)] -pub struct WorkerDetailRow { - pub id: String, - pub task: String, - pub result: Option, - pub status: String, - pub worker_type: String, - pub channel_id: Option, - pub channel_name: Option, - pub started_at: String, - pub completed_at: Option, - pub transcript_blob: Option>, - pub tool_calls: i64, +#[cfg(test)] +mod tests { + use super::*; + use sqlx::sqlite::SqliteConnectOptions; + use std::sync::Arc; + + async fn connect_logger() -> ProcessRunLogger { + let options = SqliteConnectOptions::new() + .in_memory(true) + .create_if_missing(true); + let pool = sqlx::pool::PoolOptions::::new() + .max_connections(1) + .connect_with(options) + .await + .expect("in-memory SQLite"); + sqlx::migrate!("./migrations") + .run(&pool) + .await + .expect("migrations"); + ProcessRunLogger::new(pool) + } + + #[test] + fn worker_timeline_projection_orders_transcript_then_delivery() { + let transcript = vec![ + TranscriptStep::Action { + content: vec![ActionContent::Text { + text: "starting task".to_string(), + }], + }, + TranscriptStep::Action { + content: vec![ActionContent::ToolCall { + id: "call-1".to_string(), + name: "read_file".to_string(), + args: "{\"path\":\"README.md\"}".to_string(), + }], + }, + TranscriptStep::ToolResult { + call_id: "call-1".to_string(), + name: "read_file".to_string(), + text: "file contents".to_string(), + }, + ]; + let contract = WorkerTaskContractSnapshot { + id: "contract-1".to_string(), + worker_id: "worker-1".to_string(), + task_summary: "test task".to_string(), + state: "terminal_acked".to_string(), + attempt_count: 2, + sla_nudge_sent: true, + terminal_state: Some("done".to_string()), + ack_deadline_at: None, + progress_deadline_at: None, + terminal_deadline_at: None, + last_progress_at: None, + created_at: Some("2026-02-25T01:00:00+00:00".to_string()), + updated_at: Some("2026-02-25T01:10:00+00:00".to_string()), + }; + let receipt = WorkerDeliveryReceiptSnapshot { + id: "receipt-1".to_string(), + worker_id: "worker-1".to_string(), + channel_id: "channel-1".to_string(), + status: "acked".to_string(), + terminal_state: "done".to_string(), + payload_text: "done".to_string(), + attempt_count: 1, + last_error: None, + next_attempt_at: None, + acked_at: Some("2026-02-25T01:11:00+00:00".to_string()), + created_at: Some("2026-02-25T01:05:00+00:00".to_string()), + updated_at: Some("2026-02-25T01:11:00+00:00".to_string()), + }; + + let projection = + build_worker_timeline_projection("done", &transcript, Some(&contract), Some(&receipt)); + assert_eq!(projection.events.len(), 5); + + assert_eq!(projection.events[0].step_index, Some(0)); + assert_eq!(projection.events[1].step_index, Some(1)); + assert_eq!(projection.events[2].step_index, Some(2)); + assert_eq!(projection.events[3].kind, "contract_state"); + assert_eq!(projection.events[4].kind, "terminal_receipt"); + + for (index, event) in projection.events.iter().enumerate() { + assert_eq!(event.sequence, index); + } + } + + #[test] + fn worker_terminal_convergence_requires_terminal_contract_state() { + assert!(worker_terminal_delivery_converged("running", None)); + assert!(!worker_terminal_delivery_converged("done", None)); + + let acked = WorkerTaskContractSnapshot { + id: "contract-acked".to_string(), + worker_id: "worker-1".to_string(), + task_summary: "task".to_string(), + state: "terminal_acked".to_string(), + attempt_count: 0, + sla_nudge_sent: false, + terminal_state: Some("done".to_string()), + ack_deadline_at: None, + progress_deadline_at: None, + terminal_deadline_at: None, + last_progress_at: None, + created_at: None, + updated_at: None, + }; + let failed = WorkerTaskContractSnapshot { + state: "terminal_failed".to_string(), + ..acked.clone() + }; + let progressing = WorkerTaskContractSnapshot { + state: "progressing".to_string(), + ..acked.clone() + }; + + assert!(worker_terminal_delivery_converged("done", Some(&acked))); + assert!(worker_terminal_delivery_converged("failed", Some(&failed))); + assert!(!worker_terminal_delivery_converged( + "timed_out", + Some(&progressing) + )); + } + + #[tokio::test] + async fn worker_terminal_receipt_claim_ack_and_stats() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "done", + "Background task completed: finished indexing", + ) + .await + .expect("upsert receipt"); + + let initial_stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load initial stats"); + assert_eq!(initial_stats.pending, 1); + assert_eq!(initial_stats.failed, 0); + + let claimed = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("claim due receipts"); + assert_eq!(claimed.len(), 1); + assert_eq!(claimed[0].id, receipt_id); + assert_eq!(claimed[0].terminal_state, "done"); + assert_eq!(claimed[0].attempt_count, 0); + + let acked_now = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("ack receipt"); + assert!(acked_now); + + let acked_again = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("idempotent ack"); + assert!(!acked_again); + + let final_stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load final stats"); + assert_eq!(final_stats.pending, 0); + assert_eq!(final_stats.failed, 0); + } + + #[tokio::test] + async fn worker_terminal_receipt_upsert_preserves_acked_rows() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "done", + "Background task completed: first payload", + ) + .await + .expect("upsert initial receipt"); + + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET status = 'acked', \ + acked_at = CURRENT_TIMESTAMP, \ + updated_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(&receipt_id) + .execute(&logger.pool) + .await + .expect("mark receipt acked"); + + let second_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "failed", + "Background task failed: should not overwrite acked receipt", + ) + .await + .expect("upsert should preserve acked row"); + assert_eq!(second_id, receipt_id); + + let row = sqlx::query( + "SELECT status, terminal_state, payload_text \ + FROM worker_delivery_receipts \ + WHERE id = ?", + ) + .bind(&receipt_id) + .fetch_one(&logger.pool) + .await + .expect("load receipt row"); + let status: String = row.try_get("status").unwrap_or_default(); + let terminal_state: String = row.try_get("terminal_state").unwrap_or_default(); + let payload_text: String = row.try_get("payload_text").unwrap_or_default(); + + assert_eq!(status, "acked"); + assert_eq!(terminal_state, "done"); + assert_eq!(payload_text, "Background task completed: first payload"); + } + + #[tokio::test] + async fn worker_terminal_receipt_failure_retries_then_fails() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "failed", + "Background task failed: network timeout", + ) + .await + .expect("upsert receipt"); + + let first_outcome = logger + .fail_worker_delivery_receipt_attempt(&receipt_id, "temporary send failure") + .await + .expect("record first failure"); + assert_eq!(first_outcome.status, "pending"); + assert_eq!(first_outcome.attempt_count, 1); + assert!(first_outcome.next_attempt_at.is_some()); + + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET next_attempt_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(&receipt_id) + .execute(&logger.pool) + .await + .expect("advance retry deadline"); + + let claimed = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("claim receipt after retry scheduling"); + assert_eq!(claimed.len(), 1); + assert_eq!(claimed[0].attempt_count, 1); + + for attempt in 2..=WORKER_RECEIPT_MAX_ATTEMPTS { + let outcome = logger + .fail_worker_delivery_receipt_attempt(&receipt_id, "adapter unavailable") + .await + .expect("record retry failure"); + assert_eq!(outcome.attempt_count, attempt); + if attempt < WORKER_RECEIPT_MAX_ATTEMPTS { + assert_eq!(outcome.status, "pending"); + assert!(outcome.next_attempt_at.is_some()); + } else { + assert_eq!(outcome.status, "failed"); + assert!(outcome.next_attempt_at.is_none()); + } + } + + let stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load retry stats"); + assert_eq!(stats.pending, 0); + assert_eq!(stats.failed, 1); + } + + #[tokio::test] + async fn close_orphaned_runs_requeues_sending_receipts() { + let logger = connect_logger().await; + let receipt_id = "receipt-test"; + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at) \ + VALUES (?, ?, ?, ?, 'sending', ?, ?, CURRENT_TIMESTAMP)", + ) + .bind(receipt_id) + .bind(uuid::Uuid::new_v4().to_string()) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("done") + .bind("Background task completed: done") + .execute(&logger.pool) + .await + .expect("insert sending receipt"); + + let (_, _, recovered_receipts, recovered_contracts) = logger + .close_orphaned_runs() + .await + .expect("recover orphaned runs"); + assert_eq!(recovered_receipts, 1); + assert_eq!(recovered_contracts, 0); + + let status: String = + sqlx::query_scalar("SELECT status FROM worker_delivery_receipts WHERE id = ?") + .bind(receipt_id) + .fetch_one(&logger.pool) + .await + .expect("load receipt status"); + assert_eq!(status, "pending"); + } + + #[tokio::test] + async fn worker_task_contract_deadline_claims_and_terminal_ack_flow() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "research task", + WorkerTaskContractTiming { + ack_secs: 0, + progress_secs: 0, + terminal_secs: 60, + }, + ) + .await + .expect("upsert contract"); + + let due_ack = logger + .claim_due_worker_task_contract_ack_deadlines(&channel_id, 10, 5) + .await + .expect("claim due ack deadlines"); + assert_eq!(due_ack.len(), 1); + assert_eq!(due_ack[0].worker_id, worker_id); + assert_eq!(due_ack[0].attempt_count, 1); + + logger + .mark_worker_task_contract_acknowledged(worker_id) + .await + .expect("mark acknowledged"); + logger + .touch_worker_task_contract_progress(worker_id, Some("indexing source data"), 30) + .await + .expect("touch progress"); + + sqlx::query( + "UPDATE worker_task_contracts \ + SET state = ?, \ + progress_deadline_at = CURRENT_TIMESTAMP, \ + sla_nudge_sent = 0 \ + WHERE worker_id = ?", + ) + .bind(WORKER_CONTRACT_STATE_PROGRESSING) + .bind(worker_id.to_string()) + .execute(&logger.pool) + .await + .expect("force progress deadline"); + + let due_progress = logger + .claim_due_worker_task_contract_progress_deadlines(&channel_id, 10) + .await + .expect("claim due progress deadlines"); + assert_eq!(due_progress.len(), 1); + assert_eq!(due_progress[0].worker_id, worker_id); + + let due_progress_again = logger + .claim_due_worker_task_contract_progress_deadlines(&channel_id, 10) + .await + .expect("second progress claim should be empty"); + assert!(due_progress_again.is_empty()); + + logger + .mark_worker_task_contract_terminal_pending(worker_id, "done", 60) + .await + .expect("mark terminal pending"); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "done", + "Background task completed: done", + ) + .await + .expect("upsert receipt"); + let acked = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("ack receipt"); + assert!(acked); + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_TERMINAL_ACKED); + } + + #[tokio::test] + async fn upsert_worker_task_contract_preserves_existing_state() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "first summary", + WorkerTaskContractTiming { + ack_secs: 5, + progress_secs: 45, + terminal_secs: 60, + }, + ) + .await + .expect("upsert initial contract"); + logger + .touch_worker_task_contract_progress(worker_id, Some("indexing"), 45) + .await + .expect("mark contract progressing"); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "updated summary", + WorkerTaskContractTiming { + ack_secs: 10, + progress_secs: 30, + terminal_secs: 120, + }, + ) + .await + .expect("refresh contract"); + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_PROGRESSING); + } + + #[tokio::test] + async fn worker_task_contract_moves_to_terminal_failed_on_receipt_exhaustion() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "analysis task", + WorkerTaskContractTiming { + ack_secs: 5, + progress_secs: 45, + terminal_secs: 60, + }, + ) + .await + .expect("upsert contract"); + logger + .mark_worker_task_contract_terminal_pending(worker_id, "failed", 60) + .await + .expect("mark terminal pending"); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "failed", + "Background task failed: request error", + ) + .await + .expect("upsert receipt"); + + for attempt in 1..=WORKER_RECEIPT_MAX_ATTEMPTS { + let outcome = logger + .fail_worker_delivery_receipt_attempt(&receipt_id, "adapter unavailable") + .await + .expect("record delivery failure"); + if attempt < WORKER_RECEIPT_MAX_ATTEMPTS { + assert_eq!(outcome.status, "pending"); + assert_eq!(outcome.attempt_count, attempt); + assert!(outcome.next_attempt_at.is_some()); + } else { + assert_eq!(outcome.status, "failed"); + assert_eq!(outcome.attempt_count, attempt); + assert!(outcome.next_attempt_at.is_none()); + } + } + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_TERMINAL_FAILED); + } + + #[tokio::test] + async fn worker_task_contract_terminal_deadline_claim_marks_failed_and_stops_receipts() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "deadline task", + WorkerTaskContractTiming { + ack_secs: 5, + progress_secs: 45, + terminal_secs: 1, + }, + ) + .await + .expect("upsert contract"); + logger + .mark_worker_task_contract_terminal_pending(worker_id, "done", 0) + .await + .expect("mark terminal pending"); + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "done", + "Background task completed: done", + ) + .await + .expect("upsert receipt"); + + let due_terminal = logger + .claim_due_worker_task_contract_terminal_deadlines(&channel_id, 10) + .await + .expect("claim due terminal deadlines"); + assert_eq!(due_terminal.len(), 1); + assert_eq!(due_terminal[0].worker_id, worker_id); + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_TERMINAL_FAILED); + + let receipt_status: String = + sqlx::query_scalar("SELECT status FROM worker_delivery_receipts WHERE id = ?") + .bind(receipt_id) + .fetch_one(&logger.pool) + .await + .expect("load receipt status"); + assert_eq!(receipt_status, "failed"); + } + + #[tokio::test] + async fn close_orphaned_runs_does_not_fail_contract_with_acked_receipt() { + let logger = connect_logger().await; + let agent_id: crate::AgentId = Arc::from("agent:test"); + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + logger + .upsert_worker_task_contract( + &agent_id, + &channel_id, + worker_id, + "orphaned contract", + WorkerTaskContractTiming { + ack_secs: 5, + progress_secs: 45, + terminal_secs: 60, + }, + ) + .await + .expect("upsert contract"); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, acked_at) \ + VALUES (?, ?, ?, ?, 'acked', ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)", + ) + .bind(uuid::Uuid::new_v4().to_string()) + .bind(worker_id.to_string()) + .bind(channel_id.as_ref()) + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("done") + .bind("Background task completed: already delivered") + .execute(&logger.pool) + .await + .expect("insert acked receipt"); + + let (_, _, _, recovered_contracts) = logger + .close_orphaned_runs() + .await + .expect("close orphaned runs"); + assert_eq!(recovered_contracts, 0); + + let state: String = + sqlx::query_scalar("SELECT state FROM worker_task_contracts WHERE worker_id = ?") + .bind(worker_id.to_string()) + .fetch_one(&logger.pool) + .await + .expect("load contract state"); + assert_eq!(state, WORKER_CONTRACT_STATE_CREATED); + } + + #[tokio::test] + async fn claim_due_worker_terminal_receipts_any_claims_multiple_channels() { + let logger = connect_logger().await; + let channel_a: ChannelId = Arc::from("discord:1:100"); + let channel_b: ChannelId = Arc::from("discord:1:200"); + + logger + .upsert_worker_terminal_receipt( + &channel_a, + uuid::Uuid::new_v4(), + "done", + "Background task completed: channel a", + ) + .await + .expect("upsert channel a receipt"); + logger + .upsert_worker_terminal_receipt( + &channel_b, + uuid::Uuid::new_v4(), + "done", + "Background task completed: channel b", + ) + .await + .expect("upsert channel b receipt"); + + let claimed = logger + .claim_due_worker_terminal_receipts_any(10) + .await + .expect("claim due receipts across channels"); + assert_eq!(claimed.len(), 2); + assert!( + claimed + .iter() + .any(|receipt| receipt.channel_id == channel_a.as_ref()) + ); + assert!( + claimed + .iter() + .any(|receipt| receipt.channel_id == channel_b.as_ref()) + ); + } + + #[tokio::test] + async fn worker_terminal_receipt_cancelled_claim_ack_round_trip() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "cancelled", + "Background task was cancelled.", + ) + .await + .expect("upsert cancelled receipt"); + + let claimed = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("claim cancelled receipt"); + assert_eq!(claimed.len(), 1); + assert_eq!(claimed[0].id, receipt_id); + assert_eq!(claimed[0].terminal_state, "cancelled"); + assert_eq!(claimed[0].payload_text, "Background task was cancelled."); + + let acked = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("ack cancelled receipt"); + assert!(acked); + + let stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load stats"); + assert_eq!(stats.pending, 0); + assert_eq!(stats.failed, 0); + } + + #[tokio::test] + async fn cancelled_receipt_delivery_failure_retries_then_acks() { + let logger = connect_logger().await; + let channel_id: ChannelId = Arc::from("channel:test"); + let worker_id = uuid::Uuid::new_v4(); + + let receipt_id = logger + .upsert_worker_terminal_receipt( + &channel_id, + worker_id, + "cancelled", + "Background task was cancelled.", + ) + .await + .expect("upsert cancelled receipt"); + + let first_claim = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("first claim"); + assert_eq!(first_claim.len(), 1); + assert_eq!(first_claim[0].id, receipt_id); + assert_eq!(first_claim[0].attempt_count, 0); + + let retry = logger + .fail_worker_delivery_receipt_attempt(&receipt_id, "adapter unavailable") + .await + .expect("record first delivery failure"); + assert_eq!(retry.status, "pending"); + assert_eq!(retry.attempt_count, 1); + assert!(retry.next_attempt_at.is_some()); + + sqlx::query( + "UPDATE worker_delivery_receipts \ + SET next_attempt_at = CURRENT_TIMESTAMP \ + WHERE id = ?", + ) + .bind(&receipt_id) + .execute(&logger.pool) + .await + .expect("advance retry deadline"); + + let second_claim = logger + .claim_due_worker_terminal_receipts(&channel_id, 8) + .await + .expect("second claim after retry"); + assert_eq!(second_claim.len(), 1); + assert_eq!(second_claim[0].id, receipt_id); + assert_eq!(second_claim[0].attempt_count, 1); + + let acked = logger + .ack_worker_delivery_receipt(&receipt_id) + .await + .expect("ack retried receipt"); + assert!(acked); + + let status: String = + sqlx::query_scalar("SELECT status FROM worker_delivery_receipts WHERE id = ?") + .bind(&receipt_id) + .fetch_one(&logger.pool) + .await + .expect("load receipt status"); + assert_eq!(status, "acked"); + + let stats = logger + .load_worker_delivery_receipt_stats(channel_id.as_ref()) + .await + .expect("load stats after ack"); + assert_eq!(stats.pending, 0); + assert_eq!(stats.failed, 0); + } + + #[tokio::test] + async fn prune_worker_delivery_receipts_deletes_only_old_terminal_rows() { + let logger = connect_logger().await; + let worker_old_acked = uuid::Uuid::new_v4().to_string(); + let worker_old_failed = uuid::Uuid::new_v4().to_string(); + let worker_old_pending = uuid::Uuid::new_v4().to_string(); + let worker_recent_acked = uuid::Uuid::new_v4().to_string(); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?)", + ) + .bind("old-acked") + .bind(&worker_old_acked) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("acked") + .bind("done") + .bind("Background task completed: old") + .bind("2000-01-01T00:00:00Z") + .execute(&logger.pool) + .await + .expect("insert old acked"); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?)", + ) + .bind("old-failed") + .bind(&worker_old_failed) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("failed") + .bind("failed") + .bind("Background task failed: old") + .bind("2000-01-01T00:00:00Z") + .execute(&logger.pool) + .await + .expect("insert old failed"); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, ?)", + ) + .bind("old-pending") + .bind(&worker_old_pending) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("pending") + .bind("done") + .bind("Background task completed: pending") + .bind("2000-01-01T00:00:00Z") + .execute(&logger.pool) + .await + .expect("insert old pending"); + + sqlx::query( + "INSERT INTO worker_delivery_receipts \ + (id, worker_id, channel_id, kind, status, terminal_state, payload_text, next_attempt_at, updated_at) \ + VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP)", + ) + .bind("recent-acked") + .bind(&worker_recent_acked) + .bind("channel:test") + .bind(WORKER_TERMINAL_RECEIPT_KIND) + .bind("acked") + .bind("done") + .bind("Background task completed: recent") + .execute(&logger.pool) + .await + .expect("insert recent acked"); + + let deleted = logger + .prune_worker_delivery_receipts() + .await + .expect("prune receipts"); + assert_eq!(deleted, 2); + + let remaining: Vec = + sqlx::query_scalar("SELECT id FROM worker_delivery_receipts ORDER BY id ASC") + .fetch_all(&logger.pool) + .await + .expect("load remaining receipt ids"); + assert_eq!(remaining, vec!["old-pending", "recent-acked"]); + } } diff --git a/src/cron/scheduler.rs b/src/cron/scheduler.rs index bf8b841d8..21a55d878 100644 --- a/src/cron/scheduler.rs +++ b/src/cron/scheduler.rs @@ -564,7 +564,7 @@ async fn run_cron_job(job: &CronJob, context: &CronContext) -> Result<()> { let channel_id: crate::ChannelId = Arc::from(format!("cron:{}", job.id).as_str()); // Create the outbound response channel to collect whatever the channel produces - let (response_tx, mut response_rx) = tokio::sync::mpsc::channel::(32); + let (response_tx, mut response_rx) = tokio::sync::mpsc::channel::(32); // Subscribe to the agent's event bus (the channel needs this for branch/worker events) let event_rx = context.deps.event_tx.subscribe(); @@ -615,10 +615,16 @@ async fn run_cron_job(job: &CronJob, context: &CronContext) -> Result<()> { loop { match tokio::time::timeout(timeout, response_rx.recv()).await { - Ok(Some(OutboundResponse::Text(text))) => { + Ok(Some(crate::OutboundEnvelope { + response: OutboundResponse::Text(text), + .. + })) => { collected_text.push(text); } - Ok(Some(OutboundResponse::RichMessage { text, .. })) => { + Ok(Some(crate::OutboundEnvelope { + response: OutboundResponse::RichMessage { text, .. }, + .. + })) => { collected_text.push(text); } Ok(Some(_)) => { diff --git a/src/db.rs b/src/db.rs index ac41768af..a418e54e0 100644 --- a/src/db.rs +++ b/src/db.rs @@ -66,3 +66,47 @@ impl Db { // LanceDB and redb close automatically when dropped } } + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + #[test] + fn migration_versions_are_unique() { + let migrations_dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("migrations"); + let entries = std::fs::read_dir(&migrations_dir).expect("read migrations directory"); + + let mut seen_versions = HashSet::new(); + for entry in entries { + let entry = entry.expect("read migration directory entry"); + let path = entry.path(); + if path.extension().and_then(|extension| extension.to_str()) != Some("sql") { + continue; + } + + let file_name = path + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or_default(); + let (version, _) = file_name + .split_once('_') + .expect("migration filename should contain version prefix"); + assert!( + !version.is_empty(), + "migration version should not be empty: {file_name}" + ); + assert!( + version.chars().all(|character| character.is_ascii_digit()), + "migration version should be numeric: {file_name}" + ); + assert!( + seen_versions.insert(version.to_string()), + "duplicate migration version detected: {version} ({file_name})" + ); + } + assert!( + !seen_versions.is_empty(), + "no migrations found in migrations/" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 0e28ff0a3..a123f204b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -380,6 +380,38 @@ pub enum OutboundResponse { Status(StatusUpdate), } +/// Internal wrapper for outbound channel routing. +/// +/// Carries an optional durable delivery receipt ID for messages that must be +/// acknowledged by the outbound adapter path. +#[derive(Debug, Clone)] +pub struct OutboundEnvelope { + pub response: OutboundResponse, + pub receipt_id: Option, +} + +impl OutboundEnvelope { + pub fn untracked(response: OutboundResponse) -> Self { + Self { + response, + receipt_id: None, + } + } + + pub fn tracked(response: OutboundResponse, receipt_id: String) -> Self { + Self { + response, + receipt_id: Some(receipt_id), + } + } +} + +impl From for OutboundEnvelope { + fn from(response: OutboundResponse) -> Self { + Self::untracked(response) + } +} + /// A generic rich-formatted card (maps to Embeds in Discord). #[derive(Debug, Clone, Serialize, Deserialize, Default, schemars::JsonSchema)] pub struct Card { @@ -503,4 +535,10 @@ pub enum StatusUpdate { worker_id: WorkerId, result: String, }, + /// Progress checkpoint from a running worker. + /// Intended for sparse, meaningful user-facing updates. + WorkerCheckpoint { + worker_id: WorkerId, + status: String, + }, } diff --git a/src/llm/model.rs b/src/llm/model.rs index 4ba76e03d..8d961af76 100644 --- a/src/llm/model.rs +++ b/src/llm/model.rs @@ -519,7 +519,7 @@ impl SpacebotModel { "messages": messages, }); - if let Some(max_tokens) = request.max_tokens { + if let Some(max_tokens) = positive_max_tokens(request.max_tokens) { body["max_tokens"] = serde_json::json!(max_tokens); } @@ -632,7 +632,7 @@ impl SpacebotModel { ); } - if !is_chatgpt_codex && let Some(max_tokens) = request.max_tokens { + if !is_chatgpt_codex && let Some(max_tokens) = positive_max_tokens(request.max_tokens) { body["max_output_tokens"] = serde_json::json!(max_tokens); } @@ -760,7 +760,7 @@ impl SpacebotModel { "messages": messages, }); - if let Some(max_tokens) = request.max_tokens { + if let Some(max_tokens) = positive_max_tokens(request.max_tokens) { body["max_tokens"] = serde_json::json!(max_tokens); } @@ -824,16 +824,7 @@ impl SpacebotModel { /// Remap model name for providers that require a different format in API calls. fn remap_model_name_for_api(&self) -> String { - if self.provider == "zai-coding-plan" { - // Z.AI Coding Plan API expects "zai/glm-5" not "glm-5" - let model_name = self - .model_name - .strip_prefix("zai/") - .unwrap_or(&self.model_name); - format!("zai/{model_name}") - } else { - self.model_name.clone() - } + remap_model_name_for_api(&self.provider, &self.model_name) } /// Generic OpenAI-compatible API call with optional bearer auth. @@ -861,7 +852,7 @@ impl SpacebotModel { "messages": messages, }); - if let Some(max_tokens) = request.max_tokens { + if let Some(max_tokens) = positive_max_tokens(request.max_tokens) { body["max_tokens"] = serde_json::json!(max_tokens); } @@ -943,6 +934,10 @@ fn reverse_map_tool_names( } } +fn positive_max_tokens(max_tokens: Option) -> Option { + max_tokens.filter(|value| *value > 0) +} + fn tool_result_content_to_string(content: &OneOrMany) -> String { content .iter() @@ -1526,6 +1521,18 @@ fn parse_openai_error_message(response_text: &str) -> Option { .map(ToOwned::to_owned) } +fn remap_model_name_for_api(provider: &str, model_name: &str) -> String { + if provider == "zai-coding-plan" { + // Coding Plan endpoint expects plain model ids (e.g. "glm-5"). + model_name + .strip_prefix("zai/") + .unwrap_or(model_name) + .to_string() + } else { + model_name.to_string() + } +} + #[cfg(test)] mod tests { use super::*; @@ -1568,4 +1575,32 @@ mod tests { panic!("expected ToolCall"); } } + + #[test] + fn positive_max_tokens_omits_none_and_zero() { + assert_eq!(positive_max_tokens(None), None); + assert_eq!(positive_max_tokens(Some(0)), None); + } + + #[test] + fn positive_max_tokens_keeps_positive_values() { + assert_eq!(positive_max_tokens(Some(1)), Some(1)); + assert_eq!(positive_max_tokens(Some(2048)), Some(2048)); + } + + #[test] + fn coding_plan_model_name_uses_plain_glm_id() { + assert_eq!( + remap_model_name_for_api("zai-coding-plan", "glm-5"), + "glm-5" + ); + assert_eq!( + remap_model_name_for_api("zai-coding-plan", "zai/glm-5"), + "glm-5" + ); + assert_eq!( + remap_model_name_for_api("openai", "gpt-4o-mini"), + "gpt-4o-mini" + ); + } } diff --git a/src/main.rs b/src/main.rs index e1eec905c..9a54ae462 100644 --- a/src/main.rs +++ b/src/main.rs @@ -126,6 +126,400 @@ struct ActiveChannel { _outbound_handle: tokio::task::JoinHandle<()>, } +const WORKER_RECEIPT_GLOBAL_DISPATCH_INTERVAL_SECS: u64 = 5; +const WORKER_RECEIPT_GLOBAL_DISPATCH_BATCH_SIZE: i64 = 32; +const WORKER_RECEIPT_PRUNE_INTERVAL_SECS: u64 = 60 * 60; + +struct OutboundRouteContext<'a> { + messaging_for_outbound: &'a spacebot::messaging::MessagingManager, + current_message: &'a spacebot::InboundMessage, + outbound_conversation_id: &'a str, + outbound_agent_names: &'a HashMap, + sse_agent_id: &'a str, + api_event_tx: &'a tokio::sync::broadcast::Sender, +} + +struct RoutedOutboundResponse { + delivery_result: spacebot::Result<()>, + delivery_outcome: spacebot::messaging::traits::DeliveryOutcome, + status_surfaced: bool, + is_status_update: bool, + acknowledged_worker_id: Option, +} + +struct ReceiptDeliveryContext<'a> { + outbound_process_logger: &'a spacebot::conversation::history::ProcessRunLogger, + outbound_conversation_logger: &'a spacebot::conversation::history::ConversationLogger, + outbound_channel_id: &'a spacebot::ChannelId, + outbound_conversation_id: &'a str, +} + +fn outbound_response_text(response: &spacebot::OutboundResponse) -> Option { + match response { + spacebot::OutboundResponse::Text(text) + | spacebot::OutboundResponse::StreamChunk(text) + | spacebot::OutboundResponse::Ephemeral { text, .. } + | spacebot::OutboundResponse::ScheduledMessage { text, .. } + | spacebot::OutboundResponse::RichMessage { text, .. } + | spacebot::OutboundResponse::ThreadReply { text, .. } => Some(text.clone()), + _ => None, + } +} + +fn acknowledged_worker_id_from_response( + response: &spacebot::OutboundResponse, +) -> Option { + match response { + spacebot::OutboundResponse::Status(spacebot::StatusUpdate::WorkerStarted { + worker_id, + .. + }) + | spacebot::OutboundResponse::Status(spacebot::StatusUpdate::WorkerCheckpoint { + worker_id, + .. + }) + | spacebot::OutboundResponse::Status(spacebot::StatusUpdate::WorkerCompleted { + worker_id, + .. + }) => Some(*worker_id), + _ => None, + } +} + +fn emit_outbound_sse_event( + api_event_tx: &tokio::sync::broadcast::Sender, + sse_agent_id: &str, + sse_channel_id: &str, + response: &spacebot::OutboundResponse, +) { + match response { + spacebot::OutboundResponse::Text(text) + | spacebot::OutboundResponse::RichMessage { text, .. } + | spacebot::OutboundResponse::ThreadReply { text, .. } => { + api_event_tx + .send(spacebot::api::ApiEvent::OutboundMessage { + agent_id: sse_agent_id.to_string(), + channel_id: sse_channel_id.to_string(), + text: text.clone(), + }) + .ok(); + } + spacebot::OutboundResponse::Status(spacebot::StatusUpdate::Thinking) => { + api_event_tx + .send(spacebot::api::ApiEvent::TypingState { + agent_id: sse_agent_id.to_string(), + channel_id: sse_channel_id.to_string(), + is_typing: true, + }) + .ok(); + } + spacebot::OutboundResponse::Status(spacebot::StatusUpdate::StopTyping) => { + api_event_tx + .send(spacebot::api::ApiEvent::TypingState { + agent_id: sse_agent_id.to_string(), + channel_id: sse_channel_id.to_string(), + is_typing: false, + }) + .ok(); + } + _ => {} + } +} + +fn should_emit_outbound_sse_event(routed: &RoutedOutboundResponse) -> bool { + if routed.delivery_result.is_err() { + return false; + } + if routed.is_status_update { + return routed.status_surfaced; + } + true +} + +async fn route_internal_link_reply( + context: &OutboundRouteContext<'_>, + response: &spacebot::OutboundResponse, +) -> spacebot::Result { + let Some(text) = outbound_response_text(response) else { + return Ok(spacebot::messaging::traits::DeliveryOutcome::NotSurfaced); + }; + + let reply_to_agent = context + .current_message + .metadata + .get("reply_to_agent") + .and_then(|value| value.as_str()) + .map(str::to_owned); + let reply_to_channel = context + .current_message + .metadata + .get("reply_to_channel") + .and_then(|value| value.as_str()) + .map(str::to_owned); + + let (Some(target_agent), Some(target_channel)) = (reply_to_agent, reply_to_channel) else { + return Err(spacebot::Error::Other(anyhow::anyhow!( + "internal link reply missing reply_to_agent/reply_to_channel metadata" + ))); + }; + + let agent_display = context + .outbound_agent_names + .get(context.sse_agent_id) + .cloned() + .unwrap_or_else(|| context.sse_agent_id.to_string()); + + let original_text = match &context.current_message.content { + spacebot::MessageContent::Text(text) => Some(text.clone()), + spacebot::MessageContent::Media { text, .. } => text.clone(), + _ => None, + }; + + let mut metadata = HashMap::new(); + metadata.insert( + "from_agent_id".to_string(), + serde_json::json!(context.sse_agent_id), + ); + metadata.insert( + "reply_to_agent".to_string(), + serde_json::json!(context.sse_agent_id), + ); + metadata.insert( + "reply_to_channel".to_string(), + serde_json::json!(context.outbound_conversation_id), + ); + if let Some(original) = original_text { + metadata.insert( + "original_sent_message".to_string(), + serde_json::json!(original), + ); + } + if let Some(originating) = context.current_message.metadata.get("originating_channel") { + metadata.insert("originating_channel".to_string(), originating.clone()); + } + if let Some(source) = context.current_message.metadata.get("originating_source") { + metadata.insert("originating_source".to_string(), source.clone()); + } + + let reply_message = spacebot::InboundMessage { + id: uuid::Uuid::new_v4().to_string(), + source: "internal".to_string(), + conversation_id: target_channel.clone(), + sender_id: context.sse_agent_id.to_string(), + agent_id: Some(Arc::from(target_agent.as_str())), + content: spacebot::MessageContent::Text(text), + timestamp: chrono::Utc::now(), + metadata, + formatted_author: Some(format!("[{agent_display}]")), + }; + + context + .messaging_for_outbound + .inject_message(reply_message) + .await?; + + context + .api_event_tx + .send(spacebot::api::ApiEvent::AgentMessageSent { + from_agent_id: context.sse_agent_id.to_string(), + to_agent_id: target_agent.clone(), + link_id: target_channel.clone(), + channel_id: target_channel.clone(), + }) + .ok(); + + tracing::info!( + from = %context.sse_agent_id, + to = %target_agent, + channel = %target_channel, + "routed link channel reply" + ); + + Ok(spacebot::messaging::traits::DeliveryOutcome::Surfaced) +} + +async fn route_outbound_response( + context: &OutboundRouteContext<'_>, + response: spacebot::OutboundResponse, +) -> RoutedOutboundResponse { + if context.current_message.source == "internal" { + let acknowledged_worker_id = acknowledged_worker_id_from_response(&response); + if matches!(response, spacebot::OutboundResponse::Status(_)) { + return RoutedOutboundResponse { + delivery_result: Ok(()), + delivery_outcome: spacebot::messaging::traits::DeliveryOutcome::Surfaced, + status_surfaced: true, + is_status_update: true, + acknowledged_worker_id, + }; + } + + let (delivery_result, delivery_outcome) = + match route_internal_link_reply(context, &response).await { + Ok(outcome) => (Ok(()), outcome), + Err(error) => ( + Err(error), + spacebot::messaging::traits::DeliveryOutcome::NotSurfaced, + ), + }; + let status_surfaced = delivery_outcome.is_surfaced(); + return RoutedOutboundResponse { + delivery_result, + delivery_outcome, + status_surfaced, + is_status_update: false, + acknowledged_worker_id, + }; + } + + let acknowledged_worker_id = acknowledged_worker_id_from_response(&response); + match response { + spacebot::OutboundResponse::Status(status) => { + let (delivery_result, delivery_outcome) = match context + .messaging_for_outbound + .send_status(context.current_message, status) + .await + { + Ok(outcome) => (Ok(()), outcome), + Err(error) => ( + Err(error), + spacebot::messaging::traits::DeliveryOutcome::NotSurfaced, + ), + }; + let status_surfaced = delivery_outcome.is_surfaced(); + RoutedOutboundResponse { + delivery_result, + delivery_outcome, + status_surfaced, + is_status_update: true, + acknowledged_worker_id, + } + } + response => { + tracing::info!( + conversation_id = %context.outbound_conversation_id, + "routing outbound response to messaging adapter" + ); + let delivery_result = context + .messaging_for_outbound + .respond(context.current_message, response) + .await; + let delivery_outcome = if delivery_result.is_ok() { + spacebot::messaging::traits::DeliveryOutcome::Surfaced + } else { + spacebot::messaging::traits::DeliveryOutcome::NotSurfaced + }; + let status_surfaced = delivery_outcome.is_surfaced(); + RoutedOutboundResponse { + delivery_result, + delivery_outcome, + status_surfaced, + is_status_update: false, + acknowledged_worker_id, + } + } + } +} + +async fn handle_delivery_receipt( + context: &ReceiptDeliveryContext<'_>, + receipt_id: &str, + routed: &RoutedOutboundResponse, + receipt_log_text: Option<&str>, +) { + if routed.is_status_update && !routed.status_surfaced { + let failure_reason = unsurfaced_status_failure_reason(&routed.delivery_result); + match context + .outbound_process_logger + .fail_worker_delivery_receipt_attempt(receipt_id, &failure_reason) + .await + { + Ok(outcome) => { + tracing::warn!( + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + attempt_count = outcome.attempt_count, + status = %outcome.status, + next_attempt_at = ?outcome.next_attempt_at, + "worker terminal receipt was not surfaced; scheduled retry" + ); + } + Err(update_error) => { + tracing::warn!( + %update_error, + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + "failed to record unsurfaced worker terminal receipt" + ); + } + } + return; + } + + match &routed.delivery_result { + Ok(()) => match context + .outbound_process_logger + .ack_worker_delivery_receipt(receipt_id) + .await + { + Ok(acked_now) => { + if acked_now { + if let Some(text) = receipt_log_text { + context + .outbound_conversation_logger + .log_bot_message(context.outbound_channel_id, text); + } + tracing::info!( + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + "worker terminal receipt delivered" + ); + } + } + Err(error) => { + tracing::warn!( + %error, + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + "failed to ack worker terminal receipt" + ); + } + }, + Err(error) => match context + .outbound_process_logger + .fail_worker_delivery_receipt_attempt(receipt_id, &error.to_string()) + .await + { + Ok(outcome) => { + tracing::warn!( + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + attempt_count = outcome.attempt_count, + status = %outcome.status, + next_attempt_at = ?outcome.next_attempt_at, + "worker terminal receipt delivery failed" + ); + } + Err(update_error) => { + tracing::warn!( + %update_error, + channel_id = %context.outbound_conversation_id, + receipt_id = %receipt_id, + "failed to record worker terminal receipt delivery failure" + ); + } + }, + } +} + +fn unsurfaced_status_failure_reason(delivery_result: &spacebot::Result<()>) -> String { + delivery_result + .as_ref() + .err() + .map(std::string::ToString::to_string) + .unwrap_or_else(|| "status update not surfaced by adapter".to_string()) +} + fn main() -> anyhow::Result<()> { rustls::crypto::ring::default_provider() .install_default() @@ -243,6 +637,217 @@ async fn cmd_stop() -> anyhow::Result<()> { Ok(()) } +fn spawn_worker_receipt_dispatch_loop( + agent_id: String, + process_run_logger: spacebot::conversation::history::ProcessRunLogger, + channel_store: spacebot::conversation::ChannelStore, + conversation_logger: spacebot::conversation::history::ConversationLogger, + messaging_manager: Arc, + api_event_tx: tokio::sync::broadcast::Sender, +) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + const FAILURE_THRESHOLD: usize = 3; + let mut next_prune_at = tokio::time::Instant::now() + std::time::Duration::from_secs(5); + let mut consecutive_failures: usize = 0; + loop { + if tokio::time::Instant::now() >= next_prune_at { + match process_run_logger.prune_worker_delivery_receipts().await { + Ok(deleted) if deleted > 0 => { + tracing::info!( + agent_id = %agent_id, + deleted, + "pruned old worker delivery receipts" + ); + } + Ok(_) => {} + Err(error) => { + tracing::warn!( + agent_id = %agent_id, + %error, + "failed to prune worker delivery receipts" + ); + } + } + next_prune_at = tokio::time::Instant::now() + + std::time::Duration::from_secs(WORKER_RECEIPT_PRUNE_INTERVAL_SECS); + } + + let due = match process_run_logger + .claim_due_worker_terminal_receipts_any(WORKER_RECEIPT_GLOBAL_DISPATCH_BATCH_SIZE) + .await + { + Ok(receipts) => { + consecutive_failures = 0; + receipts + } + Err(error) => { + consecutive_failures = consecutive_failures.saturating_add(1); + tracing::warn!( + agent_id = %agent_id, + consecutive_failures, + %error, + "global worker receipt dispatcher failed to claim receipts" + ); + if consecutive_failures >= FAILURE_THRESHOLD { + tracing::error!( + agent_id = %agent_id, + consecutive_failures, + threshold = FAILURE_THRESHOLD, + next_prune_at = ?next_prune_at, + "worker receipt dispatcher circuit breaker opened after repeated claim failures" + ); + return; + } + Vec::new() + } + }; + + for receipt in due { + let delivery_result = async { + let channel_info = channel_store + .get(&receipt.channel_id) + .await + .map_err(|error| { + anyhow::anyhow!("failed to resolve channel info: {error}") + })? + .ok_or_else(|| { + anyhow::anyhow!( + "cannot deliver worker receipt: channel '{}' not found", + receipt.channel_id + ) + })?; + + let target = + spacebot::messaging::target::resolve_broadcast_target(&channel_info) + .ok_or_else(|| { + anyhow::anyhow!( + "cannot resolve broadcast target for channel '{}'", + receipt.channel_id + ) + })?; + + messaging_manager + .broadcast( + &target.adapter, + &target.target, + spacebot::OutboundResponse::Text(receipt.payload_text.clone()), + ) + .await + } + .await; + + match delivery_result { + Ok(()) => match process_run_logger + .ack_worker_delivery_receipt(&receipt.id) + .await + { + Ok(acked_now) => { + consecutive_failures = 0; + if acked_now { + let channel_id: spacebot::ChannelId = + Arc::from(receipt.channel_id.as_str()); + conversation_logger + .log_bot_message(&channel_id, &receipt.payload_text); + api_event_tx + .send(spacebot::api::ApiEvent::OutboundMessage { + agent_id: agent_id.clone(), + channel_id: receipt.channel_id.clone(), + text: receipt.payload_text.clone(), + }) + .ok(); + tracing::info!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + "global worker receipt dispatcher delivered terminal receipt" + ); + } + } + Err(error) => { + consecutive_failures = consecutive_failures.saturating_add(1); + tracing::warn!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + consecutive_failures, + %error, + "failed to ack globally delivered worker terminal receipt" + ); + if consecutive_failures >= FAILURE_THRESHOLD { + tracing::error!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + consecutive_failures, + threshold = FAILURE_THRESHOLD, + next_prune_at = ?next_prune_at, + "worker receipt dispatcher circuit breaker opened after repeated ack failures" + ); + return; + } + } + }, + Err(error) => { + match process_run_logger + .fail_worker_delivery_receipt_attempt(&receipt.id, &error.to_string()) + .await + { + Ok(outcome) => { + consecutive_failures = 0; + tracing::warn!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + attempt_count = outcome.attempt_count, + status = %outcome.status, + next_attempt_at = ?outcome.next_attempt_at, + consecutive_failures, + %error, + "global worker receipt dispatcher failed to deliver terminal receipt" + ); + } + Err(update_error) => { + consecutive_failures = consecutive_failures.saturating_add(1); + tracing::warn!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + consecutive_failures, + %update_error, + "failed to record global worker receipt delivery failure" + ); + } + } + if consecutive_failures >= FAILURE_THRESHOLD { + tracing::error!( + agent_id = %agent_id, + channel_id = %receipt.channel_id, + worker_id = %receipt.worker_id, + receipt_id = %receipt.id, + consecutive_failures, + threshold = FAILURE_THRESHOLD, + next_prune_at = ?next_prune_at, + "worker receipt dispatcher circuit breaker opened after repeated delivery/update failures" + ); + return; + } + } + } + } + + tokio::time::sleep(std::time::Duration::from_secs( + WORKER_RECEIPT_GLOBAL_DISPATCH_INTERVAL_SECS, + )) + .await; + } + }) +} + /// Stop if running, don't error if not. fn cmd_stop_if_running() { let paths = spacebot::daemon::DaemonPaths::from_default(); @@ -856,7 +1461,8 @@ async fn run( }; // Create outbound response channel - let (response_tx, mut response_rx) = mpsc::channel::(32); + let (response_tx, mut response_rx) = + mpsc::channel::(32); // Subscribe to the agent's event bus let event_rx = agent.deps.event_tx.subscribe(); @@ -930,169 +1536,89 @@ async fn run( let latest_message = Arc::new(tokio::sync::RwLock::new(message.clone())); let outbound_message = latest_message.clone(); let outbound_conversation_id = conversation_id.clone(); + let outbound_channel_id: spacebot::ChannelId = + Arc::from(outbound_conversation_id.clone()); + let outbound_process_logger = + spacebot::conversation::history::ProcessRunLogger::new( + agent.db.sqlite.clone(), + ); + let outbound_conversation_logger = + spacebot::conversation::history::ConversationLogger::new( + agent.db.sqlite.clone(), + ); + let outbound_agent_names = agent.deps.agent_names.clone(); let api_event_tx = api_state.event_tx.clone(); let sse_agent_id = agent_id.to_string(); let sse_channel_id = conversation_id.clone(); - let outbound_agent_names = agent.deps.agent_names.clone(); let outbound_handle = tokio::spawn(async move { - while let Some(response) = response_rx.recv().await { - // Forward relevant events to SSE clients - match &response { - spacebot::OutboundResponse::Text(text) => { - api_event_tx.send(spacebot::api::ApiEvent::OutboundMessage { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - text: text.clone(), - }).ok(); - } - spacebot::OutboundResponse::RichMessage { text, .. } => { - api_event_tx.send(spacebot::api::ApiEvent::OutboundMessage { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - text: text.clone(), - }).ok(); - } - spacebot::OutboundResponse::ThreadReply { text, .. } => { - api_event_tx.send(spacebot::api::ApiEvent::OutboundMessage { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - text: text.clone(), - }).ok(); - } - spacebot::OutboundResponse::Status(spacebot::StatusUpdate::Thinking) => { - api_event_tx.send(spacebot::api::ApiEvent::TypingState { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - is_typing: true, - }).ok(); - } - spacebot::OutboundResponse::Status(spacebot::StatusUpdate::StopTyping) => { - api_event_tx.send(spacebot::api::ApiEvent::TypingState { - agent_id: sse_agent_id.clone(), - channel_id: sse_channel_id.clone(), - is_typing: false, - }).ok(); - } - _ => {} - } + while let Some(envelope) = response_rx.recv().await { + let receipt_id = envelope.receipt_id; + let response = envelope.response; + let receipt_log_text = outbound_response_text(&response); + let response_for_sse = response.clone(); let current_message = outbound_message.read().await.clone(); + let route_context = OutboundRouteContext { + messaging_for_outbound: &messaging_for_outbound, + current_message: ¤t_message, + outbound_conversation_id: &outbound_conversation_id, + outbound_agent_names: &outbound_agent_names, + sse_agent_id: &sse_agent_id, + api_event_tx: &api_event_tx, + }; + let routed = route_outbound_response(&route_context, response).await; + if should_emit_outbound_sse_event(&routed) { + emit_outbound_sse_event( + &api_event_tx, + &sse_agent_id, + &sse_channel_id, + &response_for_sse, + ); + } - // Internal link channels: route replies back to the sender's link channel - if current_message.source == "internal" { - let reply_text = match &response { - spacebot::OutboundResponse::Text(t) => Some(t.clone()), - spacebot::OutboundResponse::RichMessage { text, .. } => Some(text.clone()), - spacebot::OutboundResponse::ThreadReply { text, .. } => Some(text.clone()), - spacebot::OutboundResponse::Status(_) => None, - _ => None, - }; + if let (Ok(()), Some(worker_id)) = + (&routed.delivery_result, routed.acknowledged_worker_id) + && routed.status_surfaced + && let Err(error) = outbound_process_logger + .mark_worker_task_contract_acknowledged(worker_id) + .await + { + tracing::warn!( + %error, + channel_id = %outbound_conversation_id, + worker_id = %worker_id, + "failed to mark worker task contract acknowledged" + ); + } - if let Some(text) = reply_text { - let reply_to_agent = current_message.metadata - .get("reply_to_agent") - .and_then(|v| v.as_str()) - .map(String::from); - let reply_to_channel = current_message.metadata - .get("reply_to_channel") - .and_then(|v| v.as_str()) - .map(String::from); - - if let (Some(target_agent), Some(target_channel)) = (reply_to_agent, reply_to_channel) { - let agent_display = outbound_agent_names - .get(&sse_agent_id) - .cloned() - .unwrap_or_else(|| sse_agent_id.clone()); - - // Include the original sent message so the receiving - // agent's link channel can seed its history with context - let original_text = match ¤t_message.content { - spacebot::MessageContent::Text(t) => Some(t.clone()), - spacebot::MessageContent::Media { text, .. } => text.clone(), - _ => None, - }; - - let mut metadata = std::collections::HashMap::from([ - ("from_agent_id".into(), serde_json::json!(&sse_agent_id)), - ("reply_to_agent".into(), serde_json::json!(&sse_agent_id)), - ("reply_to_channel".into(), serde_json::json!(&outbound_conversation_id)), - ]); - if let Some(original) = original_text { - metadata.insert("original_sent_message".into(), serde_json::json!(original)); - } - // Propagate originating_channel and originating_source so both sides - // know where to route conclusions and which adapter to use. - if let Some(originating) = current_message.metadata.get("originating_channel") { - metadata.insert("originating_channel".into(), originating.clone()); - } - if let Some(source) = current_message.metadata.get("originating_source") { - metadata.insert("originating_source".into(), source.clone()); - } - - let reply_message = spacebot::InboundMessage { - id: uuid::Uuid::new_v4().to_string(), - source: "internal".into(), - conversation_id: target_channel.clone(), - sender_id: sse_agent_id.clone(), - agent_id: Some(Arc::from(target_agent.as_str())), - content: spacebot::MessageContent::Text(text), - timestamp: chrono::Utc::now(), - metadata, - formatted_author: Some(format!("[{agent_display}]")), - }; - - if let Err(error) = messaging_for_outbound - .inject_message(reply_message) - .await - { - tracing::error!( - %error, - from = %sse_agent_id, - to = %target_agent, - "failed to route link channel reply" - ); - } else { - // Emit SSE event so the dashboard animates the edge - api_event_tx.send(spacebot::api::ApiEvent::AgentMessageSent { - from_agent_id: sse_agent_id.clone(), - to_agent_id: target_agent.clone(), - link_id: target_channel.clone(), - channel_id: target_channel.clone(), - }).ok(); - - tracing::info!( - from = %sse_agent_id, - to = %target_agent, - channel = %target_channel, - "routed link channel reply" - ); - } - } - } - continue; + if let Some(receipt_id) = receipt_id.as_deref() { + let receipt_context = ReceiptDeliveryContext { + outbound_process_logger: &outbound_process_logger, + outbound_conversation_logger: &outbound_conversation_logger, + outbound_channel_id: &outbound_channel_id, + outbound_conversation_id: &outbound_conversation_id, + }; + handle_delivery_receipt( + &receipt_context, + receipt_id, + &routed, + receipt_log_text.as_deref(), + ) + .await; } - match response { - spacebot::OutboundResponse::Status(status) => { - if let Err(error) = messaging_for_outbound - .send_status(¤t_message, status) - .await - { - tracing::warn!(%error, "failed to send status update"); - } - } - response => { - tracing::info!( - conversation_id = %outbound_conversation_id, - "routing outbound response to messaging adapter" - ); - if let Err(error) = messaging_for_outbound - .respond(¤t_message, response) - .await - { - tracing::error!(%error, "failed to send outbound response"); - } + if let Err(error) = &routed.delivery_result { + if routed.is_status_update { + tracing::warn!(%error, "failed to send status update"); + } else { + tracing::error!(%error, "failed to send outbound response"); } + } else if routed.is_status_update && !routed.status_surfaced { + tracing::warn!( + channel_id = %outbound_conversation_id, + delivery_outcome = ?routed.delivery_outcome, + "status update was accepted by adapter but not surfaced" + ); } } }); @@ -1372,6 +1898,33 @@ async fn initialize_agents( ) })?; + let process_run_logger = + spacebot::conversation::history::ProcessRunLogger::new(db.sqlite.clone()); + let (recovered_workers, recovered_branches, recovered_receipts, recovered_contracts) = + process_run_logger + .close_orphaned_runs() + .await + .with_context(|| { + format!( + "failed to recover orphaned runs for agent '{}'", + agent_config.id + ) + })?; + if recovered_workers > 0 + || recovered_branches > 0 + || recovered_receipts > 0 + || recovered_contracts > 0 + { + tracing::warn!( + agent_id = %agent_config.id, + recovered_workers, + recovered_branches, + recovered_receipts, + recovered_contracts, + "recovered orphaned process runs from previous startup" + ); + } + // Per-agent settings store (redb-backed) let settings_path = agent_config.data_dir.join("settings.redb"); let settings_store = Arc::new( @@ -1657,6 +2210,21 @@ async fn initialize_agents( tracing::info!("messaging adapters started"); + // Start a global worker terminal receipt dispatcher for each agent so + // pending receipts are delivered even when no channel loop is active. + for (agent_id, agent) in agents.iter() { + let handle = spawn_worker_receipt_dispatch_loop( + agent_id.to_string(), + spacebot::conversation::history::ProcessRunLogger::new(agent.db.sqlite.clone()), + spacebot::conversation::ChannelStore::new(agent.db.sqlite.clone()), + spacebot::conversation::history::ConversationLogger::new(agent.db.sqlite.clone()), + messaging_manager.clone(), + api_state.event_tx.clone(), + ); + cortex_handles.push(handle); + tracing::info!(agent_id = %agent_id, "worker receipt dispatcher loop started"); + } + // Initialize cron schedulers for each agent let mut cron_stores_map = std::collections::HashMap::new(); let mut cron_schedulers_map = std::collections::HashMap::new(); @@ -1800,3 +2368,28 @@ async fn initialize_agents( Ok(()) } + +#[cfg(test)] +mod tests { + use super::unsurfaced_status_failure_reason; + + #[test] + fn unsurfaced_status_failure_reason_uses_adapter_error_when_present() { + let result: spacebot::Result<()> = Err(spacebot::Error::from(anyhow::anyhow!( + "discord adapter rejected update" + ))); + assert_eq!( + unsurfaced_status_failure_reason(&result), + "discord adapter rejected update" + ); + } + + #[test] + fn unsurfaced_status_failure_reason_falls_back_for_non_error_result() { + let result: spacebot::Result<()> = Ok(()); + assert_eq!( + unsurfaced_status_failure_reason(&result), + "status update not surfaced by adapter" + ); + } +} diff --git a/src/messaging/discord.rs b/src/messaging/discord.rs index ab433dd3a..2d5e1f53a 100644 --- a/src/messaging/discord.rs +++ b/src/messaging/discord.rs @@ -1,7 +1,7 @@ //! Discord messaging adapter using serenity. use crate::config::DiscordPermissions; -use crate::messaging::traits::{HistoryMessage, InboundStream, Messaging}; +use crate::messaging::traits::{DeliveryOutcome, HistoryMessage, InboundStream, Messaging}; use crate::{InboundMessage, MessageContent, OutboundResponse, StatusUpdate}; use anyhow::Context as _; @@ -26,6 +26,9 @@ pub struct DiscordAdapter { bot_user_id: Arc>>, /// Maps InboundMessage.id to the Discord MessageId being edited during streaming. active_messages: Arc>>, + /// Per-worker-per-channel progress message used for worker checkpoint edits. + /// Keys are generated by `progress_message_key()` as `{channel_id}:{worker_id}`. + progress_messages: Arc>>, /// Typing handles per message. Typing stops when the handle is dropped. typing_tasks: Arc>>, shard_manager: Arc>>>, @@ -39,6 +42,7 @@ impl DiscordAdapter { http: Arc::new(RwLock::new(None)), bot_user_id: Arc::new(RwLock::new(None)), active_messages: Arc::new(RwLock::new(HashMap::new())), + progress_messages: Arc::new(RwLock::new(HashMap::new())), typing_tasks: Arc::new(RwLock::new(HashMap::new())), shard_manager: Arc::new(RwLock::new(None)), } @@ -85,6 +89,83 @@ impl DiscordAdapter { .and_then(|value| value.as_u64()) .map(MessageId::new) } + + fn progress_message_key(message: &InboundMessage, worker_id: crate::WorkerId) -> String { + format!("{}:{worker_id}", Self::channel_key(message)) + } + + async fn upsert_progress_message( + &self, + message: &InboundMessage, + worker_id: crate::WorkerId, + content: &str, + ) -> anyhow::Result<()> { + let http = self.get_http().await?; + let channel_id = self.extract_channel_id(message)?; + let key = Self::progress_message_key(message, worker_id); + let display_text = if content.len() > 2000 { + let end = content.floor_char_boundary(1997); + format!("{}...", &content[..end]) + } else { + content.to_string() + }; + let existing_message_id = { + let progress_messages = self.progress_messages.read().await; + progress_messages.get(&key).copied() + }; + + if let Some(message_id) = existing_message_id { + let builder = EditMessage::new().content(display_text.clone()); + match channel_id.edit_message(&*http, message_id, builder).await { + Ok(_) => return Ok(()), + Err(error) => { + tracing::warn!(%error, "failed to edit progress message; creating a new one"); + self.progress_messages.write().await.remove(&key); + } + } + } + + let reply_to = Self::extract_reply_message_id(message); + let mut builder = CreateMessage::new().content(display_text); + if let Some(reply_message_id) = reply_to { + builder = builder.reference_message((channel_id, reply_message_id)); + } + let sent = channel_id + .send_message(&*http, builder) + .await + .context("failed to send worker progress message")?; + self.progress_messages.write().await.insert(key, sent.id); + Ok(()) + } + + async fn clear_progress_message(&self, message: &InboundMessage, worker_id: crate::WorkerId) { + self.progress_messages + .write() + .await + .remove(&Self::progress_message_key(message, worker_id)); + } + + async fn handle_worker_progress( + &self, + message: &InboundMessage, + worker_id: crate::WorkerId, + text: String, + clear_on_success: bool, + ) -> bool { + self.stop_typing(message).await; + if let Err(error) = self + .upsert_progress_message(message, worker_id, &text) + .await + { + tracing::debug!(%error, "failed to update discord progress message"); + false + } else { + if clear_on_success { + self.clear_progress_message(message, worker_id).await; + } + true + } + } } impl Messaging for DiscordAdapter { @@ -359,8 +440,8 @@ impl Messaging for DiscordAdapter { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { - match status { + ) -> crate::Result { + let surfaced = match status { StatusUpdate::Thinking => { let http = self.get_http().await?; let channel_id = self.extract_channel_id(message)?; @@ -370,13 +451,48 @@ impl Messaging for DiscordAdapter { .write() .await .insert(Self::channel_key(message), typing); + true + } + StatusUpdate::WorkerStarted { worker_id, task } => { + let text = format!( + "Background task `{}` started: {}", + short_worker_id(worker_id), + task + ); + self.handle_worker_progress(message, worker_id, text, false) + .await + } + StatusUpdate::WorkerCheckpoint { worker_id, status } => { + let text = format!( + "Background task `{}`: {}", + short_worker_id(worker_id), + status + ); + self.handle_worker_progress(message, worker_id, text, false) + .await } - _ => { + StatusUpdate::WorkerCompleted { worker_id, result } => { + let text = format!( + "Background task `{}` completed: {}", + short_worker_id(worker_id), + result + ); + self.handle_worker_progress(message, worker_id, text, true) + .await + } + StatusUpdate::StopTyping + | StatusUpdate::ToolStarted { .. } + | StatusUpdate::ToolCompleted { .. } + | StatusUpdate::BranchStarted { .. } => { self.stop_typing(message).await; + false } - } - - Ok(()) + }; + Ok(if surfaced { + DeliveryOutcome::Surfaced + } else { + DeliveryOutcome::NotSurfaced + }) } async fn broadcast(&self, target: &str, response: OutboundResponse) -> crate::Result<()> { @@ -542,6 +658,7 @@ impl Messaging for DiscordAdapter { async fn shutdown(&self) -> crate::Result<()> { self.typing_tasks.write().await.clear(); + self.progress_messages.write().await.clear(); if let Some(shard_manager) = self.shard_manager.read().await.as_ref() { shard_manager.shutdown_all().await; @@ -916,6 +1033,11 @@ async fn build_metadata( (metadata, formatted_author) } +fn short_worker_id(worker_id: crate::WorkerId) -> String { + let full = worker_id.to_string(); + full.chars().take(8).collect() +} + /// Split a message into chunks that fit within Discord's 2000 char limit. /// Tries to split at newlines, then spaces, then hard-cuts. fn split_message(text: &str, max_len: usize) -> Vec { @@ -1075,7 +1197,7 @@ fn build_poll( #[cfg(test)] mod tests { use super::*; - use crate::{Button, ButtonStyle, Card, CardField, InteractiveElements, Poll}; + use crate::{Button, ButtonStyle, Card, CardField, InteractiveElements, MessageContent, Poll}; #[test] fn test_build_embed_limits() { @@ -1133,4 +1255,34 @@ mod tests { let _ = build_poll(&poll); // Again, can't easily inspect CreatePoll fields, but we verify it runs. } + + #[test] + fn progress_message_key_is_scoped_per_worker() { + let worker_a = + uuid::Uuid::parse_str("11111111-1111-1111-1111-111111111111").expect("valid uuid"); + let worker_b = + uuid::Uuid::parse_str("22222222-2222-2222-2222-222222222222").expect("valid uuid"); + + let mut metadata = std::collections::HashMap::new(); + metadata.insert( + "discord_channel_id".to_string(), + serde_json::Value::from(42_u64), + ); + + let message = InboundMessage { + id: "msg-1".to_string(), + source: "discord".to_string(), + conversation_id: "discord:42".to_string(), + sender_id: "user-1".to_string(), + agent_id: None, + content: MessageContent::Text("hello".to_string()), + timestamp: chrono::Utc::now(), + metadata, + formatted_author: None, + }; + + let key_a = DiscordAdapter::progress_message_key(&message, worker_a); + let key_b = DiscordAdapter::progress_message_key(&message, worker_b); + assert_ne!(key_a, key_b, "workers in same channel need distinct keys"); + } } diff --git a/src/messaging/manager.rs b/src/messaging/manager.rs index c08014676..5df68dd87 100644 --- a/src/messaging/manager.rs +++ b/src/messaging/manager.rs @@ -1,6 +1,8 @@ //! MessagingManager: Fan-in and routing for all adapters. -use crate::messaging::traits::{HistoryMessage, InboundStream, Messaging, MessagingDyn}; +use crate::messaging::traits::{ + DeliveryOutcome, HistoryMessage, InboundStream, Messaging, MessagingDyn, +}; use crate::{InboundMessage, OutboundResponse, StatusUpdate}; use anyhow::Context as _; @@ -217,7 +219,7 @@ impl MessagingManager { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { + ) -> crate::Result { let adapters = self.adapters.read().await; let adapter = adapters .get(&message.source) diff --git a/src/messaging/slack.rs b/src/messaging/slack.rs index 2aa1e3cbf..fbee1bf88 100644 --- a/src/messaging/slack.rs +++ b/src/messaging/slack.rs @@ -22,7 +22,7 @@ //! - DM broadcast via `conversations.open` use crate::config::{SlackCommandConfig, SlackPermissions}; -use crate::messaging::traits::{HistoryMessage, InboundStream, Messaging}; +use crate::messaging::traits::{DeliveryOutcome, HistoryMessage, InboundStream, Messaging}; use crate::{InboundMessage, MessageContent, OutboundResponse, StatusUpdate}; use anyhow::Context as _; @@ -787,7 +787,7 @@ impl Messaging for SlackAdapter { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { + ) -> crate::Result { let thread_ts = match extract_thread_ts(message) { Some(ts) => ts, None => { @@ -796,12 +796,21 @@ impl Messaging for SlackAdapter { "skipping assistant.threads.setStatus — message has no thread_ts \ (typing indicators only work in Slack Assistant threads)" ); - return Ok(()); + return Ok(DeliveryOutcome::NotSurfaced); } }; let channel_id = match extract_channel_id(message) { Ok(id) => id, - Err(_) => return Ok(()), + Err(error) => { + tracing::debug!( + %error, + message_id = %message.id, + conversation_id = %message.conversation_id, + source = %message.source, + "skipping assistant.threads.setStatus — failed to extract channel id" + ); + return Ok(DeliveryOutcome::NotSurfaced); + } }; let status_text = match &status { @@ -809,7 +818,14 @@ impl Messaging for SlackAdapter { StatusUpdate::StopTyping => String::new(), // empty string clears the status StatusUpdate::ToolStarted { .. } => "Working…".to_string(), StatusUpdate::ToolCompleted { .. } => "Working…".to_string(), - _ => "Working…".to_string(), + StatusUpdate::WorkerStarted { task, .. } => { + format!("Starting: {}", truncate_status_text(task, 120)) + } + StatusUpdate::WorkerCheckpoint { status, .. } => truncate_status_text(status, 140), + StatusUpdate::WorkerCompleted { result, .. } => { + format!("Done: {}", truncate_status_text(result, 120)) + } + StatusUpdate::BranchStarted { .. } => "Branch started…".to_string(), }; let session = self.session(); @@ -823,9 +839,10 @@ impl Messaging for SlackAdapter { // Best-effort — don't propagate status errors into the main response pipeline. if let Err(err) = session.assistant_threads_set_status(&req).await { tracing::debug!(error = %err, "failed to set slack assistant thread status (non-fatal)"); + return Ok(DeliveryOutcome::NotSurfaced); } - Ok(()) + Ok(DeliveryOutcome::Surfaced) } async fn respond( @@ -1258,6 +1275,24 @@ fn markdown_content(text: impl Into) -> SlackMessageContent { } } +fn truncate_status_text(text: &str, max_chars: usize) -> String { + let char_count = text.chars().count(); + if char_count <= max_chars { + return text.to_string(); + } + + let visible_chars = max_chars.saturating_sub(3); + let end = if visible_chars == 0 { + 0 + } else { + text.char_indices() + .nth(visible_chars) + .map(|(index, _)| index) + .unwrap_or(text.len()) + }; + format!("{}...", &text[..end]) +} + /// Extract `MessageContent` from an optional `SlackMessageContent`. fn extract_message_content(content: &Option) -> MessageContent { let Some(msg_content) = content else { diff --git a/src/messaging/telegram.rs b/src/messaging/telegram.rs index eab7994c2..9d8cd83c1 100644 --- a/src/messaging/telegram.rs +++ b/src/messaging/telegram.rs @@ -1,7 +1,7 @@ //! Telegram messaging adapter using teloxide. use crate::config::TelegramPermissions; -use crate::messaging::traits::{InboundStream, Messaging}; +use crate::messaging::traits::{DeliveryOutcome, InboundStream, Messaging}; use crate::{Attachment, InboundMessage, MessageContent, OutboundResponse, StatusUpdate}; use anyhow::Context as _; @@ -472,7 +472,7 @@ impl Messaging for TelegramAdapter { .remove(&message.conversation_id); } OutboundResponse::Status(status) => { - self.send_status(message, status).await?; + let _ = self.send_status(message, status).await?; } // Slack-specific variants — graceful fallbacks for Telegram OutboundResponse::RemoveReaction(_) => {} // no-op @@ -493,7 +493,7 @@ impl Messaging for TelegramAdapter { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { + ) -> crate::Result { match status { StatusUpdate::Thinking => { let chat_id = self.extract_chat_id(message)?; @@ -520,13 +520,22 @@ impl Messaging for TelegramAdapter { .write() .await .insert(conversation_id, handle); + Ok(DeliveryOutcome::Surfaced) } - _ => { + StatusUpdate::StopTyping + | StatusUpdate::ToolStarted { .. } + | StatusUpdate::ToolCompleted { .. } + | StatusUpdate::BranchStarted { .. } => { self.stop_typing(&message.conversation_id).await; + Ok(DeliveryOutcome::Surfaced) + } + StatusUpdate::WorkerStarted { .. } + | StatusUpdate::WorkerCheckpoint { .. } + | StatusUpdate::WorkerCompleted { .. } => { + // Telegram adapter does not currently surface worker status updates. + Ok(DeliveryOutcome::NotSurfaced) } } - - Ok(()) } async fn broadcast(&self, target: &str, response: OutboundResponse) -> crate::Result<()> { diff --git a/src/messaging/traits.rs b/src/messaging/traits.rs index 1f5e10f00..12870cb44 100644 --- a/src/messaging/traits.rs +++ b/src/messaging/traits.rs @@ -8,6 +8,21 @@ use std::pin::Pin; /// Message stream type. pub type InboundStream = Pin + Send>>; +/// Result of attempting to deliver a status update. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DeliveryOutcome { + /// Adapter surfaced the status update to the user. + Surfaced, + /// Adapter accepted the call but did not surface user-visible output. + NotSurfaced, +} + +impl DeliveryOutcome { + pub fn is_surfaced(self) -> bool { + matches!(self, Self::Surfaced) + } +} + /// A message from platform history used for backfilling channel context. #[derive(Debug, Clone)] pub struct HistoryMessage { @@ -37,8 +52,8 @@ pub trait Messaging: Send + Sync + 'static { &self, _message: &InboundMessage, _status: StatusUpdate, - ) -> impl std::future::Future> + Send { - async { Ok(()) } + ) -> impl std::future::Future> + Send { + async { Ok(DeliveryOutcome::NotSurfaced) } } /// Broadcast a message. @@ -90,7 +105,7 @@ pub trait MessagingDyn: Send + Sync + 'static { &'a self, message: &'a InboundMessage, status: StatusUpdate, - ) -> Pin> + Send + 'a>>; + ) -> Pin> + Send + 'a>>; fn broadcast<'a>( &'a self, @@ -136,7 +151,7 @@ impl MessagingDyn for T { &'a self, message: &'a InboundMessage, status: StatusUpdate, - ) -> Pin> + Send + 'a>> { + ) -> Pin> + Send + 'a>> { Box::pin(Messaging::send_status(self, message, status)) } diff --git a/src/messaging/webchat.rs b/src/messaging/webchat.rs index a016df1f4..267f63185 100644 --- a/src/messaging/webchat.rs +++ b/src/messaging/webchat.rs @@ -4,7 +4,7 @@ //! Inbound messages are injected by the API handler via `MessagingManager::inject_message`, //! and outbound responses are routed to per-session channels consumed as SSE streams. -use crate::messaging::traits::{InboundStream, Messaging}; +use crate::messaging::traits::{DeliveryOutcome, InboundStream, Messaging}; use crate::{InboundMessage, OutboundResponse, StatusUpdate}; use std::collections::HashMap; @@ -106,10 +106,10 @@ impl Messaging for WebChatAdapter { &self, message: &InboundMessage, status: StatusUpdate, - ) -> crate::Result<()> { + ) -> crate::Result { let sessions = self.sessions.read().await; let Some(tx) = sessions.get(&message.conversation_id) else { - return Ok(()); + return Ok(DeliveryOutcome::NotSurfaced); }; let event = match status { @@ -117,11 +117,17 @@ impl Messaging for WebChatAdapter { StatusUpdate::StopTyping => WebChatEvent::StopTyping, StatusUpdate::ToolStarted { tool_name } => WebChatEvent::ToolStarted { tool_name }, StatusUpdate::ToolCompleted { tool_name } => WebChatEvent::ToolCompleted { tool_name }, - _ => return Ok(()), + StatusUpdate::BranchStarted { .. } => return Ok(DeliveryOutcome::NotSurfaced), + StatusUpdate::WorkerStarted { .. } => return Ok(DeliveryOutcome::NotSurfaced), + StatusUpdate::WorkerCompleted { .. } => return Ok(DeliveryOutcome::NotSurfaced), + StatusUpdate::WorkerCheckpoint { .. } => return Ok(DeliveryOutcome::NotSurfaced), }; - let _ = tx.send(event).await; - Ok(()) + Ok(if tx.send(event).await.is_ok() { + DeliveryOutcome::Surfaced + } else { + DeliveryOutcome::NotSurfaced + }) } async fn health_check(&self) -> crate::Result<()> { diff --git a/src/tools.rs b/src/tools.rs index 28ea3ef3c..4f94e320f 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -97,7 +97,7 @@ use crate::agent::channel::ChannelState; use crate::config::{BrowserConfig, RuntimeConfig}; use crate::memory::MemorySearch; use crate::sandbox::Sandbox; -use crate::{AgentId, ChannelId, OutboundResponse, ProcessEvent, WorkerId}; +use crate::{AgentId, ChannelId, OutboundEnvelope, ProcessEvent, WorkerId}; use rig::tool::Tool as _; use rig::tool::server::{ToolServer, ToolServerHandle}; use std::path::PathBuf; @@ -227,7 +227,7 @@ pub fn should_block_user_visible_text(value: &str) -> bool { pub async fn add_channel_tools( handle: &ToolServerHandle, state: ChannelState, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, conversation_id: impl Into, skip_flag: SkipFlag, replied_flag: RepliedFlag, diff --git a/src/tools/browser.rs b/src/tools/browser.rs index f6e692f9b..dcf8e9a39 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -24,6 +24,7 @@ use std::collections::HashMap; use std::net::{IpAddr, Ipv4Addr, Ipv6Addr}; use std::path::PathBuf; use std::sync::Arc; +use std::time::Duration; use tokio::sync::Mutex; use tokio::task::JoinHandle; @@ -127,6 +128,11 @@ fn is_v4_mapped_blocked(ip: Ipv6Addr) -> bool { /// Tool for browser automation (worker-only). #[derive(Debug, Clone)] pub struct BrowserTool { + /// Shared browser session state for this worker. + /// + /// Operations intentionally hold this mutex across long awaits (including + /// `with_action_timeout(...)` and `Browser::launch`) so actions are + /// serialized per worker and cannot interleave unpredictably. state: Arc>, config: BrowserConfig, screenshot_dir: PathBuf, @@ -465,6 +471,29 @@ impl Tool for BrowserTool { } impl BrowserTool { + fn action_timeout_secs(&self) -> u64 { + self.config.browser_action_timeout_secs.max(1) + } + + async fn with_action_timeout( + &self, + action_name: &str, + action_future: F, + ) -> Result + where + F: std::future::Future>, + E: std::fmt::Display, + { + let timeout_secs = self.action_timeout_secs(); + match tokio::time::timeout(Duration::from_secs(timeout_secs), action_future).await { + Ok(Ok(value)) => Ok(value), + Ok(Err(error)) => Err(BrowserError::new(format!("{action_name} failed: {error}"))), + Err(_) => Err(BrowserError::new(format!( + "{action_name} timed out after {timeout_secs}s" + ))), + } + } + async fn handle_launch(&self) -> Result { let mut state = self.state.lock().await; @@ -492,9 +521,9 @@ impl BrowserTool { "launching chrome" ); - let (browser, mut handler) = Browser::launch(chrome_config) - .await - .map_err(|error| BrowserError::new(format!("failed to launch browser: {error}")))?; + let (browser, mut handler) = self + .with_action_timeout("browser launch", Browser::launch(chrome_config)) + .await?; let handler_task = tokio::spawn(async move { while handler.next().await.is_some() {} }); @@ -515,9 +544,8 @@ impl BrowserTool { let mut state = self.state.lock().await; let page = self.get_or_create_page(&mut state, Some(&url)).await?; - page.goto(&url) - .await - .map_err(|error| BrowserError::new(format!("navigation failed: {error}")))?; + self.with_action_timeout("navigation", page.goto(&url)) + .await?; let title = page.get_title().await.ok().flatten(); let current_url = page.url().await.ok().flatten(); @@ -545,10 +573,9 @@ impl BrowserTool { validate_url(target_url)?; } - let page = browser - .new_page(target_url) - .await - .map_err(|error| BrowserError::new(format!("failed to open tab: {error}")))?; + let page = self + .with_action_timeout("open tab", browser.new_page(target_url)) + .await?; let target_id = page_target_id(&page); let title = page.get_title().await.ok().flatten(); @@ -637,12 +664,13 @@ impl BrowserTool { let page = state .pages - .remove(&id) + .get(&id) + .cloned() .ok_or_else(|| BrowserError::new(format!("no tab with target_id '{id}'")))?; - page.close() - .await - .map_err(|error| BrowserError::new(format!("failed to close tab: {error}")))?; + self.with_action_timeout("close tab", page.close()).await?; + + state.pages.remove(&id); if state.active_target.as_ref() == Some(&id) { state.active_target = state.pages.keys().next().cloned(); @@ -659,18 +687,18 @@ impl BrowserTool { let page = self.require_active_page(&state)?.clone(); // Enable accessibility domain if not already enabled - page.execute(AxEnableParams::default()) - .await - .map_err(|error| { - BrowserError::new(format!("failed to enable accessibility: {error}")) - })?; + self.with_action_timeout( + "snapshot accessibility enable", + page.execute(AxEnableParams::default()), + ) + .await?; - let ax_tree = page - .execute(GetFullAxTreeParams::default()) - .await - .map_err(|error| { - BrowserError::new(format!("failed to get accessibility tree: {error}")) - })?; + let ax_tree = self + .with_action_timeout( + "snapshot accessibility tree", + page.execute(GetFullAxTreeParams::default()), + ) + .await?; state.element_refs.clear(); state.next_ref = 0; @@ -758,10 +786,8 @@ impl BrowserTool { match act_kind { ActKind::Click => { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .click() - .await - .map_err(|error| BrowserError::new(format!("click failed: {error}")))?; + self.with_action_timeout("act click", element.click()) + .await?; Ok(BrowserOutput::success("Clicked element")) } ActKind::Type => { @@ -769,14 +795,11 @@ impl BrowserTool { return Err(BrowserError::new("text is required for act:type")); }; let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .click() - .await - .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?; - element - .type_str(&text) - .await - .map_err(|error| BrowserError::new(format!("type failed: {error}")))?; + self.with_action_timeout("act type", async { + element.click().await?; + element.type_str(&text).await + }) + .await?; Ok(BrowserOutput::success(format!( "Typed '{}' into element", truncate_for_display(&text, 50) @@ -788,36 +811,30 @@ impl BrowserTool { }; if element_ref.is_some() { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .press_key(&key) - .await - .map_err(|error| BrowserError::new(format!("press_key failed: {error}")))?; + self.with_action_timeout("act press_key", element.press_key(&key)) + .await?; } else { - dispatch_key_press(page, &key).await?; + self.with_action_timeout("act press_key", dispatch_key_press(page, &key)) + .await?; } Ok(BrowserOutput::success(format!("Pressed key '{key}'"))) } ActKind::Hover => { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .hover() - .await - .map_err(|error| BrowserError::new(format!("hover failed: {error}")))?; + self.with_action_timeout("act hover", element.hover()) + .await?; Ok(BrowserOutput::success("Hovered over element")) } ActKind::ScrollIntoView => { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element.scroll_into_view().await.map_err(|error| { - BrowserError::new(format!("scroll_into_view failed: {error}")) - })?; + self.with_action_timeout("act scroll_into_view", element.scroll_into_view()) + .await?; Ok(BrowserOutput::success("Scrolled element into view")) } ActKind::Focus => { let element = self.resolve_element_ref(&state, page, element_ref).await?; - element - .focus() - .await - .map_err(|error| BrowserError::new(format!("focus failed: {error}")))?; + self.with_action_timeout("act focus", element.focus()) + .await?; Ok(BrowserOutput::success("Focused element")) } } @@ -833,18 +850,18 @@ impl BrowserTool { let screenshot_data = if let Some(ref_id) = element_ref { let element = self.resolve_element_ref(&state, page, Some(ref_id)).await?; - element - .screenshot(CaptureScreenshotFormat::Png) - .await - .map_err(|error| BrowserError::new(format!("element screenshot failed: {error}")))? + self.with_action_timeout( + "element screenshot", + element.screenshot(CaptureScreenshotFormat::Png), + ) + .await? } else { let params = ScreenshotParams::builder() .format(CaptureScreenshotFormat::Png) .full_page(full_page) .build(); - page.screenshot(params) - .await - .map_err(|error| BrowserError::new(format!("screenshot failed: {error}")))? + self.with_action_timeout("page screenshot", page.screenshot(params)) + .await? }; // Save to disk @@ -896,10 +913,9 @@ impl BrowserTool { let state = self.state.lock().await; let page = self.require_active_page(&state)?; - let result = page - .evaluate(script) - .await - .map_err(|error| BrowserError::new(format!("evaluate failed: {error}")))?; + let result = self + .with_action_timeout("evaluate", page.evaluate(script)) + .await?; let value = result.value().cloned(); @@ -920,19 +936,19 @@ impl BrowserTool { let state = self.state.lock().await; let page = self.require_active_page(&state)?; - let html = page - .content() - .await - .map_err(|error| BrowserError::new(format!("failed to get page content: {error}")))?; + let html = self + .with_action_timeout("page content", page.content()) + .await?; let title = page.get_title().await.ok().flatten(); let url = page.url().await.ok().flatten(); // Truncate very large pages for LLM consumption let truncated = if html.len() > 100_000 { + let end = html.floor_char_boundary(100_000); format!( "{}... [truncated, {} bytes total]", - &html[..100_000], + &html[..end], html.len() ) } else { @@ -954,18 +970,29 @@ impl BrowserTool { async fn handle_close(&self) -> Result { let mut state = self.state.lock().await; - - if let Some(mut browser) = state.browser.take() - && let Err(error) = browser.close().await - { - tracing::warn!(%error, "browser close returned error"); + let mut close_error: Option = None; + + if let Some(mut browser) = state.browser.take() { + let close_result = self + .with_action_timeout("browser close", async { browser.close().await }) + .await; + if let Err(error) = close_result { + tracing::warn!(%error, "browser close returned error"); + close_error = Some(error); + } } state.pages.clear(); state.active_target = None; state.element_refs.clear(); state.next_ref = 0; - state._handler_task = None; + if let Some(handler_task) = state._handler_task.take() { + handler_task.abort(); + } + + if let Some(error) = close_error { + return Err(error); + } tracing::info!("browser closed"); Ok(BrowserOutput::success("Browser closed")) @@ -989,10 +1016,9 @@ impl BrowserTool { .ok_or_else(|| BrowserError::new("browser not launched — call launch first"))?; let target_url = url.unwrap_or("about:blank"); - let page = browser - .new_page(target_url) - .await - .map_err(|error| BrowserError::new(format!("failed to create page: {error}")))?; + let page = self + .with_action_timeout("create page", browser.new_page(target_url)) + .await?; let target_id = page_target_id(&page); state.pages.insert(target_id.clone(), page); @@ -1105,6 +1131,7 @@ fn truncate_for_display(text: &str, max_len: usize) -> String { if text.len() <= max_len { text.to_string() } else { - format!("{}...", &text[..max_len]) + let end = text.floor_char_boundary(max_len); + format!("{}...", &text[..end]) } } diff --git a/src/tools/cancel.rs b/src/tools/cancel.rs index ea1fd950c..0cfa4ce84 100644 --- a/src/tools/cancel.rs +++ b/src/tools/cancel.rs @@ -100,16 +100,23 @@ impl Tool for CancelTool { .parse::() .map_err(|e| CancelError(format!("Invalid worker ID: {e}")))?; self.state - .cancel_worker(worker_id) + .cancel_worker(worker_id, args.reason.as_deref()) .await .map_err(CancelError)?; } other => return Err(CancelError(format!("Unknown process type: {other}"))), } - let message = if let Some(reason) = &args.reason { + let display_reason = args + .reason + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + .unwrap_or("cancelled by request"); + + let message = if args.process_type == "worker" { format!( - "{} {} cancelled: {reason}", + "{} {} cancelled: {display_reason}", args.process_type, args.process_id ) } else { diff --git a/src/tools/conclude_link.rs b/src/tools/conclude_link.rs index 315a3f2d5..d348eff01 100644 --- a/src/tools/conclude_link.rs +++ b/src/tools/conclude_link.rs @@ -4,7 +4,7 @@ //! tool with a summary. The channel checks the flag after the LLM turn and //! routes the summary back to the originating channel as a system message. -use crate::OutboundResponse; +use crate::{OutboundEnvelope, OutboundResponse}; use rig::completion::ToolDefinition; use rig::tool::Tool; use schemars::JsonSchema; @@ -32,14 +32,14 @@ pub fn new_conclude_link() -> (ConcludeLinkFlag, ConcludeLinkSummary) { pub struct ConcludeLinkTool { flag: ConcludeLinkFlag, summary: ConcludeLinkSummary, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, } impl ConcludeLinkTool { pub fn new( flag: ConcludeLinkFlag, summary: ConcludeLinkSummary, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, ) -> Self { Self { flag, @@ -94,10 +94,10 @@ impl Tool for ConcludeLinkTool { let summary_len = args.summary.len(); *self.summary.write().await = Some(args.summary); - let _ = self - .response_tx - .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping)) - .await; + self.response_tx + .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping).into()) + .await + .ok(); tracing::info!( summary_len, diff --git a/src/tools/react.rs b/src/tools/react.rs index 6679d5edf..ce3aa4ef2 100644 --- a/src/tools/react.rs +++ b/src/tools/react.rs @@ -1,6 +1,6 @@ //! React tool for adding emoji reactions to messages (channel only). -use crate::OutboundResponse; +use crate::{OutboundEnvelope, OutboundResponse}; use rig::completion::ToolDefinition; use rig::tool::Tool; use schemars::JsonSchema; @@ -10,11 +10,11 @@ use tokio::sync::mpsc; /// Tool for reacting to messages with emoji. #[derive(Debug, Clone)] pub struct ReactTool { - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, } impl ReactTool { - pub fn new(response_tx: mpsc::Sender) -> Self { + pub fn new(response_tx: mpsc::Sender) -> Self { Self { response_tx } } } @@ -66,7 +66,7 @@ impl Tool for ReactTool { tracing::info!(emoji = %args.emoji, "react tool called"); self.response_tx - .send(OutboundResponse::Reaction(args.emoji.clone())) + .send(OutboundResponse::Reaction(args.emoji.clone()).into()) .await .map_err(|error| ReactError(format!("failed to send reaction: {error}")))?; diff --git a/src/tools/reply.rs b/src/tools/reply.rs index f90f2e891..1ca845621 100644 --- a/src/tools/reply.rs +++ b/src/tools/reply.rs @@ -2,7 +2,7 @@ use crate::conversation::ConversationLogger; -use crate::{ChannelId, OutboundResponse}; +use crate::{ChannelId, OutboundEnvelope, OutboundResponse}; use regex::Regex; use rig::completion::ToolDefinition; use rig::tool::Tool; @@ -40,7 +40,7 @@ pub fn new_replied_flag() -> RepliedFlag { /// tools once and shares them across calls. #[derive(Debug, Clone)] pub struct ReplyTool { - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, conversation_id: String, conversation_logger: ConversationLogger, channel_id: ChannelId, @@ -51,7 +51,7 @@ pub struct ReplyTool { impl ReplyTool { /// Create a new reply tool bound to a conversation's response channel. pub fn new( - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, conversation_id: impl Into, conversation_logger: ConversationLogger, channel_id: ChannelId, @@ -220,6 +220,31 @@ pub(crate) fn normalize_discord_mention_tokens(content: &str, source: &str) -> S normalized } +pub(crate) fn is_low_value_waiting_update(content: &str) -> bool { + let lowered = content.to_ascii_lowercase(); + + let spawned = lowered.contains("worker was spawned") + || lowered.contains("spawned a worker") + || lowered.contains("worker was started"); + let no_report = lowered.contains("hasn't reported back") + || lowered.contains("has not reported back") + || lowered.contains("hasn't come back") + || lowered.contains("has not come back"); + if spawned && no_report { + return true; + } + + let known_template = lowered.contains("still waiting on the research results") + || lowered.contains("still waiting on the worker results") + || lowered.contains("still waiting on the results") + || lowered.contains("still waiting for the worker results"); + if known_template { + return true; + } + + false +} + impl Tool for ReplyTool { const NAME: &'static str = "reply"; @@ -395,7 +420,7 @@ impl Tool for ReplyTool { }; self.response_tx - .send(response) + .send(response.into()) .await .map_err(|e| ReplyError(format!("failed to send reply: {e}")))?; @@ -414,7 +439,9 @@ impl Tool for ReplyTool { #[cfg(test)] mod tests { - use super::{normalize_discord_mention_tokens, sanitize_discord_user_id}; + use super::{ + is_low_value_waiting_update, normalize_discord_mention_tokens, sanitize_discord_user_id, + }; #[test] fn normalizes_broken_discord_mentions() { @@ -445,4 +472,16 @@ mod tests { let parsed = sanitize_discord_user_id(">234152400653385729").expect("should parse id"); assert_eq!(parsed, "234152400653385729"); } + + #[test] + fn suppresses_low_value_waiting_updates() { + let content = "Still waiting on the research results — the worker was spawned and hasn't reported back yet."; + assert!(is_low_value_waiting_update(content)); + assert!(!is_low_value_waiting_update( + "Still waiting on your approval before I run anything." + )); + assert!(!is_low_value_waiting_update( + "I found 3 key findings and linked the sources below." + )); + } } diff --git a/src/tools/send_file.rs b/src/tools/send_file.rs index 02437e829..f5ac47a27 100644 --- a/src/tools/send_file.rs +++ b/src/tools/send_file.rs @@ -1,6 +1,6 @@ //! Send file tool for delivering file attachments to users (channel only). -use crate::OutboundResponse; +use crate::{OutboundEnvelope, OutboundResponse}; use rig::completion::ToolDefinition; use rig::tool::Tool; use schemars::JsonSchema; @@ -16,12 +16,12 @@ use tokio::sync::mpsc; /// File access is restricted to the agent's workspace boundary. #[derive(Debug, Clone)] pub struct SendFileTool { - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, workspace: PathBuf, } impl SendFileTool { - pub fn new(response_tx: mpsc::Sender, workspace: PathBuf) -> Self { + pub fn new(response_tx: mpsc::Sender, workspace: PathBuf) -> Self { Self { response_tx, workspace, @@ -161,7 +161,7 @@ impl Tool for SendFileTool { }; self.response_tx - .send(response) + .send(response.into()) .await .map_err(|error| SendFileError(format!("failed to send file: {error}")))?; diff --git a/src/tools/skip.rs b/src/tools/skip.rs index 481b5fb8b..d1fad2377 100644 --- a/src/tools/skip.rs +++ b/src/tools/skip.rs @@ -5,7 +5,7 @@ //! instead of `reply`. The channel checks the skip flag after the LLM turn and //! suppresses any fallback text output. -use crate::OutboundResponse; +use crate::{OutboundEnvelope, OutboundResponse}; use rig::completion::ToolDefinition; use rig::tool::Tool; use schemars::JsonSchema; @@ -29,11 +29,11 @@ pub fn new_skip_flag() -> SkipFlag { #[derive(Debug, Clone)] pub struct SkipTool { flag: SkipFlag, - response_tx: mpsc::Sender, + response_tx: mpsc::Sender, } impl SkipTool { - pub fn new(flag: SkipFlag, response_tx: mpsc::Sender) -> Self { + pub fn new(flag: SkipFlag, response_tx: mpsc::Sender) -> Self { Self { flag, response_tx } } } @@ -84,10 +84,10 @@ impl Tool for SkipTool { self.flag.store(true, Ordering::Relaxed); // Cancel the typing indicator so it doesn't linger - let _ = self - .response_tx - .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping)) - .await; + self.response_tx + .send(OutboundResponse::Status(crate::StatusUpdate::StopTyping).into()) + .await + .ok(); let reason = args.reason.as_deref().unwrap_or("no reason given"); tracing::info!(reason, "skip tool called, suppressing response"); diff --git a/src/tools/worker_inspect.rs b/src/tools/worker_inspect.rs index 960d52972..8e91f8c02 100644 --- a/src/tools/worker_inspect.rs +++ b/src/tools/worker_inspect.rs @@ -115,9 +115,11 @@ impl Tool for WorkerInspectTool { summary.push_str(&format!("\n### Result\n\n{result}\n")); } + let mut transcript_steps = Vec::new(); if let Some(blob) = &detail.transcript_blob { match worker_transcript::deserialize_transcript(blob) { Ok(steps) => { + transcript_steps = steps.clone(); summary.push_str(&format!("\n### Transcript ({} steps)\n\n", steps.len())); for step in &steps { match step { @@ -169,6 +171,46 @@ impl Tool for WorkerInspectTool { summary.push_str("\n*No transcript available for this worker.*\n"); } + let contract = self + .run_logger + .get_worker_task_contract_snapshot(&worker_id) + .await + .map_err(|e| WorkerInspectError(format!("Failed to load worker contract: {e}")))?; + let receipt = self + .run_logger + .get_worker_terminal_receipt_snapshot(&worker_id) + .await + .map_err(|e| WorkerInspectError(format!("Failed to load worker receipt: {e}")))?; + let projection = crate::conversation::history::build_worker_timeline_projection( + &detail.status, + &transcript_steps, + contract.as_ref(), + receipt.as_ref(), + ); + if !projection.events.is_empty() { + summary.push_str("\n### Projected Timeline\n\n"); + summary.push_str(&format!( + "- terminal_converged: {}\n", + projection.terminal_converged + )); + + let max_events = 80; + for event in projection.events.iter().take(max_events) { + let timestamp = event.at.as_deref().unwrap_or("transcript"); + summary.push_str(&format!( + "- [{}] {}: {}\n", + timestamp, event.kind, event.summary + )); + } + if projection.events.len() > max_events { + summary.push_str(&format!( + "- ... {} additional projected events omitted\n", + projection.events.len() - max_events + )); + } + summary.push('\n'); + } + Ok(WorkerInspectOutput { action: "inspect".to_string(), summary,