diff --git a/crates/core/src/query.rs b/crates/core/src/query.rs index f7807c9c..fb309aed 100644 --- a/crates/core/src/query.rs +++ b/crates/core/src/query.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::collections::HashSet; use std::sync::Arc; use std::time::Duration; @@ -96,6 +97,8 @@ pub enum QueryEvent { /// A tool call completed. ToolResult { tool_use_id: String, + tool_name: String, + input: serde_json::Value, content: ToolContent, display_content: Option, is_error: bool, @@ -610,7 +613,8 @@ pub async fn query( let mut assistant_text = String::new(); let mut reasoning_text = String::new(); - let mut tool_uses: Vec<(String, String, serde_json::Value, String, bool)> = Vec::new(); + let mut tool_uses: Vec<(usize, String, String, serde_json::Value, String, bool)> = + Vec::new(); let mut emitted_tool_use_starts: HashSet = HashSet::new(); let mut final_response = None; let mut stop_reason = None; @@ -631,7 +635,10 @@ pub async fn query( emit(QueryEvent::ReasoningCompleted); } Ok(StreamEvent::ToolCallStart { - id, name, input, .. + index, + id, + name, + input, }) => { if emitted_tool_use_starts.insert(id.clone()) { emit(QueryEvent::ToolUseStart { @@ -640,12 +647,19 @@ pub async fn query( input: input.clone(), }); } - tool_uses.push((id, name, input, String::new(), false)); + tool_uses.push((index, id, name, input, String::new(), false)); } - Ok(StreamEvent::ToolCallInputDelta { partial_json, .. }) => { - if let Some(last) = tool_uses.last_mut() { - last.3.push_str(&partial_json); - last.4 = true; + Ok(StreamEvent::ToolCallInputDelta { + index, + partial_json, + }) => { + if let Some(tool_use) = tool_uses + .iter_mut() + .rev() + .find(|(tool_index, ..)| *tool_index == index) + { + tool_use.4.push_str(&partial_json); + tool_use.5 = true; } } Ok(StreamEvent::MessageDone { response }) => { @@ -741,8 +755,10 @@ pub async fn query( tool_uses = response .content .iter() - .filter_map(|block| match block { + .enumerate() + .filter_map(|(index, block)| match block { ResponseContent::ToolUse { id, name, input } => Some(( + index, id.clone(), name.clone(), input.clone(), @@ -785,13 +801,31 @@ pub async fn query( }); } + let final_tool_inputs: HashMap = final_response + .as_ref() + .map(|response| { + response + .content + .iter() + .filter_map(|block| match block { + ResponseContent::ToolUse { id, input, .. } => { + Some((id.clone(), input.clone())) + } + ResponseContent::Text(_) => None, + }) + .collect() + }) + .unwrap_or_default(); + let tool_calls: Vec = tool_uses .into_iter() - .map(|(id, name, initial_input, json_str, saw_delta)| { + .map(|(_index, id, name, initial_input, json_str, saw_delta)| { let input = if saw_delta { - serde_json::from_str(&json_str).unwrap_or(initial_input) + serde_json::from_str(&json_str).unwrap_or_else(|_| { + final_tool_inputs.get(&id).cloned().unwrap_or(initial_input) + }) } else { - initial_input + final_tool_inputs.get(&id).cloned().unwrap_or(initial_input) }; if emitted_tool_use_starts.insert(id.clone()) { emit(QueryEvent::ToolUseStart { @@ -831,12 +865,20 @@ pub async fn query( return Ok(()); } - let tool_result_summaries: std::collections::HashMap = tool_calls + let tool_result_metadata: HashMap = tool_calls .iter() .map(|call| { ( call.id.clone(), - devo_tools::tool_summary::tool_summary(&call.name, &call.input, &session.cwd), + ( + call.name.clone(), + call.input.clone(), + devo_tools::tool_summary::tool_summary( + &call.name, + &call.input, + &session.cwd, + ), + ), ) }) .collect(); @@ -846,7 +888,7 @@ pub async fn query( // long-running and parallel tools can render before the whole batch ends. let results = if let Some(progress_events) = on_event.clone() { let completion_events = Arc::clone(&progress_events); - let summaries = Arc::new(tool_result_summaries.clone()); + let metadata = Arc::new(tool_result_metadata.clone()); runtime .execute_batch_streaming_with_completion( &tool_calls, @@ -859,12 +901,16 @@ pub async fn query( move |result| { let content = compact_tool_content(result.content.clone()); let display_content = result.display_content.clone().map(micro_compact); - let summary = summaries + let (tool_name, input, summary) = metadata .get(result.tool_use_id.as_str()) .cloned() - .unwrap_or_default(); + .unwrap_or_else(|| { + (String::new(), serde_json::Value::Null, String::new()) + }); completion_events(QueryEvent::ToolResult { tool_use_id: result.tool_use_id.clone(), + tool_name, + input, content, display_content, is_error: result.is_error, @@ -1025,6 +1071,10 @@ mod tests { requests: AtomicUsize, } + struct InterleavedToolUseProvider { + requests: AtomicUsize, + } + struct ParallelToolUseProvider { requests: AtomicUsize, } @@ -1093,6 +1143,87 @@ mod tests { } } + #[async_trait] + impl devo_provider::ModelProviderSDK for InterleavedToolUseProvider { + async fn completion(&self, _request: ModelRequest) -> Result { + unreachable!("tests stream responses only") + } + + async fn completion_stream( + &self, + _request: ModelRequest, + ) -> Result> + Send>>> { + let request_number = self.requests.fetch_add(1, Ordering::SeqCst); + + let events = if request_number == 0 { + vec![ + Ok(StreamEvent::ToolCallStart { + index: 0, + id: "tool-1".into(), + name: "mutating_tool".into(), + input: json!({}), + }), + Ok(StreamEvent::ToolCallStart { + index: 1, + id: "tool-2".into(), + name: "mutating_tool".into(), + input: json!({}), + }), + Ok(StreamEvent::ToolCallInputDelta { + index: 0, + partial_json: r#"{"value":1}"#.into(), + }), + Ok(StreamEvent::ToolCallInputDelta { + index: 1, + partial_json: r#"{"value":2}"#.into(), + }), + Ok(StreamEvent::MessageDone { + response: ModelResponse { + id: "resp-1".into(), + content: vec![ + ResponseContent::ToolUse { + id: "tool-1".into(), + name: "mutating_tool".into(), + input: json!({}), + }, + ResponseContent::ToolUse { + id: "tool-2".into(), + name: "mutating_tool".into(), + input: json!({}), + }, + ], + stop_reason: Some(StopReason::ToolUse), + usage: Usage::default(), + metadata: Default::default(), + }, + }), + ] + } else { + vec![ + Ok(StreamEvent::TextDelta { + index: 0, + text: "done".into(), + }), + Ok(StreamEvent::MessageDone { + response: ModelResponse { + id: "resp-2".into(), + content: vec![ResponseContent::Text("done".into())], + stop_reason: Some(StopReason::EndTurn), + usage: Usage::default(), + metadata: Default::default(), + }, + }), + ] + }; + + Ok(Box::pin(futures::stream::iter(events))) + } + + fn name(&self) -> &str { + "interleaved-test-provider" + } + } + #[async_trait] impl devo_provider::ModelProviderSDK for ParallelToolUseProvider { async fn completion(&self, _request: ModelRequest) -> Result { @@ -2012,6 +2143,115 @@ mod tests { } } + #[tokio::test] + async fn query_tool_result_event_includes_final_tool_input() { + let mut builder = ToolRegistryBuilder::new(); + builder.register_handler("mutating_tool", Arc::new(DisplayContentTool)); + builder.push_spec(ToolSpec { + name: "mutating_tool".into(), + description: String::new(), + input_schema: JsonSchema::object(Default::default(), None, None), + output_mode: ToolOutputMode::Text, + execution_mode: ToolExecutionMode::ReadOnly, + capability_tags: vec![], + supports_parallel: false, + preparation_feedback: ToolPreparationFeedback::None, + }); + let registry = Arc::new(builder.build()); + let runtime = ToolRuntime::new_without_permissions(Arc::clone(®istry)); + + let mut session = SessionState::new(SessionConfig::default(), std::env::temp_dir()); + session.push_message(Message::user("run the tool")); + + let seen = Arc::new(Mutex::new(Vec::new())); + let seen_clone = Arc::clone(&seen); + let callback = Arc::new(move |event: QueryEvent| { + if let QueryEvent::ToolResult { + tool_name, input, .. + } = event + { + seen_clone.lock().unwrap().push((tool_name, input)); + } + }); + + query( + &mut session, + &TurnConfig { + model: Model::default(), + thinking_selection: None, + }, + Arc::new(SingleToolUseProvider { + requests: AtomicUsize::new(0), + }), + registry, + &runtime, + Some(callback), + ) + .await + .expect("query should complete"); + + assert_eq!( + seen.lock().unwrap().as_slice(), + &[(String::from("mutating_tool"), json!({ "value": 1 }))] + ); + } + + #[tokio::test] + async fn query_tool_result_event_matches_input_delta_by_tool_index() { + let mut builder = ToolRegistryBuilder::new(); + builder.register_handler("mutating_tool", Arc::new(DisplayContentTool)); + builder.push_spec(ToolSpec { + name: "mutating_tool".into(), + description: String::new(), + input_schema: JsonSchema::object(Default::default(), None, None), + output_mode: ToolOutputMode::Text, + execution_mode: ToolExecutionMode::ReadOnly, + capability_tags: vec![], + supports_parallel: false, + preparation_feedback: ToolPreparationFeedback::None, + }); + let registry = Arc::new(builder.build()); + let runtime = ToolRuntime::new_without_permissions(Arc::clone(®istry)); + + let mut session = SessionState::new(SessionConfig::default(), std::env::temp_dir()); + session.push_message(Message::user("run the tools")); + + let seen = Arc::new(Mutex::new(Vec::new())); + let seen_clone = Arc::clone(&seen); + let callback = Arc::new(move |event: QueryEvent| { + if let QueryEvent::ToolResult { + tool_use_id, input, .. + } = event + { + seen_clone.lock().unwrap().push((tool_use_id, input)); + } + }); + + query( + &mut session, + &TurnConfig { + model: Model::default(), + thinking_selection: None, + }, + Arc::new(InterleavedToolUseProvider { + requests: AtomicUsize::new(0), + }), + registry, + &runtime, + Some(callback), + ) + .await + .expect("query should complete"); + + assert_eq!( + seen.lock().unwrap().as_slice(), + &[ + (String::from("tool-1"), json!({ "value": 1 })), + (String::from("tool-2"), json!({ "value": 2 })), + ] + ); + } + #[tokio::test] async fn query_emits_tool_result_display_content() { let mut builder = ToolRegistryBuilder::new(); diff --git a/crates/server/src/lib.rs b/crates/server/src/lib.rs index 45d1d569..1b6b06ab 100644 --- a/crates/server/src/lib.rs +++ b/crates/server/src/lib.rs @@ -13,6 +13,7 @@ mod provider_config; mod runtime; mod session; mod titles; +mod tool_actions; mod transport; mod turn; diff --git a/crates/server/src/projection.rs b/crates/server/src/projection.rs index d929c491..743ddadc 100644 --- a/crates/server/src/projection.rs +++ b/crates/server/src/projection.rs @@ -152,28 +152,12 @@ pub(crate) fn history_item_from_turn_item(item: &TurnItem) -> Option { - let path = input - .get("filePath") - .or_else(|| input.get("path")) - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let name = std::path::Path::new(path) - .file_name() - .map(|name| name.to_string_lossy().to_string()) - .unwrap_or_else(|| path.to_string()); - vec![devo_protocol::parse_command::ParsedCommand::Read { - cmd: title.clone(), - name, - path: std::path::PathBuf::from(path), - }] - } + "read" => crate::tool_actions::read_action_from_tool_input(&title, input) + .into_iter() + .collect(), "glob" => vec![devo_protocol::parse_command::ParsedCommand::ListFiles { cmd: title.clone(), - path: input - .get("path") - .and_then(serde_json::Value::as_str) - .map(ToOwned::to_owned), + path: glob_display_from_input(input), }], "grep" => vec![devo_protocol::parse_command::ParsedCommand::Search { cmd: title.clone(), @@ -188,7 +172,9 @@ pub(crate) fn history_item_from_turn_item(item: &TurnItem) -> Option Vec::new(), }; - item = item.with_metadata(SessionHistoryMetadata::Explored { actions: parsed }); + if !parsed.is_empty() { + item = item.with_metadata(SessionHistoryMetadata::Explored { actions: parsed }); + } } Some(item) } @@ -439,6 +425,18 @@ fn summarize_tool_call(tool_name: &str, input: &serde_json::Value) -> String { devo_tools::tool_summary::tool_summary(tool_name, input, &cwd).replacen(": ", " ", 1) } +fn glob_display_from_input(input: &serde_json::Value) -> Option { + let pattern = input + .get("pattern") + .and_then(serde_json::Value::as_str) + .filter(|pattern| !pattern.is_empty())?; + let path = input.get("path").and_then(serde_json::Value::as_str); + Some(match path.filter(|path| !path.is_empty()) { + Some(path) => format!("{pattern} in {path}"), + None => pattern.to_string(), + }) +} + fn summarize_tool_result(tool_name: Option<&str>, is_error: bool) -> String { match (tool_name, is_error) { (Some(tool_name), true) => format!("{tool_name} error"), @@ -532,7 +530,7 @@ mod tests { tool_call_id: "call-1".to_string(), tool_name: "read".to_string(), input: serde_json::json!({ - "filePath": "crates/tui/src/chatwidget.rs" + "filePath": "crates/tui/src/mod.rs" }), }); @@ -542,13 +540,25 @@ mod tests { assert!(matches!( &actions[0], devo_protocol::parse_command::ParsedCommand::Read { name, .. } - if name == "chatwidget.rs" + if name == "mod.rs" )); } other => panic!("unexpected metadata: {other:?}"), } } + #[test] + fn read_tool_call_without_path_does_not_emit_empty_explored_metadata() { + let item = TurnItem::ToolCall(ToolCallItem { + tool_call_id: "call-1".to_string(), + tool_name: "read".to_string(), + input: serde_json::json!({ "limit": 20 }), + }); + + let history_item = history_item_from_turn_item(&item).expect("history item"); + assert_eq!(history_item.metadata, None); + } + #[test] fn update_plan_tool_result_emits_plan_metadata() { let item = TurnItem::ToolResult(ToolResultItem { diff --git a/crates/server/src/runtime/turn_exec.rs b/crates/server/src/runtime/turn_exec.rs index f6b1fe39..1eb3da1e 100644 --- a/crates/server/src/runtime/turn_exec.rs +++ b/crates/server/src/runtime/turn_exec.rs @@ -141,28 +141,12 @@ fn command_actions_from_tool_input( input: &serde_json::Value, ) -> Vec { match tool_name { - "read" => { - let path = input - .get("filePath") - .or_else(|| input.get("path")) - .and_then(serde_json::Value::as_str) - .unwrap_or_default(); - let name = std::path::Path::new(path) - .file_name() - .map(|name| name.to_string_lossy().to_string()) - .unwrap_or_else(|| path.to_string()); - vec![devo_protocol::parse_command::ParsedCommand::Read { - cmd: command.to_string(), - name, - path: std::path::PathBuf::from(path), - }] - } + "read" => crate::tool_actions::read_action_from_tool_input(command, input) + .into_iter() + .collect(), "glob" => vec![devo_protocol::parse_command::ParsedCommand::ListFiles { cmd: command.to_string(), - path: input - .get("path") - .and_then(serde_json::Value::as_str) - .map(ToOwned::to_owned), + path: glob_display_from_input(input), }], "grep" => vec![devo_protocol::parse_command::ParsedCommand::Search { cmd: command.to_string(), @@ -179,6 +163,36 @@ fn command_actions_from_tool_input( } } +fn glob_display_from_input(input: &serde_json::Value) -> Option { + let pattern = input + .get("pattern") + .and_then(serde_json::Value::as_str) + .filter(|pattern| !pattern.is_empty())?; + let path = input.get("path").and_then(serde_json::Value::as_str); + Some(match path.filter(|path| !path.is_empty()) { + Some(path) => format!("{pattern} in {path}"), + None => pattern.to_string(), + }) +} + +fn command_actions_from_tool_result( + tool_name: &str, + command: &str, + input: &serde_json::Value, + summary: &str, +) -> Vec { + let actions = command_actions_from_tool_input(tool_name, command, input); + if !actions.is_empty() { + return actions; + } + match tool_name { + "read" => crate::tool_actions::read_action_from_tool_summary(summary) + .into_iter() + .collect(), + _ => actions, + } +} + fn command_execution_item_id_for_progress( pending_tool_calls: &HashMap, tool_use_id: &str, @@ -467,15 +481,30 @@ impl ServerRuntime { } QueryEvent::ToolResult { tool_use_id, + tool_name: final_tool_name, + input: final_input, content, display_content, is_error, summary, } => { - let tool_name = tool_names_by_id.get(&tool_use_id).cloned(); + let tool_name = if final_tool_name.is_empty() { + tool_names_by_id.get(&tool_use_id).cloned() + } else { + Some(final_tool_name) + }; // First complete the pending ToolCall item so its item/completed // arrives before the ToolResult item/completed. if let Some(mut pending) = pending_tool_calls.remove(&tool_use_id) { + if !final_input.is_null() { + pending.command = tool_name + .as_deref() + .map(|tool_name| { + command_display_from_input(tool_name, &final_input) + }) + .unwrap_or_default(); + pending.input = final_input; + } if pending.item_id.is_none() || pending.item_seq.is_none() { let started_payload = if let Some(tool_name) = tool_name.clone() { let item_kind = @@ -500,10 +529,11 @@ impl ServerRuntime { tool_call_id: tool_use_id.clone(), tool_name: tool_name.clone(), parameters: pending.input.clone(), - command_actions: command_actions_from_tool_input( + command_actions: command_actions_from_tool_result( &tool_name, &pending.command, &pending.input, + &summary, ), }) .expect("serialize tool call payload") @@ -522,10 +552,11 @@ impl ServerRuntime { command: pending.command.clone(), source: devo_protocol::protocol::ExecCommandSource::Agent, - command_actions: command_actions_from_tool_input( + command_actions: command_actions_from_tool_result( &tool_name, &pending.command, &pending.input, + &summary, ), output: None, is_error: false, @@ -541,10 +572,11 @@ impl ServerRuntime { tool_call_id: tool_use_id.clone(), tool_name: tool_name.clone(), parameters: pending.input.clone(), - command_actions: command_actions_from_tool_input( + command_actions: command_actions_from_tool_result( &tool_name, &pending.command, &pending.input, + &summary, ), }) .expect("serialize tool call payload") @@ -777,10 +809,11 @@ impl ServerRuntime { tool_name: tool_name.clone(), command: pending.command.clone(), source: devo_protocol::protocol::ExecCommandSource::Agent, - command_actions: command_actions_from_tool_input( + command_actions: command_actions_from_tool_result( &tool_name, &pending.command, &pending.input, + &summary, ), output: Some(output.clone()), is_error, @@ -810,10 +843,11 @@ impl ServerRuntime { tool_call_id: tool_use_id.clone(), tool_name: tool_name.clone().unwrap_or_default(), parameters: pending.input.clone(), - command_actions: command_actions_from_tool_input( + command_actions: command_actions_from_tool_result( tool_name.clone().unwrap_or_default().as_str(), &pending.command, &pending.input, + &summary, ), }) .expect("serialize tool call payload"); @@ -1544,17 +1578,42 @@ mod tests { fn command_actions_from_read_tool_input_builds_read_action() { let actions = command_actions_from_tool_input( "read", - "read crates/tui/src/chatwidget.rs", + "read crates/tui/src/mod.rs", &serde_json::json!({ - "filePath": "crates/tui/src/chatwidget.rs" + "filePath": "crates/tui/src/mod.rs" }), ); assert_eq!( actions, vec![devo_protocol::parse_command::ParsedCommand::Read { - cmd: "read crates/tui/src/chatwidget.rs".to_string(), - name: "chatwidget.rs".to_string(), - path: std::path::PathBuf::from("crates/tui/src/chatwidget.rs"), + cmd: "read crates/tui/src/mod.rs".to_string(), + name: "mod.rs".to_string(), + path: std::path::PathBuf::from("crates/tui/src/mod.rs"), + }] + ); + } + + #[test] + fn command_actions_from_read_tool_input_without_path_is_empty() { + let actions = + command_actions_from_tool_input("read", "read", &serde_json::json!({ "limit": 10 })); + assert_eq!(actions, Vec::new()); + } + + #[test] + fn command_actions_from_read_tool_result_summary_recovers_final_path() { + let actions = command_actions_from_tool_result( + "read", + "read ", + &serde_json::json!({}), + "read: crates/tui/src/mod.rs", + ); + assert_eq!( + actions, + vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: "read crates/tui/src/mod.rs".to_string(), + name: "mod.rs".to_string(), + path: std::path::PathBuf::from("crates/tui/src/mod.rs"), }] ); } @@ -1577,4 +1636,23 @@ mod tests { && path.as_deref() == Some("crates/tui/src") )); } + + #[test] + fn command_actions_from_glob_tool_input_include_pattern_and_path() { + let actions = command_actions_from_tool_input( + "glob", + "glob **/Cargo.toml in crates", + &serde_json::json!({ + "pattern": "**/Cargo.toml", + "path": "crates" + }), + ); + assert_eq!( + actions, + vec![devo_protocol::parse_command::ParsedCommand::ListFiles { + cmd: "glob **/Cargo.toml in crates".to_string(), + path: Some("**/Cargo.toml in crates".to_string()), + }] + ); + } } diff --git a/crates/server/src/tool_actions.rs b/crates/server/src/tool_actions.rs new file mode 100644 index 00000000..56b30465 --- /dev/null +++ b/crates/server/src/tool_actions.rs @@ -0,0 +1,48 @@ +use std::path::Path; +use std::path::PathBuf; + +use devo_protocol::parse_command::ParsedCommand; + +pub(crate) fn read_action_from_tool_input( + command: &str, + input: &serde_json::Value, +) -> Option { + let path = input + .get("filePath") + .or_else(|| input.get("path")) + .and_then(serde_json::Value::as_str)? + .trim(); + if path.is_empty() { + return None; + } + + let name = Path::new(path) + .file_name() + .map(|name| name.to_string_lossy().to_string()) + .unwrap_or_else(|| path.to_string()); + + Some(ParsedCommand::Read { + cmd: command.to_string(), + name, + path: PathBuf::from(path), + }) +} + +pub(crate) fn read_action_from_tool_summary(summary: &str) -> Option { + let path = summary + .strip_prefix("read: ") + .or_else(|| summary.strip_prefix("read ")) + .unwrap_or_default() + .trim(); + let path = path + .split_once(" (offset:") + .or_else(|| path.split_once(" (limit:")) + .map_or(path, |(path, _)| path) + .trim(); + if path.is_empty() { + return None; + } + + let input = serde_json::json!({ "filePath": path }); + read_action_from_tool_input(&summary.replacen(": ", " ", 1), &input) +} diff --git a/crates/server/tests/end_to_end.rs b/crates/server/tests/end_to_end.rs index 200c1c70..603a08c8 100644 --- a/crates/server/tests/end_to_end.rs +++ b/crates/server/tests/end_to_end.rs @@ -1,6 +1,8 @@ use std::net::TcpListener as StdTcpListener; use std::path::PathBuf; use std::sync::Arc; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; use std::time::Duration; use anyhow::Context; @@ -24,7 +26,10 @@ use devo_core::PresetModelCatalog; use devo_core::SkillsConfig; use devo_protocol::ModelRequest; use devo_protocol::ModelResponse; +use devo_protocol::ResponseContent; +use devo_protocol::StopReason; use devo_protocol::StreamEvent; +use devo_protocol::Usage; use devo_provider::ModelProviderSDK; use devo_server::ServerRuntime; use devo_server::ServerRuntimeDependencies; @@ -84,6 +89,108 @@ impl ModelProviderSDK for PendingProvider { } } +struct StreamingToolProvider { + requests: AtomicUsize, + workspace: PathBuf, +} + +impl StreamingToolProvider { + fn new(workspace: PathBuf) -> Self { + Self { + requests: AtomicUsize::new(0), + workspace, + } + } +} + +#[async_trait] +impl ModelProviderSDK for StreamingToolProvider { + async fn completion(&self, _request: ModelRequest) -> Result { + anyhow::bail!("test provider does not support completion") + } + + async fn completion_stream( + &self, + _request: ModelRequest, + ) -> Result> + Send>>> { + let request_number = self.requests.fetch_add(1, Ordering::SeqCst); + let read_input = serde_json::json!({ + "filePath": self.workspace.join("README.md").to_string_lossy().to_string() + }); + let glob_input = serde_json::json!({ + "pattern": "**/Cargo.toml", + "path": "crates" + }); + + let events = if request_number == 0 { + vec![ + Ok(StreamEvent::ToolCallStart { + index: 0, + id: "read-1".to_string(), + name: "read".to_string(), + input: serde_json::json!({}), + }), + Ok(StreamEvent::ToolCallStart { + index: 1, + id: "glob-1".to_string(), + name: "glob".to_string(), + input: serde_json::json!({}), + }), + Ok(StreamEvent::ToolCallInputDelta { + index: 0, + partial_json: read_input.to_string(), + }), + Ok(StreamEvent::ToolCallInputDelta { + index: 1, + partial_json: glob_input.to_string(), + }), + Ok(StreamEvent::MessageDone { + response: ModelResponse { + id: "resp-tools".to_string(), + content: vec![ + ResponseContent::ToolUse { + id: "read-1".to_string(), + name: "read".to_string(), + input: serde_json::json!({}), + }, + ResponseContent::ToolUse { + id: "glob-1".to_string(), + name: "glob".to_string(), + input: serde_json::json!({}), + }, + ], + stop_reason: Some(StopReason::ToolUse), + usage: Usage::default(), + metadata: Default::default(), + }, + }), + ] + } else { + vec![ + Ok(StreamEvent::TextDelta { + index: 0, + text: "done".to_string(), + }), + Ok(StreamEvent::MessageDone { + response: ModelResponse { + id: "resp-done".to_string(), + content: vec![ResponseContent::Text("done".to_string())], + stop_reason: Some(StopReason::EndTurn), + usage: Usage::default(), + metadata: Default::default(), + }, + }), + ] + }; + + Ok(Box::pin(stream::iter(events))) + } + + fn name(&self) -> &str { + "streaming-tool-test-provider" + } +} + #[tokio::test] async fn stdio_server_process_supports_handshake_and_session_start() -> Result<()> { let home_dir = TempDir::new()?; @@ -393,6 +500,173 @@ async fn websocket_listener_supports_handshake_subscription_and_turn_lifecycle() Ok(()) } +#[tokio::test] +async fn websocket_turn_streams_final_tool_metadata_for_read_and_glob() -> Result<()> { + let workspace = TempDir::new()?; + std::fs::write(workspace.path().join("README.md"), "# Test\n")?; + std::fs::create_dir_all(workspace.path().join("crates/tools"))?; + std::fs::write( + workspace.path().join("crates/tools/Cargo.toml"), + "[package]\nname = \"tools\"\n", + )?; + + let port = { + let listener = StdTcpListener::bind("127.0.0.1:0")?; + let port = listener.local_addr()?.port(); + drop(listener); + port + }; + let bind_address = format!("127.0.0.1:{port}"); + let db_dir = TempDir::new()?; + let db = Arc::new(devo_server::db::Database::open( + db_dir.path().join("e2e.db"), + )?); + let runtime = ServerRuntime::new( + workspace.path().to_path_buf(), + ServerRuntimeDependencies::new( + Arc::new(StreamingToolProvider::new(workspace.path().to_path_buf())), + Arc::new(devo_tools::create_default_tool_registry()), + "test-model".to_string(), + Arc::new(PresetModelCatalog::default()), + None, + Box::new(FileSystemSkillCatalog::new(SkillsConfig::default())), + devo_core::AgentsMdConfig::default(), + db, + workspace.path().join("config.toml"), + ), + ); + let listen = vec![format!("ws://{bind_address}")]; + let listener_task = + tokio::spawn( + async move { devo_server::run_listeners(Arc::clone(&runtime), &listen).await }, + ); + + tokio::time::sleep(Duration::from_millis(200)).await; + + let (mut socket, _) = connect_async(format!("ws://{bind_address}")).await?; + socket + .send(Message::Text( + serde_json::to_string(&initialize_request("web_socket"))?.into(), + )) + .await?; + let initialize_response = read_websocket_json(&mut socket).await?; + assert_eq!(initialize_response["id"], serde_json::json!(1)); + + socket + .send(Message::Text( + serde_json::json!({ "method": "initialized" }) + .to_string() + .into(), + )) + .await?; + socket + .send(Message::Text( + serde_json::json!({ + "id": 2, + "method": "session/start", + "params": { + "cwd": workspace.path().to_string_lossy(), + "ephemeral": false, + "title": null, + "model": "test-model" + } + }) + .to_string() + .into(), + )) + .await?; + + let session_start_messages = read_n_websocket_json(&mut socket, 2).await?; + let session_response = session_start_messages + .iter() + .find(|value| value.get("id") == Some(&serde_json::json!(2))) + .context("find session/start response")?; + let session_id = session_response["result"]["session"]["session_id"] + .as_str() + .context("extract session id")? + .to_string(); + + socket + .send(Message::Text( + serde_json::json!({ + "id": 3, + "method": "turn/start", + "params": { + "session_id": session_id, + "input": [{ "type": "text", "text": "read and glob" }], + "model": null, + "sandbox": null, + "approval_policy": null, + "cwd": null + } + }) + .to_string() + .into(), + )) + .await?; + + let messages = read_until_websocket_json( + &mut socket, + |messages| { + messages + .iter() + .any(|value| value.get("method") == Some(&serde_json::json!("turn/completed"))) + }, + 80, + ) + .await + .context("read turn lifecycle messages")?; + + let completed_tool_calls = messages + .iter() + .filter(|value| { + value.get("method") == Some(&serde_json::json!("item/completed")) + && value["params"]["item"]["item_kind"] == serde_json::json!("tool_call") + }) + .collect::>(); + assert_eq!( + completed_tool_calls.len(), + 2, + "expected completed ToolCall items: {messages:#?}" + ); + + let read_call = completed_tool_calls + .iter() + .find(|value| value["params"]["item"]["payload"]["tool_name"] == serde_json::json!("read")) + .context("find read tool call")?; + assert_eq!( + read_call["params"]["item"]["payload"]["parameters"]["filePath"], + serde_json::json!( + workspace + .path() + .join("README.md") + .to_string_lossy() + .to_string() + ) + ); + assert_eq!( + read_call["params"]["item"]["payload"]["command_actions"][0]["name"], + serde_json::json!("README.md") + ); + + let glob_call = completed_tool_calls + .iter() + .find(|value| value["params"]["item"]["payload"]["tool_name"] == serde_json::json!("glob")) + .context("find glob tool call")?; + assert_eq!( + glob_call["params"]["item"]["payload"]["parameters"]["pattern"], + serde_json::json!("**/Cargo.toml") + ); + assert_eq!( + glob_call["params"]["item"]["payload"]["command_actions"][0]["path"], + serde_json::json!("**/Cargo.toml in crates") + ); + + listener_task.abort(); + let _ = listener_task.await; + Ok(()) +} + fn devo_command() -> Result { if let Some(binary_path) = std::env::var_os("CARGO_BIN_EXE_devo").map(PathBuf::from) && binary_path.is_file() diff --git a/crates/tools/src/tool_summary.rs b/crates/tools/src/tool_summary.rs index 290f47a6..ead91f47 100644 --- a/crates/tools/src/tool_summary.rs +++ b/crates/tools/src/tool_summary.rs @@ -52,7 +52,11 @@ pub fn tool_summary(name: &str, input: &serde_json::Value, cwd: &Path) -> String format!("exec: {cmd}") } "read" => { - let path = input["filePath"].as_str().unwrap_or(""); + let path = input + .get("filePath") + .or_else(|| input.get("path")) + .and_then(|value| value.as_str()) + .unwrap_or(""); let rel = make_relative(cwd, path); let mut s = format!("read: {rel}"); let offset = input["offset"].as_u64(); diff --git a/crates/tui/README.md b/crates/tui/README.md index b065a28e..78587f1a 100644 --- a/crates/tui/README.md +++ b/crates/tui/README.md @@ -279,10 +279,6 @@ Overlay behavior pager_overlay.rs, host_overlay.rs Most bugs become easier to reason about once the layer is clear. -``` - -One typo to fix in your intended wording: use **“fundamental knowledge”**, not “foundmental knowledge,” and **“familiar”**, not “familar.” - ## Module Map ### Entry Points diff --git a/crates/tui/src/chatwidget.rs b/crates/tui/src/chatwidget.rs index ed4ab25e..6849b4d4 100644 --- a/crates/tui/src/chatwidget.rs +++ b/crates/tui/src/chatwidget.rs @@ -683,7 +683,7 @@ impl ChatWidget { Self::format_compact_token_count(self.total_input_tokens) )); parts.push(format!( - "↺{} {}%", + "(cached {} {}%)", Self::format_compact_token_count(self.total_cache_read_tokens), cached_input_percent )); @@ -1754,6 +1754,38 @@ impl ChatWidget { self.frame_requester.schedule_frame(); self.set_status_message("Tool started"); } + WorkerEvent::ToolCallUpdated { + tool_use_id, + summary, + parsed_commands, + } => { + if let Some(tool_call) = self.active_tool_calls.get_mut(&tool_use_id) { + tool_call.title = summary.clone(); + tool_call.exec_like = true; + } + let command = crate::exec_command::split_command_string(&summary); + if let Some(cell) = self + .active_cell + .as_mut() + .and_then(|cell| cell.as_any_mut().downcast_mut::()) + && cell.update_call(&tool_use_id, command.clone(), parsed_commands.clone()) + { + self.active_cell_revision = self.active_cell_revision.wrapping_add(1); + self.frame_requester.schedule_frame(); + self.set_status_message("Tool updated"); + return; + } + if self.history.iter_mut().rev().any(|cell| { + cell.as_any_mut() + .downcast_mut::() + .is_some_and(|cell| { + cell.update_call(&tool_use_id, command.clone(), parsed_commands.clone()) + }) + }) { + self.frame_requester.schedule_frame(); + self.set_status_message("Tool updated"); + } + } WorkerEvent::ToolOutputDelta { tool_use_id, delta } => { if let Some(tool_call) = self.active_tool_calls.get_mut(&tool_use_id) { if tool_call.exec_like { diff --git a/crates/tui/src/chatwidget_tests.rs b/crates/tui/src/chatwidget_tests.rs index bbddfcf7..dfb7e300 100644 --- a/crates/tui/src/chatwidget_tests.rs +++ b/crates/tui/src/chatwidget_tests.rs @@ -3087,7 +3087,7 @@ fn status_summary_uses_last_turn_total_when_idle_and_live_estimate_while_busy() let idle_summary = widget.status_summary_text(); assert!(idle_summary.contains("↑12")); - assert!(idle_summary.contains("↺4 33%")); + assert!(idle_summary.contains("cached 4 33%")); assert!(idle_summary.contains("↓18")); assert!(idle_summary.contains("42/190k")); @@ -3107,7 +3107,7 @@ fn status_summary_uses_last_turn_total_when_idle_and_live_estimate_while_busy() let busy_summary = widget.status_summary_text(); assert!(busy_summary.contains("↑7")); - assert!(busy_summary.contains("↺6 86%")); + assert!(busy_summary.contains("cached 6 86%")); assert!(busy_summary.contains("7/190k")); widget.handle_worker_event(crate::events::WorkerEvent::TurnFinished { @@ -3123,7 +3123,7 @@ fn status_summary_uses_last_turn_total_when_idle_and_live_estimate_while_busy() let finished_summary = widget.status_summary_text(); assert!(finished_summary.contains("↑19")); - assert!(finished_summary.contains("↺6 32%")); + assert!(finished_summary.contains("cached 6 32%")); assert!(finished_summary.contains("7/190k")); } @@ -3214,7 +3214,7 @@ fn new_session_prepared_appends_header_after_existing_history_and_resets_status( let summary = widget.status_summary_text(); assert!(summary.contains("↑0")); - assert!(summary.contains("↺0 0%")); + assert!(summary.contains("cached 0 0%")); assert!(summary.contains("↓0")); assert!(summary.contains("0/190k")); @@ -3256,7 +3256,7 @@ fn new_session_prepared_does_not_duplicate_startup_header_without_history() { let rows = rendered_rows(&widget, 80, 16); assert_eq!(rows.iter().filter(|row| row.contains("Devo")).count(), 1); - assert!(widget.status_summary_text().contains("↺0 0%")); + assert!(widget.status_summary_text().contains("cached 0 0%")); } #[test] @@ -3784,6 +3784,214 @@ fn read_tool_call_renders_as_explored_group_in_viewport() { assert!(display.contains("▌ Explored") || display.contains("▌ Exploring")); } +#[test] +fn read_tool_call_falls_back_to_path_when_read_name_is_empty() { + let model = Model { + slug: "test-model".to_string(), + display_name: "Test Model".to_string(), + ..Model::default() + }; + let (mut widget, _app_event_rx) = widget_with_model(model, PathBuf::from(".")); + + widget.handle_worker_event(crate::events::WorkerEvent::ToolCall { + tool_use_id: "tool-1".to_string(), + summary: "read crates/tui/src/mod.rs".to_string(), + preparing: false, + parsed_commands: Some(vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: "read crates/tui/src/mod.rs".to_string(), + name: String::new(), + path: PathBuf::from("crates/tui/src/mod.rs"), + }]), + }); + widget.handle_worker_event(crate::events::WorkerEvent::ToolResult { + tool_use_id: "tool-1".to_string(), + title: "read crates/tui/src/mod.rs".to_string(), + preview: "mod tui;".to_string(), + is_error: false, + truncated: false, + }); + + let display = widget + .active_cell_display_lines_for_test(80) + .into_iter() + .map(|line| { + line.spans + .into_iter() + .map(|span| span.content.to_string()) + .collect::() + }) + .collect::>() + .join("\n"); + + assert!( + display.contains("Read mod.rs"), + "expected read summary fallback in explored viewport: {display}" + ); + assert!( + !display.contains(" └ Read\n"), + "read summary should not be bare Read: {display}" + ); +} + +#[test] +fn read_tool_call_updates_placeholder_from_completed_tool_call_metadata() { + let model = Model { + slug: "test-model".to_string(), + display_name: "Test Model".to_string(), + ..Model::default() + }; + let (mut widget, _app_event_rx) = widget_with_model(model, PathBuf::from(".")); + + widget.handle_worker_event(crate::events::WorkerEvent::ToolCall { + tool_use_id: "tool-1".to_string(), + summary: "read {}".to_string(), + preparing: false, + parsed_commands: Some(vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: String::new(), + name: String::new(), + path: PathBuf::new(), + }]), + }); + + let initial_display = widget + .active_cell_display_lines_for_test(80) + .into_iter() + .map(|line| { + line.spans + .into_iter() + .map(|span| span.content.to_string()) + .collect::() + }) + .collect::>() + .join("\n"); + + assert!( + initial_display.contains("Explored") || initial_display.contains("Exploring"), + "expected read start to render as explored cell: {initial_display}" + ); + assert!( + initial_display.contains("Read"), + "expected placeholder read line: {initial_display}" + ); + assert!( + !initial_display.contains("Running read {}"), + "read placeholder should not render as a generic running tool: {initial_display}" + ); + + widget.handle_worker_event(crate::events::WorkerEvent::ToolCallUpdated { + tool_use_id: "tool-1".to_string(), + summary: "read crates/tui/src/mod.rs".to_string(), + parsed_commands: vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: "read crates/tui/src/mod.rs".to_string(), + name: "mod.rs".to_string(), + path: PathBuf::from("crates/tui/src/mod.rs"), + }], + }); + + let updated_display = widget + .active_cell_display_lines_for_test(80) + .into_iter() + .map(|line| { + line.spans + .into_iter() + .map(|span| span.content.to_string()) + .collect::() + }) + .collect::>() + .join("\n"); + + assert!( + updated_display.contains("Read mod.rs"), + "expected read placeholder to update in place: {updated_display}" + ); + + widget.handle_worker_event(crate::events::WorkerEvent::ToolResult { + tool_use_id: "tool-1".to_string(), + title: "read crates/tui/src/mod.rs".to_string(), + preview: "mod tui;".to_string(), + is_error: false, + truncated: false, + }); + + let completed_display = widget + .active_cell_display_lines_for_test(80) + .into_iter() + .map(|line| { + line.spans + .into_iter() + .map(|span| span.content.to_string()) + .collect::() + }) + .collect::>() + .join("\n"); + + assert!( + completed_display.contains("Read mod.rs"), + "expected completed read to remain explored: {completed_display}" + ); + assert!( + !completed_display.contains("Ran read"), + "matching result should not create generic ran cell: {completed_display}" + ); +} + +#[test] +fn consecutive_read_tool_calls_render_on_one_line_with_spaces() { + let model = Model { + slug: "test-model".to_string(), + display_name: "Test Model".to_string(), + ..Model::default() + }; + let (mut widget, _app_event_rx) = widget_with_model(model, PathBuf::from(".")); + + for name in ["mod.rs", "lib.rs", "file1.rs", "file2.rs"] { + let tool_use_id = format!("tool-{name}"); + widget.handle_worker_event(crate::events::WorkerEvent::ToolCall { + tool_use_id: tool_use_id.clone(), + summary: "read {}".to_string(), + preparing: false, + parsed_commands: Some(vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: String::new(), + name: String::new(), + path: PathBuf::new(), + }]), + }); + widget.handle_worker_event(crate::events::WorkerEvent::ToolCallUpdated { + tool_use_id: tool_use_id.clone(), + summary: format!("read crates/tui/src/{name}"), + parsed_commands: vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: format!("read crates/tui/src/{name}"), + name: name.to_string(), + path: PathBuf::from(format!("crates/tui/src/{name}")), + }], + }); + widget.handle_worker_event(crate::events::WorkerEvent::ToolResult { + tool_use_id, + title: format!("read crates/tui/src/{name}"), + preview: String::new(), + is_error: false, + truncated: false, + }); + } + + let display = widget + .active_cell_display_lines_for_test(120) + .into_iter() + .map(|line| { + line.spans + .into_iter() + .map(|span| span.content.to_string()) + .collect::() + }) + .collect::>() + .join("\n"); + + assert!( + display.contains("Read mod.rs lib.rs file1.rs file2.rs"), + "expected consecutive reads to render space-separated: {display}" + ); +} + #[test] fn glob_tool_call_renders_as_explored_group_in_viewport() { let model = Model { diff --git a/crates/tui/src/events.rs b/crates/tui/src/events.rs index 923e3e2d..85a42ad0 100644 --- a/crates/tui/src/events.rs +++ b/crates/tui/src/events.rs @@ -110,6 +110,15 @@ pub(crate) enum WorkerEvent { /// Optional parsed command semantics for command-like and exploration-like tools. parsed_commands: Option>, }, + /// Updated metadata for a previously started tool call. + ToolCallUpdated { + /// Stable identifier matching the original tool call. + tool_use_id: String, + /// Updated human-readable summary line. + summary: String, + /// Parsed command semantics derived from finalized tool metadata. + parsed_commands: Vec, + }, /// Incremental output delta from a running tool. ToolOutputDelta { /// Stable identifier matching the corresponding tool call. diff --git a/crates/tui/src/exec_cell/model.rs b/crates/tui/src/exec_cell/model.rs index 4fd873bf..c6b2c462 100644 --- a/crates/tui/src/exec_cell/model.rs +++ b/crates/tui/src/exec_cell/model.rs @@ -94,6 +94,20 @@ impl ExecCell { true } + pub(crate) fn update_call( + &mut self, + call_id: &str, + command: Vec, + parsed: Vec, + ) -> bool { + let Some(call) = self.calls.iter_mut().rev().find(|c| c.call_id == call_id) else { + return false; + }; + call.command = command; + call.parsed = parsed; + true + } + pub(crate) fn should_flush(&self) -> bool { !self.is_exploring_cell() && self.calls.iter().all(|c| c.output.is_some()) } diff --git a/crates/tui/src/exec_cell/render.rs b/crates/tui/src/exec_cell/render.rs index 5559f4bc..d2016f2d 100644 --- a/crates/tui/src/exec_cell/render.rs +++ b/crates/tui/src/exec_cell/render.rs @@ -39,6 +39,20 @@ pub(crate) struct OutputLinesParams { pub(crate) include_prefix: bool, } +fn read_display_name(name: &str, path: &std::path::Path, cmd: &str) -> String { + if !name.is_empty() { + return name.to_string(); + } + if let Some(file_name) = path.file_name() { + return file_name.to_string_lossy().to_string(); + } + let path = path.to_string_lossy(); + if !path.is_empty() { + return path.to_string(); + } + cmd.to_string() +} + pub(crate) fn new_active_exec_command( call_id: String, command: Vec, @@ -352,20 +366,22 @@ impl ExecCell { .parsed .iter() .map(|parsed| match parsed { - ParsedCommand::Read { name, .. } => name.clone(), + ParsedCommand::Read { cmd, name, path } => { + read_display_name(name, path, cmd) + } _ => unreachable!(), }) .unique(); vec![( "Read", - Itertools::intersperse(names.into_iter().map(Into::into), ", ".dim()).collect(), + Itertools::intersperse(names.into_iter().map(Into::into), " ".dim()).collect(), )] } else { let mut lines = Vec::new(); for parsed in &call.parsed { match parsed { - ParsedCommand::Read { name, .. } => { - lines.push(("Read", vec![name.clone().into()])); + ParsedCommand::Read { cmd, name, path } => { + lines.push(("Read", vec![read_display_name(name, path, cmd).into()])); } ParsedCommand::ListFiles { cmd, path } => { lines.push(("List", vec![path.clone().unwrap_or(cmd.clone()).into()])); diff --git a/crates/tui/src/host.rs b/crates/tui/src/host.rs index 23598e1d..7d566e42 100644 --- a/crates/tui/src/host.rs +++ b/crates/tui/src/host.rs @@ -661,6 +661,7 @@ fn handle_worker_event( | WorkerEvent::AssistantMessageCompleted(_) | WorkerEvent::ReasoningCompleted(_) | WorkerEvent::ToolCall { .. } + | WorkerEvent::ToolCallUpdated { .. } | WorkerEvent::ToolResult { .. } | WorkerEvent::PatchApplied { .. } | WorkerEvent::PlanUpdated { .. } diff --git a/crates/tui/src/lib.rs b/crates/tui/src/lib.rs index 0dc47a2a..bf79ef23 100644 --- a/crates/tui/src/lib.rs +++ b/crates/tui/src/lib.rs @@ -45,6 +45,8 @@ mod terminal_palette; mod test_backend; mod text_formatting; mod theme; +#[cfg(test)] +mod tool_rendering_e2e_tests; mod tool_result_cell; mod tui; mod ui_consts; diff --git a/crates/tui/src/tool_rendering_e2e_tests.rs b/crates/tui/src/tool_rendering_e2e_tests.rs new file mode 100644 index 00000000..84625a6d --- /dev/null +++ b/crates/tui/src/tool_rendering_e2e_tests.rs @@ -0,0 +1,146 @@ +use std::path::PathBuf; + +use devo_protocol::Model; +use pretty_assertions::assert_eq; +use tokio::sync::mpsc; + +use crate::app_event::AppEvent; +use crate::app_event_sender::AppEventSender; +use crate::chatwidget::ChatWidget; +use crate::chatwidget::ChatWidgetInit; +use crate::chatwidget::TuiSessionState; +use crate::events::WorkerEvent; +use crate::tui::frame_requester::FrameRequester; + +fn widget_with_model( + model: Model, + cwd: PathBuf, +) -> (ChatWidget, mpsc::UnboundedReceiver) { + let (app_event_tx, app_event_rx) = mpsc::unbounded_channel(); + let widget = ChatWidget::new_with_app_event(ChatWidgetInit { + frame_requester: FrameRequester::test_dummy(), + app_event_tx: AppEventSender::new(app_event_tx), + initial_session: TuiSessionState::new(cwd, Some(model)), + initial_thinking_selection: None, + initial_permission_preset: devo_protocol::PermissionPreset::Default, + initial_user_message: None, + enhanced_keys_supported: true, + is_first_run: false, + available_models: Vec::new(), + saved_model_slugs: Vec::new(), + show_model_onboarding: false, + startup_tooltip_override: None, + initial_theme_name: None, + }); + (widget, app_event_rx) +} + +fn active_display(widget: &ChatWidget) -> String { + widget + .active_cell_display_lines_for_test(100) + .into_iter() + .map(|line| { + line.spans + .into_iter() + .map(|span| span.content.to_string()) + .collect::() + }) + .collect::>() + .join("\n") +} + +#[test] +fn streaming_read_and_glob_updates_render_in_one_explored_cell() { + let model = Model { + slug: "test-model".to_string(), + display_name: "Test Model".to_string(), + ..Model::default() + }; + let (mut widget, _app_event_rx) = widget_with_model(model, PathBuf::from(".")); + + widget.handle_worker_event(WorkerEvent::ToolCall { + tool_use_id: "read-1".to_string(), + summary: "read {}".to_string(), + preparing: false, + parsed_commands: Some(vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: String::new(), + name: String::new(), + path: PathBuf::new(), + }]), + }); + assert_eq!( + active_display(&widget).contains("Running read {}"), + false, + "read start must render as explored placeholder" + ); + + widget.handle_worker_event(WorkerEvent::ToolCallUpdated { + tool_use_id: "read-1".to_string(), + summary: "read README.md".to_string(), + parsed_commands: vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: "read README.md".to_string(), + name: "README.md".to_string(), + path: PathBuf::from("README.md"), + }], + }); + widget.handle_worker_event(WorkerEvent::ToolResult { + tool_use_id: "read-1".to_string(), + title: "read README.md".to_string(), + preview: "# Devo".to_string(), + is_error: false, + truncated: false, + }); + + widget.handle_worker_event(WorkerEvent::ToolCall { + tool_use_id: "glob-1".to_string(), + summary: "glob {}".to_string(), + preparing: false, + parsed_commands: Some(vec![ + devo_protocol::parse_command::ParsedCommand::ListFiles { + cmd: "glob".to_string(), + path: Some("glob".to_string()), + }, + ]), + }); + widget.handle_worker_event(WorkerEvent::ToolCallUpdated { + tool_use_id: "glob-1".to_string(), + summary: "glob **/Cargo.toml in crates".to_string(), + parsed_commands: vec![devo_protocol::parse_command::ParsedCommand::ListFiles { + cmd: "glob **/Cargo.toml in crates".to_string(), + path: Some("**/Cargo.toml in crates".to_string()), + }], + }); + widget.handle_worker_event(WorkerEvent::ToolResult { + tool_use_id: "glob-1".to_string(), + title: "glob **/Cargo.toml in crates".to_string(), + preview: "crates/tools/Cargo.toml".to_string(), + is_error: false, + truncated: false, + }); + + let display = active_display(&widget); + assert!( + display.contains("Explored"), + "expected explored group:\n{display}" + ); + assert!( + display.contains("Read README.md"), + "expected final read file name:\n{display}" + ); + assert!( + display.contains("List **/Cargo.toml in crates"), + "expected final glob parameters:\n{display}" + ); + assert!( + !display.contains("Running read {}"), + "read must not render as generic running tool:\n{display}" + ); + assert!( + !display.contains("Ran read"), + "read result must not create a generic ran cell:\n{display}" + ); + assert!( + !display.contains("List glob"), + "glob placeholder must be replaced in place:\n{display}" + ); +} diff --git a/crates/tui/src/worker.rs b/crates/tui/src/worker.rs index a6517ae0..3303fd21 100644 --- a/crates/tui/src/worker.rs +++ b/crates/tui/src/worker.rs @@ -1267,11 +1267,13 @@ async fn run_worker_inner( ) { let summary = summarize_tool_call(&payload); + let parsed_commands = + tool_call_started_actions(&payload); let _ = event_tx.send(WorkerEvent::ToolCall { tool_use_id: payload.tool_call_id.clone(), summary, preparing: payload.tool_name == "write", - parsed_commands: Some(payload.command_actions), + parsed_commands: Some(parsed_commands), }); } } @@ -1691,9 +1693,18 @@ fn handle_completed_item(payload: ItemEventPayload, event_tx: &mpsc::UnboundedSe payload, .. } => { - // ToolCall is now handled via item/started; skip duplicate emission from - // item/completed since it arrives later (after the tool actually finishes). - let _ = payload; + let Ok(payload) = serde_json::from_value::(payload) else { + return; + }; + let summary = summarize_tool_call_update(&payload); + let parsed_commands = tool_call_updated_actions(&payload, &summary); + if !parsed_commands.is_empty() { + let _ = event_tx.send(WorkerEvent::ToolCallUpdated { + tool_use_id: payload.tool_call_id, + summary, + parsed_commands, + }); + } } ItemEnvelope { item_kind: ItemKind::FileChange, @@ -1948,6 +1959,133 @@ fn summarize_tool_call(payload: &ToolCallPayload) -> String { } } +fn summarize_tool_call_update(payload: &ToolCallPayload) -> String { + let summary = summarize_tool_call(payload); + if payload.tool_name == "read" + && summary == "read {}" + && let Some(cmd) = payload + .command_actions + .iter() + .find_map(|action| match action { + devo_protocol::parse_command::ParsedCommand::Read { cmd, .. } + if !cmd.is_empty() => + { + Some(cmd.clone()) + } + _ => None, + }) + { + return cmd; + } + if payload.tool_name == "glob" + && summary == "glob {}" + && let Some(cmd) = payload + .command_actions + .iter() + .find_map(|action| match action { + devo_protocol::parse_command::ParsedCommand::ListFiles { cmd, .. } + if !cmd.is_empty() => + { + Some(cmd.clone()) + } + _ => None, + }) + { + return cmd; + } + summary +} + +fn read_command_action_from_parameters( + command: &str, + input: &serde_json::Value, +) -> Option { + let path = input + .get("filePath") + .or_else(|| input.get("path")) + .and_then(serde_json::Value::as_str)? + .trim(); + if path.is_empty() { + return None; + } + let name = Path::new(path) + .file_name() + .map(|name| name.to_string_lossy().to_string()) + .unwrap_or_else(|| path.to_string()); + Some(devo_protocol::parse_command::ParsedCommand::Read { + cmd: command.to_string(), + name, + path: PathBuf::from(path), + }) +} + +fn glob_command_action_from_parameters( + command: &str, + input: &serde_json::Value, +) -> Option { + let pattern = input + .get("pattern") + .and_then(serde_json::Value::as_str) + .filter(|pattern| !pattern.is_empty())?; + let path = input.get("path").and_then(serde_json::Value::as_str); + let display = match path.filter(|path| !path.is_empty()) { + Some(path) => format!("{pattern} in {path}"), + None => pattern.to_string(), + }; + Some(devo_protocol::parse_command::ParsedCommand::ListFiles { + cmd: command.to_string(), + path: Some(display), + }) +} + +fn tool_call_started_actions( + payload: &ToolCallPayload, +) -> Vec { + if !payload.command_actions.is_empty() { + return payload.command_actions.clone(); + } + if payload.tool_name == "read" { + return vec![ + read_command_action_from_parameters("read", &payload.parameters).unwrap_or_else(|| { + devo_protocol::parse_command::ParsedCommand::Read { + cmd: String::new(), + name: String::new(), + path: PathBuf::new(), + } + }), + ]; + } + if payload.tool_name == "glob" { + return vec![ + glob_command_action_from_parameters("glob", &payload.parameters).unwrap_or_else(|| { + devo_protocol::parse_command::ParsedCommand::ListFiles { + cmd: "glob".to_string(), + path: Some("glob".to_string()), + } + }), + ]; + } + Vec::new() +} + +fn tool_call_updated_actions( + payload: &ToolCallPayload, + summary: &str, +) -> Vec { + if !payload.command_actions.is_empty() { + return payload.command_actions.clone(); + } + match payload.tool_name.as_str() { + "read" => read_command_action_from_parameters(summary, &payload.parameters) + .into_iter() + .collect(), + "glob" => glob_command_action_from_parameters(summary, &payload.parameters) + .into_iter() + .collect(), + _ => Vec::new(), + } +} + fn make_path_relative(path: &str) -> String { let p = std::path::PathBuf::from(path); if p.is_absolute() @@ -2344,6 +2482,7 @@ mod tests { use super::normalize_display_output; use super::project_history_items; use super::summarize_tool_call; + use super::tool_call_started_actions; use super::truncate_tool_output; use crate::events::PlanStep; use crate::events::PlanStepStatus; @@ -2434,6 +2573,111 @@ mod tests { ); } + #[test] + fn read_tool_call_start_with_empty_parameters_emits_placeholder_action() { + let payload = ToolCallPayload { + tool_call_id: "call-1".to_string(), + tool_name: "read".to_string(), + parameters: serde_json::json!({}), + command_actions: Vec::new(), + }; + + assert_eq!( + tool_call_started_actions(&payload), + vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: String::new(), + name: String::new(), + path: PathBuf::new(), + }] + ); + } + + #[test] + fn completed_read_tool_call_emits_update_event() { + let (event_tx, mut event_rx) = tokio::sync::mpsc::unbounded_channel(); + handle_completed_item( + ItemEventPayload { + context: devo_server::EventContext { + session_id: SessionId::new(), + turn_id: None, + item_id: None, + seq: 1, + }, + item: ItemEnvelope { + item_id: ItemId::new(), + item_kind: ItemKind::ToolCall, + payload: serde_json::to_value(ToolCallPayload { + tool_call_id: "call-1".to_string(), + tool_name: "read".to_string(), + parameters: serde_json::json!({}), + command_actions: vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: "read crates/tui/src/mod.rs".to_string(), + name: "mod.rs".to_string(), + path: PathBuf::from("crates/tui/src/mod.rs"), + }], + }) + .expect("serialize tool call payload"), + }, + }, + &event_tx, + ); + + assert_eq!( + event_rx.try_recv().expect("worker event"), + WorkerEvent::ToolCallUpdated { + tool_use_id: "call-1".to_string(), + summary: "read crates/tui/src/mod.rs".to_string(), + parsed_commands: vec![devo_protocol::parse_command::ParsedCommand::Read { + cmd: "read crates/tui/src/mod.rs".to_string(), + name: "mod.rs".to_string(), + path: PathBuf::from("crates/tui/src/mod.rs"), + }], + } + ); + } + + #[test] + fn completed_glob_tool_call_emits_update_with_pattern_and_path() { + let (event_tx, mut event_rx) = tokio::sync::mpsc::unbounded_channel(); + handle_completed_item( + ItemEventPayload { + context: devo_server::EventContext { + session_id: SessionId::new(), + turn_id: None, + item_id: None, + seq: 1, + }, + item: ItemEnvelope { + item_id: ItemId::new(), + item_kind: ItemKind::ToolCall, + payload: serde_json::to_value(ToolCallPayload { + tool_call_id: "call-1".to_string(), + tool_name: "glob".to_string(), + parameters: serde_json::json!({ + "pattern": "**/Cargo.toml", + "path": "crates" + }), + command_actions: Vec::new(), + }) + .expect("serialize tool call payload"), + }, + }, + &event_tx, + ); + + assert_eq!( + event_rx.try_recv().expect("worker event"), + WorkerEvent::ToolCallUpdated { + tool_use_id: "call-1".to_string(), + summary: "glob **/Cargo.toml in crates".to_string(), + parsed_commands: vec![devo_protocol::parse_command::ParsedCommand::ListFiles { + cmd: "glob **/Cargo.toml in crates".to_string(), + path: Some("**/Cargo.toml in crates".to_string()), + }], + } + ); + } + #[test] fn completed_tool_result_falls_back_to_content_preview() { let (event_tx, mut event_rx) = tokio::sync::mpsc::unbounded_channel(); diff --git a/docs/spec-interactive-tui-v2.md b/docs/spec-interactive-tui.md similarity index 99% rename from docs/spec-interactive-tui-v2.md rename to docs/spec-interactive-tui.md index 2b668560..0ae73a88 100644 --- a/docs/spec-interactive-tui-v2.md +++ b/docs/spec-interactive-tui.md @@ -529,12 +529,12 @@ PS C:\Users\lenovo\Desktop\devo> - separated from content above by **three blank lines** - one blank line above the input line, one below - `┃` at the left edge of the input line in the theme's accent color -- status line below shows: model name, effort, cumulative token usage for the session (`↑` total input / `↺` cached input / `↓` total output), a context-window bar, and the last completed query's input usage against the model's effective context window +- status line below shows: model name, effort, cumulative token usage for the session (`↑` total input (cached input) / `↓` total output), a context-window bar, and the last completed query's input usage against the model's effective context window The status line token counters must obey these rules: - `↑` is the session's cumulative input token total across all completed queries in the current session -- `↺` is the session's cumulative cached-input token total across all completed queries in the current session +- cached input is the session's cumulative cached-input token total across all completed queries in the current session - `↓` is the session's cumulative output token total across all completed queries in the current session - the context-window bar is based on the last completed query's input token usage, not the cumulative session total - the context-window denominator must use the model's effective usable window, not the raw model context window diff --git a/specs/AGENTS.md b/specs/AGENTS.md new file mode 100644 index 00000000..ae35c3c8 --- /dev/null +++ b/specs/AGENTS.md @@ -0,0 +1,303 @@ +# Project Methodology: Specification-Driven Development + +This project follows a specification-driven development methodology inspired by systems engineering. + +The methodology decomposes high-level requirements into progressively more concrete engineering artifacts, maintains explicit traceability between those artifacts, and ensures that every implemented behavior can be verified through tests. + +Project specifications are stored under the `specs/` directory. + +## Specification Hierarchy + +Project specifications are organized into three levels: `L1 / L2 / L3` + +The `L1 / L2 / L3` specification hierarchy is the collaboration space between humans and assistant. + +These levels represent a refinement hierarchy from abstract business intent to concrete implementation detail. + +### L1 — User and Business Requirements + +L1 contains high-level requirements from the user or business perspective. + +L1 specifications are primarily human-authored or human-approved. Assistant may help clarify, organize, or detect inconsistencies in L1 documents, but they must not change the business meaning without human approval. + +L1 specifications must follow the approved L1 requirement template. The template is required for L1 only. The template file at `/specs/templates/spec-l1-requirement.md`. + +Typical L1 content defines business intent, user requirements, business workflows, system capabilities, functional requirements, non-functional requirements, and acceptance criteria at the product or system level. + +L1 specifications answer the question: `What must the system do, and why?` + +### L2 — Technical Requirements and High-Level Design + +L2 refines L1 content into technical requirements and high-level design decisions. + +L2 specifications are co-designed by humans and assistant. + +L2 specifications translate L1 content into technical requirements, system architecture, component boundaries, technical frameworks / programming languages selection, standards, API contracts, data models, infrastructure assumptions, and high-level implementation constraints and other technical decisions. + +L2 specifications are free-form design documents. + +L2 specifications answer the question: `How should the system be designed to satisfy the requirements?` + +### L3 — Detailed Design and Implementation Specifications + +L3 contains concrete design and implementation-level specifications. + +L3 specifications are co-designed by humans and assistant. L3 specifications are expected to be directly actionable by assistant. + +L3 documents define the detailed behavior required to implement the L2 design, concrete behavior such as algorithms, state transitions, retry policies, timeout rules, error-handling behavior, edge cases, and unit-level implementation requirements. + +L3 specifications are free-form detailed behavior documents. + +L3 specifications answer the question: `How exactly should the implementation behave?` + +### Ownership Model + +L1: Human-authored or human-approved. Assistant should not change without human approval. +L2: Human-assistant co-designed. Assistant may update when technical design changes or gaps are discovered. +L3: Human-assistant co-specified and assistant-actionable. Assistant may update directly when refining concrete implementation behavior. +Implementation: Assistant-produced +Tests: Assistant-produced + +Assistant may update L2 directly only for local technical clarifications that do not change public APIs, data models, security posture, infrastructure cost assumptions, or externally visible behavior. + +If an L2 change affects architecture, component boundaries, API contracts, data compatibility, deployment assumptions, or security constraints, assistant must propose the change for human approval before implementation. + +## Traceability + +The project uses explicit traceability links between requirements, designs, implementations, and tests. + +The primary traceability chain is: `L1 -> L2 -> L3` + +The design should keep as decoupled as possible. A highly coupled traceability graph may indicate that the requirements or design should be decomposed further. + +Tests are verification artifacts and must be traceable to the specification artifacts they verify. + +In general, tests should primarily trace to L3 artifacts because L3 defines concrete, implementable, and verifiable behavior. + +Coverage for L1 and L2 is usually derived transitively through the traceability chain: `Test -> L3 -> L2 -> L1`. + +If the unit test verifies an L3 behavior, and that L3 behavior is already traced to L2 and L1, then the unit test contributes to L1 and L2 coverage through the traceability matrix. + +The recommended mapping is: +``` +Unit Test → L3 Detailed Design / Behavior +Integration Test → L3 and/or L2 Technical Design +End-to-End Test → L1 Requirement and/or L2 Workflow Design +``` + +## Specification Status + +Each specification document must include a status field: + +- Draft +- Proposed +- Approved +- Implemented +- Deprecated +- Superseded + +Approved and Implemented specifications are baselined artifacts. Later semantic changes should be made as a new revision rather than by silently overwriting the existing baseline. + +Status transitions: + +- Draft → Proposed: Assistant may propose. +- Proposed → Approved: Human approval required. +- Approved → Implemented: Assistant may update after the corresponding implementation and tests are completed. +- Approved / Implemented → Draft: Allowed only by opening a new revision for proposed changes. The previous Approved or Implemented revision must be retained as the historical baseline and remains the active implementation authority until the new revision is approved or implemented. +- Approved / Implemented → Deprecated: Human approval required unless explicitly part of an approved replacement. +- Deprecated → Superseded: Must reference the replacing specification. + +Example: + +```text +L3-BEH-AUTH-003 Rev 1 +Status: Implemented +Active Baseline: yes + +L3-BEH-AUTH-003 Rev 2 +Status: Draft +Active Baseline: no +Supersedes: Rev 1 after approval +``` + +In this example, `Rev 1` remains the active implementation authority while `Rev 2` is still Draft. Existing implementation, tests, and traceability continue to reference `Rev 1`. + +After `Rev 2` is approved, implemented, and verified, the baseline may move forward: + +```text +L3-BEH-AUTH-003 Rev 1 +Status: Superseded +Superseded-By: Rev 2 + +L3-BEH-AUTH-003 Rev 2 +Status: Implemented +Active Baseline: yes +Supersedes: Rev 1 +``` + +Assistant must not mark L1 or L2 specifications as Approved unless explicitly instructed by a human. + +Assistant should implement only against the active Approved or Implemented revision unless explicitly instructed otherwise. Deprecated or Superseded specifications must not be used as implementation authority. + +Each specification document should include the following metadata: + +- Artifact ID +- Revision +- Status +- Active Baseline +- Supersedes, if applicable +- Superseded-By, if applicable + +## Folder Hierarchy + +The hierarchy is organized as: + +``` +specs/ + L1/ + L2/ + L3/ + traceability/ +``` + +L1, L2, and L3 may contain subdirectories grouped by domain, module, feature, or subsystem. Each specification item should be stored as a separate Markdown file. + +Each specification item should have a stable artifact identifier. If only one revision exists, the file name may include only the artifact identifier. If multiple revisions are retained, the file name should also include the revision number. Example: + +If only one revision exists: + +```text +specs/L1/business-flow/L1-REQ-AUTH-001-login.md +specs/L2/auth/L2-DES-AUTH-001-authentication-architecture.md +specs/L3/auth/L3-BEH-AUTH-003-retry-policy.md +``` + +If multiple revisions are retained: +``` +specs/L3/auth/L3-BEH-AUTH-003-retry-policy.rev1.md +specs/L3/auth/L3-BEH-AUTH-003-retry-policy.rev2.md +``` + +The artifact identifier remains stable across revisions. The revision marker belongs to the file name and document metadata, not to the artifact identifier. + +The `traceability/` directory is the single source of truth for traceability relationships between specification artifacts, implementation files, and tests. + +All specifications should use stable identifiers. Identifiers must be stable, unique within their scope, human-readable, and suitable for use in traceability matrices. + +Recommended identifier format: + +``` +L1-REQ-- +L2-DES-- +L3-BEH-- +``` + +Implementation artifacts are referenced by file path and symbol or module name. +Test artifacts are identified by their test location and test function path. Test type is recorded separately as Unit, Integration, or End-to-End. + +Identifier meanings: + +``` +REQ = Requirement +DES = Design +BEH = Detailed Behavior +``` + +## Traceability Matrix + +Traceability relationships are maintained in dedicated files under: `specs/traceability/`, files: + +``` +specs/traceability/l1_to_l2.md +specs/traceability/l2_to_l3.md +specs/traceability/l3_to_impl.md +specs/traceability/verification.md +``` + +The standard relationship types are: +``` +L1 → L2: refined-by +L2 → L3: specified-by +L3 → Implementation: realized-by +Test → Specification: verifies +``` + +Example: +``` +# L1 to L2 Traceability Matrix + +| Source ID | Source Path | Target ID | Target Path | Relationship | Rationale | +|---|---|---|---|---|---| +| L1-REQ-AUTH-001 | L1/business-flow/L1-REQ-AUTH-001-login.md | L2-DES-AUTH-001 | L2/auth/L2-DES-AUTH-001-authentication-architecture.md | refined-by | The authentication architecture refines the login requirement. | +``` + +``` +# L2 to L3 Traceability Matrix + +| Source ID | Source Path | Target ID | Target Path | Relationship | Rationale | +|---|---|---|---|---|---| +| L2-DES-AUTH-001 | L2/auth/L2-DES-AUTH-001-authentication-architecture.md | L3-BEH-AUTH-003 | L3/auth/L3-BEH-AUTH-003-retry-policy.md | specified-by | The retry policy specifies concrete behavior required by the authentication architecture. | +``` + +``` +# L3 to Implementation Traceability Matrix + +| Spec ID | Revision | Spec Path | Implementation Path | Symbol / Module | Relationship | Notes | +|---|---:|---|---|---|---|---| +| L3-BEH-AUTH-003 | 1 | L3/auth/L3-BEH-AUTH-003-retry-policy.rev1.md | crates/auth/src/retry.rs | auth::retry | realized-by | Implements retry policy for transient authentication failures. | +``` + +``` +# Verification Traceability Matrix + +| Test Reference | Test Type | Test Location | Directly Verifies | Verified Revision | Derived Coverage | Notes | +|---|---|---|---|---:|---|---| +| auth::retry::tests::retries_transient_failure_three_times | Unit | crates/auth/src/retry.rs | L3-BEH-AUTH-003 | 1 | L2-DES-AUTH-001, L1-REQ-AUTH-001 | Verifies retry count and stop condition. | +``` + +The traceability matrices provide the foundation for requirement coverage analysis, test coverage analysis, impact analysis, change management, design review, implementation review, verification, and validation. + + +## Test Trace Comment + +Tests should declare traceability metadata using structured comments immediately above the test function. + +The preferred format is: + +```rust +/// Trace: [, ...] +/// Verifies: +#[test] +fn test_name() { + ... +} +``` + +The `Trace` line is mandatory for tests that verify behavior derived from the specification hierarchy. + +The `Verifies` line is recommended because it helps humans and assistant understand why the test exists. + +A test may reference multiple specification artifacts when it verifies behavior that spans multiple requirements, design elements, or detailed behaviors. + +## Guidance for Assistant + +When adding or modifying behavior, Assistant must identify the relevant L3 specification first. + +Non-behavioral changes such as formatting, lint-only edits, dead-code removal, internal refactors that preserve observable behavior, CI maintenance, and test utilities do not require new L3 behavior specifications. When possible, such changes should reference the closest relevant existing specification. If no relevant specification exists, the change should be described as engineering maintenance in the implementation summary. + +When adding or modifying tests, Assistant must identify the behavior being verified and attach traceability metadata to the test. + +Assistant may update specification documents when implementation work reveals that the existing specification is incomplete, ambiguous, inconsistent, or insufficiently actionable. The assistant’s authority depends on the specification level. + +The preferred workflow is: +``` +1. Identify the relevant L1 / L2 / L3 artifacts. +2. Select the L3 artifact that defines the concrete behavior. +3. If the L3 artifact is missing, ambiguous, incomplete, or inconsistent with implementation needs, update or create the L3 artifact before implementing code. +4. If the issue affects architecture, component boundaries, API contracts, data models, or technical constraints, update the corresponding L2 artifact as well. +5. Implement the behavior in the corresponding source file. +6. Add or update tests for the implemented behavior. +7. Add traceability metadata to the tests. +8. Update the traceability matrix if new artifacts or relationships are introduced. +9. Ensure that every implemented behavior is traceable to at least one specification artifact. +``` diff --git a/specs/L1/L1-REQ-AGENT-001-execution-workflow.md b/specs/L1/L1-REQ-AGENT-001-execution-workflow.md new file mode 100644 index 00000000..20b058cb --- /dev/null +++ b/specs/L1/L1-REQ-AGENT-001-execution-workflow.md @@ -0,0 +1,77 @@ +--- +artifact_id: L1-REQ-AGENT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-AGENT-001 — Execution Workflow + +## Purpose + +Define the end-to-end user-visible workflow for an agentic coding task. + +## Why This Matters + +Users need the program to carry work from intent to verified outcome, not merely produce suggestions. A clear execution workflow prevents ambiguous task state and makes the program accountable for what it changed, checked, or could not complete. + +## Background / Context + +The program is expected to behave as a coding agent that can carry work from user intent through tool use, code changes, verification, and final reporting. This requirement keeps that product behavior separate from internal runtime design. + +## User / Business Requirement + +The program must support a complete task execution workflow from user request to final outcome, while making task state and important progress visible to the user. + +## Real User Scenarios + +- A user asks the program to implement a feature; the program inspects the workspace, plans the change, edits files, runs verification, and reports the result. +- A user asks the program to debug a failure; the program gathers evidence, executes targeted tools, identifies the cause, and explains whether the fix is complete. + +## Functional Requirements + +- The program must understand the user request in the context of the current session and workspace. +- The program must plan when the task is complex, risky, or explicitly requires planning. +- The program must execute required tool calls and report important progress, blockers, and failures. +- The program must produce a final response that states what was completed, what changed, how it was verified, and what remains unresolved. + +## Non-Functional Requirements + +- Task state must be understandable without reading internal logs. +- Execution history must be durable enough to support later review and recovery. + +## Acceptance Criteria + +- Given a multi-step coding task, when the program starts execution, then the user can identify whether the task is running, waiting, completed, failed, or interrupted. +- Given a task that changes files, when the program finishes, then the final response includes the change scope and verification result. +- Given a task cannot be completed, when the program stops, then the final response identifies the blocker, the completed work, and the next practical action. +- Given the program asks for user input during execution, when the user responds, then the workflow continues without losing the prior task state. + +## Out of Scope + +- The program does not define internal runtime state machines, protocol payloads, scheduling algorithms, or retry algorithms in this L1 requirement. +- This requirement does not guarantee that every task can be completed autonomously. + +## Open Questions + +- Which classes of tasks should require an explicit visible plan before execution? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Defines the server-side execution engine that carries accepted user input through context assembly, model invocation, tool dispatch, and terminal turn outcome. | +| related-to | L2-DES-AGENT-002 | 1 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | Interrupt and resume control operates on the execution workflow. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol requests and notifications expose execution state to clients. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable session records preserve turn execution history. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-22 | Human | Traceability | Linked the requirement to the L2 agent execution engine design. | diff --git a/specs/L1/L1-REQ-AGENT-002-interrupt-resume.md b/specs/L1/L1-REQ-AGENT-002-interrupt-resume.md new file mode 100644 index 00000000..b7345831 --- /dev/null +++ b/specs/L1/L1-REQ-AGENT-002-interrupt-resume.md @@ -0,0 +1,83 @@ +--- +artifact_id: L1-REQ-AGENT-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-AGENT-002 — Interrupt and Resume + +## Purpose + +Ensure that users can control long-running or misdirected work. + +## Why This Matters + +Agentic work can take time, call tools, and modify files. Users must be able to regain control quickly when the work is wrong, risky, too expensive, or no longer useful. + +## Background / Context + +Agentic work may include model generation, command execution, background processes, file edits, and delegated work. Users need predictable control over these activities. + +## User / Business Requirement + +The program must let the user interrupt, cancel, inspect, and resume work where recovery is possible. + +## Real User Scenarios + +- A user notices that the program is editing the wrong module and interrupts the task before more files change. +- A user stops model generation after noticing that the response is going in the wrong direction or spending unnecessary tokens. +- A user stops a long-running command, reviews the partial output, and resumes the task with new instructions. + +## Functional Requirements + +- The user must be able to interrupt current model generation. +- The user must be able to stop or cancel running tools and background tasks where safe. +- The user must be able to inspect current background processes before deciding whether to stop them. +- The program must preserve completed steps, outputs, and file-change state after interruption. +- The program must support resuming an interrupted task when enough context remains available. + +## Non-Functional Requirements + +- Interruption feedback must be visible and timely. +- The program must not silently leave background work running after a user cancellation. + +## Acceptance Criteria + +- Given an active turn, when the user interrupts execution, then the program shows that execution has stopped or is stopping. +- Given an interrupted task with recoverable context, when the user resumes, then the program continues with awareness of prior progress rather than treating it as a new task. +- Given a running tool cannot be stopped immediately, when the user cancels, then the program reports that cleanup is pending or explains the remaining process state. +- Given background processes started by the program are still running, when the user inspects active work, then the program exposes those processes before the user chooses whether to stop them. +- Given file changes exist after interruption, when the user reviews the task, then the program identifies which changes were produced before interruption. + +## Out of Scope + +- The program does not define platform-specific process termination signals, process-group handling, or UI keybindings in this L1 requirement. +- This requirement does not guarantee that every external process can be safely resumed after cancellation. + +## Open Questions + +- Should the program distinguish interrupt, cancel, abort, and pause as separate user actions? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TOOL-005 | 1 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | Background process management defines the process list and manual stop behavior used for background task control. | +| refined-by | L2-DES-AGENT-002 | 1 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | Defines server-owned interrupt, active work inspection, cleanup, and resume behavior. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Interrupt and resume operate on active execution engine state. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol requests and notifications expose interrupt and resume controls. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable records preserve interrupted and resumed turn state. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added inspection of current background processes before manual stop decisions. | +| 1 | 2026-05-21 | Human | Refinement | Added a real user scenario for stopping model generation. | +| 1 | 2026-05-22 | Human | Traceability | Linked the requirement to the L2 interrupt and resume control design. | diff --git a/specs/L1/L1-REQ-AGENT-003-task-planning.md b/specs/L1/L1-REQ-AGENT-003-task-planning.md new file mode 100644 index 00000000..ca731c0b --- /dev/null +++ b/specs/L1/L1-REQ-AGENT-003-task-planning.md @@ -0,0 +1,83 @@ +--- +artifact_id: L1-REQ-AGENT-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-AGENT-003 — Task Planning + +## Purpose + +Provide a shared plan between the user and the agent for multi-step work. + +## Why This Matters + +Plans make agentic work legible. They let the user see the intended route, correct direction early, and distinguish genuine progress from hidden model activity. + +## Background / Context + +For complex tasks, users need a visible representation of intended steps and current execution state. A plan is user-facing task state, not hidden reasoning. + +Plan Mode is a stricter planning interaction where the agent analyzes and produces a strategic plan without modifying files. + +## User / Business Requirement + +The program must support visible task planning with status updates during execution. + +## Real User Scenarios + +- A user asks for a refactor across several modules and wants to approve the approach before edits begin. +- A user watches a long task progress through investigation, implementation, tests, and cleanup steps. + +## Functional Requirements + +- The user must be able to request a plan before execution. +- The user must be able to enter Plan Mode for analysis and strategic planning without file modification. +- The program may create a plan when task complexity or risk justifies it. +- The plan must represent pending, in-progress, completed, and blocked states. +- The program must update the plan when execution status or user constraints change. + +## Non-Functional Requirements + +- Plan state must remain consistent with actual execution state. +- The plan must not expose private model reasoning as if it were program state. + +## Acceptance Criteria + +- Given a planned task, when a step begins, then the plan marks that step as in progress. +- Given Plan Mode is active, when the user asks for planning, then the program produces a strategic plan without applying file changes. +- Given a blocked step, when the program cannot proceed, then the plan reflects the blocker rather than marking the step complete. +- Given the user changes the objective, when the plan is still active, then the program updates the plan or explains why the old plan no longer applies. +- Given parallel work is delegated, when more than one step is active, then the plan makes the parallelism explicit. + +## Out of Scope + +- The program does not define internal plan data structures, plan-generation algorithms, or UI rendering details in this L1 requirement. +- This requirement does not make a visible plan mandatory for every trivial task. + +## Open Questions + +- Which task types should automatically create a visible plan? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Plan Mode defines the planning-only interaction behavior and no-file-modification rule. | +| refined-by | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | Defines the plan tool as visible to-do state for task planning and execution progress. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | The execution engine updates visible plan state as work proceeds. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol events expose plan updates to clients. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable records preserve plan state for replay and recovery. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added Plan Mode as a planning-only interaction without file modification. | +| 1 | 2026-05-22 | Human | Traceability | Linked task planning to the L2 built-in tool system and plan tool design. | diff --git a/specs/L1/L1-REQ-AGENT-004-subagents.md b/specs/L1/L1-REQ-AGENT-004-subagents.md new file mode 100644 index 00000000..77e1c822 --- /dev/null +++ b/specs/L1/L1-REQ-AGENT-004-subagents.md @@ -0,0 +1,77 @@ +--- +artifact_id: L1-REQ-AGENT-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-AGENT-004 — Subagents + +## Purpose + +Allow bounded work to be delegated to parallel agents while keeping the main workflow coherent. + +## Why This Matters + +Parallel delegation can shorten large investigations, but it can also create duplicate work or conflicting edits. Users need subagents to be scoped, visible, and integrated by the main workflow. + +## Background / Context + +Some tasks benefit from parallel exploration, implementation, or verification. Delegation must remain visible, scoped, and safe from the user perspective. Subagents may need to inherit the current conversation context, and session forking can provide that context without modifying the parent session. + +## User / Business Requirement + +The program must support subagents for delegated, bounded work and must let the user inspect their status and results. + +## Real User Scenarios + +- A user asks the main agent to investigate two independent subsystems in parallel and compare the findings. +- A user asks one subagent to implement a bounded patch while another subagent checks related tests. + +## Functional Requirements + +- The user must be able to request creation of a subagent. +- Each subagent must have a clear task, scope, and expected output. +- The user must be able to inspect subagent status and final results. +- The main agent must integrate subagent findings, patches, or verification results into the main workflow. +- Subagents should be able to start from a forked session or equivalent forked context when delegated work requires existing conversation history. +- When a subagent uses a forked session or forked context, the relationship to the parent session must remain visible. + +## Non-Functional Requirements + +- Subagents must respect the same safety, permission, and workspace boundaries as the main session. +- The program must reduce duplicate work and conflicting edits when multiple subagents run concurrently. + +## Acceptance Criteria + +- Given a delegated task, when a subagent starts, then the user can see what work was delegated. +- Given a completed subagent, when the main agent reports final status, then the subagent result is summarized or integrated in the main session. +- Given a subagent modifies files, when the main workflow reports results, then the changed files and ownership of the work are visible. +- Given a subagent fails or is canceled, when the user checks status, then the failure or cancellation is visible without hiding the main task state. +- Given a subagent starts from a forked session or forked context, when the user inspects the subagent, then the parent session relationship is visible. + +## Out of Scope + +- The program does not define subagent scheduling, workspace forking, merge mechanics, or communication protocols in this L1 requirement. +- This requirement does not allow subagents to bypass safety, approval, or workspace boundaries. + +## Open Questions + +- Can a subagent request user approval directly, or must approval route through the main agent? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/agent/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added subagent use of forked session context and parent-session visibility. | diff --git a/specs/L1/L1-REQ-AGENT-005-plan-mode.md b/specs/L1/L1-REQ-AGENT-005-plan-mode.md new file mode 100644 index 00000000..df6e9778 --- /dev/null +++ b/specs/L1/L1-REQ-AGENT-005-plan-mode.md @@ -0,0 +1,91 @@ +--- +artifact_id: L1-REQ-AGENT-005 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-AGENT-005 — Plan Mode + +## Purpose + +Allow users to request analysis and strategic planning without allowing the agent to modify files during that mode. + +## Background / Context + +Some tasks require careful codebase analysis before implementation. Users may want the agent to inspect the repository, reason about constraints, and produce a plan without making changes. Plan Mode provides that behavior. + +Plan Mode is not a session-level agent mode such as Coding Mode or Security Mode. It is a session-local agent interaction mode that may be entered during a session. Normal Mode is the ordinary non-Plan agent interaction mode. + +Because asking the user a question can interrupt active execution, the dedicated question tool is reserved for Plan Mode. In Normal Mode, the agent must not invoke the question tool. + +## User / Business Requirement + +The program must support Plan Mode, where the agent can analyze the codebase and produce a strategic plan while being prohibited from modifying files. + +## Real User Scenarios + +- A user asks the agent to inspect a complex subsystem and propose an implementation plan before any files are changed. +- A user enters Plan Mode because they want clarification and design discussion before committing to edits. +- A user expects Normal Mode execution to continue without being interrupted by the question tool. + +## Functional Requirements + +- The program must support Plan Mode as a session-local agent interaction mode. +- In Plan Mode, the agent must not create, edit, delete, rename, or otherwise modify files. +- In Plan Mode, the agent may read files, search the codebase, inspect project context, and use other non-mutating analysis capabilities where permitted. +- In Plan Mode, the agent must produce a strategic plan based on user input and codebase analysis. +- In Plan Mode, if the agent needs clarification from the user, it may use the question tool. +- The question tool must be available only in Plan Mode. +- In Normal Mode, the agent must not invoke the question tool. +- Entering or leaving Plan Mode must not change the session-level agent mode such as Coding Mode or Security Mode. + +## Non-Functional Requirements + +- Plan Mode must provide strong protection against accidental file modification. +- Plan Mode output must be actionable enough for the user to decide whether to proceed with implementation. +- The restriction on the question tool must be clear enough that Normal Mode execution is not disrupted by unexpected user-question prompts. +- Plan Mode status must be visible to the user when active. + +## Acceptance Criteria + +- Given Plan Mode is active, when the agent analyzes a task, then it does not modify files. +- Given Plan Mode is active, when the agent needs clarification, then it may ask the user through the question tool. +- Given Normal Mode is active, when the agent needs to continue work, then it does not invoke the question tool. +- Given Plan Mode is active, when the agent completes analysis, then it provides a strategic plan rather than applying changes. +- Given the TUI enters Plan Mode, when the user inspects the session-level agent mode, then Coding Mode or Security Mode remains unchanged. +- Given a mutating tool is requested while Plan Mode is active, when the program evaluates the request, then the program blocks the mutation or reports that Plan Mode prohibits file modification. + +## Out of Scope + +- This requirement does not define the exact command, keybinding, label, or visual design used to enter or leave Plan Mode. +- This requirement does not define the internal implementation of the question tool. +- This requirement does not prohibit approval prompts or safety prompts that are separate from the question tool. +- This requirement does not require Plan Mode to produce an implementation patch. + +## Open Questions + +- Should Plan Mode allow non-file side effects such as running commands, network requests, or subagents? +- Should Plan Mode end automatically after a plan is produced, or remain active until the user exits it? +- What exact user action should convert a Plan Mode plan into Normal Mode implementation work? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TUI-009 | 1 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | The TUI exposes Plan Mode as a session-local input mode and status-line label. | +| related-to | L1-REQ-AGENT-003 | 1 | specs/L1/L1-REQ-AGENT-003-task-planning.md | Task planning defines visible plan state refined by Plan Mode behavior. | +| related-to | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | Built-in tools include user-question capability, which Plan Mode restricts. | +| related-to | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | Tool mode gating enforces Plan Mode restrictions for mutating tools and the question tool. | +| refined-by | TBD | TBD | specs/L2/agent/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved Plan Mode requirement. | +| 1 | 2026-05-22 | Human | Traceability | Linked Plan Mode restrictions to the L2 tool system design. | diff --git a/specs/L1/L1-REQ-APP-001-client-server-arch.md b/specs/L1/L1-REQ-APP-001-client-server-arch.md new file mode 100644 index 00000000..ded974d1 --- /dev/null +++ b/specs/L1/L1-REQ-APP-001-client-server-arch.md @@ -0,0 +1,84 @@ +--- +artifact_id: L1-REQ-APP-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-APP-001 — Client Server Architecture + +## Purpose + +Ensure that the program separates user-facing clients from the agent runtime capability they control. + +## Why This Matters + +Users should not get different task semantics depending on which client they use. A shared runtime makes sessions, approvals, task state, and history consistent across TUI, desktop, IDE, and future clients. + +## Background / Context + +The program may expose multiple user interfaces over time. The initial client surface is the TUI, but agent execution, session state, tool execution, safety decisions, and model interactions should not be owned by one client implementation. + +Users should experience consistent core behavior whether they interact through the TUI or a future client surface. + +For example, the TUI, a desktop client, and an IDE extension client may all connect to the same agent runtime. When the user starts a task, approves an action, interrupts work, or resumes a session from one client, the other connected clients should observe the same underlying session and task state. + +## User / Business Requirement + +The program must provide a client/server product architecture where clients present and control work, while the server-side agent capability owns shared execution behavior. + +## Real User Scenarios + +- A user starts a task in the TUI, opens a desktop client, and sees the same running session state. +- A user approves a tool action from one client and expects the approval decision to be reflected in other connected clients. + +## Functional Requirements + +- The program must provide a server-side agent capability that can be used by client surfaces. +- The initial product must provide a TUI client surface. +- Client surfaces must be able to start, observe, interrupt, and resume agent work through the shared agent capability. +- Multiple connected client surfaces must observe the same underlying session, turn, approval, and task state when they are connected to the same agent runtime. +- Core behaviors such as sessions, turns, model calls, tool execution, approvals, safety checks, and persistence must be shared rather than reimplemented independently by each client. +- Client-specific UI behavior may differ, but it must not change the meaning of core agent execution. + +## Non-Functional Requirements + +- Core agent behavior must remain client-neutral where possible. +- A future client should be able to reuse the same core capabilities without changing existing TUI behavior. +- Client/server boundaries must preserve user-visible consistency for task state, history, approvals, and errors. + +## Acceptance Criteria + +- Given the TUI client starts a task, when the task runs, then the shared agent capability owns execution rather than TUI-only logic. +- Given a future client surface, when it starts or resumes a session, then it observes the same session and turn semantics as the TUI. +- Given the TUI, desktop client, and IDE extension are connected to the same agent runtime, when the user performs an operation in one client, then the other clients observe the same updated session or task state. +- Given a tool approval is required, when any client surface is active, then the approval represents the same safety decision in the shared agent capability. +- Given an error occurs during model or tool execution, when any client surface reports it, then the user receives the same underlying failure state. +- Given a client reconnects to an active session, when the session is still running, then the client can observe the current state instead of creating a conflicting duplicate session. + +## Out of Scope + +- The program does not define transport protocols, wire payloads, crate boundaries, deployment topology, or process layout in this L1 requirement. +- This requirement does not require every client to have identical UI layout or interaction shortcuts. +- Tool extensibility is covered by separate tool and integration requirements. + +## Open Questions + +- Which client surfaces after the TUI should be considered first-class? +- Should local and remote clients share the same user-facing capability guarantees? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | L2 defines protocol, transport, and server instance ownership for shared clients. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-APP-002-persistence.md b/specs/L1/L1-REQ-APP-002-persistence.md new file mode 100644 index 00000000..45e5cc27 --- /dev/null +++ b/specs/L1/L1-REQ-APP-002-persistence.md @@ -0,0 +1,108 @@ +--- +artifact_id: L1-REQ-APP-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-APP-002 — Persistence + +## Purpose + +Preserve user work and conversation history across application restarts. + +## Why This Matters + +Users rely on the program for long-running work and later review. Persistent history prevents completed decisions, tool output, approvals, and task state from disappearing when the application exits. + +## Background / Context + +Coding-agent sessions often span many turns and may include tool outputs, file edits, approvals, queued user messages, steering interventions, fork relationships, and decisions that must remain available later. + +Users may exit the application or leave a task before the active work has completed. In that case, the program must preserve enough active-task state to resume that task on the next launch and show the user what pending input or intervention was restored. + +## User / Business Requirement + +The program must persist conversation and execution history so users can resume and inspect prior work. + +If a user exits while a task is active or incomplete, the program must automatically resume that task on the next launch. + +## Real User Scenarios + +- A user closes the application after a long debugging session and later reopens it to continue from the previous context. +- A user reviews yesterday's tool output to understand why a file was changed. +- A user exits while a task is active, then relaunches the program and sees the task resumed automatically. +- A user had pending `steer` and `queue` items when the application exited, then relaunches and sees those restored items displayed in the client interface. + +## Functional Requirements + +- The program must save conversation history durably. +- The program must load prior conversation history when the application starts. +- The program must preserve tool calls, tool outputs, approvals, and final responses as part of history. +- The program must support finding or selecting prior sessions. +- The program must persist active incomplete task state when the user exits. +- The program must automatically resume an active incomplete task when the user next launches the program. +- The program must persist all tasks or messages currently held in both `steer` and `queue` queues. +- The program must restore `steer` and `queue` queue contents on next launch. +- The client interface must display restored `steer` and `queue` queue contents so the user can understand what will influence or follow the resumed task. +- The program must persist fork relationships between child sessions, parent sessions, and fork turns. +- Forked session persistence must allow inherited history to remain viewable without requiring a deep copy of the entire parent session history. +- Forked session persistence must not require the parent session file to remain accessible after the parent is deleted, unless the fork itself is also deleted by an explicit cascade policy. + +## Non-Functional Requirements + +- Persistence must avoid silent data loss. +- Stored history must remain usable after normal application restart. +- Active-task restoration must be reliable enough that users can trust exit and relaunch workflows. +- Restored `steer` and `queue` items must be visible without being confused with already-executed transcript items. +- Forked session persistence must avoid unnecessary storage growth while preserving user-visible inherited history. + +## Acceptance Criteria + +- Given an existing session, when the user reopens the application, then the user can access that session history. +- Given a completed tool call, when the session is reloaded, then the relevant tool call record remains available for review. +- Given a turn ended with an approval decision, when the session is reopened, then the decision remains visible in history. +- Given a user exits while a task is active or incomplete, when the program launches next time, then that task is automatically resumed. +- Given `steer` queue items existed at exit time, when the program launches next time, then those items are restored and displayed in the client interface. +- Given `queue` queue items existed at exit time, when the program launches next time, then those items are restored and displayed in the client interface. +- Given restored `steer` or `queue` items are displayed, when the user inspects them, then the user can distinguish pending restored items from already-completed transcript history. +- Given a forked session is restored after restart, when the user opens it, then the inherited history and parent-session relationship remain visible. +- Given a forked session is persisted, when the storage representation is created, then the program avoids a full deep copy of the parent session history records. +- Given a parent session has been deleted, when a surviving forked session is restored, then the fork's inherited history remains viewable without opening the deleted parent session file. +- Given persistence fails, when the user continues working, then the program reports the risk of unsaved history. + +## Out of Scope + +- The program does not define storage backend, file format, database schema, or retention policy in this L1 requirement. +- The program does not define the internal data model for `steer` and `queue` queues in this L1 requirement. +- The program does not define the internal shallow-copy or reference mechanism for forked session history in this L1 requirement. +- This requirement does not guarantee indefinite retention of all historical data without user-configured limits. + +## Open Questions + +- Should session deletion support a recovery window? +- Should automatic active-task resume continue execution immediately, or restore into a waiting state when the next pending action is risky? +- Beyond the immediately previous eligible message, should restored `steer` and `queue` items be editable or cancelable before resumed execution continues? +- Which storage strategy should be preferred for inherited fork history: protected shared segments, materialized fork segments, or protected retained source records? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 defines durable JSONL session records, replay, and recovery. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | L2 defines reconnect and catch-up behavior over the shared server protocol. | +| related-to | L1-REQ-CONV-005 | 1 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | Immediate message editing requires persisted edit records and replay behavior. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added automatic active-task resume and restored `steer` / `queue` queue display requirements. | +| 1 | 2026-05-21 | Human | Refinement | Added fork relationship persistence and shallow-copy inherited history requirements. | +| 1 | 2026-05-22 | Human | Refinement | Narrowed restored-message edit open question after adding immediate previous message editing. | +| 1 | 2026-05-22 | Human | Refinement | Required surviving forks to replay inherited history without relying on deleted parent session files. | diff --git a/specs/L1/L1-REQ-APP-003-safety.md b/specs/L1/L1-REQ-APP-003-safety.md new file mode 100644 index 00000000..98de1192 --- /dev/null +++ b/specs/L1/L1-REQ-APP-003-safety.md @@ -0,0 +1,77 @@ +--- +artifact_id: L1-REQ-APP-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-APP-003 — Application Safety + +## Purpose + +Protect user systems, files, credentials, and decision authority while the program performs agentic work. + +## Why This Matters + +The program can run commands, read files, edit code, and contact external services. Users must remain in control of risky actions and must be able to trust that boundaries are enforced. + +## Background / Context + +The program can execute tools, read files, modify code, access networks, and interact with external services. Safety must be a program guarantee. Some modes may add stricter safety rules for specialized work. + +## User / Business Requirement + +The program must enforce permissions, sandboxing, and user approval for actions that can affect user data, systems, or external resources. + +## Real User Scenarios + +- A command wants to write outside the workspace, and the user is asked for explicit approval before it runs. +- A network request is blocked by the current policy, and the program explains what permission would be needed. + +## Functional Requirements + +- The program must support permission modes for tool and resource access. +- The program must support sandboxing for risky execution where available. +- The program must enforce mode-specific safety rules where an active mode defines stricter behavior. +- The program must request explicit user approval for actions outside the current permission boundary. +- The program must record approval and denial outcomes in user-visible history. + +## Non-Functional Requirements + +- Safety decisions must be explainable to the user. +- The program must fail closed when permission state is ambiguous. + +## Acceptance Criteria + +- Given an action that exceeds current permissions, when the program attempts it, then the user receives an approval request before execution. +- Given a denied approval request, when the program continues, then it must not perform the denied action. +- Given permission state is unclear, when a risky action is requested, then the program refuses or asks for clarification instead of guessing. +- Given a user grants scoped approval, when later unrelated work requests broader access, then the earlier approval is not treated as unlimited permission. +- Given an active mode defines stricter safety behavior, when an action conflicts with that mode's safety rules, then the program blocks or escalates the action according to that mode. + +## Out of Scope + +- The program does not define sandbox implementation details or policy engine internals in this L1 requirement. +- This requirement does not promise that all operating systems provide identical sandbox strength. + +## Open Questions + +- Which permission modes should be exposed directly to users? +- Which mode-specific safety rules should be user-configurable, and which should be mandatory? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/app/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added mode-specific safety behavior. | diff --git a/specs/L1/L1-REQ-APP-004-observability.md b/specs/L1/L1-REQ-APP-004-observability.md new file mode 100644 index 00000000..2a3d833a --- /dev/null +++ b/specs/L1/L1-REQ-APP-004-observability.md @@ -0,0 +1,80 @@ +--- +artifact_id: L1-REQ-APP-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-APP-004 — Observability + +## Purpose + +Make system behavior diagnosable for users and maintainers. + +## Why This Matters + +Agentic failures often cross model, tool, runtime, and UI boundaries. Useful observability lets users and maintainers locate the failing stage without guessing or relying on hidden state. + +## Background / Context + +Agentic workflows cross model calls, tools, clients, servers, and external integrations. Failures must be diagnosable without relying on guesswork. + +## User / Business Requirement + +The program must provide observability across the client, server, user interface, model calls, and tool execution paths. + +## Real User Scenarios + +- A model call fails and the user needs to know whether the failure came from provider credentials, network access, or model availability. +- A model response streams incorrectly and the user enables trace logging to inspect recorded stream events. +- A tool appears slow and the user wants to see whether the program is waiting on the command, model, approval, or rendering. + +## Functional Requirements + +- The program must support structured logs for important lifecycle events. +- The program must support configurable log levels such as trace, debug, info, warn, and error. +- Trace logging must support diagnostic records for large language model streaming response events where model calls stream. +- The program must expose user-relevant diagnostics such as current model, token usage, tool timing, and waiting state. +- The program may support optional telemetry when the user enables it. + +## Non-Functional Requirements + +- Logs and telemetry must not expose secrets. +- Trace logs that include model stream data must respect privacy, secret-handling, and configured retention controls. +- Diagnostics must be actionable rather than generic. + +## Acceptance Criteria + +- Given a failed tool call, when the user inspects diagnostics, then the user can identify the failing tool and failure phase. +- Given telemetry is disabled, when the program runs, then it does not send telemetry events. +- Given a turn is waiting, when the user inspects status, then the program identifies whether it is waiting for model output, tool output, approval, or user input. +- Given logs are collected, when they include task identifiers, then related events can be correlated without exposing secrets. +- Given trace logging is enabled during a streaming model response, when logs are collected, then the streaming response events are available for diagnostic inspection. + +## Out of Scope + +- The program does not define telemetry server design, metrics backend, or log storage format in this L1 requirement. +- This requirement does not require every diagnostic event to be shown directly in the primary UI. + +## Open Questions + +- Should telemetry be disabled by default or explicitly chosen during onboarding? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-LLM-003 | 1 | specs/L1/L1-REQ-LLM-003-observability.md | Model usage observability defines trace-mode recording of streaming response events. | +| refined-by | L2-DES-APP-004 | 1 | specs/L2/app/L2-DES-APP-004-observability-architecture.md | Defines structured logs, user-facing diagnostics, trace-mode controls, correlation, redaction, retention, and telemetry boundaries. | +| related-to | L2-DES-LLM-003 | 1 | specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md | Defines the model-specific usage and streaming trace records used by application observability. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added trace logging support for large language model streaming response events. | diff --git a/specs/L1/L1-REQ-APP-005-lightweight.md b/specs/L1/L1-REQ-APP-005-lightweight.md new file mode 100644 index 00000000..e8437048 --- /dev/null +++ b/specs/L1/L1-REQ-APP-005-lightweight.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-APP-005 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-APP-005 — Lightweight Operation + +## Purpose + +Keep the program efficient enough for everyday local development use. + +## Why This Matters + +The program runs in developer workflows where latency, memory growth, and unnecessary CPU load directly affect day-to-day work. A coding agent that becomes heavy during long sessions will stop being practical. + +## Background / Context + +The program may run for long sessions, process large transcripts, and operate inside developer machines with limited resources. + +## User / Business Requirement + +The program must avoid unnecessary memory, CPU, and startup overhead while preserving required functionality. + +## Real User Scenarios + +- A user keeps a long session open while working in a large repository and expects the program to remain responsive. +- A user streams a large tool output and expects the program to bound rendering, storage, and context usage. + +## Functional Requirements + +- The program must avoid retaining unnecessary data in memory after it is no longer needed. +- The program must remain responsive during normal conversation, search, streaming, and tool execution workflows. +- The program must make resource-heavy operations visible when they affect user experience. + +## Non-Functional Requirements + +- Memory usage must be treated as a program constraint. +- Performance targets should be measurable in L2 or L3 specifications. + +## Acceptance Criteria + +- Given a long session, when the user continues working, then the program remains usable without obvious avoidable memory growth. +- Given a large output or transcript, when the program renders or stores it, then it applies bounded behavior rather than allowing unbounded resource use. +- Given the program starts in a normal workspace, when initialization completes, then startup overhead does not include unnecessary indexing or loading of unrelated data. +- Given a resource-heavy operation is running, when it affects responsiveness, then the program exposes enough status for the user to understand the delay. + +## Out of Scope + +- The program does not define allocator choice, memory layout rules, or low-level optimization techniques in this L1 requirement. +- This requirement does not require sacrificing correctness, safety, or recoverability only to reduce resource use. + +## Open Questions + +- What concrete memory and startup responsiveness targets should be used for the first release? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-APP-001 | 1 | specs/L2/app/L2-DES-APP-001-memory-efficient-rust-data-models.md | Defines technical design principles for memory-efficient Rust data models. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Traceability | Linked lightweight operation to memory-efficient Rust data model design. | diff --git a/specs/L1/L1-REQ-APP-006-fuzzysearch.md b/specs/L1/L1-REQ-APP-006-fuzzysearch.md new file mode 100644 index 00000000..4b101618 --- /dev/null +++ b/specs/L1/L1-REQ-APP-006-fuzzysearch.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-APP-006 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-APP-006 — Fuzzy Search + +## Purpose + +Help users quickly find capabilities and project information without exact names. + +## Why This Matters + +Users often remember partial names, rough concepts, or fragments from prior work rather than exact paths or command names. Fuzzy search reduces friction when navigating project files, sessions, skills, MCP capabilities, and commands. + +## Background / Context + +Users need to navigate skills, MCP capabilities, project files, commands, and prior context during agentic work. + +## User / Business Requirement + +The program must support fuzzy search across important user-facing entities. + +## Real User Scenarios + +- A user remembers part of a filename and uses fuzzy search to open the relevant project file. +- A user searches for an available skill or MCP tool without remembering its exact name. + +## Functional Requirements + +- The program must support fuzzy search for project files. +- The program must support fuzzy search for skills. +- The program must support fuzzy search for MCP servers, tools, resources, or templates where available. +- The program should support fuzzy search for sessions, transcript entries, and commands. + +## Non-Functional Requirements + +- Search results must be fast enough for interactive use. +- Search must respect workspace, privacy, and permission boundaries. + +## Acceptance Criteria + +- Given a partial file name, when the user searches, then matching project files are returned. +- Given configured skills or MCP capabilities, when the user searches by partial name, then relevant entries are discoverable. +- Given search results include different entity types, when results are shown, then the user can distinguish files, sessions, commands, skills, and MCP capabilities. +- Given a search crosses workspace data, when permissions restrict access, then restricted entries are omitted or clearly unavailable. + +## Out of Scope + +- The program does not define indexing algorithms, ranking formulas, or UI layout in this L1 requirement. +- This requirement does not require fuzzy search to expose private or permission-restricted data. + +## Open Questions + +- Should transcript and session search be included in the initial fuzzy-search scope? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/app/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-APP-007-tui.md b/specs/L1/L1-REQ-APP-007-tui.md new file mode 100644 index 00000000..9362eda2 --- /dev/null +++ b/specs/L1/L1-REQ-APP-007-tui.md @@ -0,0 +1,77 @@ +--- +artifact_id: L1-REQ-APP-007 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-APP-007 — Terminal User Interface + +## Purpose + +Define the high-level user experience expected from the terminal client. + +## Why This Matters + +The TUI is the initial way users interact with the program. It must make agent work understandable in a terminal while preserving terminal workflows such as scrollback, keyboard input, and safe exit behavior. + +## Background / Context + +The initial client surface is a terminal interface. It must support interactive agent work while preserving useful terminal behavior. + +## User / Business Requirement + +The program must provide a terminal user interface that supports interactive sessions, visible execution state, transcript review, and efficient input. + +## Real User Scenarios + +- A user runs the TUI inside an existing terminal and wants prior terminal scrollback to remain useful after exit. +- A user watches a running turn and needs to see session status, transcript updates, and the composer without switching tools. + +## Functional Requirements + +- The TUI must support an inline mode that preserves terminal scrollback where appropriate. +- The TUI should support an alternate full-screen mode where appropriate. +- The TUI must expose a header or status area, transcript area, and composer area. +- The TUI must support onboarding, command discovery, and visible state for active work. + +## Non-Functional Requirements + +- The TUI must be usable in common terminal environments. +- The TUI must avoid corrupting terminal state after exit. + +## Acceptance Criteria + +- Given an interactive session, when the user opens the TUI, then the user can identify the current session state and input area. +- Given inline mode, when the user exits, then useful terminal scrollback remains available. +- Given a turn is running, when the user looks at the TUI, then active work state is visible without requiring log inspection. +- Given the TUI exits normally, when control returns to the shell, then terminal state is not visibly corrupted. + +## Out of Scope + +- The program does not define detailed widget layout, keybinding mapping, color theme implementation, or rendering algorithms in this L1 requirement. +- This requirement does not require all terminal clients to render identically. + +## Open Questions + +- Which terminal environments are part of the required support matrix? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Defines the modern terminal shell, core regions, inline/fullscreen consistency, responsive layout, and visible active-work UX. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines the composer, command discovery, and input-mode behavior used by the TUI shell. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Defines streaming transcript and active state rendering. | +| related-to | L2-DES-TUI-005 | 1 | specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md | Defines inline scrollback preservation and terminal-safe exit behavior. | +| related-to | L2-DES-TUI-001 | 1 | specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md | Defines the onboarding flow required by the TUI. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-APP-008-mcp.md b/specs/L1/L1-REQ-APP-008-mcp.md new file mode 100644 index 00000000..6d3a7d4c --- /dev/null +++ b/specs/L1/L1-REQ-APP-008-mcp.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-APP-008 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-APP-008 — MCP Capability Integration + +## Purpose + +Allow users to extend the program with external MCP-provided capabilities. + +## Why This Matters + +MCP integrations can add powerful external tools and resources, but users need to know what capabilities were added, whether they are healthy, and whether they follow the same safety rules as built-in tools. + +## Background / Context + +MCP can provide tools, resources, and templates from external servers. Users need discovery, status, safety, and error handling around these capabilities. + +## User / Business Requirement + +The program must support user-configured MCP integrations as discoverable and controllable product capabilities. + +## Real User Scenarios + +- A user configures an MCP server and expects its tools and resources to appear as available capabilities. +- A configured MCP server fails to start, and the user needs a clear status message rather than silent missing tools. + +## Functional Requirements + +- The user must be able to configure MCP servers. +- The user must be able to discover MCP-provided tools, resources, and resource templates. +- The program must show MCP server status and startup errors. +- MCP-provided capabilities must participate in the same safety and approval model as built-in capabilities. + +## Non-Functional Requirements + +- MCP failures must not make unrelated built-in capabilities unusable. +- MCP capability names and descriptions must be understandable to users before use. + +## Acceptance Criteria + +- Given a configured MCP server, when discovery succeeds, then the user can see the capabilities it provides. +- Given an MCP server fails to start, when the user inspects integrations, then the failure is visible and actionable. +- Given an MCP-provided tool requires access outside current permissions, when it is requested, then the normal approval and safety flow applies. +- Given an MCP capability disappears after refresh, when the user inspects status, then the program indicates that the capability is no longer available. + +## Out of Scope + +- The program does not define MCP transport details, protocol implementation, or server lifecycle internals in this L1 requirement. +- This requirement does not guarantee that every third-party MCP server is trustworthy, available, or compatible. + +## Open Questions + +- Should MCP servers be enabled globally, per workspace, or both? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-MCP-001 | 1 | specs/L2/mcp/L2-DES-MCP-001-mcp-integration-architecture.md | Defines MCP configuration, lifecycle, capability discovery, status, safety, and failure behavior. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-APP-009-skills.md b/specs/L1/L1-REQ-APP-009-skills.md new file mode 100644 index 00000000..ca337bcb --- /dev/null +++ b/specs/L1/L1-REQ-APP-009-skills.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-APP-009 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-APP-009 — Skills + +## Purpose + +Let users extend agent behavior with reusable instruction bundles. + +## Why This Matters + +Skills let users reuse specialized workflows without re-explaining them in every prompt. They must remain discoverable and subordinate to the user's current intent and safety constraints. + +## Background / Context + +Skills capture specialized workflows, domain instructions, and reusable procedures. They must be discoverable and used intentionally. + +## User / Business Requirement + +The program must support skills as user-visible reusable capability packages. + +## Real User Scenarios + +- A user asks the program to use a known skill for a document, frontend, or repository-specific workflow. +- A workspace provides a skill, and the user wants to know whether it was discovered and applied. + +## Functional Requirements + +- The user must be able to discover available skills. +- The user must be able to reference or request a skill for a task. +- The program must explain when a skill is being used and why it is relevant. +- The program must handle missing, invalid, or unavailable skills clearly. + +## Non-Functional Requirements + +- Skill use must not override higher-priority safety or user instructions. +- Skill discovery must respect configured roots and workspace boundaries. + +## Acceptance Criteria + +- Given an available skill, when the user requests it, then the program applies the skill or explains why it cannot. +- Given a missing skill, when the user requests it, then the program reports that it is unavailable without failing the whole session. +- Given a skill is applied, when the program starts the task, then the user can see that the skill is being used. +- Given a skill conflicts with higher-priority instructions, when the task runs, then higher-priority instructions win. + +## Out of Scope + +- The program does not define skill file format, installation workflow, or runtime injection mechanics in this L1 requirement. +- This requirement does not allow skills to override user approval, safety, or privacy boundaries. + +## Open Questions + +- Should skills be automatically selected, explicitly selected, or both? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-SKILLS-001 | 1 | specs/L2/skills/L2-DES-SKILLS-001-agent-skills-architecture.md | Defines skill package discovery, activation, context integration, trust, and visibility behavior. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-APP-010-configuration.md b/specs/L1/L1-REQ-APP-010-configuration.md new file mode 100644 index 00000000..b95d9856 --- /dev/null +++ b/specs/L1/L1-REQ-APP-010-configuration.md @@ -0,0 +1,121 @@ +--- +artifact_id: L1-REQ-APP-010 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-APP-010 — Configuration + +## Purpose + +Let users control program defaults and preferences across sessions and clients. + +## Why This Matters + +Configuration turns repeated preferences into durable behavior. Users need predictable defaults for models, permissions, tools, integrations, interface behavior, and telemetry without restating them every session. + +## Background / Context + +Users need control over modes, permissions, tools, integrations, interface behavior, logging, and telemetry. Some tools require their own execution configuration, including web search. + +Model and reasoning defaults are configured through onboarding and supported model-selection workflows, rather than requiring a separate generic settings screen for reasoning effort changes. + +Configuration is loaded from a user-scoped configuration file and, when present, a project-scoped configuration file. The project-scoped configuration file at `project_directory/.dev/config.toml` takes precedence over the user-scoped configuration file. The user-scoped configuration file is `C:\Users\username\.devo\config.toml` on Windows and `~/.devo/config.toml` on macOS or Linux. + +## User / Business Requirement + +The program must provide persistent application-level configuration for core user-facing behavior. + +## Real User Scenarios + +- A user sets a preferred model and expects future sessions to use it by default. +- A user changes pending model or reasoning before the first message and expects that selection to become the default for future sessions. +- A user changes reasoning during an active session and expects later turns in that session to keep using the new reasoning effort. +- A user exits the server after changing reasoning in the current session and expects that reasoning effort to be restored as the default on the next launch. +- A user selects Security Mode before sending the first message for an authorized engagement and expects that pending selection to persist as the default mode for future sessions. +- A user disables telemetry and expects that setting to apply across restarts and client surfaces. +- A user chooses whether web search should use a cloud-based provider search service or a locally configured search path. + +## Functional Requirements + +- The user must be able to configure default model and reasoning settings through onboarding and supported model-selection workflows. +- Configuration information entered during onboarding must be persistently saved to a configuration file. +- Onboarding-created model, provider, provider-specific model name, invocation method, and reasoning effort configuration must be restorable in later launches. +- The program must support a project-scoped configuration file at `project_directory/.dev/config.toml`. +- The program must support a user-scoped configuration file at `C:\Users\username\.devo\config.toml` on Windows. +- The program must support a user-scoped configuration file at `~/.devo/config.toml` on macOS and Linux. +- When both project-scoped and user-scoped configuration files exist, project-scoped configuration must take precedence over user-scoped configuration for overlapping settings. +- When configuration is persisted, the program must make the persistence target deterministic so the user can understand whether the saved value is project-scoped or user-scoped. +- The program must not require a separate generic settings screen as the post-onboarding path for changing model reasoning effort. +- Before the first user message is sent, changing the pending model or reasoning selection must automatically persist that selection as the default model configuration where supported. +- After the first user message is sent, changing model or reasoning selection must update the current session selection and continue to apply to later turns in that session. +- When the server exits gracefully, the program must persist the current active session reasoning effort as the default reasoning configuration for future sessions. +- The user must be able to configure or inspect the default operating mode where mode defaults are supported. +- Before the first user message is sent, changing the pending mode selection must automatically persist the selected mode as the default mode configuration. +- After the first user message is sent, the user must not be able to change the active mode of that session through configuration. +- The user must be able to configure the default permission policy and sandbox behavior. +- The user must be able to configure tools, skills, MCP sources, theme, keybindings, logging, and telemetry preferences. +- The user must be able to configure tool execution options where tools require them, including the effective web search execution path. +- The user must be able to inspect the currently effective configuration. + +## Non-Functional Requirements + +- Configuration errors must be actionable. +- Configuration must be durable across normal application restarts. + +## Acceptance Criteria + +- Given a changed configuration value, when the user starts a later session, then the new value is applied. +- Given onboarding completes with model provider information, when the program restarts, then the onboarding-entered configuration is loaded from persistent configuration without requiring the same setup again. +- Given both `project_directory/.dev/config.toml` and the user-scoped configuration file define an overlapping setting, when the program computes effective configuration, then the project-scoped value takes precedence. +- Given no project-scoped configuration file exists, when the user-scoped configuration file exists, then the program can load applicable settings from the user-scoped configuration file. +- Given configuration is persisted from onboarding or model selection, when the user inspects effective configuration, then the user can understand the scope or source of the saved value where that distinction affects behavior. +- Given the first user message has not been sent, when the user changes pending model or reasoning selection, then the selected value is persisted as the default model configuration where supported. +- Given the first user message has been sent, when the user changes model or reasoning selection in that session, then later turns in the same session use the changed selection. +- Given the server exits gracefully after the current active session reasoning effort changed, when the program next starts, then that reasoning effort is available as the default reasoning configuration. +- Given an invalid configuration, when the program loads it, then the user receives a specific error and recovery path. +- Given a setting is overridden for one turn, when a later turn starts, then the user can distinguish temporary override from persistent configuration. +- Given multiple client surfaces are used, when the effective configuration is inspected, then the same shared defaults are visible where applicable. +- Given web search has configurable execution paths, when the user inspects configuration, then the active web search path is visible. +- Given a mode is active or configured as default, when the user inspects effective configuration, then the mode and mode-specific effects are visible. +- Given the first user message has not been sent, when the user changes the pending mode selection, then the selected mode is persisted as the default mode configuration. +- Given the first user message has been sent, when configuration changes the default mode, then the existing session's active mode is unchanged. + +## Out of Scope + +- The program does not define the full TOML schema, field-level merge algorithm, or configuration UI layout in this L1 requirement beyond the required configuration file locations and project-over-user precedence. +- This requirement does not define exact conflict behavior when multiple active sessions have different reasoning efforts at server exit. +- This requirement does not require every setting to be configurable from every client surface. + +## Open Questions + +- Which settings should be allowed as per-turn overrides? +- Should default mode be global, workspace-specific, or selected explicitly before the first user message? +- Which tool-specific settings should be global, workspace-specific, session-specific, or per-turn overrideable? +- If multiple sessions are active when the server exits, which session's reasoning effort should become the persisted default? +- Should the user be able to override the default onboarding persistence target when both user-scoped and project-scoped configuration files are writable? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | L2 defines configuration source precedence, effective configuration resolution, and onboarding persistence. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added tool-specific configuration requirements including web search execution path. | +| 1 | 2026-05-21 | Human | Refinement | Added operating mode configuration visibility. | +| 1 | 2026-05-21 | Human | Refinement | Clarified that mode defaults do not change the active mode of existing sessions. | +| 1 | 2026-05-21 | Human | Refinement | Added pre-first-message mode changes and automatic default-mode persistence. | +| 1 | 2026-05-21 | Human | Refinement | Clarified that pre-first-message mode changes apply to pending mode selection because the session is created by the first user message. | +| 1 | 2026-05-22 | Human | Refinement | Clarified model and reasoning default behavior: onboarding and model-selection workflows configure defaults, active-session changes are sticky, and graceful server exit persists the current active session reasoning effort. | +| 1 | 2026-05-22 | Human | Refinement | Added onboarding configuration persistence and project-over-user configuration file precedence. | +| 1 | 2026-05-25 | Human | Refinement | Renamed configurable approval posture to permission policy and kept sandbox behavior as a separate configuration concern. | diff --git a/specs/L1/L1-REQ-APP-011-error-recovery.md b/specs/L1/L1-REQ-APP-011-error-recovery.md new file mode 100644 index 00000000..4e0a4222 --- /dev/null +++ b/specs/L1/L1-REQ-APP-011-error-recovery.md @@ -0,0 +1,90 @@ +--- +artifact_id: L1-REQ-APP-011 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-APP-011 — Error Recovery + +## Purpose + +Help users recover from failures without losing work or understanding. + +## Why This Matters + +Failures are normal in agentic workflows. The product must turn them into understandable recovery paths instead of leaving users with partial state and vague errors. + +## Background / Context + +Model calls, tools, configuration, network access, permissions, persistence, and clients may fail. The program must turn those failures into actionable product behavior. + +Large language model invocations can fail because of transient network issues, provider-side HTTP errors, rate limits, authentication failures, or malformed provider responses. Users need retry behavior for retryable failures and enough provider-returned detail to understand what happened. + +## User / Business Requirement + +The program must provide clear, user-visible error handling and recovery paths. + +## Real User Scenarios + +- A provider call fails because credentials are invalid, and the user receives a configuration-focused recovery path. +- A tool writes partial output and exits with an error, and the user can see what happened before deciding whether to retry. +- A model invocation fails with a retryable network error, and the program retries with increasing delay instead of failing immediately. +- A model provider returns an HTTP error response, and the client shows the provider-returned error details in a refined, readable UI rather than only showing a generic exception type. + +## Functional Requirements + +- The program must identify the phase where a failure occurred. +- The program must preserve completed history and outputs after partial failure. +- The program must suggest practical next steps when retry, configuration, input change, or approval can resolve the failure. +- The program must warn the user when a failure may have left partial file changes or inconsistent state. +- The program must retry retryable large language model network errors using an exponential backoff strategy. +- The program must expose the specific error details returned by model invocations, including HTTP error responses where available, instead of only exposing generic exception classes or failure labels. +- The client interface must present model invocation error details in a refined, readable, non-jarring way. + +## Non-Functional Requirements + +- Error messages must be actionable rather than generic. +- The program must avoid silent data loss. +- Retry behavior must be bounded so it does not leave users waiting indefinitely. +- Detailed provider errors must be visible without overwhelming the main task flow. +- Error presentation must preserve usability and visual polish even when the underlying provider response is verbose or technical. + +## Acceptance Criteria + +- Given a provider failure, when the turn fails, then the user can identify that the provider or model call failed. +- Given a retryable network error occurs during a model invocation, when the program can safely retry, then it retries using exponential backoff before reporting final failure. +- Given a model provider returns an HTTP error response, when the client reports the failure, then the user can inspect the provider-returned error details. +- Given model invocation error details are displayed, when the user views them in the client interface, then they are presented in a readable and non-jarring UI treatment rather than as an unstyled raw exception dump. +- Given a tool failure after partial output, when the user reviews the transcript, then the partial output and failure summary remain visible. +- Given a failure leaves possible partial file changes, when the task stops, then the program warns the user to inspect the affected files. +- Given a retry is possible, when the program reports the error, then it explains the condition that must change before retrying. + +## Out of Scope + +- The program does not define error enum design, error codes, exact retry limits, backoff timing parameters, or crash-recovery implementation in this L1 requirement. +- This requirement does not define the exact visual design of error cards, panels, colors, typography, or disclosure controls. +- This requirement does not guarantee automatic recovery from all external service or system failures. + +## Open Questions + +- Which model invocation failures are considered retryable? +- What maximum retry count and maximum backoff delay should be used? +- Which provider error fields should be shown by default, and which should be hidden behind disclosure? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/app/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added model network retry, provider error detail display, and refined error UI requirements. | diff --git a/specs/L1/L1-REQ-APP-012-privacy-data-ownership.md b/specs/L1/L1-REQ-APP-012-privacy-data-ownership.md new file mode 100644 index 00000000..4918d3b4 --- /dev/null +++ b/specs/L1/L1-REQ-APP-012-privacy-data-ownership.md @@ -0,0 +1,93 @@ +--- +artifact_id: L1-REQ-APP-012 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-APP-012 — Privacy and Data Ownership + +## Purpose + +Make ownership and movement of user data explicit. + +## Why This Matters + +The program handles project files, conversation history, logs, credentials, and external service calls. Users need to know what stays local, what may leave the machine, and how they can control stored data. + +## Background / Context + +The program handles conversation history, core-maintained persistent memory, file contents, tool output, credentials, model prompts, logs, and optional telemetry. Users need clear control over user-visible stored data and external data sharing boundaries. + +Credential data may need to pass through a client interface when the user is explicitly configuring or managing provider access. The program should distinguish those explicit credential flows from ordinary model selection, provider status, transcript, logging, and model-context flows. + +## User / Business Requirement + +The program must protect user data, explain external data sharing, and provide control over stored data. + +## Real User Scenarios + +- A user asks whether a task will send file content to a model provider before approving the work. +- A user enters an API key during provider setup and expects ordinary status views to show whether it is configured without displaying the plaintext key by default. +- A user wants to delete a session and expects stored history and cached artifacts for that session to be removed or clearly retained by policy. +- A user deletes a session and expects the program to remove or retain user-visible session data according to the deletion policy without requiring manual persistent-memory management. + +## Functional Requirements + +- The program must treat session history, core-maintained persistent memory when model-visible, file contents, tool output, configuration, and local cache as user data. +- The program must make external data sharing boundaries visible for model providers, MCP servers, web services, and telemetry services. +- The program must prevent secrets and credentials from being exposed as ordinary model context. +- The program may allow client interfaces to handle credential material during explicit user-initiated credential setup, update, repair, or user-authorized reveal flows. +- The program must avoid exposing plaintext credential values in routine client views such as model lists, model switchers, provider status displays, transcripts, logs, or telemetry by default. +- The user must be able to export and delete persistent user data. +- Persistent memory is generated and maintained by the core agent runtime, not by routine client-side memory management flows. +- Client interfaces are not required to inspect, list, delete, export, or subscribe to individual persistent memory entries. +- When the user deletes a session, the core may update, unlink, retain, or remove internal memory according to internal memory policy without requiring per-memory user decisions in the client. + +## Non-Functional Requirements + +- Telemetry must be user-controllable. +- Logs and tool output must not intentionally preserve plaintext secrets. + +## Acceptance Criteria + +- Given telemetry is disabled, when the program runs, then telemetry data is not sent. +- Given stored session history, when the user requests deletion, then the program removes it or reports why it cannot. +- Given content is sent to an external provider, when the user reviews privacy-relevant state, then the program can identify the type of data involved. +- Given a user provides credential material through an explicit setup or update flow, when the program receives it, then that flow is treated as credential handling rather than ordinary transcript, model context, logging, or telemetry data. +- Given a routine client view needs to show provider or model credential state, when the view is rendered, then it uses status information rather than plaintext credential values by default. +- Given a secret is detected in tool output, when the output is recorded, then plaintext secret exposure is avoided where the safety policy requires it. +- Given persistent memory is generated by the core, when a client renders ordinary session or transcript views, then the client is not required to show persistent-memory records or memory-change notifications. +- Given persistent memory contributes to model-visible context, when safety or privacy controls are applied, then it is treated as model-visible user data. + +## Out of Scope + +- The program does not define secret-detection rules, credential-store backend, or telemetry protocol in this L1 requirement. +- The program does not define exact credential reveal, rotation, masking, or redaction controls in this L1 requirement. +- The program does not define persistent memory ranking, retrieval, summarization, extraction, retention, or internal deletion algorithms in this L1 requirement. +- This requirement does not claim that all sensitive data can be detected perfectly. + +## Open Questions + +- Should telemetry default to disabled or require an onboarding decision? +- Should users have a future advanced diagnostics mode for inspecting core-maintained persistent memory? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-MEM-001 | 1 | specs/L1/L1-REQ-MEM-001-persistent-memory.md | Persistent memory is specified as core-maintained internal state. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | L2 defines user-visible deletion and export protocol behavior without memory management methods. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-22 | Human | Refinement | Added explicit credential-flow requirements and clarified that routine client, transcript, logging, telemetry, and model-context paths should not expose plaintext credentials by default. | +| 1 | 2026-05-22 | Human | Refinement | Added persistent memory ownership and session-deletion impact requirements. | +| 1 | 2026-05-22 | Human | Refinement | Reframed persistent memory as core-maintained state outside routine client management. | diff --git a/specs/L1/L1-REQ-APP-013-agent-modes.md b/specs/L1/L1-REQ-APP-013-agent-modes.md new file mode 100644 index 00000000..e50feb20 --- /dev/null +++ b/specs/L1/L1-REQ-APP-013-agent-modes.md @@ -0,0 +1,96 @@ +--- +artifact_id: L1-REQ-APP-013 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-APP-013 — Agent Modes + +## Purpose + +Allow the program to support distinct operating modes without splitting the core agent runtime into separate programs. + +## Background / Context + +The program is primarily a coding agent, but users may also need a security-focused operating mode for authorized security work. These modes should share the same session, tool, model, context, safety, persistence, and client architecture while allowing mode-specific instructions, tool defaults, permission expectations, reporting expectations, and safety rules. + +The initial mode set should include Coding Mode and Security Mode. A mode is a user-visible operating profile, not a separate program and not divided into smaller formal categories. + +Mode is session-scoped after a session exists. Before the first user message is sent, no session has been created yet; the user is editing a pending mode selection initialized from the persisted default mode where a default is configured. When the first user message is sent, the program creates the session using the current pending mode selection and locks that mode for the session. + +Session-level agent modes are distinct from TUI session-local input modes such as Shell Mode and Plan Mode. TUI input modes may change how composer input is interpreted within a session, but they must not change the session-level agent mode. + +## User / Business Requirement + +The program must support user-visible agent modes that configure behavior for different work contexts while preserving common platform guarantees. + +## Functional Requirements + +- The program must support Coding Mode as an operating mode for software development and coding-agent workflows. +- The program must support Security Mode as an operating mode for authorized security work. +- The active session mode must be visible to the user. +- Before a session exists, the program must initialize the pending mode selection from the persisted default mode where a default mode is configured. +- Before the first user message is sent, the user must be able to select or change the pending mode selection where mode selection is supported. +- When the user changes the pending mode selection before sending the first user message, the program must automatically persist that selected mode as the default mode configuration. +- When the first user message is sent, the program must create the session using the current pending mode selection. +- The user must be able to inspect the active session mode. +- Once the first user message has been sent, the program must not allow the user to change that session's active mode. +- If the user needs a different mode, the program should direct the user to create or fork a session with the desired mode where supported. +- A mode may configure base instructions, tool defaults, skills, MCP integrations, permission posture, safety rules, and reporting expectations. +- Mode-specific behavior must be represented in model context where relevant. +- Modes must not create separate, incompatible session semantics for history, persistence, approvals, or tool visibility. +- Modes must remain distinct from client-local input modes such as Shell Mode and Plan Mode. + +## Non-Functional Requirements + +- Mode behavior must be predictable and auditable. +- Mode selection must not weaken safety, privacy, permission, or workspace boundaries. +- Mode-specific configuration must remain understandable to users. +- Shared platform behavior should remain consistent across modes unless a mode explicitly changes user-visible behavior. + +## Acceptance Criteria + +- Given the program supports multiple modes, when the user inspects the session state, then the active mode is visible. +- Given a default mode is configured, when no session has been created yet, then the pending mode selection is initialized with that default mode. +- Given the first user message has not been sent, when the user changes the pending mode selection, then that mode is automatically persisted as the default mode configuration. +- Given the first user message is sent, when the program creates the session, then the session's active mode is set from the current pending mode selection. +- Given Coding Mode is active, when the user performs ordinary software development work, then the program uses coding-oriented defaults and reporting expectations. +- Given Security Mode is active, when the user performs authorized security work, then the program uses security-oriented instructions, tools, safety rules, and reporting expectations. +- Given the first user message has been sent, when the user attempts to change that session's active mode, then the program refuses the change and preserves the existing session mode. +- Given the user needs a different mode after a session has started, when mode selection is required, then the program can direct the user to create or fork a session with the desired mode where supported. +- Given a mode changes tool availability or permission posture, when the user inspects effective configuration, then the mode-specific effect is visible. +- Given the TUI enters Shell Mode or Plan Mode, when the user inspects the session-level agent mode, then Coding Mode or Security Mode remains unchanged. + +## Out of Scope + +- This requirement does not define the exact session-creation command, configuration file format, client UI design, or internal prompt layout. +- This requirement does not define formal subdivisions inside Security Mode. +- This requirement does not allow any mode to bypass safety, approval, privacy, permission, or workspace boundaries. + +## Open Questions + +- Which mode should be the default for a new session? +- Should mode defaults be global, workspace-specific, or selected explicitly for every new session? +- Which mode-specific settings should be persistent defaults, workspace defaults, or fixed session-level settings? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TUI-009 | 1 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | Distinguishes session-level agent modes from session-local TUI input modes. | +| refined-by | TBD | TBD | specs/L2/app/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | +| 1 | 2026-05-21 | Human | Refinement | Made mode session-scoped and disallowed mode changes during a session. | +| 1 | 2026-05-21 | Human | Refinement | Allowed mode changes before the first user message and persisted that choice as default mode configuration. | +| 1 | 2026-05-21 | Human | Refinement | Clarified that no session exists before the first user message and that the pending mode selection is used at session creation. | +| 1 | 2026-05-21 | Human | Refinement | Clarified that session-level agent modes are distinct from TUI session-local input modes. | diff --git a/specs/L1/L1-REQ-AUTO-001-automations-and-reminders.md b/specs/L1/L1-REQ-AUTO-001-automations-and-reminders.md new file mode 100644 index 00000000..49a455b5 --- /dev/null +++ b/specs/L1/L1-REQ-AUTO-001-automations-and-reminders.md @@ -0,0 +1,79 @@ +--- +artifact_id: L1-REQ-AUTO-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-AUTO-001 — Automations and Reminders + +## Purpose + +Allow users to ask the program to continue, remind, monitor, or repeat work at a later time. + +## Why This Matters + +Some work depends on time, external changes, or a future follow-up. Automations let users delegate delayed work while preserving visibility and control over what will run. + +## Background / Context + +Some agent workflows are not completed in a single immediate turn. Users may want the program to check back later, continue a thread after a short delay, run a recurring verification, monitor an external condition, or remind them about unfinished work. + +These workflows must be explicit and inspectable so users understand what will run, when it will run, and what authority it has. + +## User / Business Requirement + +The program must support user-controlled automations and reminders for delayed, recurring, or follow-up work. + +## Real User Scenarios + +- A user asks the program to remind them later to continue a paused thread. +- A user schedules a recurring check that runs verification and reports failures. + +## Functional Requirements + +- The user must be able to create a reminder or automation with a clear task and schedule. +- The user must be able to view, update, pause, resume, and delete automations. +- The program must distinguish one-time follow-ups from recurring automations. +- The program must show what context, workspace, permissions, and goal an automation will use. +- The program must report automation results or failures in a user-visible place. + +## Non-Functional Requirements + +- Automations must be explicit and user-controlled. +- Automations must respect safety, privacy, permission, and workspace boundaries. +- Automations must not silently run broad or destructive work without appropriate user intent. + +## Acceptance Criteria + +- Given a user creates a reminder, when the scheduled time arrives, then the program surfaces the requested follow-up. +- Given a user creates a recurring automation, when the user views automations, then the schedule, task, status, and last result are visible. +- Given an automation would require permissions beyond its current scope, when it runs, then it follows the approval and safety model instead of silently escalating. +- Given an automation fails, when the user views its status, then the failure reason and last attempted run are visible. +- Given a user pauses an automation, when the schedule would otherwise trigger, then the automation does not run until resumed. + +## Out of Scope + +- The program does not define scheduling engine implementation, recurrence rule syntax, notification transport, or background execution architecture in this L1 requirement. +- This requirement does not allow automations to perform destructive or broad work without user intent and permission. + +## Open Questions + +- Should automations be bound to sessions, workspaces, goals, or a combination of these? +- Which automation types are required for the first milestone: reminders, thread follow-ups, recurring jobs, or monitors? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/auto/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-CHANGE-001-rollback-and-recovery.md b/specs/L1/L1-REQ-CHANGE-001-rollback-and-recovery.md new file mode 100644 index 00000000..d6f349a7 --- /dev/null +++ b/specs/L1/L1-REQ-CHANGE-001-rollback-and-recovery.md @@ -0,0 +1,82 @@ +--- +artifact_id: L1-REQ-CHANGE-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-CHANGE-001 — Rollback and Recovery + +## Purpose + +Ensure that users can understand and recover from risky or unwanted file changes made during agent work. + +## Background / Context + +Git-oriented change management helps in repositories, but the program may also work in non-git workspaces or encounter failed edits, partial writes, interrupted turns, and destructive operations. Users need a product-level way to understand what changed and how to undo or recover from it. + +Rollback and recovery must protect user trust, especially when edits are broad, destructive, partially applied, or not yet verified. + +## User / Business Requirement + +The program must help users recover from risky, failed, or unwanted changes, including changes outside normal git workflows. + +## Functional Requirements + +- The program must make user-visible file changes attributable to a task where possible. +- The program must warn when an operation may be destructive, broad, or difficult to undo. +- The program must preserve enough information for users to understand what changed. +- The program must provide or explain a recovery path when edits fail, are interrupted, or produce unwanted results. +- The program must support rollback guidance for non-git workspaces where automatic rollback is not available. +- The program must not silently discard user-created changes while attempting recovery. +- When immediate message editing supersedes the latest turn, the program must attempt to restore file changes attributable to that superseded turn before the replacement turn runs. +- If a file changed by the superseded turn has diverged after that turn, the program must skip automatic restoration for that file and preserve the current file state unless the user explicitly chooses a destructive reset policy. +- The program should prefer structured per-tool restoration data for known file-editing tools and may use workspace-level checkpoints for changes that are otherwise difficult to attribute. + +## Non-Functional Requirements + +- Recovery behavior must prioritize preserving user data over convenience. +- Rollback guidance must be clear enough for users to act on without inspecting internal implementation details. +- Recovery mechanisms must respect workspace, safety, and permission boundaries. +- Automatic rollback must not hide failures or make additional risky changes without clear user intent. + +## Acceptance Criteria + +- Given the program changes files, when the task finishes, fails, or is interrupted, then the user can identify the changed files where possible. +- Given a potentially destructive operation is requested, when the program is about to act, then the user receives an appropriate warning or approval path. +- Given an edit fails after partial changes, when the program reports failure, then it explains the partial state and recovery options. +- Given the workspace is not managed by git, when the user asks how to undo changes, then the program provides the best available recovery guidance instead of assuming git is available. +- Given user-created changes exist before recovery, when the program attempts rollback, then it avoids overwriting those changes without explicit user intent. +- Given immediate message editing supersedes a turn that changed files, when restoration runs, then the program reports which files were restored, skipped, or unsupported. +- Given a superseded turn changed files through a shell command and no reliable checkpoint exists, when restoration runs, then the program does not pretend those shell changes were restored. + +## Out of Scope + +- Specific snapshot mechanisms, backup storage formats, patch inversion algorithms, and git command implementation are not specified here. +- This requirement does not guarantee automatic rollback for every operation or every workspace type. +- This requirement does not replace the separate git change management requirement for repository-specific workflows. + +## Open Questions + +- Which file operations require a pre-change snapshot or explicit rollback plan? +- Should automatic rollback be opt-in, opt-out, or only used after explicit user approval? +- How long should recovery artifacts be retained? +- Should git-based turn checkpoints be mandatory in git workspaces, or should per-tool inverse records remain the primary mechanism? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-CONV-005 | 1 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | Immediate message editing requires rollback of file changes from the superseded turn where safe. | +| refined-by | TBD | TBD | specs/L2/change/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft approved for L1 expansion. | +| 1 | 2026-05-22 | Human | Refinement | Added immediate-message-edit restoration requirements and checkpoint considerations. | diff --git a/specs/L1/L1-REQ-CI-001-ci.md b/specs/L1/L1-REQ-CI-001-ci.md new file mode 100644 index 00000000..3ac3d894 --- /dev/null +++ b/specs/L1/L1-REQ-CI-001-ci.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-CI-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-CI-001 — Continuous Integration + +## Purpose + +Define the baseline quality checks required for the project. + +## Why This Matters + +CI protects the project from regressions that local development may miss. A shared quality gate gives users and contributors a clear signal for whether code is formatted, builds, passes tests, and satisfies lint rules. + +## Background / Context + +The program is a Rust-based project. Contributors and agents need a shared quality gate for formatting, compilation, tests, and linting. + +## User / Business Requirement + +The project must provide a reliable CI quality gate for formatting, checking, testing, and linting. + +## Real User Scenarios + +- A contributor opens a pull request and expects CI to catch formatting, test, compile, and lint failures. +- A maintainer wants local verification commands to match the checks that will run in CI. + +## Functional Requirements + +- CI must run formatting checks. +- CI must run workspace tests. +- CI must run workspace compilation checks for all targets. +- CI must run clippy with warnings treated as errors. + +## Non-Functional Requirements + +- CI failures must be visible and actionable. +- The local verification commands should match CI expectations where practical. + +## Acceptance Criteria + +- Given a pull request, when CI runs, then formatting, tests, check, and clippy are executed. +- Given a lint warning, when CI runs clippy, then the CI job fails. +- Given a CI job fails, when the user inspects the result, then the failing command or check category is visible. +- Given local verification passes with the documented commands, when CI runs in the same supported environment, then CI should not fail because of mismatched baseline commands. + +## Out of Scope + +- The program does not define CI provider configuration, caching strategy, or release automation in this L1 requirement. +- This requirement does not guarantee that every platform-specific issue is caught by a single CI configuration. + +## Open Questions + +- Should CI include platform-specific jobs for macOS, Linux, and Windows? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/ci/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-CLIENT-001-localization-readiness.md b/specs/L1/L1-REQ-CLIENT-001-localization-readiness.md new file mode 100644 index 00000000..4dc50191 --- /dev/null +++ b/specs/L1/L1-REQ-CLIENT-001-localization-readiness.md @@ -0,0 +1,77 @@ +--- +artifact_id: L1-REQ-CLIENT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-CLIENT-001 — Localization Readiness + +## Purpose + +Ensure that client interfaces are usable with non-English user content and can support full UI localization in the future. + +## Background / Context + +The initial product may use English UI text, but users may write prompts, file paths, tool output, provider messages, and transcript content in many languages. Client interfaces must not fail on Unicode input, IME composition, CJK display width, non-ASCII paths, or localized external output. + +Full translation of every UI string is useful, but it does not need to block the first milestone if the clients are structured so localization can be added later. + +## User / Business Requirement + +The program must be locale-safe for user content and localization-ready for future translated client interfaces. + +## Functional Requirements + +- Client interfaces must correctly accept and display Unicode user input. +- Client interfaces must support IME composition in supported environments. +- Client interfaces must preserve non-ASCII file paths, command output, provider responses, and transcript content. +- Client interfaces must handle CJK and other wide-character text without corrupting input, cursor behavior, transcript layout, or visible state. +- User-visible client strings should be centralized or structured so they can be translated in a future localization milestone. +- The initial product may ship with English UI text only. + +## Non-Functional Requirements + +- Locale-safe behavior is required even when full UI translation is not yet implemented. +- Localization readiness must not compromise safety, error clarity, or command discoverability. +- Future UI translation should not require rewriting core client workflows. +- Display behavior must remain readable when localized or non-ASCII content is longer than equivalent English text. + +## Acceptance Criteria + +- Given the user enters non-English text, when the client accepts input, then the submitted message preserves that text. +- Given the user enters text through an IME in a supported environment, when composition completes, then the client preserves the composed text. +- Given a file path contains non-ASCII characters, when the client displays or references the path, then the path remains readable and intact. +- Given provider or tool output contains localized or non-ASCII text, when it appears in the transcript, then the client renders it without corrupting layout. +- Given full UI localization is not implemented, when developers add or review client UI strings, then the code structure does not make future localization unnecessarily difficult. + +## Out of Scope + +- Complete translated UI catalogs are not required by this L1 requirement. +- Locale detection, translation file formats, pluralization rules, string extraction tooling, and i18n library choice are not specified here. +- This requirement does not guarantee identical IME or wide-character behavior in unsupported terminal or client environments. + +## Open Questions + +- Which UI languages should be supported after the initial English-only milestone? +- Which client environments are required to support IME input? +- What minimum layout guarantees should apply to CJK and other wide-character text? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-CLIENT-001 | 1 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | Defines Unicode-safe input, IME composition, display-width aware rendering, non-ASCII path handling, diagnostics, and future UI string translation structure. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | TUI layout must account for Unicode and localized display width. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Composer and input mode handling must preserve Unicode and IME input. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Transcript and streaming output must preserve localized and non-ASCII content. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft approved for L1 expansion. | diff --git a/specs/L1/L1-REQ-CLIENT-002-session-rendering-consistency.md b/specs/L1/L1-REQ-CLIENT-002-session-rendering-consistency.md new file mode 100644 index 00000000..d07a5b39 --- /dev/null +++ b/specs/L1/L1-REQ-CLIENT-002-session-rendering-consistency.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-CLIENT-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-CLIENT-002 — Session Rendering Consistency + +## Purpose + +Ensure that clients render active sessions and restored session history with a consistent visual and stylistic language. + +## Background / Context + +Users may interact with a session while it is active, close the application, and later resume the same session from persisted history. From the client perspective, restored history should not feel like a separate or degraded rendering mode. + +Live-only states such as active streaming, temporary progress indicators, and animations may naturally differ from restored static history, but the overall visual language, content hierarchy, spacing, typography, color usage, and item treatment should remain consistent. + +## User / Business Requirement + +The client interface must render active chat sessions and resumed chat history in a visually and stylistically consistent way. + +## Functional Requirements + +- Client interfaces must use consistent visual treatment for equivalent session items during active use and after history restoration. +- Restored user messages, assistant messages, tool calls, tool outputs, approvals, errors, and turn summaries must remain recognizable as the same kinds of items that appeared during active use. +- Restored history must preserve enough display metadata for clients to render item state, role, outcome, and hierarchy consistently. +- Live-only affordances may be omitted or converted to stable completed-state affordances after restoration. +- If restored content cannot be rendered exactly as it appeared live, the client must still present it in the same design language and make any state differences understandable. + +## Non-Functional Requirements + +- Restored history must not appear visually broken, raw, or stylistically unrelated to active-session rendering. +- Styling consistency must not hide important differences between active, completed, failed, interrupted, restored, or pending states. +- The consistency requirement must hold across normal application close and relaunch workflows. +- Client rendering consistency must not require persisting unnecessary implementation-only UI state. + +## Acceptance Criteria + +- Given a user views a message during an active session, when the same session is restored after relaunch, then the restored message uses a visually consistent item style. +- Given a tool call completes during an active session, when the session is restored from history, then the tool call remains recognizable as the same type of transcript item. +- Given a turn summary is shown live, when the session is restored, then the restored summary preserves the same content hierarchy and visual role. +- Given live streaming animations are not present after restoration, when the client renders restored history, then the completed-state rendering still fits the same design language. +- Given restored pending items such as `steer` or `queue` entries are displayed, when the user inspects them, then they are visually consistent with client UI while remaining distinguishable from completed transcript history. + +## Out of Scope + +- This requirement does not define exact colors, fonts, spacing values, component implementations, animation behavior, or per-client rendering primitives. +- This requirement does not require persisting every transient live-rendering frame or animation state. +- This requirement does not require all clients to use identical layouts, only consistent treatment within each client. + +## Open Questions + +- Which display metadata must be persisted to support consistent restored rendering? +- Which live-only states should have explicit restored equivalents? +- Should each client define its own rendering consistency checklist in L2? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/client/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | diff --git a/specs/L1/L1-REQ-CLIENT-003-tool-activity-presentation.md b/specs/L1/L1-REQ-CLIENT-003-tool-activity-presentation.md new file mode 100644 index 00000000..272f55ad --- /dev/null +++ b/specs/L1/L1-REQ-CLIENT-003-tool-activity-presentation.md @@ -0,0 +1,80 @@ +--- +artifact_id: L1-REQ-CLIENT-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-CLIENT-003 — Tool Activity Presentation + +## Purpose + +Ensure that clients present tool activity as readable user-facing work groups instead of noisy raw tool-call streams. + +## Background / Context + +Agent work often contains repeated tool calls that belong to the same user-visible activity. Reading files and searching the project are exploratory work. Writing files and applying patches are file-update work. Clients should group these activities so users can scan what the agent did without losing access to important details. + +The TUI client has immediate presentation requirements. A future desktop client should use the same product concepts while choosing a richer presentation that fits desktop UI conventions. + +## User / Business Requirement + +The program must provide client-facing tool activity presentation that groups related tool calls, labels them by user-visible intent, and exposes file-change details when files are created or edited. + +## Functional Requirements + +- In the TUI client, consecutive `read` tool calls must be grouped together on a single line and labeled `read`. +- In the TUI client, consecutive `glob` and `grep` tool calls must be grouped together on a single line and labeled `search`. +- In the TUI client, `read` and `search` activity must be nested under an `Explore` group. +- In the TUI client, model reasoning content and the model response must mark the beginning and end of an `Explore` group. +- In the TUI client, `write` and `apply_patch` activity must be nested under a `File Update` group. +- In the TUI client, a `write` operation may be labeled `Created`. +- In the TUI client, an `apply_patch` operation must be labeled `Edited`. +- The server-side agent capability must transmit file-change details for `write` and `apply_patch` operations so clients can render the specific changes. +- A future desktop client should group consecutive tool calls together and make the group collapsible. +- A future desktop client should summarize grouped tool activity with user-visible counts, such as `Explored 1 file, 1 search, ran 2 commands`. + +## Non-Functional Requirements + +- Grouping must improve readability without hiding important tool outcomes, errors, or file-change details. +- Tool activity labels must describe user-visible intent rather than raw implementation details where possible. +- Grouped activity must remain auditable from the transcript or an expanded detail view. +- Client-specific presentation may differ, but the semantic grouping of exploration and file updates should remain consistent. + +## Acceptance Criteria + +- Given consecutive `read` calls occur in the TUI client, when they are rendered, then they appear as one grouped `read` line. +- Given consecutive `glob` or `grep` calls occur in the TUI client, when they are rendered, then they appear as one grouped `search` line. +- Given `read` or `search` activity occurs, when the TUI renders it, then it is shown within an `Explore` group. +- Given `write` activity occurs, when the TUI renders it, then it is shown within a `File Update` group and may be labeled `Created`. +- Given `apply_patch` activity occurs, when the TUI renders it, then it is shown within a `File Update` group and labeled `Edited`. +- Given a file is created or edited by `write` or `apply_patch`, when a client renders the event, then the client has enough file-change details from the server-side agent capability to display the specific change. +- Given a future desktop client renders consecutive tool calls, when the group is collapsed, then it can summarize activity counts such as files explored, searches performed, and commands run. + +## Out of Scope + +- This requirement does not define exact TUI line layout, icons, colors, collapse controls, animation, or desktop component design. +- This requirement does not define the server event schema or exact file-diff payload format. +- This requirement does not define all possible tool grouping categories beyond the `Explore` and `File Update` groups described here. + +## Open Questions + +- Which model response event precisely closes an `Explore` group when exploration and answering interleave? +- Should command execution have its own top-level group or remain summarized alongside exploration in desktop views? +- What minimum file-change detail must be transmitted for created, edited, deleted, and renamed files? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/client/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | diff --git a/specs/L1/L1-REQ-CLIENT-004-prefixed-input-actions.md b/specs/L1/L1-REQ-CLIENT-004-prefixed-input-actions.md new file mode 100644 index 00000000..34b17cee --- /dev/null +++ b/specs/L1/L1-REQ-CLIENT-004-prefixed-input-actions.md @@ -0,0 +1,80 @@ +--- +artifact_id: L1-REQ-CLIENT-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-CLIENT-004 — Fuzzy Search Prefix + +## Purpose + +Define client-side behavior for the `@` input prefix that triggers fuzzy search. + +## Background / Context + +Users need fast ways to reference project or capability entities without leaving the client input flow. Prefix-based input actions provide a compact interaction model: + +- `@` starts fuzzy search and selection. + +The `@` prefix must be visible and predictable so users understand whether they are sending a normal chat message or selecting a referenced entity. + +Terminal-command prefix behavior is specific to the TUI and is specified separately. + +## User / Business Requirement + +The client interface must recognize `@` at the beginning of input and route it to the fuzzy search workflow. + +## Functional Requirements + +- If client input begins with `@`, the client must initiate fuzzy search rather than submitting the text as a normal chat message. +- When `@` fuzzy search starts, the client must show a popup window immediately. +- The fuzzy search popup must update results in real time based on the string following the `@` symbol. +- Fuzzy search results must be grouped or ordered by type in this order: skills, MCP entries, then files in the current working directory. +- Pressing Enter while a fuzzy search result is selected must confirm that selection. +- The client must make it clear when input is in normal chat mode or fuzzy search mode. + +## Non-Functional Requirements + +- Prefix behavior must be predictable and must not silently change normal chat input from ambiguous input. +- The fuzzy search popup must remain responsive enough for interactive typing. +- Search behavior must respect workspace, safety, privacy, and permission boundaries. +- The popup must present result type and selection state clearly enough to avoid accidental selection. + +## Acceptance Criteria + +- Given the user enters input beginning with `@`, when the prefix is typed, then the client immediately opens a fuzzy search popup. +- Given the user continues typing after `@`, when the query changes, then the popup updates matching results in real time. +- Given fuzzy search returns skills, MCP entries, and current-working-directory files, when the popup renders them, then skills appear before MCP entries and MCP entries appear before files. +- Given a fuzzy search result is selected, when the user presses Enter, then the client confirms the selected result. +- Given a search action would exceed permissions, when the action is invoked, then the program follows the applicable safety and approval behavior. + +## Out of Scope + +- This requirement does not define TUI-only terminal-command prefix behavior. +- This requirement does not define fuzzy matching algorithms, scoring, indexing, popup layout, or keyboard navigation beyond Enter confirmation. +- This requirement does not define how selected fuzzy search results are represented in model context or transcript history. + +## Open Questions + +- Should whitespace before `@` still trigger prefixed input behavior? +- Should fuzzy search include sessions, transcript entries, or commands in addition to skills, MCP entries, and current-working-directory files? +- How should users escape a leading `@` when they intend to send a normal chat message? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TUI-008 | 1 | specs/L1/L1-REQ-TUI-008-terminal-command-prefix.md | TUI-only terminal-command prefix behavior is specified separately from general client fuzzy search. | +| refined-by | TBD | TBD | specs/L2/client/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | +| 1 | 2026-05-21 | Human | Refinement | Moved TUI-only terminal-command prefix behavior into a TUI requirement and scoped this requirement to `@` fuzzy search. | diff --git a/specs/L1/L1-REQ-CONTEXT-001-management.md b/specs/L1/L1-REQ-CONTEXT-001-management.md new file mode 100644 index 00000000..63a37af5 --- /dev/null +++ b/specs/L1/L1-REQ-CONTEXT-001-management.md @@ -0,0 +1,79 @@ +--- +artifact_id: L1-REQ-CONTEXT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-CONTEXT-001 — Context Management + +## Purpose + +Ensure the program can maintain useful working context across long-running sessions. + +## Why This Matters + +The model can only use the context it is given. Context management determines whether the program remembers goals, instructions, permissions, recent work, and tool results while staying within model limits. + +## Background / Context + +Agentic work requires model-visible context such as system instructions, active mode, project instruction files, environment, permissions, tools, user messages, model output, tool calls, and tool results. Model context windows are finite. + +## User / Business Requirement + +The program must manage model context so long-running work remains coherent while respecting model limits. + +## Real User Scenarios + +- A user resumes a long task after many turns and expects the program to remember the current objective and important decisions. +- A user changes permission mode and expects future model calls to receive the updated constraint. + +## Functional Requirements + +- The program must include required startup context such as instructions, active mode, environment, permissions, persona, tools, skills, and MCP capabilities where applicable. +- The program must include relevant discovered project instruction files in context where applicable and permitted. +- The program must include relevant conversation items such as user messages, model responses, reasoning summaries, tool inputs, and tool outputs. +- The program must keep context structurally valid across turns. +- The program must support context reduction when needed to stay within model limits. + +## Non-Functional Requirements + +- Context management must preserve recent and relevant task information. +- Context reduction must not corrupt tool-call or conversation structure. + +## Acceptance Criteria + +- Given a long session, when the context approaches model limits, then the program reduces context rather than failing unnecessarily. +- Given a future model call, when context is assembled, then required instructions, active mode, permissions, and available capabilities are represented. +- Given recognized project instruction files were discovered, when context is assembled for workspace-dependent work, then relevant instructions from those files are represented subject to context limits and instruction hierarchy. +- Given context is rebuilt after tool use, when the next model call starts, then relevant tool inputs and outputs remain coherent. +- Given user instructions conflict with obsolete summarized context, when context is assembled, then the current user instruction is preserved as authoritative. + +## Out of Scope + +- The program does not define token estimator implementation, compaction algorithm, or prompt serialization format in this L1 requirement. +- This requirement does not guarantee that every historical detail remains model-visible forever. + +## Open Questions + +- Which context items are mandatory for every model invocation? +- Should active mode be represented in every model invocation or only when mode-specific behavior applies? +- Which discovered project instruction files should be mandatory context for workspace-dependent model invocations? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 defines active context snapshots as references into metadata and transcript records. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added discovered project instruction files as workspace-dependent model context. | +| 1 | 2026-05-21 | Human | Refinement | Added active mode as model context. | diff --git a/specs/L1/L1-REQ-CONTEXT-002-normalize.md b/specs/L1/L1-REQ-CONTEXT-002-normalize.md new file mode 100644 index 00000000..e299e457 --- /dev/null +++ b/specs/L1/L1-REQ-CONTEXT-002-normalize.md @@ -0,0 +1,88 @@ +--- +artifact_id: L1-REQ-CONTEXT-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-CONTEXT-002 — Context Normalization + +## Purpose + +Keep model context well-formed, bounded, and compatible with the currently selected model. + +## Why This Matters + +Malformed or oversized context can make the model misunderstand the task, lose tool-call structure, or exceed provider limits. Normalization keeps context safe to send and understandable when content is omitted. + +## Background / Context + +Conversation history can include large messages, tool outputs, structured tool-call pairs, and multimodal content such as text, images, and video. Invalid, oversized, or modality-incompatible context can harm model behavior and reliability. + +Because users can switch models during a conversation, context normalization must account for the capabilities of the currently selected model before each request. + +## User / Business Requirement + +The program must normalize context items before they are used for model calls. + +## Real User Scenarios + +- A command produces thousands of lines of output, and the program includes a bounded representation instead of flooding the next model call. +- A tool call and result are preserved together so the model does not see an orphaned tool output. +- A user switches from a multimodal model to a text-only model, and unsupported image or video context is removed before the next model request. + +## Functional Requirements + +- The program must bound individual context item size. +- The program must truncate oversized items in a visible and structured way. +- The program must preserve tool input and output pairing. +- The program must avoid model context states with orphaned tool calls or orphaned tool outputs. +- The program must normalize context against the currently selected model's supported modalities before each model request. +- The program must remove modality content unsupported by the current model before sending the request, unless it can be converted into an approved supported representation. +- The program must make modality-based omission visible or explainable to the user when it affects task context. + +## Non-Functional Requirements + +- Normalization must preserve enough information for the model and user to understand what was omitted. +- Normalization must be deterministic enough for debugging and replay. +- Modality filtering must prevent unsupported modality payloads from being sent to model providers. +- Context normalization must remain valid when the user switches models mid-conversation. + +## Acceptance Criteria + +- Given an oversized tool output, when context is prepared, then the output is truncated instead of consuming unbounded context. +- Given a tool call record, when context is prepared, then the corresponding input and output relationship remains valid. +- Given an item is truncated, when the user or model sees the context representation, then the truncation is indicated rather than hidden. +- Given multiple item types have different risk profiles, when normalization runs, then each item type can use an appropriate bounded representation. +- Given the selected model does not support a modality present in context, when context is prepared, then content in that unsupported modality is removed or converted before the model request is sent. +- Given the user switches models mid-conversation, when the next context is prepared, then context is normalized for the newly selected model's modality capabilities. +- Given unsupported modality content is removed, when the user needs to understand model behavior, then the omission is visible or explainable. + +## Out of Scope + +- The program does not define exact size limits, truncation format, or serialization schema in this L1 requirement. +- The program does not define modality conversion algorithms, OCR, video transcription, or provider-specific payload formats in this L1 requirement. +- This requirement does not require preserving full raw content inside model context when it exceeds limits. + +## Open Questions + +- What item types require different maximum-size policies? +- Which modality conversions are acceptable before removing unsupported context content entirely? +- How should the client display modality-based omissions from model context? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/context/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added model-switching and unsupported modality normalization requirements. | diff --git a/specs/L1/L1-REQ-CONTEXT-003-compress.md b/specs/L1/L1-REQ-CONTEXT-003-compress.md new file mode 100644 index 00000000..6cb6c027 --- /dev/null +++ b/specs/L1/L1-REQ-CONTEXT-003-compress.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-CONTEXT-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-CONTEXT-003 — Context Compression + +## Purpose + +Support long conversations by replacing older detail with useful summaries when context grows too large. + +## Why This Matters + +Without compression, long sessions eventually fail or lose useful continuity. Compression lets the program keep moving while preserving the important intent, decisions, and task state from older history. + +## Background / Context + +The program must continue working across long sessions even when full raw history no longer fits in the model context window. + +## User / Business Requirement + +The program must compress older context when context usage reaches a configured threshold. + +## Real User Scenarios + +- A user works through a long implementation session and expects older decisions to survive as summary when raw history no longer fits. +- A user resumes after compression and expects the current objective, changed files, blockers, and verification status to remain available. + +## Functional Requirements + +- The program must detect when context usage approaches a threshold. +- The program must summarize eligible older history into a compact representation. +- The program must preserve recent conversation turns without unnecessary compression. +- The program must combine summary history with recent turns for future model calls. + +## Non-Functional Requirements + +- Compression must preserve task continuity and important decisions. +- Compression must avoid losing recoverability of the true historical record. + +## Acceptance Criteria + +- Given a session that exceeds the context threshold, when compression runs, then future context contains a summary plus recent turns. +- Given compressed history, when the user resumes work, then the agent retains the major goals, decisions, and constraints from earlier work. +- Given compression omits raw detail, when the user inspects history, then the original historical record remains recoverable outside the compressed model context where persistence allows it. +- Given recent turns are still active task context, when compression runs, then those recent turns are preserved rather than summarized away prematurely. + +## Out of Scope + +- The program does not define summary prompt design, threshold formulas, or storage representation in this L1 requirement. +- This requirement does not require summaries to preserve every historical token or low-value output detail. + +## Open Questions + +- How many recent turns should remain uncompressed by default? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/context/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-CONV-001-session-lifecycle.md b/specs/L1/L1-REQ-CONV-001-session-lifecycle.md new file mode 100644 index 00000000..5d4a42fe --- /dev/null +++ b/specs/L1/L1-REQ-CONV-001-session-lifecycle.md @@ -0,0 +1,79 @@ +--- +artifact_id: L1-REQ-CONV-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-CONV-001 — Session Lifecycle + +## Purpose + +Define how users manage durable conversations with the program. + +## Why This Matters + +Sessions are the user's unit of work. Clear lifecycle behavior lets users start, resume, fork, archive, and delete work without losing track of which history or workspace they are using. + +## Background / Context + +A session is the main unit of ongoing collaboration. Users need to create, resume, search, fork, archive, and delete sessions without losing clarity. A new session is created when the first user message for that session is submitted. + +## User / Business Requirement + +The program must support a complete user-visible session lifecycle. + +## Real User Scenarios + +- A user resumes a previous session to continue a feature implementation. +- A user forks a session to try a different approach without changing the original conversation history. + +## Functional Requirements + +- The user must be able to create a new session. +- The program must create a new session when the first user message for that session is submitted. +- The user must be able to resume an existing session. +- The user must be able to find and inspect prior sessions. +- The user must be able to fork, archive, or delete sessions where supported. +- Session forking must preserve a visible relationship to the parent session and fork turn. + +## Non-Functional Requirements + +- Session history must survive normal application restarts. +- Session operations must not silently corrupt or overwrite existing history. + +## Acceptance Criteria + +- Given an existing session, when the application restarts, then the user can find and resume that session. +- Given no session exists yet for a new conversation, when the user sends the first message, then the program creates the session. +- Given a forked session, when the user continues in the fork, then the original session history remains unchanged. +- Given a forked session is inspected, when parent session data remains available, then the user can identify and navigate to the parent session and fork turn. +- Given a session is archived, when the user views active sessions, then the archived session no longer appears as active but remains recoverable if supported. +- Given a session is deleted, when deletion completes, then the program reports whether associated persisted data was removed or retained by policy. + +## Out of Scope + +- The program does not define session ID format, storage layout, or session-list UI in this L1 requirement. +- This requirement does not require every session operation to be reversible. + +## Open Questions + +- Should every session be bound to one workspace, many workspaces, or no workspace? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 defines the durable JSONL session data model. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added explicit parent-session and fork-turn traceability for session forking. | +| 1 | 2026-05-21 | Human | Refinement | Clarified that a new session is created when the first user message is submitted. | diff --git a/specs/L1/L1-REQ-CONV-002-turn-lifecycle.md b/specs/L1/L1-REQ-CONV-002-turn-lifecycle.md new file mode 100644 index 00000000..e3dbd76a --- /dev/null +++ b/specs/L1/L1-REQ-CONV-002-turn-lifecycle.md @@ -0,0 +1,75 @@ +--- +artifact_id: L1-REQ-CONV-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-CONV-002 — Turn Lifecycle + +## Purpose + +Make each user-to-agent execution cycle visible and auditable. + +## Why This Matters + +Turns are how users understand what happened after each input. A clear turn lifecycle prevents confusion between active work, waiting state, completed output, interrupted work, and failed execution. + +## Background / Context + +A turn may include user input, model output, reasoning summaries, tool calls, tool outputs, approvals, Plan Mode questions, and final response state. + +## User / Business Requirement + +The program must expose a clear lifecycle for each turn in a session. + +## Real User Scenarios + +- A user submits a request and watches the turn move from running, to waiting for approval, to completed. +- A user reviews a prior failed turn and needs to know which model output, tool call, or approval step failed. + +## Functional Requirements + +- A turn must begin when the user submits input for agent execution. +- A turn must represent running, waiting, completed, failed, and interrupted states. +- A turn must preserve relevant items in session history after it ends. +- A turn should expose token usage, model information, and key execution results where available. + +## Non-Functional Requirements + +- Turn status must be understandable to users in real time. +- Turn history must support later audit and recovery. + +## Acceptance Criteria + +- Given an active turn, when the program waits for approval or a Plan Mode question, then the user can see why it is waiting. +- Given an interrupted turn, when the user reviews history, then the interruption state is preserved. +- Given a turn completes successfully, when the user reviews history, then the final response and relevant execution items are associated with that turn. +- Given a turn fails before producing a final answer, when the user reviews it, then the failure state is visible and not confused with completion. + +## Out of Scope + +- The program does not define internal item types, server event names, or token accounting precision in this L1 requirement. +- This requirement does not require every internal event to be displayed as a separate user-facing transcript item. + +## Open Questions + +- Can a single Plan Mode turn ask the user multiple questions? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Plan Mode defines when the question tool may be used during a turn. | +| refined-by | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 defines turn and item structures for durable session history. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Clarified that turn questions are Plan Mode questions under the question-tool restriction. | diff --git a/specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md b/specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md new file mode 100644 index 00000000..dc329114 --- /dev/null +++ b/specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md @@ -0,0 +1,93 @@ +--- +artifact_id: L1-REQ-CONV-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-CONV-003 — Active Turn Message Handling + +## Purpose + +Define what happens when the user sends another message while a turn is already active. + +## Why This Matters + +Users often notice corrections, constraints, or follow-up instructions while the program is still working. The product must avoid ambiguous behavior where a message is silently ignored, accidentally interrupts the active task, or is mistaken for completed transcript history. + +## Background / Context + +A session may have an active turn that is generating model output, running tools, waiting for approval, waiting for a user answer, or processing delegated work. During that time, the user may want to guide the active work immediately or send a message that should run after the active work completes. + +The product model distinguishes two user-facing choices: + +- `steer`: a guided intervention intended to influence the currently active turn. +- `queue`: a message that waits in order until the active turn completes. + +## User / Business Requirement + +When a user sends a message during an active turn, the program must let the user choose whether the message should steer the current turn or be queued for later execution. + +## Real User Scenarios + +- A user sees the program taking the wrong approach and sends a `steer` message to correct the active task. +- A user thinks of a follow-up request during a long-running task and sends it to the `queue` so it runs after the current task finishes. +- A user relaunches the program after exiting mid-task and sees restored `steer` and `queue` items displayed in the client interface. + +## Functional Requirements + +- The client interface must distinguish messages sent while a turn is active from normal new-turn submissions. +- The user must be able to choose `steer` for guidance intended to affect the currently active turn. +- The user must be able to choose `queue` for a message intended to run after the active turn completes. +- `steer` messages must be visible as guided interventions rather than ordinary completed transcript items. +- `queue` messages must preserve user submission order until they are executed, canceled, or otherwise resolved. +- The client interface must show pending `steer` and `queue` messages so the user understands what is influencing or waiting behind active work. +- If a `steer` message cannot safely affect the active turn, the program must report that limitation and preserve or reclassify the message according to user intent. +- `steer` and `queue` messages must participate in persistence and restoration behavior when the user exits before the active task completes. + +## Non-Functional Requirements + +- Active-turn message handling must be predictable and visible to the user. +- The program must not silently drop messages sent during active work. +- The program must not confuse pending `steer` or `queue` messages with completed assistant output or already-executed user turns. +- The program must preserve safety, approval, and workspace boundaries when applying `steer` or executing queued messages. + +## Acceptance Criteria + +- Given a turn is active, when the user sends a new message, then the client offers or applies a clear `steer` versus `queue` handling mode. +- Given the user chooses `steer`, when the message is accepted, then the client displays it as guidance for the active turn. +- Given the user chooses `queue`, when the active turn is still running, then the message is retained as pending follow-up work. +- Given multiple messages are queued, when the active turn completes, then queued messages are processed in the user-visible order unless the user changes that order. +- Given pending `steer` or `queue` messages exist when the user exits, when the program launches next time, then those messages are restored and displayed in the client interface. +- Given a `steer` message cannot be applied to the current active state, when the program reports the issue, then the user can understand whether it was queued, rejected, or needs another action. + +## Out of Scope + +- This requirement does not define internal queue data structures, server event names, concurrency model, or exact client UI controls. +- This requirement does not define whether `steer` can modify an already-running tool invocation. +- This requirement does not define detailed conflict handling between queued messages, active goals, and subagents. + +## Open Questions + +- Should `steer` be allowed during every active state, or only during model generation and planning phases? +- Beyond the immediately previous eligible message, should users be able to edit or reorder queued messages before execution? +- Should the client require an explicit choice every time, or use a default mode with an affordance to switch? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | L2 defines protocol behavior for steer and queue submissions during active turns. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 defines durable turn and item records that preserve steer and queue messages. | +| related-to | L1-REQ-CONV-005 | 1 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | Immediate message editing covers the narrow case of editing the latest eligible queued message. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft approved for L1 expansion. | +| 1 | 2026-05-22 | Human | Refinement | Narrowed the queued-message edit open question after adding immediate previous message editing. | diff --git a/specs/L1/L1-REQ-CONV-004-session-forking.md b/specs/L1/L1-REQ-CONV-004-session-forking.md new file mode 100644 index 00000000..a3e3ae30 --- /dev/null +++ b/specs/L1/L1-REQ-CONV-004-session-forking.md @@ -0,0 +1,100 @@ +--- +artifact_id: L1-REQ-CONV-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-CONV-004 — Session Forking + +## Purpose + +Allow users and subagents to start a new session from the context of an existing session at a specific conversational turn. + +## Background / Context + +Users may want to explore a different approach without changing the original session. Subagents also benefit from preserving current context when delegated work starts, because a forked session can carry the relevant conversation state into a bounded child workflow. + +A forked session should behave like a new session for future work, while preserving a clear relationship to the parent session and fork turn. From the client perspective, inherited chat history should remain fully viewable. From the persistence perspective, the program should avoid a literal deep copy of the full history because that would consume unnecessary disk space. + +Deleting, archiving, or exporting a parent session must not silently corrupt forked sessions that depend on inherited parent history. + +Fork persistence must distinguish the fork's origin metadata from the inherited history needed to render and continue the fork. The parent session link is provenance and navigation metadata. It must not be the only way to replay the fork's inherited history, because the parent session may later be deleted or become unavailable. + +After parent deletion, origin fields such as `parent_session_id` and `fork_turn_id` may remain only as non-dereferenceable provenance or tombstone metadata. The fork must not require opening the deleted parent session file to recover the inherited transcript. + +## User / Business Requirement + +The program must support forking a session from a specific conversational turn, preserving user-visible context while representing inherited history efficiently. + +## Functional Requirements + +- The user must be able to fork a session from a specific conversational turn where session forking is supported. +- A forked session must start a new session whose future turns do not mutate the parent session history. +- The forked session must preserve enough inherited context for the user or subagent to continue work from the selected turn. +- The client interface must allow the user to view the inherited chat history in a forked session. +- The program must represent forked session history using shallow-copy or reference-based behavior rather than requiring a literal deep copy of all parent history records. +- A forked session must persist or reference an inherited-history segment that remains replayable for the fork even if the parent session record is later deleted. +- A forked session must retain enough fork-origin display metadata, such as parent label, fork-turn label, and fork-turn digest, for the user to understand the origin even when parent navigation is unavailable. +- The client interface must clearly indicate that a session was forked from a parent session. +- The fork indicator must identify the relevant parent session and conversational turn. +- The fork indicator must allow the user to navigate back to the original parent session where the parent session remains available. +- Deleting a parent session must not make an existing forked session unusable. +- If a parent session is deleted or unavailable while a forked session remains, the forked session must preserve the inherited history needed for the fork without requiring the deleted parent session to be opened. +- If the parent session is deleted or unavailable, the fork indicator must preserve origin metadata and clearly indicate that parent navigation is unavailable. +- When a user deletes a session that has fork descendants, the program must report the impact on those forks and must not delete forked sessions unless the user explicitly requests cascade deletion where supported. +- Subagent creation should be able to use session forking when delegated work needs existing conversation context. + +## Non-Functional Requirements + +- Forking must avoid unnecessary disk growth from duplicating full chat histories. +- Forked sessions must preserve parent-child traceability for audit and navigation. +- Forked session rendering must remain understandable after application restart. +- Forked session replay must remain valid when the parent session is deleted, provided the fork itself is not deleted. +- Parent and forked session history must remain isolated from accidental cross-session mutation. + +## Acceptance Criteria + +- Given a session has multiple turns, when the user forks from a selected turn, then the program creates a new session whose inherited context corresponds to that turn. +- Given a forked session continues with new turns, when the user reviews the parent session, then the parent session history remains unchanged. +- Given the user opens a forked session, when the client renders the transcript, then inherited chat history is viewable without appearing as missing context. +- Given a forked session is displayed, when the user inspects the fork indicator, then the user can identify the parent session and fork turn. +- Given the parent session remains available, when the user activates the fork indicator, then the client can navigate to the original parent session. +- Given the parent session has been deleted or is unavailable, when the user opens a forked session, then the forked session remains usable, inherited history remains viewable, and the fork indicator reports that the parent is unavailable. +- Given the parent session has been deleted or is unavailable, when the user inspects the fork indicator, then the indicator still shows retained origin metadata without requiring the parent session link to resolve. +- Given the parent session storage has been removed, when the forked session is replayed from durable records, then replay does not require opening the deleted parent session file. +- Given a user deletes a session with fork descendants, when deletion is requested, then the program reports whether descendants will be preserved, deleted by explicit cascade, or blocked by policy. +- Given a session is forked from a long history, when persistence stores the fork, then the program avoids a full deep copy of the parent history records. +- Given a subagent is created with existing conversation context, when session forking is used, then the subagent receives the relevant inherited context without modifying the parent session. + +## Out of Scope + +- This requirement does not define session ID format, storage schema, reference-counting mechanics, or database implementation details. +- This requirement does not define exact client visual design for fork indicators or navigation controls. +- This requirement does not require fork navigation to succeed when the parent session has been deleted or is unavailable by policy. + +## Open Questions + +- Should users be able to fork from an active in-progress turn, or only from completed turns? +- Should forked sessions inherit all permissions and configuration from the parent, or snapshot only the context visible at the fork turn? +- Should subagent forked sessions be visible in the normal session list by default? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 defines fork references and retention behavior in the session data model. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | L2 defines session fork, delete, and broadcast protocol behavior. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | +| 1 | 2026-05-22 | Human | Refinement | Clarified parent deletion behavior for forked sessions. | +| 1 | 2026-05-22 | Human | Refinement | Distinguished fork origin metadata from replayable inherited-history storage. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that parent origin links may become non-dereferenceable tombstone metadata after deletion. | diff --git a/specs/L1/L1-REQ-CONV-005-immediate-message-editing.md b/specs/L1/L1-REQ-CONV-005-immediate-message-editing.md new file mode 100644 index 00000000..5a2d0e8f --- /dev/null +++ b/specs/L1/L1-REQ-CONV-005-immediate-message-editing.md @@ -0,0 +1,115 @@ +--- +artifact_id: L1-REQ-CONV-005 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-CONV-005 — Immediate Message Editing + +## Purpose + +Define how the user can edit the immediately preceding message in a session without corrupting durable history. + +## Why This Matters + +Users often notice a typo, missing constraint, wrong file mention, or incorrect instruction immediately after submitting a message. They need a fast correction path, while the transcript still needs to remain auditable and recoverable. + +## Background / Context + +The current conversation requirements preserve transcript history as an append-only audit trail. They do not define editing of arbitrary historical messages. + +Editing older historical messages in place would conflict with append-only persistence, model-context auditability, and tool side-effect visibility. For older corrections, session forking is the safer general mechanism. A narrower edit feature is still useful for the immediately preceding user-authored message in the current session branch. + +When the immediately preceding message produced file changes, editing that message semantically supersedes the latest turn. The replacement turn should run against the workspace state that existed before the superseded turn where that state can be restored safely. Tool-driven file changes such as `write` and `apply_patch` can usually be reverted from captured before/after content. Shell commands may also modify files, but their file effects are harder to attribute unless the program captures a turn-level workspace checkpoint. + +## User / Business Requirement + +The program must support editing the immediately preceding eligible user-authored message in the current session branch, and must attempt to restore file changes made by the superseded latest turn before continuing from the edited message. + +## Real User Scenarios + +- A user submits a message and immediately notices the wrong file mention. +- A user asks a question, receives an answer, then edits the immediately previous user message to correct the request and regenerate from that corrected input. +- A user edits the immediately previous coding request and expects files modified by that superseded turn to be restored before the corrected request runs. +- A user manually edits a file after the superseded turn, then edits the previous message; the program keeps the user's current file content for that file instead of overwriting it during restoration. +- A user has a queued follow-up message that has not started yet and wants to correct it before execution. +- A user tries to edit an older transcript message and is directed to fork from that point instead of mutating history. + +## Functional Requirements + +- The program must identify the immediately preceding eligible user-authored message in the current session branch. +- The user must be able to replace that message's content and mentions through an explicit edit action. +- The edit must be represented as a new durable event or revision record rather than in-place mutation of the original transcript record. +- If the target message already produced a completed, failed, or interrupted turn, the original turn and its outputs must remain recoverable for audit. +- After accepting an edit for a completed previous turn, the program should restore the workspace changes attributable to that superseded turn where safely possible, then continue from the edited message by creating a replacement continuation path or replacement turn. +- The program must record enough file-change or checkpoint data during a turn to attempt restoration if that turn is later superseded by immediate message editing. +- For file changes made by tools with structured edit semantics, such as `write` and `apply_patch`, the program should capture before/after file state or an equivalent inverse operation. +- For file changes made by shell commands, the program should restore them only when they are captured by a turn-level workspace checkpoint or otherwise attributable with sufficient confidence. +- If a file has been manually changed after the superseded turn or cannot be restored safely, the program must skip restoration for that file, preserve the current file state, and record that the file was not restored. +- The program may use a git-based turn checkpoint or hidden ghost commit as one possible restoration mechanism, but it must keep that mechanism separate from user-visible commits and branch history unless the user explicitly asks otherwise. +- If the target message is a queued message that has not started, the edit may update the queued message's effective content while preserving the original queued revision for audit. +- If the target message belongs to an active running turn, the program must not silently mutate the already-started model or tool execution. It must either require interruption, convert the change to `steer`, or reject the edit with a clear explanation. +- Older historical messages must not be edited in place. The program should direct the user to fork from the relevant turn when they need to revise older history. +- All connected clients subscribed to the session must observe accepted edits and resulting turn state changes. + +## Non-Functional Requirements + +- Message editing must preserve an auditable record of the original message and the edited revision. +- Message editing must not hide tool side effects that already occurred before the edit. +- Message editing must not silently discard user-created file changes made after the superseded turn. +- Workspace restoration for immediate message editing must be transparent enough for the user to see which files were restored, skipped, or unsupported. +- Message editing must remain compatible with append-only session persistence and stable context-prefix behavior. +- The current model context after an accepted edit should use the edited message for the replacement continuation path rather than treating both original and edited text as ordinary user intent. + +## Acceptance Criteria + +- Given the latest completed turn was created from a user message, when the user edits that immediately previous message, then the program records the edit and starts or prepares a replacement continuation from the edited content. +- Given the latest completed turn changed files through restorable file-editing tools, when the user edits the immediately previous message, then those files are restored to their pre-turn state before the replacement continuation runs. +- Given a file changed by the superseded turn has been modified again after that turn, when restoration is attempted, then restoration for that file is skipped and the current file state is preserved. +- Given the superseded turn changed files through a shell command, when those changes are not attributable or checkpointed, then the program reports that automatic restoration for those files is unsupported or skipped. +- Given restoration is partially skipped, when the replacement continuation starts, then the transcript or client state records which files were restored and which current file states were kept. +- Given the edited message replaces a previous completed turn, when the transcript is reviewed, then the original turn remains recoverable or visibly superseded rather than silently deleted. +- Given a queued message has not started, when the user edits it, then the queued message's effective content changes while the original revision remains auditable. +- Given the immediately previous message is part of an active running turn, when the user requests an edit, then the program explains whether interruption, `steer`, or rejection applies. +- Given the user attempts to edit an older historical message, when the edit is requested, then the program rejects direct editing and offers or indicates session forking as the appropriate path. +- Given one client edits the immediately previous message, when other clients are subscribed to the session, then they receive the edit and resulting turn updates in order. + +## Out of Scope + +- This requirement does not require arbitrary historical message editing. +- This requirement does not define exact client keybindings, popup layout, or transcript rendering details. +- This requirement does not automatically undo external API calls, network effects, process side effects, published git operations, or other non-file side effects produced by the superseded turn. +- This requirement does not require automatic restoration of shell-created file changes unless a turn-level checkpoint or equivalent attribution exists. +- This requirement does not define branch comparison UI between original and edited continuations. + +## Open Questions + +- Should accepting an edit automatically regenerate immediately, or should the edited message be staged until the user confirms execution? +- Should the default transcript view collapse superseded turns, or show them inline with a superseded marker? +- Should edits to restored `steer` messages be supported in the same feature, or handled by active-turn message controls? +- Should git-based turn checkpoints be required for git workspaces, optional, or controlled by a user setting? +- Should users be able to opt into destructive turn reset behavior that may discard post-turn manual edits, or should the default per-file skip behavior remain mandatory? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-CONV-003 | 1 | specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md | Active running turns may require steer, queue, or interruption instead of direct mutation. | +| related-to | L1-REQ-CONV-004 | 1 | specs/L1/L1-REQ-CONV-004-session-forking.md | Older historical message revision should use session forking instead of in-place editing. | +| related-to | L1-REQ-CHANGE-001 | 1 | specs/L1/L1-REQ-CHANGE-001-rollback-and-recovery.md | Workspace restoration is a rollback and recovery behavior for superseded turns. | +| related-to | L1-REQ-EDIT-001 | 1 | specs/L1/L1-REQ-EDIT-001-file-editing-workflow.md | Structured file edit tools should capture enough data for restoration. | +| related-to | L1-REQ-GIT-001 | 1 | specs/L1/L1-REQ-GIT-001-change-management.md | Git checkpoints may be used as an internal restoration mechanism without publishing commits. | +| refined-by | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 defines append-only edit records and replacement turn references. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | L2 defines edit request and broadcast protocol behavior. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial immediate previous message editing requirement. | +| 1 | 2026-05-22 | Human | Refinement | Added turn file restoration behavior when immediate message editing supersedes the latest turn. | diff --git a/specs/L1/L1-REQ-EDIT-001-file-editing-workflow.md b/specs/L1/L1-REQ-EDIT-001-file-editing-workflow.md new file mode 100644 index 00000000..1cef6bfb --- /dev/null +++ b/specs/L1/L1-REQ-EDIT-001-file-editing-workflow.md @@ -0,0 +1,83 @@ +--- +artifact_id: L1-REQ-EDIT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-EDIT-001 — File Editing Workflow + +## Purpose + +Ensure that file changes made by the program are intentional, reviewable, and recoverable from the user's perspective. + +## Why This Matters + +File edits are the point where agent work becomes durable project change. Users need confidence that edits are scoped to the request, do not overwrite unrelated work, and can be reviewed after they are applied. + +## Background / Context + +The program may read, write, and edit project files while performing coding tasks. Users need to understand what changed, why it changed, whether the change was partial, and how it relates to the requested work. + +File editing is broader than a tool capability. It includes change planning, safe application, review, failure handling, and final reporting. + +## User / Business Requirement + +The program must provide a file editing workflow that makes proposed and applied changes understandable and safe to review. + +## Real User Scenarios + +- A user asks for a focused bug fix and expects the program to modify only the relevant files. +- A user has existing local changes and expects the program to preserve them while applying a separate task edit. + +## Functional Requirements + +- The program must explain the intended file-editing scope when the task requires non-trivial changes. +- The program must apply file changes only within the relevant workspace and task scope unless the user approves otherwise. +- The program must preserve unrelated user changes and avoid overwriting them silently. +- The program must report which files were changed and summarize the purpose of the changes. +- The program must detect and report partial edit failures or conflicts that require user attention. +- Structured file-editing tools should capture enough before/after state, diff data, or inverse operation data to support later restoration of the turn's file changes where safe. + +## Non-Functional Requirements + +- File editing behavior must be predictable and auditable. +- The program must avoid broad, unrelated rewrites when a smaller targeted edit satisfies the task. +- File edits must respect workspace safety and permission boundaries. + +## Acceptance Criteria + +- Given a requested code change, when the program edits files, then the final response lists the changed files and summarizes the change intent. +- Given unrelated existing changes in the same workspace, when the program applies edits, then those unrelated changes are not silently reverted or claimed as program-generated work. +- Given an edit cannot be applied cleanly, when the program reports status, then the user can see which file or change failed. +- Given a large edit is required, when the program reports the result, then the user can understand the scope and reason for the broader change. +- Given a file is generated or binary, when the program needs to modify it, then the program handles it intentionally or reports that the edit is unsupported. +- Given a file is changed by a structured file-editing tool, when the latest turn is later superseded by immediate message editing, then the recorded file-change data is sufficient to attempt safe restoration. + +## Out of Scope + +- The program does not define patch algorithms, diff rendering implementation, editor integration, or merge-conflict resolution mechanics in this L1 requirement. +- This requirement does not allow the program to silently rewrite unrelated files for convenience. + +## Open Questions + +- Should the program support a user approval step before applying large file edits? +- Which edit sizes or file types require special review behavior? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-CONV-005 | 1 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | Immediate message editing uses structured file-edit records for superseded-turn restoration. | +| refined-by | TBD | TBD | specs/L2/edit/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-22 | Human | Refinement | Added structured edit restoration data requirement for immediate message editing. | diff --git a/specs/L1/L1-REQ-GIT-001-change-management.md b/specs/L1/L1-REQ-GIT-001-change-management.md new file mode 100644 index 00000000..865e938a --- /dev/null +++ b/specs/L1/L1-REQ-GIT-001-change-management.md @@ -0,0 +1,80 @@ +--- +artifact_id: L1-REQ-GIT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-GIT-001 — Change Management + +## Purpose + +Help users manage code changes safely and intentionally. + +## Why This Matters + +Git operations can publish or preserve work. Users need the program to distinguish task changes from unrelated local changes and to avoid staging, committing, or pushing without clear intent. + +## Background / Context + +The program modifies code in working repositories. Users need to understand diffs, avoid unrelated changes, verify work, and create branches or commits on request. + +## User / Business Requirement + +The program must provide git-oriented change management for repository work. + +## Real User Scenarios + +- A user asks the program to commit only the files changed for the current task while unrelated local files are dirty. +- A user asks for a branch and pull request after verification passes. + +## Functional Requirements + +- The program must be able to show current branch and working-tree status. +- The program must distinguish task-related changes from unrelated pre-existing changes where possible. +- The program must support showing or summarizing diffs. +- The program must stage, commit, branch, push, or create pull requests only when requested or approved by the user. +- The program may use internal git objects, hidden refs, or ghost commits as implementation details for turn-level workspace checkpoints, provided they are not presented as user-authored commits and do not publish or rewrite visible history without explicit user intent. + +## Non-Functional Requirements + +- The program must avoid including unrelated files in commits. +- Commit messages must describe the actual change. +- Internal git checkpoints must remain distinguishable from user-requested branches, commits, staging, pushes, and pull requests. + +## Acceptance Criteria + +- Given unrelated dirty files, when the user asks for a commit, then the program does not include those files without explicit intent. +- Given verification failures, when the program reports or commits changes, then the failure is disclosed. +- Given the user asks to stage changes, when task-related and unrelated files are both present, then the program stages only the intended files or asks for clarification. +- Given a push or pull request is requested, when the repository state prevents it, then the program explains the blocker. +- Given the program uses a hidden git checkpoint for restoration, when the user inspects normal git history, then the checkpoint is not confused with a user-authored commit. + +## Out of Scope + +- The program does not define git command implementation, hosting-provider integration, or merge-conflict algorithms in this L1 requirement. +- This requirement does not permit automatic publication of code changes without user intent. +- This requirement does not permit hidden checkpoint machinery to silently discard user changes unless a separate explicit destructive reset policy is chosen by the user. + +## Open Questions + +- Should the program create task branches automatically for certain workflows? +- Should internal git checkpoints be implemented as hidden refs, temporary commits, worktree snapshots, or another mechanism? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-CONV-005 | 1 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | Immediate message editing may use hidden git checkpoints for superseded-turn restoration. | +| refined-by | TBD | TBD | specs/L2/git/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-22 | Human | Refinement | Added internal git checkpoint constraints for turn-level restoration. | diff --git a/specs/L1/L1-REQ-GOAL-001-ralph-loop.md b/specs/L1/L1-REQ-GOAL-001-ralph-loop.md new file mode 100644 index 00000000..f57d3107 --- /dev/null +++ b/specs/L1/L1-REQ-GOAL-001-ralph-loop.md @@ -0,0 +1,84 @@ +--- +artifact_id: L1-REQ-GOAL-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-GOAL-001 — Ralph Loop + +## Purpose + +Let users run a bounded autonomous coding loop around a durable objective until the objective is verified, blocked, paused, canceled, or stopped by budget limits. + +## Why This Matters + +Ralph Loop style work is useful when a task is too large for one turn but still has a concrete completion condition. The loop keeps the program focused on the objective, forces repeated audit against the requested outcome, and prevents false completion based only on partial progress or proxy signals. + +## Background / Context + +In coding-agent practice, a Ralph Loop is an autonomous iteration pattern: the agent receives an objective, works on it, verifies the result, and continues looping while the objective is not satisfied and allowed budget remains. Effective Ralph Loop use depends on a clear objective, persisted task state, verification criteria, and explicit stop conditions. + +The program's goal feature is this Ralph Loop objective. It is not only a label for the session; it is the active contract that drives continuation, audit, status reporting, and completion decisions. + +## User / Business Requirement + +The program must provide a Ralph Loop goal capability for bounded autonomous work toward a verifiable objective. + +## Real User Scenarios + +- A user sets a Ralph Loop goal to eliminate all failing tests in a package; the program keeps iterating until the tests pass or a blocker or budget limit is reached. +- A user sets a Ralph Loop goal to complete a migration; the program performs implementation, checks the migration against the objective, and does not stop merely because one command succeeded. +- A task fails verification, and the Ralph Loop goal remains active instead of being marked complete. + +## Functional Requirements + +- The user must be able to create, view, pause, resume, clear, and complete a Ralph Loop goal. +- A Ralph Loop goal must describe the active objective and the user-visible success condition. +- A Ralph Loop goal must expose status such as pursuing, paused, completed, blocked, canceled, or budget-limited. +- The program must continue work across turns while the Ralph Loop goal is active, not paused, not complete, and still within allowed budget. +- The program must audit actual completion before marking the Ralph Loop goal complete. +- The program must report progress, blockers, verification status, and remaining budget information where available. + +## Non-Functional Requirements + +- Ralph Loop state must survive across turns and recoverable session resumes. +- Completion must be based on actual satisfaction of the objective, not on attempted work, generated text, or a single weak proxy signal. +- Looping must be bounded by explicit budget, stop, pause, or cancellation controls. +- The user must be able to understand why the loop is continuing or why it stopped. + +## Acceptance Criteria + +- Given an active Ralph Loop goal, when the user asks for status, then the program reports the objective, current status, progress, blockers, and budget state where available. +- Given the objective has not been verified as complete, when a turn ends, then the Ralph Loop goal remains active or blocked rather than being incorrectly marked complete. +- Given verification fails, when the loop continues, then the failure is treated as feedback for the next iteration or reported as a blocker. +- Given the user pauses the Ralph Loop goal, when the current turn ends, then automatic continuation stops until the user resumes it. +- Given the budget or stop condition is reached, when the loop stops, then the program reports that the goal is not necessarily complete unless completion was actually verified. + +## Out of Scope + +- The program does not define Ralph Loop storage format, slash-command syntax, continuation prompt design, model-tool schema, or budget calculation in this L1 requirement. +- This requirement does not allow the program to run forever without explicit budget or stop controls. +- This requirement does not allow the program to mark a Ralph Loop goal complete based only on attempted work, generated text, or a single unverified signal. + +## Open Questions + +- Should a session allow more than one active Ralph Loop goal? +- Which statuses should be product-level states versus L2/L3 runtime details? +- What budget dimensions should be exposed to users: tokens, time, turns, tool calls, cost, or a combination? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/goal/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-INPUT-001-attachments-and-multimodal.md b/specs/L1/L1-REQ-INPUT-001-attachments-and-multimodal.md new file mode 100644 index 00000000..91a9d988 --- /dev/null +++ b/specs/L1/L1-REQ-INPUT-001-attachments-and-multimodal.md @@ -0,0 +1,79 @@ +--- +artifact_id: L1-REQ-INPUT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-INPUT-001 — Attachments and Multimodal Input + +## Purpose + +Allow users to provide files, images, logs, and other artifacts as task context. + +## Why This Matters + +Many real tasks begin with screenshots, logs, documents, spreadsheets, or other artifacts. Users should not have to manually translate every artifact into plain text before the program can help. + +## Background / Context + +Coding and product work often depends on external artifacts such as screenshots, logs, documents, spreadsheets, design references, archives, or generated reports. Users should be able to attach or reference those artifacts without manually converting everything into plain text. + +Attachments and multimodal inputs must be handled safely, with clear limits and clear representation in session history. + +## User / Business Requirement + +The program must support user-provided attachments and multimodal inputs as first-class task context where the active model and tools allow it. + +## Real User Scenarios + +- A user attaches a screenshot of a UI bug and asks the program to diagnose the layout issue. +- A user provides a log file or document and asks the program to extract the relevant failure or requirement. + +## Functional Requirements + +- The user must be able to provide file attachments or local artifact references as part of a task. +- The program must identify the type, size, and availability of attached artifacts. +- The program must make attached artifacts visible in the session or turn context. +- The program must use appropriate processing for text files, images, logs, documents, spreadsheets, archives, and other supported artifact types. +- The program must explain when an attachment cannot be used because of format, size, permission, model capability, or safety constraints. + +## Non-Functional Requirements + +- Attachment handling must respect privacy, permission, and workspace boundaries. +- Large attachments must not cause unbounded memory or context usage. +- The program must distinguish raw artifact storage from model-visible summarized or extracted content. + +## Acceptance Criteria + +- Given a user attaches a supported file, when the task begins, then the program can reference that artifact as part of the task context. +- Given an unsupported or inaccessible attachment, when the program tries to use it, then the user receives a clear explanation. +- Given a large attachment, when the program processes it, then the program applies bounded behavior and reports any truncation or summarization. +- Given an attachment requires a model capability that is unavailable, when the task starts, then the program explains the limitation and offers an alternate path where possible. +- Given an attachment is outside the workspace or permission boundary, when the program attempts to access it, then the normal approval or denial behavior applies. + +## Out of Scope + +- The program does not define file parser implementation, OCR implementation, archive extraction details, or provider-specific multimodal payload formats in this L1 requirement. +- This requirement does not guarantee that every artifact format can be interpreted. + +## Open Questions + +- Which attachment types are required for the first product milestone? +- Should attachments be persisted with session history or referenced by path only? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 defines item content parts and mentions for multimodal input and artifact references. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-LLM-001-token-efficiency.md b/specs/L1/L1-REQ-LLM-001-token-efficiency.md new file mode 100644 index 00000000..cd746540 --- /dev/null +++ b/specs/L1/L1-REQ-LLM-001-token-efficiency.md @@ -0,0 +1,90 @@ +--- +artifact_id: L1-REQ-LLM-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-LLM-001 — Token Efficiency + +## Purpose + +Reduce unnecessary model cost and latency while preserving task quality. + +## Why This Matters + +Token use affects cost, speed, and model reliability. Efficient context construction helps the program stay responsive and take advantage of provider caching without sacrificing user intent. + +## Background / Context + +Model providers may support prompt or prefix caching. Context construction should avoid avoidable churn in stable prompt prefixes. A stable context prefix is the key mechanism for maximizing cache hit rates across repeated turns. + +Users may still change runtime configuration during a conversation, including access permissions, response persona, selected model, or generation state after an interruption. These changes must be represented without rewriting existing prefix content, because in-place updates to the context prefix can invalidate provider cache reuse. + +## User / Business Requirement + +The program must consider token efficiency and provider cache friendliness when constructing model context, and it must preserve stable context prefixes by representing in-conversation configuration changes through appended context rather than in-place prefix mutation. + +## Real User Scenarios + +- A user runs many turns in one session and expects unchanged instructions and tool definitions not to be needlessly churned. +- A user checks token usage and wants to understand how much context is read, generated, or cached. +- A user changes permissions, persona, or model during a conversation and expects the change to affect future behavior without rewriting earlier stable context. +- A user interrupts generation and resumes work, and the program records the new state without mutating the existing context prefix. + +## Functional Requirements + +- The program should keep stable context prefixes stable where program behavior allows. +- The program should avoid unnecessary reordering or rewriting of unchanged context. +- The program must represent dynamic in-conversation changes by appending new context or state rather than performing in-place updates to existing context prefix content. +- Dynamic changes that must preserve the existing context prefix include access permission changes, response persona changes, model switches, and generation interruption state changes. +- The program must avoid rewriting stable instructions, tool definitions, prior messages, or previous configuration records solely to reflect a later configuration change. +- The program should expose token usage and cached-token information where available. +- The program should avoid sending irrelevant large context to the model. + +## Non-Functional Requirements + +- Token optimization must not compromise correctness, safety, or user intent. +- Provider-specific optimization should remain compatible with provider-independent behavior. +- Cache-friendly context construction must be deterministic enough to debug why a cache hit was or was not expected. +- Append-only handling of runtime changes must preserve an auditable history of configuration changes that affected model behavior. + +## Acceptance Criteria + +- Given unchanged instructions and capabilities, when multiple turns run, then stable prompt content remains stable where possible. +- Given the user changes access permissions during a conversation, when the next model context is assembled, then the permission change is represented as appended state and the existing context prefix is not rewritten. +- Given the user changes response persona during a conversation, when future model responses are prepared, then the new persona is appended or otherwise represented without in-place mutation of earlier prefix content. +- Given the user switches models during a conversation, when the next model request is prepared, then model-specific request handling may change but existing context prefix content is not rewritten merely because the model changed. +- Given generation is interrupted, when the session continues, then interruption state is appended or recorded after the existing prefix rather than editing earlier context content. +- Given token usage data is available, when the user inspects model usage, then read, write, and cached-read usage are visible. +- Given a large irrelevant artifact is available, when model context is assembled, then the program avoids sending it unless it is needed for the task. +- Given an optimization would change task meaning or omit required safety context, when context is assembled, then correctness and safety take priority over token savings. + +## Out of Scope + +- The program does not define provider-specific cache protocols, token-estimation algorithms, or prompt serialization in this L1 requirement. +- This requirement does not require token savings at the cost of correctness, safety, or user intent. +- This requirement does not require preserving a stable prefix when a higher-priority safety or correctness requirement makes prefix mutation unavoidable. + +## Open Questions + +- Which token metrics should be required in the initial user interface? +- Which context segments are considered part of the stable cacheable prefix for each provider or model family? +- How should the client explain cache-impacting changes when a stable prefix cannot be preserved? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/llm/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added stable prefix preservation and append-only runtime configuration change requirements. | diff --git a/specs/L1/L1-REQ-LLM-002-tools.md b/specs/L1/L1-REQ-LLM-002-tools.md new file mode 100644 index 00000000..d6929a45 --- /dev/null +++ b/specs/L1/L1-REQ-LLM-002-tools.md @@ -0,0 +1,76 @@ +--- +artifact_id: L1-REQ-LLM-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-LLM-002 — Model Tool Use + +## Purpose + +Allow the model to request external capabilities through controlled tools. + +## Why This Matters + +Tool use is how the model turns reasoning into action. The user needs those actions to be structured, validated, visible, and constrained by safety policy. + +## Background / Context + +The program relies on tools for file access, command execution, search, web access, planning, approvals, and other actions. Tool use must remain structured and safe. + +## User / Business Requirement + +The program must support model-requested tool use through a controlled tool lifecycle. + +## Real User Scenarios + +- The model requests a file read to inspect code before proposing a fix. +- The model requests a command execution that requires approval because it writes outside the current permission boundary. + +## Functional Requirements + +- The model must be able to request available tools using structured inputs. +- The model should be able to request explicit parallel tool orchestration through `multi_tool_use` where enabled. +- The program must validate tool requests before execution. +- The program must apply safety and approval checks before risky tool execution. +- The program must return structured tool results to the model and user-visible history. + +## Non-Functional Requirements + +- Tool behavior must be predictable and auditable. +- Tool outputs must be bounded and sanitized where necessary. + +## Acceptance Criteria + +- Given a model-requested tool call, when the tool is allowed, then the program executes it and records the result. +- Given a tool call that requires approval, when approval is denied, then the program does not execute the tool. +- Given the model invokes `multi_tool_use`, when the underlying tool calls are valid and allowed, then the program executes the listed calls as an explicit parallel group. +- Given tool input is invalid, when the model requests the tool, then the program rejects or normalizes the request before execution. +- Given tool output is produced, when the model continues, then the result is represented in a structured and bounded way. + +## Out of Scope + +- The program does not define individual tool schemas, provider wire formats, or execution backends in this L1 requirement. +- This requirement does not allow the model to bypass validation or approval by using a different tool path. + +## Open Questions + +- Which tools must be available in the first milestone? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/llm/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added model-requested `multi_tool_use` parallel orchestration behavior. | diff --git a/specs/L1/L1-REQ-LLM-003-observability.md b/specs/L1/L1-REQ-LLM-003-observability.md new file mode 100644 index 00000000..98711fda --- /dev/null +++ b/specs/L1/L1-REQ-LLM-003-observability.md @@ -0,0 +1,82 @@ +--- +artifact_id: L1-REQ-LLM-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-LLM-003 — Model Usage Observability + +## Purpose + +Make model usage and cost-relevant information visible to users. + +## Why This Matters + +Model calls are often the most expensive and opaque part of the workflow. Usage observability lets users understand token pressure, caching behavior, and why context compression or model changes may be needed. + +## Background / Context + +Users need to understand model calls, token consumption, cached-token usage, context-window pressure, output generation, and streaming response behavior for debugging and cost control. + +## User / Business Requirement + +The program must expose model usage observability for user-facing and diagnostic workflows. + +## Real User Scenarios + +- A user asks why a long session was compressed and sees that context-window usage was near the limit. +- A user compares two turns and sees read, write, and cached-read token usage when the provider reports it. +- A user enables trace logging to diagnose streaming behavior and can inspect recorded model response stream events. + +## Functional Requirements + +- The program must record input token usage where available. +- The program must record output token usage where available. +- The program must record cached input token usage where available. +- The program must expose current context-window usage or estimate where available. +- When trace logging mode is enabled, the program must record streaming response data from large language model calls. +- Trace-mode streaming response records must preserve enough information to diagnose streaming behavior, such as response deltas, timing, ordering, and completion state where available. + +## Non-Functional Requirements + +- Usage reporting must clearly distinguish measured values from estimates. +- Usage reporting must not leak sensitive prompt content. +- Trace-mode streaming response logging must follow privacy, secret-handling, and configured log-retention controls. + +## Acceptance Criteria + +- Given a completed model call with usage data, when the user inspects usage, then read, write, and cached-read values are visible if provided. +- Given context-window pressure, when the program reports status, then the user can see that context usage is high or near a limit. +- Given usage values are estimated rather than provider-reported, when they are displayed, then the program labels them as estimates. +- Given usage data is unavailable from a provider, when the user inspects the turn, then the program reports that the value is unavailable rather than inventing it. +- Given trace logging mode is enabled, when a large language model response streams, then the program records the streaming response events for diagnostic inspection. +- Given trace logging mode is disabled, when a large language model response streams, then the program does not record response stream content solely for trace diagnostics. + +## Out of Scope + +- The program does not define provider-specific usage parsing or billing calculations in this L1 requirement. +- This requirement does not guarantee exact monetary cost reporting for every provider. + +## Open Questions + +- Should usage be displayed per turn, per session, or both? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-APP-004 | 1 | specs/L1/L1-REQ-APP-004-observability.md | Application observability defines trace logging behavior and diagnostic logging constraints. | +| refined-by | L2-DES-LLM-003 | 1 | specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md | Defines usage metrics, context pressure, measured versus estimated values, unavailable values, and trace-mode stream records. | +| related-to | L2-DES-APP-004 | 1 | specs/L2/app/L2-DES-APP-004-observability-architecture.md | Defines the cross-system logging, diagnostics, trace-mode, privacy, and retention architecture used by model observability. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added trace-mode logging of large language model streaming response events. | diff --git a/specs/L1/L1-REQ-LLM-004-persona.md b/specs/L1/L1-REQ-LLM-004-persona.md new file mode 100644 index 00000000..9e6c6f12 --- /dev/null +++ b/specs/L1/L1-REQ-LLM-004-persona.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-LLM-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-LLM-004 — Persona and Communication Style + +## Purpose + +Allow users to control the communication style used by the model-facing agent. + +## Why This Matters + +Different tasks and users require different communication styles. Configurable style helps the program match user expectations while keeping safety, correctness, and instruction hierarchy intact. + +## Background / Context + +Different users and tasks may require concise, detailed, formal, direct, or localized communication styles. + +## User / Business Requirement + +The program must support adjustable persona or communication style settings. + +## Real User Scenarios + +- A user selects a concise style for implementation tasks and expects shorter final reports. +- A user switches to a more explanatory style while learning unfamiliar code. + +## Functional Requirements + +- The user must be able to select or configure a communication style. +- The selected style must influence model-facing instructions for future responses. +- The program must make the active style understandable to the user. +- The program must allow style changes without changing unrelated safety or tool behavior. + +## Non-Functional Requirements + +- Persona settings must not override higher-priority user, safety, or system constraints. +- Style changes should be durable when configured as a preference. + +## Acceptance Criteria + +- Given a selected concise style, when the model responds, then responses are shorter where task requirements allow. +- Given a style change, when a later turn begins, then the new style is used unless overridden. +- Given a style asks for brevity, when safety or verification details are important, then the program still includes necessary information. +- Given a project or system instruction conflicts with style preference, when the model responds, then higher-priority instructions take precedence. + +## Out of Scope + +- The program does not define prompt template implementation or style taxonomy in this L1 requirement. +- This requirement does not allow persona settings to override safety, user intent, or factual accuracy. + +## Open Questions + +- Which built-in styles should be available by default? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/llm/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-MEM-001-persistent-memory.md b/specs/L1/L1-REQ-MEM-001-persistent-memory.md new file mode 100644 index 00000000..b042f17d --- /dev/null +++ b/specs/L1/L1-REQ-MEM-001-persistent-memory.md @@ -0,0 +1,87 @@ +--- +artifact_id: L1-REQ-MEM-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-MEM-001 — Persistent Memory + +## Purpose + +Define persistent memory as agent-maintained core state rather than a client-managed user feature. + +## Why This Matters + +Persistent memory can help the agent carry useful preferences, project knowledge, and recurring decisions across sessions. However, requiring users to inspect, curate, export, or delete individual memory entries adds unnecessary client complexity and makes memory feel like a user-facing database rather than an internal agent capability. + +## Background / Context + +Persistent memory is distinct from session transcript history. The client interface operates on sessions, turns, items, configuration, approvals, and user-visible data controls. Persistent memory is generated and maintained by the core agent runtime for future context construction. + +The TUI, desktop client, IDE client, and other client surfaces do not need a persistent-memory management protocol. They should not list memory entries, subscribe to memory changes, or expose individual memory deletion/export controls unless a later requirement explicitly promotes memory management to a user-facing feature. + +## User / Business Requirement + +The program may maintain persistent memory internally, but users are not required to manage persistent memory directly. + +## Real User Scenarios + +- A user continues work across sessions and benefits from agent-retained context without managing memory records. +- A user deletes or archives a session through normal session controls without needing to resolve individual memory entries. +- A client renders sessions and turns without knowing whether the core created, updated, or used persistent memory internally. + +## Functional Requirements + +- Persistent memory, where supported, must be generated and maintained by the core agent runtime. +- Persistent memory must not be part of the routine client-server protocol surface. +- Clients must not be required to list, inspect, edit, delete, export, or subscribe to individual persistent memory entries. +- Persistent memory may retain internal source provenance for debugging, safety, privacy, or context-quality purposes. +- Session deletion may cause the core to update, unlink, retain, or remove internal memory according to internal memory policy, but ordinary clients are not required to present per-memory decisions. +- Persistent memory used for model context must pass through the same safety, privacy, and context-construction controls as other model-visible context. +- If persistent memory is disabled or unavailable, normal session, turn, and client behavior must continue to work. + +## Non-Functional Requirements + +- Persistent memory behavior must remain deterministic enough for debugging and replay where it affects model-visible context. +- Persistent memory must not expose plaintext secrets into model context, logs, telemetry, or routine client projections. +- Persistent memory implementation details must not leak into ordinary session and transcript UI. + +## Acceptance Criteria + +- Given persistent memory is supported, when the core derives memory from session activity, then no client-side memory management action is required. +- Given a client connects to the server, when it negotiates protocol capabilities, then it does not need persistent-memory list, delete, export, or change-notification methods. +- Given a session is deleted, when the core updates any internal memory linked to that session, then ordinary session deletion can complete without requiring the user to manage individual memory entries. +- Given persistent memory contributes to future model context, when context is assembled, then the memory is treated as core-maintained context rather than as a transcript item or client-managed record. +- Given persistent memory is disabled, when the user uses sessions and turns, then client behavior remains unchanged except for the absence of memory-derived context. + +## Out of Scope + +- This requirement does not define memory extraction, ranking, retrieval, summarization, storage, compaction, or model-context insertion algorithms. +- This requirement does not define a user-facing memory browser, editor, export flow, deletion flow, or notification stream. +- This requirement does not require persistent memory to be enabled by default. +- This requirement does not guarantee perfect provenance for every internal memory entry. + +## Open Questions + +- Should a future privacy or diagnostics mode expose internal persistent memory to advanced users? +- Should persistent memory have a global enable/disable setting, or be controlled only by agent mode and core policy? +- How long should internal persistent memory be retained by default? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | Persistent memory remains user data when model-visible, but is not a routine client-managed resource. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | L2 describes internal memory provenance links in durable session records where needed. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial persistent memory ownership requirement. | +| 1 | 2026-05-22 | Human | Refinement | Reframed persistent memory as core-maintained internal state rather than a client-managed protocol feature. | diff --git a/specs/L1/L1-REQ-MODEL-001-config.md b/specs/L1/L1-REQ-MODEL-001-config.md new file mode 100644 index 00000000..48132d53 --- /dev/null +++ b/specs/L1/L1-REQ-MODEL-001-config.md @@ -0,0 +1,129 @@ +--- +artifact_id: L1-REQ-MODEL-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-MODEL-001 — Model Configuration + +## Purpose + +Let users configure supported models for use by the program. + +## Why This Matters + +Model behavior depends on capabilities such as context length, reasoning, thinking, supported modalities, and provider availability. Users need clear model configuration to choose a model that fits the task. + +## Background / Context + +Models differ in context length, reasoning support, thinking support, supported input modalities, and availability. The program includes a built-in supported-model list that defines model capabilities and default behavior. This built-in list is distinct from user-defined providers, user-provided provider details such as base URL and API key, and model-provider binding details such as provider-specific model name and invocation method. + +Initially, supported model definitions may exist even when no models have been configured for actual invocation. A model becomes invocable only after the user binds a supported model to a user-defined provider, provider-specific model name, invocation method, and required reasoning settings where applicable. + +Client interfaces may collect credential material during explicit model or provider configuration flows. Routine model selection, model listing, and model-switching views should represent credential state without requiring plaintext credential values. + +After onboarding, the primary client-side model change workflow is the TUI `/model` command. This workflow lets the user select a configured or configurable supported model and then choose a reasoning effort when the selected model supports reasoning. + +User-configured providers and model-provider bindings are durable configuration records. Their effective values follow the application configuration precedence rule: project-scoped configuration takes precedence over user-scoped configuration for overlapping settings. + +## User / Business Requirement + +The program must support built-in supported model definitions and user-configured invocable models derived from those supported definitions. + +## Real User Scenarios + +- A user chooses a model from the built-in supported-model list, selects or creates a provider, enters the provider-specific model name, and expects that binding to become available for invocation. +- A user enters an API key during an explicit setup flow and later expects the model switcher to show that credentials are configured without showing the plaintext API key. +- A user invokes `/model` in the TUI, selects a model, chooses a supported reasoning effort, and expects later turns in the same session to keep using that selection. +- A user chooses a reasoning-capable model for a complex task and expects the interface to show that capability. +- A user opens the model switcher and expects to see models that have been configured for actual use. + +## Functional Requirements + +- The program must include a built-in supported-model list. +- The built-in supported-model list must be defined by a single comprehensive configuration source, such as a JSON file. +- Built-in supported model definitions must include intrinsic model information such as base instructions, context window length, effective context window length, reasoning or thinking capabilities, and supported modalities. +- Built-in supported model definitions must not include user-specific provider invocation details such as provider name, base URL, API key, provider-specific model name, or invocation method. +- The program must distinguish supported model definitions from user-configured invocable models. +- The program must distinguish reusable user-defined providers from model-provider bindings. +- A user-defined provider must represent a reusable provider connection endpoint and credentials. +- A model-provider binding must represent an invocable model by linking a supported model, a user-defined provider, a provider-specific model name, an invocation method, and reasoning effort where applicable. +- Initially, no model is configured for actual invocation unless user configuration has been completed or restored from persistence. +- The user must be able to configure a model for invocation only when that model exists in the built-in supported-model list. +- When configuring a supported model for invocation, the user must select or create a provider, enter the model name expected by that provider, and choose an invocation method where applicable. +- Client interfaces may accept credential material when the user is explicitly creating, updating, or repairing model provider configuration. +- Routine client-side model listing, model selection, and model-switching data must expose credential or configuration status instead of plaintext credential values. +- A successfully configured model must become available for selection in client-side model switching interfaces such as the TUI model switcher. +- User-configured providers and model-provider bindings created during onboarding or model setup must be persistently saved to configuration. +- When both project-scoped and user-scoped configuration files define overlapping model provider or model-provider binding settings, the project-scoped configuration must take precedence. +- After onboarding, the TUI must provide a `/model` command for changing the current session model selection. +- The `/model` workflow must first let the user select a model. +- When the selected model supports reasoning, the `/model` workflow must let the user select a supported reasoning effort after model selection. +- A model and reasoning effort selected through `/model` during an active session must become the current session selection and continue to apply to later turns in that session. +- Model configuration must capture user-relevant capabilities such as context length, reasoning support, and supported modalities. +- Model configuration must represent whether a model accepts text, image, and video input where applicable. +- Model configuration must support persistence and onboarding. + +## Non-Functional Requirements + +- Invalid model configuration must produce actionable errors. +- Model configuration must be understandable to users selecting a model. + +## Acceptance Criteria + +- Given a model exists in the built-in supported-model list, when the user configures required provider details for it, then the model becomes invocable. +- Given a provider has already been configured, when the user configures another supported model through that provider, then the user can reuse that provider without re-entering base URL and API key. +- Given a supported model is exposed through a provider under a provider-specific model name, when the user configures the binding, then the user can enter that model name separately from the canonical supported model slug. +- Given the user enters credential material in an explicit model configuration flow, when the configuration is submitted, then the program can use that credential material to configure the model for invocation. +- Given a model does not exist in the built-in supported-model list, when the user attempts to configure it for invocation, then the program rejects the configuration with an actionable explanation. +- Given no user model configuration exists, when the user opens model selection, then the program does not present unconfigured supported models as ready for invocation. +- Given a configured model, when the user opens a client-side model switching interface, then that model is available for selection. +- Given a configured model appears in a routine client-side model switching interface, when the client displays it, then credential state is represented as status rather than as a plaintext credential value. +- Given onboarding or model setup creates a provider and model-provider binding, when the program restarts, then that invocable model remains available from persistent configuration. +- Given project-scoped and user-scoped configuration define overlapping model defaults or bindings, when the program computes available invocable models, then the project-scoped configuration takes precedence for those overlapping settings. +- Given the user invokes `/model` after onboarding, when the model selection opens, then the user can select a model before selecting reasoning effort. +- Given the selected model supports reasoning, when the user selects it through `/model`, then the user can choose one of that model's supported reasoning efforts. +- Given the user changes model or reasoning effort through `/model` during a session, when the next turn starts in that session, then the changed selection remains active. +- Given a first-time user, when onboarding requires model setup, then the user can configure or select a supported model. +- Given a model lacks a capability required by a task, when the user selects it, then the program reports the limitation before relying on that capability. +- Given a model has configured modality capabilities, when the program prepares a request, then those capabilities can be used to decide which context modalities are allowed. +- Given model configuration is persisted, when the program restarts, then configured models remain available. + +## Out of Scope + +- The program does not define exact model catalog schema, file path, provider request format, or client UI layout in this L1 requirement. +- The program does not define exact credential-entry controls, credential-reveal controls, or credential-store backend in this L1 requirement. +- This requirement does not define exact popup layout, search behavior, keyboard handling, or visual styling for the `/model` workflow. +- This requirement does not guarantee that every configured model supports every program feature. +- This requirement does not allow arbitrary models outside the built-in supported-model list to become invocable without first being added to that supported-model list. + +## Open Questions + +- Which model capabilities are mandatory for the initial configuration UI? +- Which modality capability fields are required for built-in supported model definitions? +- Who is allowed to update the built-in supported-model list, and through what review process? +- Should users be able to request support for a new model from the client interface? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | L2 defines supported models, user providers, and model-provider bindings. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | L2 defines configuration source precedence used by persisted model provider and binding records. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added explicit text, image, and video modality capability requirements. | +| 1 | 2026-05-21 | Human | Refinement | Distinguished built-in supported model definitions from user-configured invocable models. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that explicit configuration flows may accept credentials while routine model selection data exposes credential status rather than plaintext credentials. | +| 1 | 2026-05-22 | Human | Refinement | Added `/model` as the post-onboarding TUI workflow for changing the current session model and supported reasoning effort. | +| 1 | 2026-05-22 | Human | Refinement | Split user-defined providers from model-provider bindings, moved invocation method to the binding, and removed tool support from supported model metadata. | +| 1 | 2026-05-22 | Human | Refinement | Added persistent storage and project-over-user precedence for configured providers and model-provider bindings. | diff --git a/specs/L1/L1-REQ-MODEL-002-provider.md b/specs/L1/L1-REQ-MODEL-002-provider.md new file mode 100644 index 00000000..bb28a5c0 --- /dev/null +++ b/specs/L1/L1-REQ-MODEL-002-provider.md @@ -0,0 +1,102 @@ +--- +artifact_id: L1-REQ-MODEL-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-MODEL-002 — Model Providers + +## Purpose + +Allow users to connect the program to cloud and local model providers. + +## Why This Matters + +Provider support determines which models users can access and what authentication, privacy, latency, and capability tradeoffs apply. Users need provider setup and failure states to be clear. + +## Background / Context + +Users may rely on hosted APIs, local models, or authenticated provider integrations. L1 should express the product capability without locking in protocol design. + +The program supports models first. Providers are user-defined connection entries rather than a fixed built-in provider enum. A provider entry can be reused by multiple model-provider bindings. + +Credential handling must distinguish explicit setup or management flows from routine provider status and model selection flows. Users may provide credentials through a client interface, but ordinary provider and model status views should not depend on returning plaintext credential values. + +Provider entries created during onboarding or model setup are durable configuration records. Their effective values follow the application configuration precedence rule: project-scoped configuration takes precedence over user-scoped configuration for overlapping settings. + +## User / Business Requirement + +The program must support configurable model providers for remote and local model access. + +## Real User Scenarios + +- A user adds a named API-key-based provider and later binds multiple supported models to that provider. +- A user inspects provider status after setup and sees whether credentials are configured or invalid without the plaintext API key being displayed by default. +- A user attempts to use a local provider and receives clear status if the local service is unavailable. + +## Functional Requirements + +- The user must be able to configure API-key-based model providers. +- The user must be able to provide a provider name when adding a provider. +- Provider identifiers must be generated by the program rather than entered by the user. +- Providers must be user-defined entries rather than limited to a fixed built-in provider enum. +- A configured provider should be reusable across multiple model-provider bindings. +- Provider entries must contain provider connection information such as provider name, base URL, and credential state. +- Provider entries created during onboarding or model setup must be persistently saved to configuration. +- When both project-scoped and user-scoped configuration files define overlapping provider settings, the project-scoped configuration must take precedence. +- Provider entries must not own model-specific invocation details such as provider-specific model name, invocation method, or reasoning effort. +- Client interfaces may handle credential material during explicit provider setup, update, repair, or user-authorized reveal flows. +- Routine provider status, model selection, and model listing data must represent credential state without returning plaintext credential values by default. +- The user should be able to configure local model providers where supported. +- The program must show provider availability and configuration errors. +- The program should support provider onboarding for common setup paths. + +## Non-Functional Requirements + +- Provider credentials must be handled safely. +- Provider failures must produce actionable user-facing diagnostics. + +## Acceptance Criteria + +- Given valid provider credentials and a valid model-provider binding, when the user selects that binding, then the program can make a model call. +- Given the user adds a provider, when provider setup asks for provider information, then the user can enter a provider name. +- Given the user adds a provider, when the provider is saved, then the program generates a stable provider identifier. +- Given a configured provider exists, when the user configures another supported model, then the user can select that provider instead of creating a duplicate provider entry. +- Given the user creates a provider during onboarding or model setup, when the program restarts, then that provider remains available from persistent configuration. +- Given project-scoped and user-scoped configuration define overlapping provider settings, when the program computes effective providers, then the project-scoped configuration takes precedence. +- Given the user submits credential material through an explicit provider setup or update flow, when the credential is accepted, then the provider can become available for model invocation. +- Given the user inspects provider or model status through a routine client view, when credential information is needed, then the program shows credential state rather than the plaintext credential value. +- Given invalid provider credentials, when the program attempts provider access, then the user sees an actionable configuration error. +- Given a provider is unavailable, when a model call is requested, then the program reports provider unavailability rather than presenting it as a generic task failure. +- Given a provider has different availability or connection state, when the user selects a model-provider binding using that provider, then the program reflects those relevant differences. + +## Out of Scope + +- The program does not define specific provider protocols, authentication flows, generated provider ID format, or wire payloads in this L1 requirement. +- The program does not require routine client views to return plaintext credentials. +- This requirement does not require identical behavior or capabilities across all providers. + +## Open Questions + +- Which providers are required for the first milestone? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | L2 defines user-defined providers and model-provider bindings. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | L2 defines configuration source precedence used by persisted provider records. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-22 | Human | Refinement | Clarified explicit credential setup and user-authorized reveal flows versus routine status views that expose credential state without plaintext credentials by default. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that providers are user-defined reusable entries with generated identifiers and that invocation method belongs to the model-provider binding. | +| 1 | 2026-05-22 | Human | Refinement | Added persistent storage and project-over-user precedence for provider entries. | diff --git a/specs/L1/L1-REQ-MODEL-003-onboard.md b/specs/L1/L1-REQ-MODEL-003-onboard.md new file mode 100644 index 00000000..e0a3e447 --- /dev/null +++ b/specs/L1/L1-REQ-MODEL-003-onboard.md @@ -0,0 +1,102 @@ +--- +artifact_id: L1-REQ-MODEL-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-MODEL-003 — Onboarding + +## Purpose + +Help first-time users reach a working configuration. + +## Why This Matters + +Users cannot evaluate the program if they cannot reach a usable first session. Onboarding should make required setup clear without hiding important privacy, provider, or permission decisions. + +## Background / Context + +A new user may need to configure a model provider, credentials, default model, permissions, telemetry, and basic UI preferences before using the program effectively. + +The TUI onboarding model setup begins by showing supported model slugs. After the user selects a supported model slug, onboarding asks the user to select an existing provider or add a new provider. When adding a provider, onboarding collects provider name, base URL, and API key. After provider selection or creation, onboarding asks the user to enter the model name expected by that provider, select an invocation method, then choose reasoning effort when the selected model supports reasoning. + +Information entered during onboarding must be saved to persistent configuration so the user does not need to repeat the same setup on the next launch. + +## User / Business Requirement + +The program must provide an onboarding flow for first-time use and missing required configuration. + +## Real User Scenarios + +- A new user starts the program without model credentials and is guided to configure a provider or local model. +- A new TUI user selects a supported model slug from onboarding, selects an existing provider or adds a provider with provider name, base URL, and API key, enters the model name for that provider, chooses an invocation method such as OpenAI Chat Completions, OpenAI Responses, or Anthropic Messages, then chooses a reasoning effort when the model supports reasoning. +- A user skips optional setup and still reaches a usable session with clear limits. + +## Functional Requirements + +- The program must detect when onboarding is required. +- The onboarding flow must guide the user through required setup. +- The onboarding flow must support model or provider setup when required. +- TUI onboarding model setup must begin with supported model slug selection. +- After model slug selection, TUI onboarding must let the user select an existing provider or add a new provider. +- When adding a provider, TUI onboarding must collect provider name, base URL, and API key where applicable. +- After provider selection or creation, TUI onboarding must let the user enter the model name expected by that provider. +- After model name entry, TUI onboarding must let the user select an invocation method where applicable. +- Invocation method choices should include OpenAI Chat Completions, OpenAI Responses, and Anthropic Messages where available. +- When the selected onboarding model supports reasoning, TUI onboarding must let the user select a supported reasoning effort after invocation method selection. +- Each onboarding input field or selection popup must show a concise hint that describes the current value the user is expected to provide. +- After successful model onboarding, the program must persist the selected model slug, provider selection or new provider details, provider-specific model name, invocation method, and reasoning effort where applicable. +- The user must be able to complete onboarding and start a usable session. + +## Non-Functional Requirements + +- Onboarding must avoid hiding important privacy or telemetry decisions. +- Onboarding must be recoverable if configuration fails. + +## Acceptance Criteria + +- Given a first-time user with no required configuration, when the program starts, then onboarding is offered or started. +- Given TUI onboarding requires model setup, when the user begins model setup, then the user first selects from supported model slugs. +- Given the user selects a supported model slug during TUI onboarding, when provider selection is required, then the user can select an existing provider or choose to add a provider. +- Given the user chooses to add a provider, when provider details are required, then the user can provide provider name, base URL, and API key where applicable. +- Given a provider has been selected or created, when model name is required, then the user can enter the model name expected by that provider. +- Given model name entry is complete during TUI onboarding, when invocation method selection is required, then the user can choose a supported invocation method. +- Given the selected onboarding model supports reasoning, when invocation method selection is complete, then the user can select a supported reasoning effort. +- Given the user is entering or selecting an onboarding value, when the field or popup is active, then the UI shows a concise hint describing the current value. +- Given completed onboarding, when the user starts a session, then required configuration is available. +- Given completed onboarding, when the program restarts, then the persisted onboarding configuration is available without requiring the user to repeat the same model/provider setup. +- Given onboarding fails because provider setup is invalid, when the user retries, then the program preserves completed setup steps where possible. +- Given a privacy or telemetry choice is part of onboarding, when the user completes setup, then the selected choice is persisted. + +## Out of Scope + +- The program does not define onboarding screen design, configuration storage format, or provider-specific setup details in this L1 requirement. +- This requirement does not define exact popup layout, search behavior, keyboard handling, validation timing, or visual styling for TUI onboarding controls. +- This requirement does not require all optional integrations to be configured during first run. + +## Open Questions + +- Which setup steps are mandatory versus optional in the first release? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TUI-010 | 1 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | TUI onboarding UI defines the terminal client presentation of the model setup flow. | +| refined-by | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | L2 defines the data model configured by onboarding. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | L2 defines how onboarding-created configuration is persisted and loaded. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-22 | Human | Refinement | Added the TUI onboarding model setup flow: supported model selection, provider details, and reasoning effort selection when supported. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that onboarding selects a model slug, collects base URL and API key, then selects invocation method before reasoning effort. | +| 1 | 2026-05-22 | Human | Refinement | Clarified provider selection or creation, provider name entry, provider-specific model name entry, invocation method selection, and field-level hints. | +| 1 | 2026-05-22 | Human | Refinement | Added persistent storage of onboarding-entered model and provider configuration. | diff --git a/specs/L1/L1-REQ-MODEL-004-modality-compatibility.md b/specs/L1/L1-REQ-MODEL-004-modality-compatibility.md new file mode 100644 index 00000000..22aaac0b --- /dev/null +++ b/specs/L1/L1-REQ-MODEL-004-modality-compatibility.md @@ -0,0 +1,74 @@ +--- +artifact_id: L1-REQ-MODEL-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-MODEL-004 — Modality Compatibility + +## Purpose + +Ensure that the program remains model-agnostic while respecting the modality capabilities of the currently selected model. + +## Background / Context + +The program should support mainstream models rather than being tied to one provider or one model family. Mainstream models do not all accept the same input modalities. Some models accept text only, while others may accept images, video, or other multimodal inputs. + +Because users can switch models during a conversation, previously valid context may contain modalities that the newly selected model does not support. The program must handle that mismatch before sending a model request. + +## User / Business Requirement + +The program must track model modality capabilities and must only send model context in modalities supported by the current model. + +## Functional Requirements + +- The program must support mainstream models through a model-agnostic design. +- Model configuration must represent supported input modalities, including at least text, image, and video where applicable. +- The program must allow model switching during a conversation where the program's policy permits it. +- Before a model request is sent, the program must compare the prepared context with the current model's supported modalities. +- Any context information in a modality unsupported by the current model must be removed from the model request or converted to a supported representation when an approved conversion path exists. +- The user must be able to understand when context was omitted because the selected model does not support a required modality. + +## Non-Functional Requirements + +- Modality filtering must happen before provider request submission. +- Modality compatibility behavior must be deterministic and auditable. +- Removing unsupported modalities must preserve conversation structure where possible. +- Model switching must not silently send unsupported modality payloads to a provider. + +## Acceptance Criteria + +- Given a text-only model is selected, when the conversation context contains images, then image payloads are removed or replaced with an approved supported representation before the request is sent. +- Given a model that does not support video is selected, when context contains video input, then the video modality is not sent to that model. +- Given context content is omitted because of modality incompatibility, when the user reviews the turn context or error explanation, then the omission is visible or explainable. +- Given the user switches from a multimodal model to a less capable model mid-conversation, when the next request is prepared, then unsupported modalities from earlier context are normalized out before invocation. +- Given a model supports a modality, when context contains that modality and policy allows it, then the program may include it in the model request. + +## Out of Scope + +- This requirement does not define provider-specific multimodal payload formats. +- This requirement does not define image, video, OCR, transcription, or summarization conversion implementations. +- This requirement does not require every mainstream model to support every modality. + +## Open Questions + +- Which mainstream models are required for the first milestone? +- Which modality conversions are acceptable when the target model does not support the original modality? +- Should omitted unsupported modality content remain visible in the client transcript even when excluded from the model request? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/model/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | diff --git a/specs/L1/L1-REQ-NOTIFY-001-attention-management.md b/specs/L1/L1-REQ-NOTIFY-001-attention-management.md new file mode 100644 index 00000000..3939e461 --- /dev/null +++ b/specs/L1/L1-REQ-NOTIFY-001-attention-management.md @@ -0,0 +1,74 @@ +--- +artifact_id: L1-REQ-NOTIFY-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-NOTIFY-001 — Attention Management + +## Purpose + +Ensure that the program brings important events to the user's attention without creating noise or interrupting flow unnecessarily. + +## Background / Context + +Agentic work may run for a long time, wait for approval, fail, become blocked, finish in the background, or require the user to make a decision. Automations and reminders cover scheduled work, but active work also needs attention rules so users know when they should act. + +The program should notify users about meaningful state changes while avoiding repeated or low-value interruptions. + +## User / Business Requirement + +The program must provide attention management for active and background work so users are informed when action or awareness is needed. + +## Functional Requirements + +- The program must surface when a long-running task completes. +- The program must surface when a task fails or becomes blocked. +- The program must surface when a task needs approval or a user answer. +- The program must surface when background or delegated work produces a result that affects the current session. +- The user must be able to understand why attention is requested. +- The program should avoid repeatedly notifying the user about the same unchanged state. + +## Non-Functional Requirements + +- Notifications must be useful, concise, and tied to actionable state. +- Notifications must not spam the user during normal streaming or frequent progress updates. +- Attention signals must respect privacy and avoid exposing sensitive content in places where it may be inappropriate. +- Attention behavior should be consistent across client surfaces where the relevant capability exists. + +## Acceptance Criteria + +- Given a long-running task completes while the user is not focused on it, when the result is available, then the program can surface completion in a user-visible way. +- Given a task fails, when the failure is known, then the program can draw attention to the failure and its summary. +- Given a task needs approval or a user answer, when progress is blocked on the user, then the program surfaces that action is required. +- Given the same blocked state remains unchanged, when time passes, then the program does not repeatedly notify the user without new information. +- Given a notification is shown, when the user inspects it, then the related session, turn, task, or automation is identifiable. + +## Out of Scope + +- Notification transport, operating-system notification APIs, badge behavior, sound behavior, and client-specific presentation are not specified here. +- This requirement does not define scheduling semantics for automations or reminders. +- This requirement does not require every minor progress update to generate a notification. + +## Open Questions + +- Which events should produce external notifications versus in-client attention indicators? +- Should notification preferences be configured globally, per workspace, per session, or per automation? +- What quiet-hours or do-not-disturb behavior should be supported? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/notify/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft approved for L1 expansion. | diff --git a/specs/L1/L1-REQ-REVIEW-001-code-review.md b/specs/L1/L1-REQ-REVIEW-001-code-review.md new file mode 100644 index 00000000..494dae3f --- /dev/null +++ b/specs/L1/L1-REQ-REVIEW-001-code-review.md @@ -0,0 +1,79 @@ +--- +artifact_id: L1-REQ-REVIEW-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-REVIEW-001 — Code Review + +## Purpose + +Ensure that the program can act as a code reviewer when the user asks for review rather than implementation. + +## Why This Matters + +Review mode has a different contract from implementation mode. Users need prioritized risks, concrete evidence, and test gaps without the program quietly changing code or burying findings under summaries. + +## Background / Context + +Review work has a different user expectation from implementation work. The user expects bugs, regressions, risks, missing tests, and unclear behavior to be identified before summaries or praise. + +The program should treat review as a first-class product workflow with clear findings and actionable evidence. + +## User / Business Requirement + +The program must support code review workflows that prioritize concrete findings, severity, evidence, and test gaps. + +## Real User Scenarios + +- A user asks the program to review a diff before merging it and expects bugs or regressions to appear first. +- A user asks whether a change has test gaps and expects the program to identify missing verification without editing files. + +## Functional Requirements + +- The program must identify whether the user is asking for review rather than direct code changes. +- The program must inspect the relevant code, diff, branch, commit, or pull request context before producing findings. +- Review output must lead with findings ordered by severity. +- Each finding must include enough location and reasoning for the user to evaluate it. +- If no issues are found, the program must state that clearly and identify any remaining verification gaps or residual risk. + +## Non-Functional Requirements + +- Review output must avoid noise, unsupported claims, and broad style commentary unless it affects correctness or maintainability. +- Review findings must be grounded in observable code or behavior. +- The program must not modify code during a review unless the user explicitly requests fixes. + +## Acceptance Criteria + +- Given a user asks for a code review, when the program responds, then findings appear before summary information. +- Given a finding is reported, when the user inspects it, then the response includes a file or code location and an explanation of the risk. +- Given no findings are found, when the program responds, then it says so and mentions any relevant test or verification gaps. +- Given the review scope is ambiguous, when the program cannot infer the target, then it asks or states the reviewed scope before producing findings. +- Given the user asks only for review, when issues are found, then the program does not edit files unless the user asks for fixes. + +## Out of Scope + +- The program does not define pull request provider integration, inline comment publication, or automated security-scanning phases in this L1 requirement. +- This requirement does not make the program a substitute for human review or project ownership. + +## Open Questions + +- Should review severity labels be standardized across all review workflows? +- Should the program support separate review modes for correctness, security, performance, and product behavior? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/review/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-SEC-001-security-mode.md b/specs/L1/L1-REQ-SEC-001-security-mode.md new file mode 100644 index 00000000..b0b7016f --- /dev/null +++ b/specs/L1/L1-REQ-SEC-001-security-mode.md @@ -0,0 +1,83 @@ +--- +artifact_id: L1-REQ-SEC-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-SEC-001 — Security Mode + +## Purpose + +Support authorized security engagements as a first-class operating mode of the program. + +## Background / Context + +Security Mode is intended for authorized penetration testing engagements, software reverse engineering, vulnerability validation, web application penetration testing, and malware analysis. These activities need different instructions, tools, safety expectations, evidence handling, and reporting expectations from ordinary coding work. + +Security Mode must remain bounded by authorization, scope, user control, and controlled execution requirements. Malware analysis is included in Security Mode, but suspected malware must not be dynamically executed on the host environment. Any behavior observation or execution of suspected malware must require a configured controlled environment such as a virtual machine, sandbox, isolated lab, or equivalent controlled analysis environment. + +## User / Business Requirement + +The program must provide a Security Mode for authorized security work while enforcing scope, safety, permission, evidence, and controlled-environment requirements. + +## Functional Requirements + +- The program must support Security Mode as a user-visible operating mode. +- Security Mode must support authorized penetration testing engagements, software reverse engineering, vulnerability validation, web application penetration testing, and malware analysis under one mode contract. +- Security Mode must make authorization and engagement scope visible or request clarification when scope is missing or ambiguous. +- Security Mode must support security-oriented tools, skills, MCP integrations, instructions, and reporting expectations where configured. +- Security Mode must preserve ordinary tool validation, permission, approval, sandbox, privacy, and audit behavior. +- Security Mode must distinguish static analysis of suspicious artifacts from dynamic execution of suspected malware. +- Security Mode must prevent suspected malware from being dynamically executed on the host environment. +- Security Mode must require a configured controlled environment before behavior observation, detonation, or execution of suspected malware can occur. +- Security Mode must report when a requested security action cannot proceed because authorization, scope, permissions, tools, or controlled environment requirements are missing. +- Security Mode final responses should preserve security-relevant evidence, assumptions, limitations, findings, and remediation guidance where applicable. + +## Non-Functional Requirements + +- Security Mode must fail closed when authorization, scope, or controlled-environment state is ambiguous. +- Security Mode must keep safety decisions explainable to the user. +- Security Mode must keep evidence and tool activity auditable. +- Security Mode must not weaken the user's privacy, permission, or workspace boundaries. +- Controlled-environment requirements must be clear enough that the user can understand why a host-side action was blocked. + +## Acceptance Criteria + +- Given Security Mode is active, when the user inspects session state, then the client identifies that the session is in Security Mode. +- Given the user requests security work without clear authorization or scope, when the program needs that information to proceed safely, then it asks for clarification or reports the missing requirement. +- Given Security Mode uses configured security tools or integrations, when the user inspects effective configuration or tool activity, then the relevant security capabilities are visible. +- Given a suspected malware sample is provided, when the user requests static analysis that does not execute the sample, then the program may proceed within ordinary safety and permission boundaries. +- Given a suspected malware sample is provided, when the user requests dynamic execution or behavior observation, then the program blocks host execution unless a controlled environment is configured and active. +- Given no controlled environment is configured, when malware execution is requested, then the program reports the missing controlled environment instead of executing on the host. +- Given Security Mode produces findings or conclusions, when the final response is presented, then relevant evidence, assumptions, limitations, and remediation guidance are included where applicable. + +## Out of Scope + +- This requirement does not define specific penetration testing methodology, reverse engineering workflow, vulnerability scoring scheme, malware sandbox implementation, or controlled-environment provider. +- This requirement does not define formal subdivisions inside Security Mode. +- This requirement does not authorize activity outside user-approved scope or applicable policy. +- This requirement does not require suspected malware to be executed; static analysis and refusal to execute without a controlled environment are valid outcomes. + +## Open Questions + +- What minimum authorization and engagement-scope fields should Security Mode require before high-risk work begins? +- What controlled-environment signals are sufficient before suspected malware dynamic execution may proceed? +- Which security tools, skills, and MCP integrations should be included in the first Security Mode milestone? +- How should Security Mode evidence be persisted, exported, or redacted? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/sec/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | diff --git a/specs/L1/L1-REQ-TOOL-001-safety.md b/specs/L1/L1-REQ-TOOL-001-safety.md new file mode 100644 index 00000000..a6a51ca5 --- /dev/null +++ b/specs/L1/L1-REQ-TOOL-001-safety.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-TOOL-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-TOOL-001 — Tool Output Safety + +## Purpose + +Prevent tool outputs from exposing secrets or unsafe content to model context, logs, or users unintentionally. + +## Why This Matters + +Tools can surface arbitrary local or remote content. Without output safety, a useful tool call can accidentally leak credentials into model context, logs, transcripts, or external systems. + +## Background / Context + +Tools can read files, run commands, fetch web content, and return arbitrary output. Outputs may include credentials or sensitive data. + +## User / Business Requirement + +The program must sanitize tool outputs when necessary before exposing them to model context or persistent records. + +## Real User Scenarios + +- A shell command prints an API key, and the program redacts it before sending output to the model. +- A fetched page contains sensitive tokens, and the program avoids storing or replaying them as ordinary context. + +## Functional Requirements + +- The program must detect likely secrets in tool output where feasible. +- The program must redact or withhold sensitive content before model exposure where required. +- The program must make redaction understandable to the user. +- The program must apply safety processing consistently across built-in and external tools where possible. + +## Non-Functional Requirements + +- Redaction must not rely solely on model judgment. +- Safety processing must avoid logging plaintext secrets. + +## Acceptance Criteria + +- Given tool output containing a likely API key, when the output is prepared for model context, then the secret is redacted or excluded. +- Given redacted output, when the user reviews the transcript, then the user can tell that redaction occurred. +- Given output safety removes content, when the model continues, then it receives a clear indication that data was withheld or redacted. +- Given a tool output is persisted, when safety rules apply, then the persisted representation avoids plaintext exposure where required. + +## Out of Scope + +- The program does not define exact secret patterns, redaction engine, or policy implementation in this L1 requirement. +- This requirement does not guarantee perfect detection of all sensitive data. + +## Open Questions + +- Which secret classes must be detected in the first milestone? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/tool/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-TOOL-002-tools.md b/specs/L1/L1-REQ-TOOL-002-tools.md new file mode 100644 index 00000000..41f0e1cd --- /dev/null +++ b/specs/L1/L1-REQ-TOOL-002-tools.md @@ -0,0 +1,94 @@ +--- +artifact_id: L1-REQ-TOOL-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-TOOL-002 — Built-In Tools + +## Purpose + +Define the baseline tool capabilities users expect from the program. + +## Why This Matters + +Tools are how the program inspects projects, changes files, runs commands, searches, asks users, and interacts with external content. A clear baseline tells users what work the program can perform. + +## Background / Context + +Coding-agent workflows require file operations, command execution, search, web access, planning, approval, Plan Mode clarification questions, and delegated work. + +## User / Business Requirement + +The program must provide a baseline set of built-in tools for coding-agent workflows. + +## Real User Scenarios + +- A user asks the program to inspect a bug, and it reads files, searches references, edits code, and runs tests. +- A user asks the program to interact with a long-running process and expects stdout, stderr, and stdin interaction to be handled visibly. + +## Functional Requirements + +- The program must support reading, writing, and editing files. +- The program must support command execution, background process execution, and process stdin interaction. +- The program must expose current background processes started by command execution tools and provide a manual stop path for those processes. +- The program must support file-name search and content search. +- The program must support planning, approval requests, Plan Mode clarification questions, web fetch, web search, and subagent coordination where enabled. +- The question tool must be reserved for Plan Mode and must not be invoked during Normal Mode. +- The program should support explicit parallel tool orchestration through `multi_tool_use` where enabled. +- Tools that require user or environment configuration, including web search, must expose clear configuration and unavailable-state behavior. + +## Non-Functional Requirements + +- Tool use must be visible, auditable, and subject to safety policy. +- Tool outputs must be bounded and understandable. + +## Acceptance Criteria + +- Given a request to inspect project files, when the program uses built-in tools, then it can search and read relevant files. +- Given a request requiring user approval, when a tool exceeds permission boundaries, then the approval workflow is used before execution. +- Given a background command is launched, when it remains running, then the program exposes process state and output access. +- Given a background command remains running, when the user views current background processes, then the user can identify and manually stop that process. +- Given a tool is unavailable or disabled, when the model requests it, then the program reports the capability gap instead of fabricating a result. +- Given Normal Mode is active, when the model requests the question tool, then the program blocks or rejects that request. +- Given Plan Mode is active, when the agent needs clarification, then the question tool is available where enabled. +- Given a configured tool such as web search is enabled, when the program uses that tool, then the effective configuration path is respected. +- Given explicit parallel tool orchestration is enabled, when `multi_tool_use` is invoked, then the listed tool calls are treated as an explicit parallel group. + +## Out of Scope + +- The program does not define exact tool names, schemas, backend implementation, or provider tool-call mappings in this L1 requirement. +- This requirement does not require every possible external capability to be built in. + +## Open Questions + +- Which built-in tools are mandatory for the first usable milestone? +- Which built-in tools require explicit configuration before first use? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TOOL-005 | 1 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | Background process management refines user-visible control over command execution processes. | +| related-to | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Plan Mode restricts question-tool availability to planning-only clarification. | +| refined-by | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | Defines built-in tool categories, lifecycle, registry behavior, mode gating, and the plan tool. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | The execution engine dispatches model-requested tools. | +| related-to | L2-DES-AGENT-002 | 1 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | Interrupt and resume control active tool and background process work. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol events expose tool calls, plan updates, and background process state. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable records preserve tool calls, tool results, and plan state. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added configurable-tool behavior for web search and other configured tools. | +| 1 | 2026-05-21 | Human | Refinement | Added explicit `multi_tool_use` parallel orchestration capability. | +| 1 | 2026-05-21 | Human | Refinement | Added current background process visibility and manual stop requirements. | +| 1 | 2026-05-21 | Human | Refinement | Reserved the question tool for Plan Mode and blocked it in Normal Mode. | +| 1 | 2026-05-22 | Human | Traceability | Linked built-in tools to the L2 tool system design. | diff --git a/specs/L1/L1-REQ-TOOL-003-web-search-configuration.md b/specs/L1/L1-REQ-TOOL-003-web-search-configuration.md new file mode 100644 index 00000000..9f7b53fb --- /dev/null +++ b/specs/L1/L1-REQ-TOOL-003-web-search-configuration.md @@ -0,0 +1,76 @@ +--- +artifact_id: L1-REQ-TOOL-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-TOOL-003 — Web Search Configuration + +## Purpose + +Ensure that web search is available as a critical tool capability and that users can configure how the program performs web search. + +## Background / Context + +Web search is a critical component for agentic work that depends on current external information, documentation, vendor behavior, ecosystem state, or public web references. + +Different execution paths may be appropriate for different users and environments. Some model providers offer cloud-based web search services. Other users may prefer or require a locally configured search path using services such as DuckDuckGo, Tavily, Google, or another search provider. The exact implementation details for local search paths are not settled in this L1 requirement, but the program capability should be prioritized. + +## User / Business Requirement + +The program must support configurable web search execution so the user can choose or understand which web search path is used. + +## Functional Requirements + +- The program must treat web search as a first-class tool capability where enabled. +- The user must be able to configure how web search is executed. +- Web search configuration must support cloud-based provider search where available, such as search services exposed by model providers. +- Web search configuration should support local or independently configured search paths where available, such as DuckDuckGo, Tavily, Google, or another search provider. +- The program must make the currently effective web search configuration visible to the user. +- If web search is unavailable, disabled, or misconfigured, the program must report that state clearly instead of pretending search results exist. +- Web search execution must respect the same safety, permission, privacy, and observability requirements as other tools. + +## Non-Functional Requirements + +- Web search configuration must be durable across restarts where configured as a persistent preference. +- Web search behavior must be auditable enough for the user to understand which search path produced a result. +- Provider-specific search behavior must not prevent the program from supporting alternative search paths. +- Search configuration errors must be actionable. + +## Acceptance Criteria + +- Given web search is enabled through a cloud-based model provider search service, when the program needs current web information, then it can use that configured search path. +- Given web search is configured through a local or independently configured search provider, when the program needs current web information, then it can use that configured search path where available. +- Given multiple web search paths are available, when the user inspects configuration, then the user can identify which path is active. +- Given web search is disabled, unavailable, or missing required credentials, when a task requires web search, then the program reports the configuration gap rather than fabricating results. +- Given a web search result is used in a task, when the user reviews tool activity or diagnostics, then the program can identify the search path used. + +## Out of Scope + +- This requirement does not define exact web search provider protocols, ranking behavior, result schema, crawling behavior, or local search implementation details. +- This requirement does not require every possible search provider to be supported. +- This requirement does not require web search to bypass network, privacy, permission, or provider policy restrictions. + +## Open Questions + +- Which web search paths are mandatory for the first usable milestone? +- Which cloud-based model-provider search services should be supported first? +- Which local or independently configured search providers should be supported first? +- Should web search configuration be global, workspace-specific, session-specific, or overridable per turn? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/tool/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | diff --git a/specs/L1/L1-REQ-TOOL-004-parallel-tool-orchestration.md b/specs/L1/L1-REQ-TOOL-004-parallel-tool-orchestration.md new file mode 100644 index 00000000..341a205d --- /dev/null +++ b/specs/L1/L1-REQ-TOOL-004-parallel-tool-orchestration.md @@ -0,0 +1,78 @@ +--- +artifact_id: L1-REQ-TOOL-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-TOOL-004 — Parallel Tool Orchestration + +## Purpose + +Define the user-visible behavior of explicit parallel tool orchestration. + +## Background / Context + +Some agent workflows benefit from invoking multiple independent tools at the same time. The program may expose a tool orchestration capability named `multi_tool_use` that lets the model request several tool calls as one explicitly parallel group. + +When `multi_tool_use` is invoked, the user's expectation is direct parallel execution of the listed tool calls. The program should not reinterpret the group as a request for the runtime to decide whether the listed calls are parallel-safe. However, `multi_tool_use` must not bypass the normal controls that apply to each underlying tool call. + +## User / Business Requirement + +The program must execute tool calls listed in `multi_tool_use` concurrently as requested, while still applying each tool's ordinary validation, permission, approval, sandbox, availability, and safety checks. + +## Functional Requirements + +- The program must support an explicit parallel tool orchestration capability where enabled. +- When the model invokes `multi_tool_use`, the program must schedule the listed tool calls for parallel execution. +- The program must not serialize, reorder, reject, or downgrade a `multi_tool_use` group solely because of additional runtime parallel-safety classification. +- Each underlying tool call inside `multi_tool_use` must still pass its ordinary schema validation, availability checks, permission checks, approval requirements, sandbox restrictions, and safety processing. +- `multi_tool_use` must not allow a tool call to bypass controls that would apply if that same tool were invoked directly. +- If an underlying tool call is blocked by ordinary validation, permission, approval, sandbox, availability, or safety behavior, that tool call must report the same kind of blocked or failed state it would report outside `multi_tool_use`. +- If a shell command is included in `multi_tool_use`, the shell command must be treated as an ordinary shell command invocation for validation and safety purposes, while still being scheduled in parallel as part of the group. +- The program must preserve user-visible results for each underlying tool call in the parallel group. + +## Non-Functional Requirements + +- Parallel orchestration behavior must be predictable: `multi_tool_use` means parallel execution, not runtime-selected serialization. +- Tool activity from a parallel group must remain auditable at the group level and at the individual tool-call level. +- Failures or blocked calls inside a parallel group must not hide successful sibling tool results. +- Parallel execution must not weaken existing safety, approval, permission, or sandbox guarantees. + +## Acceptance Criteria + +- Given `multi_tool_use` contains multiple valid and allowed tool calls, when it is invoked, then the program starts those tool calls concurrently. +- Given a tool call inside `multi_tool_use` requires approval, when approval is required by ordinary tool policy, then the tool call follows the normal approval behavior rather than bypassing it. +- Given a tool call inside `multi_tool_use` has invalid input, when the group is invoked, then that tool call is rejected or fails according to ordinary validation behavior. +- Given a shell command appears inside `multi_tool_use`, when the group is invoked, then the shell command is scheduled in parallel subject to ordinary shell command controls. +- Given one tool call in a parallel group fails or is blocked, when sibling calls complete successfully, then successful sibling results remain visible. +- Given the user reviews tool activity, when a parallel group was executed, then the user can identify both the group and each underlying tool result. + +## Out of Scope + +- This requirement does not define the exact wire format, tool schema, or provider mapping for `multi_tool_use`. +- This requirement does not require runtime read/write classification, dependency analysis, or resource-lock scheduling for `multi_tool_use`. +- This requirement does not define model prompting policy for when the model should choose `multi_tool_use`. +- This requirement does not make parallel execution a bypass around existing tool validation, safety, approval, permission, sandbox, or availability behavior. + +## Open Questions + +- Should partial failure in `multi_tool_use` be reported as a group-level failure, an item-level failure, or both? +- Should the client render `multi_tool_use` as a visible group separate from ordinary tool activity grouping? +- Should the program expose limits on the maximum number of tool calls allowed in one `multi_tool_use` request? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/tool/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | diff --git a/specs/L1/L1-REQ-TOOL-005-background-process-management.md b/specs/L1/L1-REQ-TOOL-005-background-process-management.md new file mode 100644 index 00000000..118018f9 --- /dev/null +++ b/specs/L1/L1-REQ-TOOL-005-background-process-management.md @@ -0,0 +1,81 @@ +--- +artifact_id: L1-REQ-TOOL-005 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-TOOL-005 — Background Process Management + +## Purpose + +Ensure that processes started by command execution tools remain visible and controllable when they continue running in the background. + +## Background / Context + +Command execution may start long-running or interactive processes. Some commands return an initial result while the underlying process continues running in the background. Users need to know which program-started processes are still active, inspect their state, and stop them manually when they are no longer needed. + +## User / Business Requirement + +The program must expose current background processes started by the program and let the user manually stop those processes from the client interface. + +## Functional Requirements + +- The program must track background processes started by command execution tools. +- The program must expose current background processes to the client interface while those processes remain active. +- The client interface must show enough process information for the user to identify the process, such as command label, process identifier, workspace or session association, runtime status, and recent output availability. +- The client interface must provide a manual stop action for a selected background process. +- When a user requests a background process stop, the program must update the client-visible process state to indicate whether the process is stopping, stopped, exited, or could not be stopped. +- Background process output must remain accessible enough for the user to inspect what the process is doing or did before it stopped. +- A background process that continues after the originating turn completes must remain visible until it exits or is stopped by the user. + +## Non-Functional Requirements + +- Background process state must update timely enough that users do not confuse an active process with a completed command. +- Stop controls must avoid terminating unrelated host processes. +- Process output display must be bounded so a noisy background process does not make the client unusable. +- The process list and stop action must be understandable without requiring users to inspect logs. + +## Acceptance Criteria + +- Given a command execution tool starts a long-running process, when the process remains active in the background, then the client interface shows it in the current background process list. +- Given a background process is shown, when the user inspects it, then the user can identify which command and workspace or session started it. +- Given a background process is producing output, when the user opens or inspects that process, then the user can access recent output or an available output view. +- Given the user manually stops a background process, when the stop request is accepted, then the client-visible state changes to stopping and eventually stopped or failed-to-stop. +- Given the originating turn has completed while a background process remains active, when the user views current background processes, then that process remains visible. +- Given a tracked background process exits on its own, when the client state updates, then the process is no longer presented as actively running. + +## Out of Scope + +- This requirement does not define platform-specific process termination signals, process-group behavior, or shell implementation details. +- This requirement does not require the program to manage arbitrary host processes that it did not start. +- This requirement does not define the exact client layout, keybinding, command name, or visual design for the process list. +- This requirement does not guarantee that every external process can be stopped immediately or cleanly. + +## Open Questions + +- Should exited background processes remain visible for a short review window before disappearing from the current process list? +- Should background process state be restored after an application restart, or only while the process supervisor is still running? +- Should stopping a background process require confirmation when the process appears to own child processes or unsaved output? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | Built-in command execution creates the background process lifecycle this requirement controls. | +| related-to | L1-REQ-AGENT-002 | 1 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | Interrupt and resume behavior includes stopping running tools and background tasks. | +| related-to | L1-REQ-TUI-004 | 1 | specs/L1/L1-REQ-TUI-004-state-visibility.md | The TUI must expose current execution state, including current background process state. | +| related-to | L2-DES-AGENT-002 | 1 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | Active work inspection and interruption include tracked background process state. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol methods expose active work inspection and tracked background process stop requests. | +| refined-by | TBD | TBD | specs/L2/tool/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | +| 1 | 2026-05-22 | Human | Traceability | Linked background process management to agent interrupt/resume and protocol surfaces. | diff --git a/specs/L1/L1-REQ-TUI-001-composer.md b/specs/L1/L1-REQ-TUI-001-composer.md new file mode 100644 index 00000000..a4a7de6c --- /dev/null +++ b/specs/L1/L1-REQ-TUI-001-composer.md @@ -0,0 +1,78 @@ +--- +artifact_id: L1-REQ-TUI-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-001 — Composer + +## Purpose + +Define user expectations for the TUI input composer. + +## Why This Matters + +The composer is where users express tasks, corrections, and commands. Input behavior must be predictable so users do not accidentally submit when they intended to insert a newline or lose non-ASCII text. + +## Background / Context + +The composer is the primary place where users write prompts, commands, and multi-line task descriptions. + +## User / Business Requirement + +The TUI must provide a reliable and ergonomic composer for entering user input. + +## Real User Scenarios + +- A user writes a multi-line task description and submits it as one message. +- A user enters Chinese or other IME text and expects the composer to preserve it in supported terminals. + +## Functional Requirements + +- The composer must support normal text entry. +- The composer must support multi-line input. +- The composer must support submitting user input intentionally. +- The composer must support session-local input modes where input interpretation differs from normal chat input. +- The composer should support command entry and discovery where appropriate. + +## Non-Functional Requirements + +- Input behavior must be predictable across supported terminals. +- The composer must preserve non-ASCII and IME input where supported by the terminal. + +## Acceptance Criteria + +- Given the user enters multi-line text, when the user submits, then the full input is sent as one user message. +- Given the user uses non-ASCII input, when the terminal supports it, then the composer preserves the entered text. +- Given the user intends to insert a newline, when the required key sequence is supported, then the composer inserts a newline instead of submitting. +- Given the composer enters a non-default input mode, when the user submits input, then the TUI interprets the input according to that active mode. +- Given the user opens command entry, when command suggestions are available, then the composer makes them discoverable without replacing typed text unexpectedly. + +## Out of Scope + +- The program does not define specific keybindings, terminal event handling, or composer rendering implementation in this L1 requirement. +- This requirement does not guarantee identical keyboard behavior in terminals that do not report the required input events. + +## Open Questions + +- Which submit and newline keybindings should be required across supported terminals? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TUI-009 | 1 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | Session input modes define how composer input interpretation changes during a session. | +| refined-by | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines composer layout, multi-line input, submission semantics, command discovery, Shell Mode, Plan Mode, and Unicode constraints. | +| related-to | L2-DES-CLIENT-001 | 1 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | Defines Unicode, IME, grapheme, and display-width constraints used by composer input. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added support for session-local composer input modes. | diff --git a/specs/L1/L1-REQ-TUI-002-streaming.md b/specs/L1/L1-REQ-TUI-002-streaming.md new file mode 100644 index 00000000..bbcfb22e --- /dev/null +++ b/specs/L1/L1-REQ-TUI-002-streaming.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-TUI-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-002 — Streaming Rendering + +## Purpose + +Make live agent progress visible while a turn is running. + +## Why This Matters + +Streaming is how users know the program is actively working. Late or batch-only updates make tool execution and model output feel stuck even when work is progressing. + +## Background / Context + +Users need to see model text, reasoning summaries, tool starts, tool output deltas, and completion states as work progresses. + +## User / Business Requirement + +The TUI must render streaming model and tool progress in a timely, readable way. + +## Real User Scenarios + +- A user watches assistant text appear incrementally instead of waiting for the full response. +- A user sees a tool row appear when a tool starts, then sees output deltas before the tool completes. + +## Functional Requirements + +- The TUI must stream assistant text as it becomes available. +- The TUI must stream reasoning summaries where available and appropriate. +- The TUI must show tool calls when they start, update when output arrives, and complete when results are available. +- The TUI must render Markdown content in transcript and live output where supported. + +## Non-Functional Requirements + +- Streaming must feel responsive during normal operation. +- Streaming rendering must not corrupt transcript layout. + +## Acceptance Criteria + +- Given streaming assistant text, when deltas arrive, then the TUI updates before the whole response completes. +- Given a running tool with output deltas, when output arrives, then the TUI shows progress before final completion. +- Given multiple tools run in parallel, when one tool starts or produces output, then that progress can appear before all parallel tools finish. +- Given streaming content includes Markdown, when it is displayed live, then the transcript remains readable and does not collapse into malformed layout. + +## Out of Scope + +- The program does not define frame scheduler, Markdown parser implementation, or internal event pipeline in this L1 requirement. +- This requirement does not require every provider to deliver streaming events with identical granularity. + +## Open Questions + +- What latency target should define acceptable streaming responsiveness? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Defines live assistant, reasoning, tool, approval, question, background process, and Markdown streaming behavior. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-TUI-003-transcript.md b/specs/L1/L1-REQ-TUI-003-transcript.md new file mode 100644 index 00000000..a9faea37 --- /dev/null +++ b/specs/L1/L1-REQ-TUI-003-transcript.md @@ -0,0 +1,74 @@ +--- +artifact_id: L1-REQ-TUI-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-003 — Transcript + +## Purpose + +Provide a durable and readable record of the session inside the TUI. + +## Why This Matters + +The transcript is the user's audit trail. It must make messages, tool work, approvals, errors, and final results reviewable without requiring the user to inspect raw logs. + +## Background / Context + +The transcript is how users review messages, reasoning summaries, tool calls, tool outputs, approvals, errors, and final results. + +## User / Business Requirement + +The TUI must provide a transcript that supports review, audit, and recovery of session activity. + +## Real User Scenarios + +- A user scrolls back to find the command output that explained a test failure. +- A user reviews a previous approval decision before allowing a similar action. + +## Functional Requirements + +- The transcript must display user messages and assistant responses. +- The transcript must display tool calls, tool outputs, approvals, questions, and errors where relevant. +- The transcript must preserve completed turn history after live rendering finishes. +- The transcript must support scrolling or review of previous content. + +## Non-Functional Requirements + +- Transcript layout must remain readable with long outputs and narrow terminal widths. + +## Acceptance Criteria + +- Given a completed tool call, when the user reviews the transcript, then the command or tool summary and result are visible. +- Given a long session, when the user scrolls back, then prior relevant messages remain reviewable. +- Given output is truncated or folded, when the user views the transcript, then the transcript indicates that not all content is shown inline. +- Given a turn fails, when the transcript is reviewed, then the error and last known task state are visible. + +## Out of Scope + +- The program does not define exact cell rendering, folding behavior, or scroll implementation in this L1 requirement. +- This requirement does not require the transcript to display unlimited raw output inline. + +## Open Questions + +- Which transcript items should be collapsible by default? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Defines transcript cell types, durable/live reconciliation, scrolling review, folding, and failure display. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Defines transcript viewport placement in the modern TUI shell. | +| related-to | L2-DES-CLIENT-001 | 1 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | Defines Unicode and localized content preservation for transcript rendering. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-TUI-004-state-visibility.md b/specs/L1/L1-REQ-TUI-004-state-visibility.md new file mode 100644 index 00000000..787e8c22 --- /dev/null +++ b/specs/L1/L1-REQ-TUI-004-state-visibility.md @@ -0,0 +1,78 @@ +--- +artifact_id: L1-REQ-TUI-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-004 — State Visibility + +## Purpose + +Ensure that users can always understand what the TUI is currently doing. + +## Background / Context + +The TUI is the primary interactive surface for agent work. A turn may be idle, generating model output, running tools, waiting for approval, waiting for a user answer, interrupted, failed, or completed. If these states are not visible, users cannot decide whether to wait, interrupt, approve, retry, or inspect results. + +## User / Business Requirement + +The TUI must make the current execution state visible and understandable to the user. + +## Functional Requirements + +- The TUI must show when the program is idle and ready for input. +- The TUI must show when model output is being generated. +- The TUI must show when a tool is preparing, running, producing output, completed, failed, or waiting. +- The TUI must expose current background processes started by the program and provide access to their manual stop controls. +- The TUI must show active non-default session-local input modes such as Shell Mode and Plan Mode. +- The TUI must show when the program is waiting for approval or a user answer. +- The TUI must show when a turn has been interrupted, failed, or completed. +- The TUI must preserve important state transitions in the transcript where they are relevant for later review. + +## Non-Functional Requirements + +- State indicators must be concise enough to remain readable during normal work. +- State indicators must not obscure the composer, transcript, or active tool output. +- State transitions must be timely enough that users do not mistake active work for a frozen interface. + +## Acceptance Criteria + +- Given no turn is active, when the TUI is open, then the user can tell that input may be submitted. +- Given model output is streaming, when the turn is running, then the user can tell that generation is active. +- Given a tool is running, when the tool starts or produces output, then the user can tell which tool is active. +- Given a background process started by the program remains active, when the user views TUI state, then the user can identify that process and access the stop control. +- Given Shell Mode or Plan Mode is active, when the user views TUI state, then the active non-default input mode is visible. +- Given the program waits for approval or a user answer, when the user looks at the TUI, then the waiting reason is visible. +- Given a turn fails or is interrupted, when the user reviews the transcript, then the final state is visible. + +## Out of Scope + +- This requirement does not define exact symbols, colors, spinner frames, layout positions, or animation implementation. +- This requirement does not define the internal event model used to represent execution state. + +## Open Questions + +- Which states require persistent transcript entries, and which states should remain live-only? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TOOL-005 | 1 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | Background process management defines the current process state and manual stop behavior the TUI must expose. | +| related-to | L1-REQ-TUI-009 | 1 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | Session input modes define Shell Mode and Plan Mode visibility in the TUI. | +| refined-by | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Defines visible state mapping for idle, model generation, tool lifecycle, approvals, questions, interruptions, failures, completion, and background processes. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Defines shell regions that present current execution state. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines bottom status line labels for active input modes. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft approved for L1 expansion. | +| 1 | 2026-05-21 | Human | Refinement | Added current background process visibility and manual stop control requirements. | +| 1 | 2026-05-21 | Human | Refinement | Added visibility for active non-default session-local input modes. | diff --git a/specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md b/specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md new file mode 100644 index 00000000..1deb6f5a --- /dev/null +++ b/specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md @@ -0,0 +1,68 @@ +--- +artifact_id: L1-REQ-TUI-005 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-005 — Terminal Lifecycle Safety + +## Purpose + +Ensure that using and exiting the TUI does not leave the user's terminal in a broken or confusing state. + +## Background / Context + +The TUI may run inline or in an alternate-screen style. It may change terminal modes, render live regions, receive interrupts, and exit while work is active or recently completed. Users rely on the terminal scrollback and shell prompt after the TUI exits. + +## User / Business Requirement + +The TUI must enter, run, interrupt, and exit without corrupting terminal state or losing useful user-visible context. + +## Functional Requirements + +- The TUI must support safe startup from a normal terminal session. +- The TUI must restore terminal modes when it exits. +- The TUI must handle normal exit and interrupt-triggered exit consistently. +- Inline mode must preserve useful terminal scrollback where possible. +- The TUI must avoid leaving stale live-rendered regions that confuse the next shell prompt. +- If the TUI cannot restore or clean up terminal state completely, it must make the failure understandable to the user where possible. + +## Non-Functional Requirements + +- Terminal lifecycle behavior must be reliable across supported terminal environments. +- Exit behavior must prioritize terminal usability over decorative rendering. +- Safe cleanup must not depend on fragile shell prompt positioning assumptions. + +## Acceptance Criteria + +- Given the TUI exits normally, when control returns to the shell, then the terminal accepts input normally. +- Given the user interrupts the TUI, when cleanup completes, then terminal modes are restored. +- Given inline mode has displayed transcript content, when the TUI exits, then useful scrollback above the live region remains available where the terminal supports it. +- Given active live content exists at exit time, when the shell prompt returns, then stale TUI rendering does not obscure the prompt. + +## Out of Scope + +- This requirement does not define terminal escape sequences, prompt-row math, crossterm behavior, or alternate-screen implementation. +- This requirement does not guarantee identical behavior in unsupported terminal emulators. + +## Open Questions + +- Which terminal environments are included in the required support matrix? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-TUI-005 | 1 | specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md | Defines startup, inline mode, alternate-screen mode, interrupt handling, cleanup, terminal restore, stale-region prevention, and shell prompt handoff. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Defines the live TUI region that lifecycle cleanup must manage. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft approved for L1 expansion. | diff --git a/specs/L1/L1-REQ-TUI-006-command-discovery-control.md b/specs/L1/L1-REQ-TUI-006-command-discovery-control.md new file mode 100644 index 00000000..93e426a8 --- /dev/null +++ b/specs/L1/L1-REQ-TUI-006-command-discovery-control.md @@ -0,0 +1,73 @@ +--- +artifact_id: L1-REQ-TUI-006 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-006 — Command Discovery and Control + +## Purpose + +Ensure that users can discover and invoke TUI commands without memorizing hidden behavior. + +## Background / Context + +The TUI is expected to expose commands for session control, configuration, model selection, theme changes, goals, interrupts, approvals, and other product workflows. Users need a discoverable command surface that works during normal interactive use. + +The `/model` command is the post-onboarding TUI command for changing the current session model and reasoning effort where the selected model supports reasoning. + +## User / Business Requirement + +The TUI must provide a discoverable command interface for controlling product workflows from the terminal. + +## Functional Requirements + +- The user must be able to discover available commands from within the TUI. +- The user must be able to invoke commands intentionally from the composer or another visible command surface. +- Command discovery must include enough names or descriptions for users to choose the right command. +- Commands that are unavailable during active work must provide clear feedback instead of silently failing. +- Commands that affect goals, sessions, configuration, model selection, theme, approval, or interruption must be represented consistently with the related product requirements. +- The TUI must provide a `/model` command for the model-selection workflow. +- The `/model` command must open a selection flow that begins with model selection and then offers reasoning effort selection when the chosen model supports reasoning. + +## Non-Functional Requirements + +- Command discovery must not disrupt typed user input unexpectedly. +- Command feedback must be concise and visible in the TUI. +- The command surface must remain usable without requiring users to read implementation documentation. + +## Acceptance Criteria + +- Given the user opens command discovery, when commands are available, then the TUI lists relevant command options. +- Given the user invokes a command, when the command is valid, then the command takes effect or reports the next required step. +- Given the user invokes a command that is blocked during active generation, when the command cannot run, then the TUI explains why. +- Given the user starts typing a command, when suggestions are shown, then existing composer text is not lost unexpectedly. +- Given the user invokes `/model`, when the command opens, then the TUI presents the model-selection workflow. +- Given the user selects a reasoning-capable model through `/model`, when reasoning effort selection is needed, then the TUI presents supported reasoning effort choices for that model. + +## Out of Scope + +- Except for the required `/model` command, this requirement does not define exact command names, slash-command grammar, fuzzy matching algorithm, or keybindings. +- This requirement does not define the implementation of each command's underlying product workflow. + +## Open Questions + +- Which commands must be present in the first TUI milestone? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines slash-command trigger behavior, list height, keyboard navigation, row styling, and the initial command list. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft approved for L1 expansion. | +| 1 | 2026-05-22 | Human | Refinement | Added `/model` as the required post-onboarding TUI command for model and supported reasoning effort selection. | diff --git a/specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md b/specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md new file mode 100644 index 00000000..74df585b --- /dev/null +++ b/specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md @@ -0,0 +1,71 @@ +--- +artifact_id: L1-REQ-TUI-007 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-007 — Responsive Layout and Readability + +## Purpose + +Ensure that the TUI remains readable and usable across practical terminal sizes. + +## Background / Context + +Terminal users may run the program in narrow panes, full-screen terminals, split panes, or resized windows. The TUI includes header/status information, transcript content, live tool output, command suggestions, composer input, and a bottom status line. Layout failures can make the interface unusable even when the underlying agent is working correctly. + +## User / Business Requirement + +The TUI must adapt to practical terminal sizes without overlapping, truncating critical state incorrectly, or making input and output unreadable. + +## Functional Requirements + +- The TUI must preserve a usable composer area across supported terminal sizes. +- The TUI must keep transcript content readable with long lines, long outputs, and narrow widths. +- The TUI must avoid overlapping header, transcript, live output, command suggestions, composer content, and the bottom status line. +- The TUI must degrade gracefully when optional information does not fit. +- The TUI must make truncation, folding, or omission visible when important content cannot be shown inline. + +## Non-Functional Requirements + +- Layout behavior must remain stable during streaming updates and terminal resize events. +- Essential state must take priority over decorative or secondary information. +- The TUI must avoid visual jitter that prevents users from reading active content. + +## Acceptance Criteria + +- Given a narrow but supported terminal width, when the TUI renders, then composer input remains usable. +- Given long transcript output, when it is displayed, then it wraps, folds, or truncates in a way that remains understandable. +- Given the terminal is resized during a turn, when the TUI redraws, then visible regions do not overlap incoherently. +- Given optional header, status, or input-mode details do not fit, when the TUI renders, then essential task state remains visible. + +## Out of Scope + +- This requirement does not define exact breakpoints, layout algorithms, cell dimensions, or rendering primitives. +- This requirement does not require the TUI to support terminal dimensions too small for meaningful interaction. + +## Open Questions + +- What minimum terminal size should be considered supported? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TUI-009 | 1 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | Session input modes require a bottom status line below the composer. | +| refined-by | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Defines responsive region priorities, narrow layout behavior, non-overlap rules, and graceful degradation. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines composer and bottom status line layout behavior. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Defines streaming and transcript rendering stability. | +| related-to | L2-DES-CLIENT-001 | 1 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | Defines display-width aware rendering for Unicode and localized text. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft approved for L1 expansion. | +| 1 | 2026-05-21 | Human | Refinement | Added bottom status line layout considerations for session input mode display. | diff --git a/specs/L1/L1-REQ-TUI-008-terminal-command-prefix.md b/specs/L1/L1-REQ-TUI-008-terminal-command-prefix.md new file mode 100644 index 00000000..995ce9bd --- /dev/null +++ b/specs/L1/L1-REQ-TUI-008-terminal-command-prefix.md @@ -0,0 +1,78 @@ +--- +artifact_id: L1-REQ-TUI-008 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-008 — Terminal Command Prefix + +## Purpose + +Define the TUI-only behavior for entering Shell Mode and executing terminal commands from composer input that begins with `!`. + +## Background / Context + +Terminal users need a fast way to run commands without leaving the TUI flow. In the TUI, a leading `!` is a compact terminal-oriented shortcut for entering Shell Mode and preparing command execution. This behavior is specific to the terminal client and should not be treated as a mandatory behavior for every client surface. + +## User / Business Requirement + +The TUI must recognize input beginning with `!` as a request to enter Shell Mode, execute the shell-mode input through the terminal command capability, and display the command result to the user. + +## Real User Scenarios + +- A user types `!` in the TUI composer, sees Shell Mode become active, then enters a command and expects the command result to be returned in the TUI. +- A user wants to run a quick diagnostic command without switching away from the active TUI session. + +## Functional Requirements + +- In the TUI, if composer input begins with `!`, the TUI must enter Shell Mode rather than treating the input as a normal chat message. +- Shell Mode input must execute through the program's terminal command capability. +- The result of Shell Mode command execution must be returned and displayed in the TUI. +- The TUI must make it clear when Shell Mode is active. +- Terminal command execution from the TUI must respect workspace, safety, privacy, and permission boundaries. + +## Non-Functional Requirements + +- Prefix behavior must be predictable and must not silently execute commands from ambiguous input. +- Command output display must be bounded and readable in the TUI. +- The terminal command prefix must not make ordinary chat input fragile or surprising. + +## Acceptance Criteria + +- Given the TUI user enters input beginning with `!`, when the input is submitted, then the program executes the remaining text as a terminal command and returns the command result in the TUI. +- Given the TUI user enters `!` as the input prefix, when the composer renders the input state, then the user can tell Shell Mode is active rather than normal chat input. +- Given a `!` command would exceed permissions, when the action is invoked, then the program follows the applicable safety and approval behavior. +- Given a `!` command produces output, when the TUI displays the result, then the output is bounded enough to keep the TUI usable. + +## Out of Scope + +- This requirement does not require non-TUI clients to support leading `!` command execution. +- This requirement does not define the shell command execution backend, command parsing rules, quoting behavior, or process lifecycle implementation. +- This requirement does not define exact TUI layout, keybindings, colors, or rendering details. + +## Open Questions + +- Should whitespace before `!` still trigger TUI terminal command behavior? +- Should `!` commands require confirmation under restrictive permission modes? +- How should TUI users escape a leading `!` when they intend to send a normal chat message? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-CLIENT-004 | 1 | specs/L1/L1-REQ-CLIENT-004-prefixed-input-actions.md | Separates TUI-only terminal command prefix behavior from general client fuzzy-search prefix behavior. | +| related-to | L1-REQ-TUI-009 | 1 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | Defines Shell Mode as a session-local TUI input mode and its status-line visibility. | +| related-to | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | Built-in command execution provides the underlying terminal command capability. | +| refined-by | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines leading `!` Shell Mode entry, escaping, one-shot command submission, result display, and safety boundaries. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved TUI-only terminal command prefix requirement. | +| 1 | 2026-05-21 | Human | Refinement | Clarified that entering the `!` prefix switches the TUI into Shell Mode. | diff --git a/specs/L1/L1-REQ-TUI-009-session-input-modes.md b/specs/L1/L1-REQ-TUI-009-session-input-modes.md new file mode 100644 index 00000000..ecece3d4 --- /dev/null +++ b/specs/L1/L1-REQ-TUI-009-session-input-modes.md @@ -0,0 +1,93 @@ +--- +artifact_id: L1-REQ-TUI-009 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-23 +--- + +# L1-REQ-TUI-009 — Session Input Modes + +## Purpose + +Define TUI session-local input modes and how active non-default input modes are shown to the user. + +## Background / Context + +The TUI composer may temporarily enter different input modes during a session. These input modes control how composer input is interpreted in the TUI. They are different from session-level agent modes such as Coding Mode and Security Mode, which are selected before or at session creation and remain locked for that session. + +The initial TUI input modes are Default Input Mode, Shell Mode, and Plan Mode. Default Input Mode is the normal chat input state. In the bottom status line, it is represented by the normal work-state label `Build` rather than a generic `Default` label. Shell Mode and Plan Mode are non-default input modes and must be visible while active. + +Plan Mode is not only a visual label. When active, it applies the agent-level Plan Mode behavior: the agent analyzes and plans without modifying files, and the question tool may be used for clarification only in this mode. + +## User / Business Requirement + +The TUI must support session-local input modes and display active non-default input modes in the bottom status line without confusing them with session-level agent modes. + +## Real User Scenarios + +- A user types `!` at the start of composer input and sees the TUI enter Shell Mode before submitting a terminal command. +- A user enters Plan Mode during a session and can tell from the bottom status line that composer input is currently plan-oriented rather than normal chat input. +- A user returns to Default Input Mode and sees the normal `Build` status label rather than an unnecessary `Default` label. + +## Functional Requirements + +- The TUI must support Default Input Mode as the normal composer input mode. +- The TUI must support Shell Mode for terminal-command input. +- The TUI must support Plan Mode for plan-oriented interaction governed by the agent-level Plan Mode behavior. +- Session-local input modes must be changeable during a session without changing the session-level agent mode. +- The TUI must provide a bottom status line below the bottom composer. +- When Shell Mode is active, the TUI must display the active mode label on the right side of the bottom status line. +- When Plan Mode is active, the TUI must display the active mode label on the right side of the bottom status line. +- When Default Input Mode is active, the TUI must display the normal `Build` status label and must not display a generic `Default` mode label. +- The TUI must avoid presenting Shell Mode or Plan Mode as Coding Mode, Security Mode, or any other session-level agent mode. +- The TUI must not present Plan Mode as permission to modify files. + +## Non-Functional Requirements + +- Input mode indicators must be concise and readable in normal terminal sizes. +- Input mode changes must be visible quickly enough that users can predict how submitted input will be handled. +- The bottom status line must not obscure composer input or transcript content. +- Mode labels must remain visually subordinate to the active composer while still being discoverable. + +## Acceptance Criteria + +- Given the TUI composer is in Default Input Mode, when the bottom status line renders, then the normal `Build` status label is shown and no generic `Default` mode label is shown. +- Given the user enters `!` as the input prefix, when the TUI recognizes the prefix, then the TUI enters Shell Mode. +- Given Shell Mode is active, when the bottom status line renders, then a Shell Mode label appears on the right side of the bottom status line. +- Given Plan Mode is active, when the bottom status line renders, then a Plan Mode label appears on the right side of the bottom status line. +- Given Plan Mode is active, when the agent responds to user input, then the Plan Mode file-modification prohibition applies. +- Given the session-level agent mode is Coding Mode or Security Mode, when the user switches TUI input mode, then the session-level agent mode remains unchanged. +- Given the bottom composer is visible, when the TUI renders the bottom status line, then the status line appears below the composer. + +## Out of Scope + +- This requirement does not define the exact commands, keybindings, labels, colors, or rendering implementation for entering or leaving Plan Mode. +- This requirement does not define the terminal command execution backend used by Shell Mode. +- This requirement does not allow TUI input modes to change the session-level agent mode. + +## Open Questions + +- What command or keybinding should enter and leave Plan Mode? +- Should Shell Mode exit automatically after a command is submitted? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-TUI-008 | 1 | specs/L1/L1-REQ-TUI-008-terminal-command-prefix.md | The `!` terminal command prefix enters Shell Mode. | +| related-to | L1-REQ-APP-013 | 1 | specs/L1/L1-REQ-APP-013-agent-modes.md | Session-local input modes must remain distinct from session-level agent modes. | +| related-to | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Agent Plan Mode defines planning-only behavior and question-tool restrictions. | +| refined-by | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines Default Input Mode, Shell Mode, Plan Mode, bottom status line labels, and Plan Mode submission behavior. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Defines the bottom status line region where non-default input modes are displayed. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved TUI session-local input mode requirement. | +| 1 | 2026-05-21 | Human | Refinement | Linked TUI Plan Mode to agent-level planning-only behavior and question-tool restrictions. | +| 1 | 2026-05-23 | Human | Refinement | Updated bottom status behavior to use `Build`, `Plan`, and `Shell` labels rather than hiding the normal work-state label. | diff --git a/specs/L1/L1-REQ-TUI-010-onboarding-ui.md b/specs/L1/L1-REQ-TUI-010-onboarding-ui.md new file mode 100644 index 00000000..adca08ae --- /dev/null +++ b/specs/L1/L1-REQ-TUI-010-onboarding-ui.md @@ -0,0 +1,115 @@ +--- +artifact_id: L1-REQ-TUI-010 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-TUI-010 — Onboarding UI + +## Purpose + +Ensure that first-time or incomplete model setup can be completed from the TUI without leaving the terminal workflow. + +## Why This Matters + +Users cannot start useful work until at least one supported model is configured for invocation. The TUI onboarding experience should make the required model setup path clear, searchable, recoverable, and compatible with credential handling expectations. + +## Background / Context + +The TUI is the initial client surface. When required model configuration is missing, the TUI should guide the user through model setup before normal session work begins. + +The required TUI onboarding flow starts with supported model slug selection, then lets the user select an existing provider or add a provider. Adding a provider collects provider name, base URL, and API key. After provider selection or creation, the flow collects the model name expected by that provider, asks the user to select an invocation method, and finally asks for reasoning effort when the selected model supports reasoning. + +## User / Business Requirement + +The TUI must provide an onboarding UI for required model setup that guides the user through supported model selection, provider detail entry, invocation method selection, and reasoning effort selection where supported. + +## Real User Scenarios + +- A first-time user opens the TUI, searches supported model slugs, selects one, and then selects or adds a provider. +- A user adds a provider by entering provider name, base URL, and API key. +- A user enters the model name expected by the selected provider before choosing an invocation method. +- A user chooses the invocation method, such as OpenAI Chat Completions, OpenAI Responses, or Anthropic Messages, after entering the model name. +- A user selects a reasoning-capable model during onboarding and is asked to choose a supported reasoning effort before starting work. +- A user mistypes an API key or base URL and receives recoverable feedback without losing completed setup progress. + +## Functional Requirements + +- The TUI must start or offer onboarding when required model configuration is missing. +- TUI onboarding model setup must begin with supported model slug selection. +- The user must be able to search or filter supported model slugs during onboarding. +- After model slug selection, TUI onboarding must let the user select an existing provider or add a provider. +- When adding a provider, TUI onboarding must collect provider name, base URL, and API key where applicable. +- After provider selection or creation, TUI onboarding must collect the model name expected by that provider. +- After model name entry, TUI onboarding must let the user select an invocation method where applicable. +- Invocation method choices should include OpenAI Chat Completions, OpenAI Responses, and Anthropic Messages where available. +- When the selected model supports reasoning, TUI onboarding must let the user select a supported reasoning effort after invocation method selection. +- Every onboarding input field or selection popup must show a concise hint describing the current value the user is expected to provide. +- When the selected model does not support reasoning, TUI onboarding must continue without asking for reasoning effort. +- Credential entry must be treated as an explicit credential-handling flow rather than ordinary transcript input. +- The TUI must preserve completed onboarding fields when validation fails where it is safe and useful to do so. +- When required setup succeeds, the TUI must submit onboarding results for persistent configuration storage. +- The TUI must let the user complete onboarding and reach a usable session after required setup succeeds. + +## Non-Functional Requirements + +- Onboarding feedback must be concise and recoverable. +- The onboarding flow must not expose plaintext credentials in routine transcript, model list, model switcher, logging, or telemetry paths by default. +- The onboarding UI must remain usable with keyboard-driven terminal interaction. +- The onboarding flow should avoid making optional setup feel mandatory. + +## Acceptance Criteria + +- Given required model configuration is missing, when the TUI starts, then the TUI starts or offers onboarding before normal model invocation is attempted. +- Given TUI onboarding begins model setup, when supported models are available, then the user can search or filter supported model slugs. +- Given the user selects a supported model slug, when provider selection is required, then the TUI lets the user select an existing provider or add a provider. +- Given the user adds a provider, when provider details are required, then the TUI lets the user provide provider name, base URL, and API key where applicable. +- Given credential material is entered during onboarding, when the TUI handles it, then the credential entry is not treated as ordinary transcript input. +- Given provider selection or creation is complete, when model name is required, then the TUI lets the user enter the model name expected by that provider. +- Given model name entry is complete, when invocation method selection is required, then the user can choose a supported invocation method. +- Given the selected model supports reasoning, when invocation method selection is complete, then the user can choose a supported reasoning effort. +- Given the selected model does not support reasoning, when invocation method selection is complete, then the TUI continues without asking for reasoning effort. +- Given the user is entering or selecting an onboarding value, when that field or popup is active, then the TUI shows a concise hint describing the current value. +- Given validation fails during provider setup, when the user returns to setup, then previously completed safe fields remain available where possible. +- Given required onboarding setup succeeds, when the program restarts, then the model setup completed through the TUI is available from persistent configuration. +- Given required onboarding setup succeeds, when the user exits onboarding, then the TUI can start or continue a usable session. + +## Out of Scope + +- This requirement does not define exact popup layout, inline rendering style, vertical guide line rendering, colors, borders, keyboard shortcuts, focus order, validation timing, or final visual styling. +- This requirement does not define credential storage backend, provider-specific validation protocol, or provider request payloads. +- This requirement does not require all optional integrations, tools, telemetry choices, or preferences to be configured during TUI onboarding. + +## Open Questions + +- Should the TUI allow users to skip model setup when no invocable model is configured? +- Should the supported model list show unconfigured models, configured models, or both during onboarding? +- Which provider detail fields are mandatory for each supported invocation method? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| related-to | L1-REQ-MODEL-001 | 1 | specs/L1/L1-REQ-MODEL-001-config.md | Model configuration defines supported models, invocable models, credential status, and reasoning effort requirements. | +| related-to | L1-REQ-MODEL-002 | 1 | specs/L1/L1-REQ-MODEL-002-provider.md | Provider setup defines credential and provider availability behavior used by onboarding. | +| related-to | L1-REQ-MODEL-003 | 1 | specs/L1/L1-REQ-MODEL-003-onboard.md | Model onboarding defines the product-level setup requirement that this TUI UI presents. | +| related-to | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Configuration defines persistence and source precedence for onboarding results. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | Privacy and data ownership define credential-handling expectations. | +| refined-by | L2-DES-TUI-001 | 1 | specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md | L2 defines the concrete terminal UI flow, inline rendering, popup behavior, and ASCII layout. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial TUI onboarding UI requirement from approved onboarding model setup flow. | +| 1 | 2026-05-22 | Human | Refinement | Added an ASCII example that concretizes the supported-model popup, provider-detail form, and reasoning-effort popup sequence. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that onboarding selects a model slug, closes the popup on Enter, and continues with inline model display, vertical separators, base URL, API key, and reasoning effort popup when supported. | +| 1 | 2026-05-22 | Human | Refinement | Added invocation method selection after API key entry, using the same searchable popup and close-on-confirm behavior as model and reasoning selection. | +| 1 | 2026-05-22 | Human | Refinement | Moved concrete popup, inline rendering, and ASCII layout design details to L2 while preserving the L1 user-facing onboarding contract. | +| 1 | 2026-05-22 | Human | Refinement | Changed onboarding to model-first, provider-select-or-add, provider-specific model name, invocation method, reasoning effort, with per-field hints. | +| 1 | 2026-05-22 | Human | Refinement | Added persistent configuration storage for successful TUI onboarding results. | diff --git a/specs/L1/L1-REQ-VERIFY-001-verification-workflow.md b/specs/L1/L1-REQ-VERIFY-001-verification-workflow.md new file mode 100644 index 00000000..6fd2b51a --- /dev/null +++ b/specs/L1/L1-REQ-VERIFY-001-verification-workflow.md @@ -0,0 +1,79 @@ +--- +artifact_id: L1-REQ-VERIFY-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-20 +--- + +# L1-REQ-VERIFY-001 — Verification Workflow + +## Purpose + +Ensure that work performed by the program is verified before it is presented as complete. + +## Why This Matters + +Verification is the difference between attempted work and trustworthy work. Users need to know what was checked, what failed, what was skipped, and what risk remains. + +## Background / Context + +Coding-agent work often changes files, runs tools, updates configuration, or diagnoses failures. Users need to know whether the result was actually checked, which checks were run, and what residual risk remains. + +Verification is a user-facing workflow, not only a CI concern. The program should connect implementation work with appropriate tests, checks, or explicit statements that verification was not possible. + +## User / Business Requirement + +The program must support a verification workflow for user-requested work and must report verification status clearly. + +## Real User Scenarios + +- A user asks for a bug fix and expects the relevant test or check to run before the program claims success. +- A user asks for a UI change where automated checks pass but manual visual verification is still required or explicitly skipped. + +## Functional Requirements + +- The program must identify relevant verification steps for tasks that change code, configuration, generated artifacts, or behavior. +- The program must run relevant checks when they are available and appropriate. +- The program must distinguish unit tests, integration tests, end-to-end tests, build checks, lint checks, static analysis, and manual verification requirements where relevant. +- The program must report checks that passed, failed, were skipped, or could not be run. +- The program must avoid claiming completion when required verification failed or was not performed. + +## Non-Functional Requirements + +- Verification reporting must be clear enough for the user to understand confidence and residual risk. +- Verification should prefer automated checks when available. +- Verification must not hide failures behind generic success messages. + +## Acceptance Criteria + +- Given a task that changes code, when the program finishes, then the final response states which verification steps were run and their results. +- Given a relevant check fails, when the program reports final status, then the failure is visible and the task is not represented as fully verified. +- Given verification cannot be run, when the program finishes, then the final response explains why and identifies the remaining risk. +- Given the user requested specific verification commands, when the program verifies the work, then those commands are run or the reason they were not run is stated. +- Given verification passes for only part of the change, when the program reports final status, then it distinguishes verified and unverified scope. + +## Out of Scope + +- The program does not define test framework selection, CI provider configuration, test discovery algorithms, or command execution implementation in this L1 requirement. +- This requirement does not guarantee that automated verification can prove every user-visible behavior. + +## Open Questions + +- Which task types require verification before the program may call the work complete? +- Should the program require user approval before running expensive or long verification commands? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/verify/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | diff --git a/specs/L1/L1-REQ-WORKSPACE-001-project-context.md b/specs/L1/L1-REQ-WORKSPACE-001-project-context.md new file mode 100644 index 00000000..960daffa --- /dev/null +++ b/specs/L1/L1-REQ-WORKSPACE-001-project-context.md @@ -0,0 +1,77 @@ +--- +artifact_id: L1-REQ-WORKSPACE-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-21 +--- + +# L1-REQ-WORKSPACE-001 — Project Context + +## Purpose + +Ensure the program acts in the intended project and respects existing work. + +## Why This Matters + +Most coding mistakes become expensive when the program works in the wrong directory, ignores local instructions, or overwrites user changes. Workspace awareness is the foundation for safe project work. + +## Background / Context + +Coding tasks depend on current workspace, repository status, local instructions, ignored paths, generated outputs, and user-created changes. Local instructions may be stored in recognized project instruction files discovered along the directory hierarchy from the project root to the current working directory. + +## User / Business Requirement + +The program must maintain and respect project context while performing coding work. + +## Real User Scenarios + +- A user runs the program in a repository with dirty files and expects unrelated changes to be preserved. +- A workspace contains local instructions, and the user expects the program to follow them during edits and verification. + +## Functional Requirements + +- The program must identify the current workspace or working directory. +- The program must consider local project instructions where present. +- The program must automatically discover and read recognized project instruction files where present and accessible. +- The program must inspect repository state before risky file or git operations. +- The program must distinguish relevant task changes from unrelated existing changes where possible. + +## Non-Functional Requirements + +- The program must avoid searching obvious generated or build-artifact paths during normal project search. +- Workspace boundary changes must be visible to the user. + +## Acceptance Criteria + +- Given a dirty repository, when the program edits files, then it does not claim unrelated pre-existing changes as its own. +- Given a task that requires access outside the workspace, when the program proceeds, then the user is informed of the boundary and reason. +- Given generated or build-output directories exist, when the program searches the project, then it avoids those paths unless they are relevant to the task. +- Given local project instructions exist, when the program begins work, then those instructions are considered as part of the workspace context. +- Given recognized project instruction files exist, when the program begins workspace-dependent work, then those files are discovered and read where accessible. + +## Out of Scope + +- The program does not define project-type detection, indexing implementation, or VCS abstraction in this L1 requirement. +- This requirement does not allow the program to ignore user-owned dirty changes for convenience. + +## Open Questions + +- Should one session support multiple workspaces? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | TBD | TBD | specs/L2/workspace/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-20 | Assistant | Initial | Initial draft with approved L1 refinement. | +| 1 | 2026-05-21 | Human | Refinement | Added automatic discovery and reading of recognized project instruction files. | +| 1 | 2026-05-22 | Human | Refinement | Removed hardcoded instruction-file list; discovery order is now defined by the project instruction file requirement. | diff --git a/specs/L1/L1-REQ-WORKSPACE-002-project-instruction-files.md b/specs/L1/L1-REQ-WORKSPACE-002-project-instruction-files.md new file mode 100644 index 00000000..a78afd22 --- /dev/null +++ b/specs/L1/L1-REQ-WORKSPACE-002-project-instruction-files.md @@ -0,0 +1,126 @@ +--- +artifact_id: L1-REQ-WORKSPACE-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Human +last_updated: 2026-05-22 +--- + +# L1-REQ-WORKSPACE-002 — Project Instruction Files + +## Purpose + +Ensure that the program automatically discovers and reads project instruction files along the directory hierarchy from the project root to the current working directory, without imposing arbitrary depth limits. + +## Background / Context + +Many coding projects include repository-local instruction files for agents or coding assistants. These files may describe coding style, safety expectations, verification commands, project structure, or workflow constraints. Users should not have to manually paste these instructions into every session. + +A project may organize its codebase into nested directories, each with its own local conventions. An instruction file in a parent directory should apply to work in child directories, while an instruction file closer to the current working directory should carry more specific, localized rules. + +The program should discover instruction files along the entire linear path from the project root to the current working directory. There is no artificial depth cap — if the current working directory is thirty levels deep, all thirty directories on the path should be checked. + +## User / Business Requirement + +The program must automatically discover, read, and apply recognized project instruction files across the entire directory hierarchy from the project root down to the current working directory. + +## Functional Requirements + +### Discovery + +- The program must locate a project root by walking upward from the current working directory and stopping at the first ancestor that contains a recognized project-root marker. The default marker is a `.git` directory. The marker set should be configurable. +- Once the project root is found, the program must collect all directories on the linear path from the project root down to the current working directory, inclusive of both endpoints. +- For each directory on that path, the program must look for recognized instruction files. +- If the current working directory is not inside any project root (no marker found on any ancestor), the program must still check the current working directory itself for instruction files. +- The program must also check a user-level global directory (e.g., `~/.devo/`) for instruction files that apply across all projects. + +### Filename Priority Per Directory + +- In each directory, the program must check for instruction files in a fixed priority order: + 1. `AGENTS.override.md` + 2. `AGENTS.md` + 3. Additional configurable fallback filenames provided by the user. +- Only the highest-priority file found in a given directory is used. If `AGENTS.override.md` exists, `AGENTS.md` and fallback filenames are not checked for that directory. +- Configurable fallback filenames allow projects that already maintain instruction files for other assistants (e.g., `CLAUDE.md`, `PROMPT.md`) to work without duplication or migration. +- Fallback filenames are user-configured, not hardcoded. The program should provide a sensible default set. + +### Assembly And Ordering + +- Discovered instruction files must be assembled in order from the project root down to the current working directory: root-first, cwd-last. +- Global instruction files should appear before project-root instructions. +- Files discovered closer to the current working directory carry more localized rules and come later in the assembled context, nearer to the model's response generation. +- The total assembled content must be bounded to a configurable maximum size. Truncation must be indicated clearly rather than hidden. +- Instruction files that are empty or contain only whitespace should be treated as absent — they do not contribute to the assembled instructions and do not prevent lower-priority files from being discovered in the same directory. + +### Content Semantics + +- The assembled project instructions must be included in the instruction hierarchy used during model-context assembly. +- Project instruction files must never silently overwrite higher-priority instructions such as system-level safety constraints or explicit user-provided instructions in the current conversation. +- Project instruction files are instructions, not conversation turns. They belong to the instruction hierarchy, not the transcript. + +### Error And Unavailability Handling + +- If an instruction file cannot be read because it is missing, the program must treat this as normal — the file simply does not apply to that directory. +- If an instruction file cannot be read because it is inaccessible, too large, binary, or blocked by permissions, the program must produce a diagnostic that explains which file was affected and why, without exposing file contents that should remain private. +- If the total assembled instruction content exceeds the configured maximum size, the program must truncate and indicate truncation rather than silently dropping content. +- A single unreadable file on the path must not prevent discovery and reading of other instruction files. + +### Discovery Boundary + +- The program must not walk upward past the project root. +- The program must not walk into sibling directories, parent directories of the project root, or arbitrary filesystem locations outside the linear project-root-to-cwd path. +- If the project root cannot be determined (no marker found and no explicit root configured), the discovery scope is the current working directory only, plus the global user-level directory. + +### Refresh + +- When the current working directory changes during a session, the program must re-discover instruction files along the new path. +- When an instruction file on the active path is modified during a session, the program should detect the change and refresh the assembled instructions subject to reasonable detection latency. + +## Non-Functional Requirements + +- Instruction-file discovery must be predictable and auditable. The user must be able to understand which files were discovered, from which directories, and in what order. +- Discovery must not scan irrelevant directory trees, generated output directories, or large build-artifact paths. +- Large instruction files must not cause unbounded context growth. +- The discovery mechanism must be fast enough that it does not introduce noticeable latency during session startup or directory changes. + +## Acceptance Criteria + +- Given a workspace with a project root at level 0 and the current working directory at level 5, when the program assembles instructions, then instruction files from all six directories on the path are discovered in root-to-cwd order. +- Given a project root directory contains `AGENTS.md` and a child directory contains `AGENTS.override.md`, when the program assembles instructions, then the root contributes the `AGENTS.md` content and the child contributes the `AGENTS.override.md` content (the override replaces the default in that directory only, not the root). +- Given a directory contains both `AGENTS.override.md` and `AGENTS.md`, when the program checks that directory, then only `AGENTS.override.md` is used. +- Given a project has no `.git` directory or other project-root markers, when the program starts in a subdirectory, then only the current working directory and the global user-level directory are checked for instruction files. +- Given a configurable fallback filename is set to `CLAUDE.md`, when a directory contains `CLAUDE.md` but not `AGENTS.md` or `AGENTS.override.md`, then the `CLAUDE.md` content is used for that directory. +- Given a directory contains both `AGENTS.md` and `CLAUDE.md`, when the program checks that directory, then `AGENTS.md` is used and `CLAUDE.md` is not checked because a higher-priority file was found. +- Given an instruction file cannot be read due to a permission error, when the program assembles instructions, then a diagnostic is produced and discovery continues for remaining directories. +- Given the total assembled instruction content exceeds the configured maximum size, when the program assembles instructions, then content is truncated and the truncation is indicated. +- Given the user changes the current working directory during a session, when the next model context is assembled, then instruction files along the new path are discovered. +- Given the current working directory is thirty levels deep, when the program discovers instruction files, then all thirty directories on the path are checked without any depth-based cutoff. + +## Out of Scope + +- This requirement does not define exact file-size limits, truncation format, or serialization schema. +- This requirement does not define the exact mechanism for detecting file changes or refresh latency targets. +- This requirement does not require the program to follow instructions that conflict with higher-priority safety, system, or user instructions. +- This requirement does not require reading instruction files outside the linear project-root-to-cwd path or the configured permission boundary. + +## Open Questions + +- What should the default set of configurable fallback filenames include (e.g., `CLAUDE.md`, `PROMPT.md`)? +- What is the appropriate default maximum size for assembled instruction content? + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---:|---|---|---| +| refined-by | TBD | TBD | specs/L2/workspace/TBD.md | L2 design has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved user requirement. | +| 1 | 2026-05-22 | Human | Refinement | Replaced the "maximum depth of five levels" concept with linear ancestor-chain discovery. Defined per-directory filename priority (AGENTS.override.md, AGENTS.md, configurable fallbacks), first-match-per-directory behavior, root-to-cwd concatenation order, configurable fallback filenames for cross-assistant compatibility, global instruction file support, size bounding, and no artificial depth limit. | + diff --git a/specs/L2/agent/L2-DES-AGENT-001-execution-engine.md b/specs/L2/agent/L2-DES-AGENT-001-execution-engine.md new file mode 100644 index 00000000..52f2a5eb --- /dev/null +++ b/specs/L2/agent/L2-DES-AGENT-001-execution-engine.md @@ -0,0 +1,298 @@ +--- +artifact_id: L2-DES-AGENT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-22 +--- + +# L2-DES-AGENT-001 — Agent Execution Engine + +## Purpose + +Define the server-side execution engine that carries an accepted `turn.submit` request through context assembly, model invocation, tool orchestration, durable recording, and terminal turn status. + +## Background / Context + +Existing L2 designs define important boundaries around the execution engine: + +- `L2-DES-APP-003` defines the client/server protocol envelope, request/response behavior, and live event delivery. +- `L2-DES-CONV-001` defines the durable session JSONL data model and replay records. +- `L2-DES-MODEL-001` defines model-provider binding and `ResolvedModelProfile` construction. + +Those designs do not define what actually happens inside the server after `turn.submit` is accepted and before `turn_completed`, `turn_failed`, or `turn_interrupted` is emitted. This document fills that gap. + +## Source Requirements + +- `L1-REQ-AGENT-001` requires a complete task execution workflow from user request to final outcome. +- `L1-REQ-CONV-002` requires observable and durable turn lifecycle behavior. +- `L1-REQ-CONV-003` requires explicit active-turn `steer` and `queue` handling. +- `L1-REQ-CONTEXT-001` requires useful model context management. +- `L1-REQ-CONTEXT-003` requires context compression near model limits. +- `L1-REQ-INPUT-001` requires attachments and multimodal input as first-class task context. +- `L1-REQ-LLM-001` requires token-efficient context construction. +- `L1-REQ-LLM-002` requires controlled model-requested tool use. +- `L1-REQ-LLM-003` requires model usage observability. +- `L1-REQ-LLM-004` requires persona and communication style handling. +- `L1-REQ-MODEL-001` requires model configuration and capability metadata. +- `L1-REQ-TOOL-002` requires baseline built-in tools for coding-agent workflows. +- `L1-REQ-TOOL-001` requires tool safety and redaction. +- `L1-REQ-GOAL-001` requires bounded autonomous Ralph Loop continuation around a durable objective. +- `L1-REQ-APP-002` requires persistence and recovery behavior. +- `L1-REQ-APP-011` requires actionable error recovery. +- `L2-DES-APP-003` defines protocol requests and notifications around the engine. +- `L2-DES-CONV-001` defines durable turn, item, context, and workspace change records. +- `L2-DES-MODEL-001` defines model-provider resolution used for invocation. +- `L2-DES-TOOL-001` defines the built-in tool registry, lifecycle, and plan tool. + +## Design Requirement + +The server must own turn execution. Clients submit intent and observe canonical events, but the server is responsible for the execution state machine, context assembly, model calls, tool dispatch, persistence, and terminal outcome. + +The execution engine should be deterministic enough that durable records can explain what happened after replay, even though provider streams, external tools, and wall-clock timing are runtime effects. + +## Execution Boundary + +The execution engine starts after a client request has been accepted as an executable turn. + +Input boundary: + +- A session identifier or new-session request. +- Accepted user content parts and mentions. +- Current session metadata. +- Effective configuration and model selection. +- Active context state. +- Optional mode, permission, and reasoning overrides allowed by current policy. + +Output boundary: + +- Durable turn and item records. +- Provider usage records. +- Tool call and tool result records. +- Workspace change-set records where files changed. +- Context snapshot or compaction records where context changed. +- Server-client events for subscribed clients. +- Exactly one terminal turn outcome: completed, failed, or interrupted. + +The execution engine does not define the WebSocket transport, the exact JSONL wire format, provider-specific HTTP payloads, or individual tool schemas. Those are defined by adjacent L2/L3 designs. + +## Runtime Concepts + +Conceptual `TurnExecution` fields: + +- `turn_id` +- `session_id` +- `submitted_by_client_id` +- `submission_id` +- `status`: admitted, running, waiting, completed, failed, or interrupted. +- `phase`: admission, context_assembly, model_invocation, tool_dispatch, waiting_for_user, recording, finalization, or terminal. +- `user_item_id` +- `resolved_model_profile` +- `context_snapshot_id` +- `active_invocation_id` +- `active_tool_call_ids` +- `pending_approval_ids` +- `pending_question_ids` +- `workspace_change_set_id` +- `usage_accumulator` +- `interrupt_token` +- `created_at` +- `updated_at` + +Public client status should remain simpler than internal phase. For example, `context_assembly`, `model_invocation`, and `tool_dispatch` may all appear as `running`; approval and question waits appear as `waiting`; final outcomes appear as `completed`, `failed`, or `interrupted`. + +## Turn Admission + +The server should admit a submitted user message according to current session activity: + +- If the session has no active turn, the server creates a new `turn_started` record and starts execution. +- If a turn is active and the client submits `steer`, the server records a steer item associated with the active turn. +- If a turn is active and the client submits `queue`, the server records a queue item to execute after the active turn reaches a terminal state. +- If the client submits ordinary input while a turn is active without selecting an allowed mode, the server rejects or reclassifies according to `L1-REQ-CONV-003` and the protocol design. +- If an idempotent retry repeats a previously accepted client-generated message id, the server returns the original canonical ids instead of creating duplicate execution. + +Turn admission must persist the accepted input before model invocation begins. + +## Execution Flow + +The normal execution flow is: + +1. Persist accepted user input and `turn_started`. +2. Load or materialize the current session projection from durable state. +3. Resolve the active model binding into a `ResolvedModelProfile`. +4. Assemble model context from instructions, metadata, active context references, user content, mentions, attachments, and tool availability. +5. If context pressure requires compaction, perform or schedule context compression before the primary model request. +6. Start a model invocation using the resolved provider method. +7. Normalize provider stream events into internal runtime events. +8. Persist logical assistant, reasoning, tool-call, usage, and error records at durable granularity. +9. Broadcast coalesced server-client events for live display. +10. Validate model-requested tool calls. +11. Run approval, permission, and safety checks before risky tool execution. +12. Execute approved tool calls through the tool supervisor. +13. Record tool results, workspace changes, output redaction state, and safety notices. +14. Feed tool results back into the model context when the provider interaction continues. +15. Repeat model/tool cycles until the model produces a terminal assistant response or execution stops. +16. Persist terminal status and final usage/context state. +17. Start the next queued item if one is pending and policy permits. + +## Context Assembly + +Context assembly creates the model-visible request for one invocation. It should be explicit and auditable without treating all assembled content as transcript turns. + +Inputs may include: + +- Base instructions, active mode instructions, persona, and permission posture from session metadata. +- The active context object from `L2-DES-CONV-001`. +- Visible transcript items selected for the current context window. +- Summaries produced by context compaction. +- User content parts and mentions from the current turn. +- Attachment and multimodal content references. +- Tool schemas and tool availability. +- Internal persistent memory selected by core policy where supported. +- Current model capability and token-budget constraints. + +Context assembly should produce a context snapshot reference that can explain which durable records and metadata influenced the invocation. Provider-specific serialization, such as system, developer, user, assistant, or tool messages, is a request-building concern and does not convert metadata instructions into transcript turns. + +## Model Invocation + +The execution engine should treat provider calls as resumable runtime work around durable logical records: + +- Each model call receives an `invocation_id`. +- Provider request metadata should identify the `ResolvedModelProfile`, context snapshot, tool schema set, reasoning effort, and request options. +- Provider streaming deltas should be normalized into provider/core events first. +- The engine should coalesce provider deltas into durable item append records and live client updates according to `L2-DES-CONV-001` and `L2-DES-APP-003`. +- Usage received during or after the invocation should update turn and session usage records. +- Provider errors should become structured turn errors with enough recovery context for user-visible reporting. + +The engine should not expose provider-native event streams directly as the client protocol. + +## Tool Dispatch + +Tool dispatch is owned by the execution engine through a tool supervisor. + +For each model-requested tool call, the engine should: + +- Capture the model-provided command description for shell or command execution tools. +- Parse and validate structured tool arguments. +- Resolve the tool definition and capability policy. +- Apply permission, safety, approval, and redaction rules. +- Emit visible waiting state if user approval is required. +- Execute allowed tool calls with bounded output capture. +- Support explicit parallel tool groups where enabled. +- Record started, updated, completed, failed, denied, or canceled tool states. +- Return structured tool results with natural-language status summaries to the model when execution continues. + +The plan tool is a normal server-owned tool from the execution engine's perspective, but its result updates visible plan state rather than external files or command output. + +Structured mutating tools such as `write` and `apply_patch` should report file changes into the core-owned per-turn workspace change set. Shell commands and background processes should report process state through the tool/process supervisor, with file-change attribution only when reliable checkpointing or attribution exists. + +## Progress Visibility + +The engine should make execution state visible through protocol events, not through client-side inference. + +Client-visible progress may include: + +- `turn_started` +- `turn_status_changed` +- `item_started` +- `item_content_update` +- `item_completed` +- `tool_call_started` +- `tool_call_updated` +- `tool_call_completed` +- `plan_updated` +- `turn_diff_updated` +- `usage_updated` +- `context_updated` +- `goal_updated` +- `goal_continuation_started` +- `error_reported` +- terminal turn status + +Durable records should be written before or atomically with corresponding canonical events where practical, so reconnecting clients can recover state after interruption or crash. + +## Goal Integration + +Goal-driven continuation turns use the same execution engine as user-submitted turns. The goal system may create hidden continuation input when the session is idle and the active goal is eligible, but once admitted, the turn follows normal context assembly, model invocation, tool dispatch, persistence, and terminal status rules. + +Rules: + +- The engine should expose usage, tool, and terminal-turn signals needed by the goal system for incremental budget accounting. +- Goal hidden context should be supplied by context assembly as metadata-derived model-visible context, not as a normal transcript item. +- The narrow model-facing goal update tool should be dispatched through the normal tool supervisor. +- If an active goal becomes paused, canceled, blocked, complete, or budget-limited during execution, the current turn may finish or interrupt according to runtime policy, but future autonomous continuation must stop. +- Plan Mode turns must not be launched as autonomous goal continuations. + +## Failure Handling + +The engine should classify failures by phase: + +- Admission failure. +- Context assembly failure. +- Model resolution failure. +- Provider invocation failure. +- Tool validation failure. +- Tool execution failure. +- Approval or question timeout. +- Persistence failure. +- Interruption. + +Recoverable failures should preserve completed records and expose a practical next action. Terminal failures should produce a `turn_failed` record rather than silently abandoning the turn. + +## Completion Semantics + +A turn is complete only after: + +- The final assistant-visible response item, if any, has been recorded. +- Required tool results have been recorded. +- Usage totals have been updated where known. +- Workspace change-set state has been finalized where file changes occurred. +- Context state has been updated or left unchanged explicitly. +- The terminal `turn_completed` record has been persisted. +- Subscribed clients have enough events to render the terminal state. + +The final user-facing response should summarize the outcome, changed files where relevant, verification performed, and unresolved work. + +## Invariants + +- At most one owner executes a given active turn. +- Accepted user input is durable before model invocation begins. +- A turn reaches exactly one terminal state. +- Clients observe server-confirmed state; they do not own execution state. +- Durable replay must recover completed, failed, and interrupted turn history. +- Tool calls cannot bypass validation, approval, or safety policy. +- File mutations are attributed to the turn when performed by structured tools or reliable checkpoints. +- Context assembly should avoid duplicating stable prefixes unnecessarily. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-AGENT-001 | 1 | specs/L1/L1-REQ-AGENT-001-execution-workflow.md | Defines the server-side execution engine that carries user intent to terminal task outcome. | +| related-to | L1-REQ-AGENT-002 | 1 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | Interrupt and resume act on execution engine runtime state. | +| related-to | L1-REQ-AGENT-003 | 1 | specs/L1/L1-REQ-AGENT-003-task-planning.md | Visible plans are progress state layered on top of execution phases. | +| related-to | L1-REQ-AGENT-004 | 1 | specs/L1/L1-REQ-AGENT-004-subagents.md | Subagents depend on bounded delegated execution units. | +| related-to | L1-REQ-CONV-002 | 1 | specs/L1/L1-REQ-CONV-002-turn-lifecycle.md | Execution produces turn lifecycle states. | +| related-to | L1-REQ-CONV-003 | 1 | specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md | Turn admission handles steer and queue behavior. | +| related-to | L1-REQ-CONTEXT-001 | 1 | specs/L1/L1-REQ-CONTEXT-001-management.md | Execution assembles active context for model invocation. | +| related-to | L1-REQ-LLM-002 | 1 | specs/L1/L1-REQ-LLM-002-tools.md | Execution validates and dispatches model-requested tools. | +| related-to | L1-REQ-TOOL-001 | 1 | specs/L1/L1-REQ-TOOL-001-safety.md | Tool dispatch applies safety and approval rules. | +| related-to | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | Execution dispatches built-in tools through the tool supervisor. | +| related-to | L1-REQ-GOAL-001 | 1 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | Goal-driven continuation turns execute through the normal engine and provide budget accounting signals. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol requests and events expose execution state to clients. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable records persist execution state. | +| related-to | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | Model resolution provides runtime invocation profiles. | +| related-to | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | Defines tool registry, lifecycle, and plan tool behavior used by dispatch. | +| related-to | L2-DES-GOAL-001 | 1 | specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md | Defines autonomous goal continuation and model-facing goal update behavior layered on the engine. | +| specified-by | TBD | TBD | specs/L3/agent/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial server-side agent execution engine design. | +| 1 | 2026-05-22 | Human | Refinement | Linked execution tool dispatch to the built-in tool system and plan tool. | +| 1 | 2026-05-23 | Human | Refinement | Added goal-driven continuation integration and budget-accounting signal requirements. | diff --git a/specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md b/specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md new file mode 100644 index 00000000..0aa53a4f --- /dev/null +++ b/specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md @@ -0,0 +1,251 @@ +--- +artifact_id: L2-DES-AGENT-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-22 +--- + +# L2-DES-AGENT-002 — Interrupt And Resume Control + +## Purpose + +Define how the server interrupts, cancels, inspects, and resumes agent work running inside the execution engine. + +## Background / Context + +`L2-DES-AGENT-001` defines the normal execution path from accepted user input to terminal turn status. Long-running agent work may be in model generation, tool execution, approval waiting, question waiting, background process supervision, context compaction, or finalization when the user interrupts it. + +Interrupt and resume must be server-owned because multiple clients can observe or control the same session. A TUI, desktop client, or IDE client may initiate the interrupt, but the server owns canonical runtime state, tool cleanup, durable records, and resumed execution. + +## Source Requirements + +- `L1-REQ-AGENT-002` requires users to interrupt, cancel, inspect, and resume work where recovery is possible. +- `L1-REQ-AGENT-001` requires a complete execution workflow with visible task state. +- `L1-REQ-TOOL-005` requires background process inspection and manual stop behavior. +- `L1-REQ-CONV-002` requires observable and durable turn lifecycle behavior. +- `L1-REQ-APP-002` requires persistence and recovery behavior. +- `L1-REQ-APP-011` requires actionable error recovery. +- `L1-REQ-TOOL-001` requires safe tool execution and redaction. +- `L2-DES-AGENT-001` defines the execution engine being interrupted or resumed. +- `L2-DES-APP-003` defines client requests and server notifications. +- `L2-DES-CONV-001` defines durable turn and item records. + +## Design Requirement + +The server should provide timely user control over active execution while preserving completed work, partial outputs, file-change state, and enough context to resume when possible. + +Interrupting a turn should stop or transition active work into a safe state. It should not silently discard durable history, silently leave program-started background work running, or make clients guess whether execution is still active. + +## Control Actions + +The design distinguishes these conceptual actions: + +- `interrupt`: request that the active turn stop as soon as possible while preserving partial state. +- `cancel_tool`: request cancellation of one running tool call where the tool supports cancellation. +- `stop_background_process`: request termination of a tracked background process started by the program. +- `resume`: start a continuation from an interrupted turn with awareness of prior progress. +- `inspect_active_work`: return the active turn phase, running tool calls, pending prompts, and tracked background processes. + +The exact protocol method names are defined by `L2-DES-APP-003`, but the runtime semantics are owned by this design. + +## Interrupt Targets + +An interrupt request may target: + +- The active model invocation. +- The currently executing tool call or parallel tool group. +- The entire active turn. +- A tracked background process associated with the session or turn. +- A waiting approval or question prompt. + +The server should resolve the target into a runtime cancellation token, tool supervisor command, provider stream cancellation, or waiting-state transition. If the target is no longer active, the server should return an idempotent success or a structured stale-state error. + +## Interrupt Flow + +Conceptual interrupt flow: + +```text +Client sends interrupt request + ↓ +Server validates session, turn, target, and permissions + ↓ +Server records interrupt requested state + ↓ +Server signals provider/tool/process/waiting state + ↓ +Server drains or bounds partial output + ↓ +Server records final interrupted state + ↓ +Server broadcasts canonical turn/tool/process updates +``` + +The immediate client response confirms that interruption was accepted or rejected. It must not wait for every process cleanup action to finish. + +## Runtime State + +Conceptual interrupt state fields: + +- `interrupt_id` +- `session_id` +- `turn_id` +- `requested_by_client_id` +- `target_kind` +- `target_id` +- `interrupt_mode` +- `requested_at` +- `accepted_at` +- `status`: requested, stopping, interrupted, completed_before_interrupt, failed, or rejected. +- `cleanup_state` +- `message` + +Execution phases should map to interrupt behavior: + +| Phase | Expected Interrupt Behavior | +|---|---| +| Admission | Reject if no active execution exists or return stale success if already terminal. | +| Context assembly | Stop before provider invocation where possible and mark the turn interrupted. | +| Model invocation | Cancel or drop the provider stream, preserve partial content, and mark the turn interrupted. | +| Tool dispatch | Request tool cancellation where supported and record completed, failed, or canceled tool states. | +| Waiting for approval or question | Resolve the wait as interrupted or canceled and mark the turn interrupted. | +| Background process running after tool return | Keep process visible unless the user explicitly stops it or policy requires cleanup. | +| Finalization | If terminal status has already been persisted, return stale success with the existing terminal state. | + +## Provider Interruption + +When a model invocation is interrupted, the engine should: + +- Stop reading provider stream events where possible. +- Cancel the underlying HTTP request or provider stream where supported. +- Persist partial assistant or reasoning content already accepted by the engine. +- Record usage received before the interruption if available. +- Mark the active invocation as interrupted or canceled. +- Mark the turn as interrupted unless execution has already reached a terminal state. + +If the provider cannot be canceled cleanly, the server should stop forwarding additional output to the turn and record cleanup status separately. + +## Tool Interruption + +Tool interruption depends on tool capabilities: + +- Read-only or short-lived tools may complete before cancellation takes effect. +- Structured mutating tools should either complete atomically or report partial failure state. +- Command execution tools should attempt process-group or runtime-specific termination according to tool design. +- Background processes that outlive the originating tool call must remain visible until they exit or are stopped. +- Tool output already emitted before interruption remains part of the interrupted turn history. + +The engine should not claim a tool was stopped until the tool supervisor reports stopped, exited, failed-to-stop, or detached-visible state. + +## Active Work Inspection + +The server should expose enough active work state for clients to let users make informed stop decisions. + +Conceptual active work projection fields: + +- `session_id` +- `active_turn_id` +- `turn_status` +- `turn_phase` +- `active_invocation_id` +- `running_tool_calls` +- `pending_approvals` +- `pending_questions` +- `background_processes` +- `workspace_change_set_status` +- `last_event_sequence` + +This projection should be safe for client display and should not include plaintext secrets or unredacted sensitive tool output. + +## Durable Recording + +Interrupt behavior must be append-only from the durable session perspective. + +Durable records should preserve: + +- The interrupt request. +- The interrupted turn terminal state. +- Partial assistant, reasoning, tool call, and tool result items already accepted. +- Tool cancellation outcomes. +- Background process state or references. +- Workspace change-set state for file changes completed before interruption. +- Resume links if work is resumed later. + +The program should not remove partial records merely because a turn was interrupted. + +## Resume Semantics + +Resuming an interrupted task should create a continuation turn linked to the interrupted turn. The interrupted turn remains durable and terminal; the resume turn carries a `resume_of_turn_id` or equivalent provenance link. + +Resume context should include: + +- The original user request. +- Partial assistant output where useful. +- Completed tool calls and tool results. +- File-change summary or workspace change-set state. +- Background process state relevant to the task. +- Any user-provided resume instruction. +- Current session metadata and model selection. + +The resumed turn should use the normal execution engine from `L2-DES-AGENT-001`. It should not reinterpret already-executed model or tool work as if it had never happened. + +## Resume Eligibility + +A resume request should be accepted only when: + +- The target turn is interrupted or otherwise recoverable. +- The session still exists and can be opened. +- The workspace is available or the user has accepted degraded behavior. +- Required context records are available or a safe degraded context can be assembled. +- The requested resume does not conflict with an active turn unless it is queued or explicitly allowed. + +If context is missing, compacted, deleted, or unsafe to reuse, the server should reject the resume or start a new turn with an explicit warning that it cannot fully resume prior state. + +## Crash And Restart Recovery + +After process restart, durable replay should reconstruct completed and interrupted turn history. If replay finds a turn that was active without a terminal record when the server stopped, the program should mark it as interrupted, failed-recoverable, or recovery-required according to L3 policy before allowing resume. + +The server should not pretend that an in-flight provider stream or external process continued safely across a crash unless a supervisor can prove that state. + +## Client Behavior + +Clients may initiate interrupts, display active work, and initiate resume, but clients do not own the canonical state transition. + +Clients should: + +- Present immediate interrupt acknowledgement. +- Reconcile local display with server-confirmed status events. +- Show cleanup-pending state when tools or background processes have not stopped yet. +- Display partial work and file-change summaries after interruption. +- Initiate resume through server protocol rather than replaying local transcript content themselves. + +## Invariants + +- Interrupt responses are timely and do not wait for all cleanup to finish. +- Completed records remain durable after interruption. +- A turn interrupted by the user reaches a visible terminal or cleanup-pending state. +- Background processes started by the program remain visible if they continue after interruption. +- Resume creates linked continuation state instead of mutating the interrupted turn in place. +- Resumed execution uses the normal execution engine and normal safety policy. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-AGENT-002 | 1 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | Defines server-owned interrupt, active work inspection, and resume behavior. | +| related-to | L1-REQ-AGENT-001 | 1 | specs/L1/L1-REQ-AGENT-001-execution-workflow.md | Interrupt and resume act on the execution engine workflow. | +| related-to | L1-REQ-TOOL-005 | 1 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | Background process inspection and stopping are part of interrupt control. | +| related-to | L1-REQ-CONV-002 | 1 | specs/L1/L1-REQ-CONV-002-turn-lifecycle.md | Interrupt and resume update visible turn lifecycle state. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Defines the execution runtime being interrupted and resumed. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol methods and events expose interrupt and resume control to clients. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable records preserve interrupted and resumed state. | +| specified-by | TBD | TBD | specs/L3/agent/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial interrupt and resume control design. | diff --git a/specs/L2/app/L2-DES-APP-001-memory-efficient-rust-data-models.md b/specs/L2/app/L2-DES-APP-001-memory-efficient-rust-data-models.md new file mode 100644 index 00000000..6f972916 --- /dev/null +++ b/specs/L2/app/L2-DES-APP-001-memory-efficient-rust-data-models.md @@ -0,0 +1,87 @@ +--- +artifact_id: L2-DES-APP-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-21 +--- + +# L2-DES-APP-001 — Memory-Efficient Rust Data Models + +## Purpose + +Refine the lightweight-operation requirement into technical design principles for Rust data models that may otherwise retain avoidable memory during large sessions or large deserialization workloads. + +## Background / Context + +Rust struct composition stores fields inline. For large composite structs, `Option` does not necessarily make the parent object small when the option is `None`; the parent may still reserve enough inline space for the large struct. This differs from reference-oriented object models where a nullable child object is usually represented by one pointer in the parent. + +The program may deserialize, retain, and traverse large collections of sparse objects such as session records, tool events, model metadata, workspace indexes, or cached external data. Sparse optional substructures can therefore create avoidable memory pressure even when most nested content is absent. + +## Design Requirement + +For large or frequently repeated Rust data structures, the program should avoid inline storage of sparse optional substructures when that storage creates meaningful memory overhead. When a nested structure is often absent or semantically empty, the design should consider `Option>` or another indirection strategy so the parent object stores only a pointer-sized optional value when no nested content exists. + +## Design Principles + +- Treat memory layout as part of data model design for large collections, long sessions, and deserialized external data. +- Prefer simple inline structs for small, dense, frequently accessed data where boxing would add unnecessary allocation or pointer chasing. +- Consider `Option>` for large nested structs that are often absent, all-default, or all-empty. +- When deserializing sparse nested structs, detect semantically empty values and avoid retaining boxed objects that carry no useful information. +- Implement emptiness checks as explicit domain behavior, such as an `is_empty` method, rather than relying on incidental serialization details. +- Keep serialization and deserialization behavior compatible with the public data contract when changing internal storage representation. +- Measure memory impact before and after optimization instead of relying only on `size_of::()`. +- Treat extra CPU cost, heap fragmentation, and pointer-chasing overhead as tradeoffs that must be justified by memory savings. + +## Applicability + +This design applies when all of the following are true: + +- A Rust data type is stored many times, retained across turns, or loaded from a large data source. +- The type contains nested structs that are often semantically empty. +- Inline representation creates measurable or reasonably expected memory pressure. +- Boxing or another indirection strategy does not make the user-visible workflow slower or less reliable overall. + +This design does not require boxing every optional nested structure. + +## Serde Guidance + +When using Serde for sparse nested data, a custom deserializer may deserialize the nested value, check whether it is semantically empty, and store `None` instead of `Some(Box)` when no useful data exists. + +Example shape: + +```rust +#[serde(default, deserialize_with = "deserialize_boxed_value")] +pub value: Option>; +``` + +The serializer should preserve the expected external representation and avoid exposing internal boxing decisions as a wire-format change unless an explicit data-contract change is approved. + +## Measurement and Verification + +Memory optimizations must be measurable when they are introduced for performance reasons. + +- Use targeted benchmarks or profiling scenarios that represent realistic large sessions or large data loads. +- Prefer allocator-level or process-level memory measurement for composite object graphs, because `size_of::()` does not include heap allocations reachable through pointers. +- Keep memory profiling optional so normal builds do not require profiling allocators or extra runtime overhead. +- Record the before-and-after memory impact and the CPU or latency tradeoff in the implementation or verification notes. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-APP-005 | 1 | specs/L1/L1-REQ-APP-005-lightweight.md | Provides technical design guidance for avoiding unnecessary memory growth. | +| specified-by | TBD | TBD | specs/L3/app/TBD.md | L3 behavior has not been authored yet. | + +## References + +- https://dystroy.org/blog/box-to-save-memory/#about-rust-structs-and-memory + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-21 | Assistant | Initial | Initial draft from approved L2 memory-optimization discussion. | diff --git a/specs/L2/app/L2-DES-APP-002-configuration-precedence.md b/specs/L2/app/L2-DES-APP-002-configuration-precedence.md new file mode 100644 index 00000000..33114af5 --- /dev/null +++ b/specs/L2/app/L2-DES-APP-002-configuration-precedence.md @@ -0,0 +1,168 @@ +--- +artifact_id: L2-DES-APP-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-APP-002 — Configuration Precedence + +## Purpose + +Refine configuration requirements into a source-precedence and persistence design for user-scoped and project-scoped configuration. + +## Background / Context + +The program has durable configuration at two scopes. User-scoped configuration carries personal defaults across projects. Project-scoped configuration carries settings that should apply when working inside a specific project directory. + +Onboarding creates durable model invocation configuration. That setup can include user providers, provider credential references, provider-specific model names, invocation methods, and reasoning effort defaults. Credential material such as API keys is saved in the companion `auth.json` file for the same source scope. These values must be saved before onboarding is considered complete. + +Post-onboarding model selection also interacts with configuration, but not every model selection should rewrite configuration files. The design must distinguish durable provider and binding records from current-session model selection and persisted default selection. + +The concrete `config.toml` and `auth.json` file schemas for these configuration sources are defined by `L2-DES-APP-005`. + +## Source Requirements + +- `L1-REQ-APP-010` requires persistent configuration, specific configuration file locations, and project-over-user precedence. +- `L1-REQ-MODEL-001` requires persisted invocable model configuration. +- `L1-REQ-MODEL-002` requires persisted provider configuration. +- `L1-REQ-MODEL-003` requires onboarding-created model and provider configuration to be restorable. +- `L1-REQ-TUI-010` requires the TUI to submit successful onboarding results for persistence. +- `L1-REQ-TUI-006` requires slash commands such as `/model` to provide user-facing control surfaces. +- `L1-REQ-APP-012` requires safe credential handling and routine client views that do not expose plaintext credentials by default. + +## Design Requirement + +The program should compute an effective configuration from available configuration sources while preserving source identity for diagnostics and inspection. + +Configuration source priority is: + +1. Project-scoped configuration: `project_directory/.dev/config.toml`, with credentials in `project_directory/.dev/auth.json` +2. User-scoped configuration: + - Windows: `C:\Users\username\.devo\config.toml` + - Windows credentials: `C:\Users\username\.devo\auth.json` + - macOS and Linux: `~/.devo/config.toml` + - macOS and Linux credentials: `~/.devo/auth.json` + +When both sources define overlapping settings, the project-scoped value takes precedence. + +## Effective Configuration + +Effective configuration is resolved conceptually as: + +```text +User config + + +Project config + ↓ +EffectiveConfig +``` + +Resolution rules: + +- Missing configuration sources are allowed. +- User-scoped configuration provides the base values. +- Project-scoped configuration overlays user-scoped configuration for overlapping settings. +- Non-overlapping settings from both sources may contribute to the effective configuration. +- Effective configuration should retain enough source metadata to explain which source supplied a value when users inspect configuration or when errors occur. +- Invalid higher-priority configuration should produce an actionable error instead of silently falling back to lower-priority configuration for the same setting. + +For keyed collections such as providers or model-provider bindings: + +- Stable identifiers are used to detect overlapping records. +- A project-scoped record with the same stable identifier as a user-scoped record overrides the user-scoped record. +- User-scoped records that do not overlap project-scoped records remain available unless project-scoped configuration explicitly disables or replaces the relevant collection according to a later schema rule. + +For selected defaults: + +- A project-scoped default model binding overrides a user-scoped default model binding. +- A project-scoped default reasoning effort overrides a user-scoped default reasoning effort for the same effective binding. + +## Onboarding Persistence + +Successful onboarding model setup produces durable configuration data: + +- Selected supported model slug. +- Selected existing provider or newly created provider. +- Provider name, base URL, and credential reference when a provider is added. +- Credential material in `auth.json` when a provider credential is added or updated. +- Provider-specific model name. +- Invocation method. +- Reasoning effort when the selected supported model permits reasoning. +- Default binding or default reasoning selection where required by the onboarding flow. + +The program should persist onboarding output before normal model invocation begins. If persistence fails, onboarding should report a recoverable configuration error rather than allowing the user to believe setup is durable. + +Until a dedicated target selector is specified, the default persistence target should be deterministic: + +- If onboarding runs with an active project directory, persist to `project_directory/.dev/config.toml`. +- If onboarding runs without an active project directory, persist to the user-scoped configuration file for the current operating system. + +When the persistence target affects visibility, sharing, or credential placement, the program should make the target understandable to the user through confirmation, inspection, or error output. In particular, project-scoped credential persistence means writing to `project_directory/.dev/auth.json`, not to `project_directory/.dev/config.toml`. + +## Model Selection Persistence + +The program should treat model selection state and durable model configuration as separate concerns: + +- Creating or updating a provider or model-provider binding is a configuration write. +- Selecting an already-configured binding for a running session is session state, not a provider or binding rewrite. +- Persisting a default selected binding or default reasoning effort is a default-selection write, not a provider or binding rewrite. + +Before the first user message in a session, changing the pending model or reasoning selection should persist the selected default where supported by `L1-REQ-APP-010`. That write should update only default-selection fields unless the workflow also created or repaired provider or binding records. + +After the first user message in a session, changing model or reasoning selection should update the current session selection and should not immediately rewrite provider records, binding records, or default-selection fields. Graceful server-exit persistence of active reasoning effort is an application lifecycle policy and should update only the relevant default reasoning field. + +When `/model` or another post-onboarding flow creates a provider or model-provider binding, the new or modified durable records must be persisted before the new binding is treated as configured for later launches. If the same flow also selects the binding for the current session, session selection should be applied only after the configuration write succeeds or after the user explicitly chooses a recoverable non-durable path in a later design. + +## File Write Safety + +Configuration file persistence should be schema-aware and conservative: + +- Writes must preserve unrelated configuration keys and sections. +- Writes should avoid rewriting provider or binding records that did not change. +- Writes should validate the resulting configuration before replacing the effective file contents. +- Failed writes must not leave a partially written configuration file. +- Errors must identify the intended configuration target and affected setting or record without printing plaintext credentials by default. +- If parent directories or files must be created, creation should follow the same user-scoped or project-scoped target rules used for the write. +- Credential value writes must target `auth.json`, while non-secret configuration writes must target `config.toml`. +- A provider, binding, or MCP server config write must not leave `config.toml` referencing a missing credential id after a failed `auth.json` write. + +The TOML and auth JSON section and field schemas are defined by `L2-DES-APP-005`. Comment preservation behavior, locking strategy, atomic write mechanics, and concurrent edit handling belong in L3 design or implementation design. + +## Credential Handling + +Credential entry during onboarding is an explicit credential-handling flow. Persistent `config.toml` records store credential references only. Credential material is stored in the companion `auth.json` file for the selected source scope. + +Rules: + +- Routine client model lists, provider lists, and model switchers should show credential status rather than plaintext credential values by default. +- Errors should identify the affected provider and configuration source without printing plaintext credentials by default. +- The program should not recommend environment variables, OS keychains, or external credential stores as the durable credential persistence path. +- If credential material is stored in project-scoped `auth.json`, the program should make the project-scoped persistence target understandable to the user. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Defines configuration sources, precedence, and persistence target behavior. | +| related-to | L1-REQ-MODEL-001 | 1 | specs/L1/L1-REQ-MODEL-001-config.md | Model provider bindings are durable configuration records. | +| related-to | L1-REQ-MODEL-002 | 1 | specs/L1/L1-REQ-MODEL-002-provider.md | Provider records are durable configuration records. | +| related-to | L1-REQ-MODEL-003 | 1 | specs/L1/L1-REQ-MODEL-003-onboard.md | Onboarding creates configuration that must be persisted. | +| related-to | L1-REQ-TUI-010 | 1 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | TUI onboarding submits setup results for persistence. | +| related-to | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Slash commands can trigger session selection or configuration writes. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | Credential persistence and projection behavior must follow privacy expectations. | +| related-to | L2-DES-APP-005 | 1 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | Defines the concrete `config.toml` and `auth.json` schemas resolved by this precedence design. | +| specified-by | TBD | TBD | specs/L3/app/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial configuration precedence and onboarding persistence design. | +| 1 | 2026-05-25 | Human | Refinement | Separated durable configuration writes from session/default model selection and added conservative config file write requirements. | +| 1 | 2026-05-25 | Human | Refinement | Linked configuration precedence to the concrete `config.toml` schema design. | +| 1 | 2026-05-25 | Human | Refinement | Moved durable credential material from configuration records into companion `auth.json` files. | diff --git a/specs/L2/app/L2-DES-APP-003-client-server-protocol.md b/specs/L2/app/L2-DES-APP-003-client-server-protocol.md new file mode 100644 index 00000000..9a638502 --- /dev/null +++ b/specs/L2/app/L2-DES-APP-003-client-server-protocol.md @@ -0,0 +1,466 @@ +--- +artifact_id: L2-DES-APP-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-APP-003 — Client Server Protocol + +## Purpose + +Refine the client/server architecture requirement into a protocol, transport, and process-ownership design that supports TUI, desktop, IDE, and future clients sharing the same agent runtime. + +## Background / Context + +The program has multiple potential client surfaces. Each client needs to start or resume sessions, submit turns, observe streaming output, answer approval or question prompts, and inspect shared state. + +If every client launches its own private server process over stdio, those clients cannot naturally share the same active sessions or runtime state. Shared state requires a discoverable server instance that multiple clients can connect to concurrently. + +## Source Requirements + +- `L1-REQ-APP-001` requires client surfaces to share server-side agent behavior. +- `L1-REQ-CONV-001` requires durable session lifecycle behavior. +- `L1-REQ-CONV-002` requires observable turn lifecycle behavior. +- `L1-REQ-CONV-003` requires explicit `steer` and `queue` handling during active turns. +- `L1-REQ-CONV-004` requires session forking from a specific turn and fork traceability. +- `L1-REQ-CONV-005` requires editing the immediately preceding eligible user-authored message without mutating durable history. +- `L1-REQ-APP-002` requires persistence and recovery behavior. +- `L1-REQ-AGENT-001` requires a complete execution workflow with visible task state. +- `L1-REQ-AGENT-002` requires interrupt, cancel, inspect, and resume behavior. +- `L1-REQ-AGENT-003` requires visible task planning with status updates. +- `L1-REQ-CHANGE-001` requires rollback and recovery behavior for file changes. +- `L1-REQ-EDIT-001` requires file edits to be reviewable and recoverable. +- `L1-REQ-GOAL-001` requires user-owned Ralph Loop goal controls and visible bounded autonomous continuation state. +- `L1-REQ-GIT-001` constrains git-oriented change management. +- `L1-REQ-APP-010` requires effective configuration inspection and model/reasoning updates. +- `L1-REQ-APP-011` requires actionable error recovery and provider error detail presentation. +- `L1-REQ-APP-012` requires user-data ownership, export, deletion, and credential-safe projections. +- `L1-REQ-AGENT-005` restricts the question tool to Plan Mode. +- `L1-REQ-TOOL-001` requires tool output safety and redaction visibility. +- `L1-REQ-TOOL-002` requires baseline built-in tools, including planning, approval, questions, search, command execution, web, and delegation tools. +- `L1-REQ-MEM-001` defines persistent memory as core-maintained internal state outside the routine client-server protocol surface. +- `L2-DES-TOOL-001` defines the built-in tool system and plan tool. +- `L2-DES-GOAL-001` defines Ralph Loop goal state, continuation, and budget behavior. +- `L2-DES-CONV-001` defines durable session JSONL events and distinguishes provider, server-client, and durable event planes. + +## Protocol Requirement + +The program should use JSON-RPC 2.0 as the logical client/server protocol envelope. + +JSON-RPC is suitable because it supports: + +- Request/response calls for commands that need results. +- Notifications for one-way events. +- Transport-independent message semantics. +- Reuse over WebSocket while preserving a simple method and notification model. + +The protocol should define program-specific method names and event payloads rather than exposing provider-specific SSE events directly to clients. + +## Transports + +The program should use JSON-RPC 2.0 over WebSocket as the client/server transport. + +The local server should bind a loopback WebSocket endpoint by default. TUI, desktop, IDE, and browser-capable clients should all connect to that endpoint as WebSocket clients. + +WebSocket is the required transport because it supports concurrent local clients and also fits browser-extension and desktop-client constraints. `stdio` should not be the shared-client transport because a stdio child process is normally owned by one parent client and cannot naturally be discovered and shared by TUI, desktop, and IDE clients at the same time. + +## Server Instance Ownership + +The default local architecture should use a single discoverable server instance per user profile. + +Conceptual startup flow: + +```text +Client starts + ↓ +Read server endpoint descriptor + ↓ +Try to connect to existing server + ↓ +If unavailable, acquire startup lock + ↓ +Start detached server + ↓ +Write endpoint descriptor + ↓ +Connect and authenticate +``` + +The endpoint descriptor should be stored in a user-scoped runtime location and include: + +- Server process identifier where available. +- WebSocket endpoint URL. +- Authentication token or credential reference. +- Server version. +- Started-at timestamp. + +The descriptor must be protected by user-only filesystem permissions where the operating system supports them. + +## Request Response Contract + +Every client request must receive a JSON-RPC response. The response confirms whether the server accepted the command, rejected it, or completed a read-only query. + +For long-running operations, the response must be immediate and must not wait for the full turn, tool call, or model invocation to finish. The response allocates canonical identifiers and sequence positions; subsequent progress is delivered through server notifications. + +Successful responses should include: + +- `accepted`: whether the command was accepted for execution. +- Canonical identifiers created or resolved by the server, such as `session_id`, `turn_id`, `item_id`, `subscription_id`, `approval_id`, or `question_id`. +- `latest_sequence` or `next_sequence` where ordering or catch-up matters. +- A projection or snapshot when the request is a read operation. +- A safe message or warning when the request succeeds with degraded behavior. +- An idempotency result when a repeated request uses a previously seen client-generated id. + +Rejected responses should use JSON-RPC error responses with: + +- `code`: stable machine-readable error code. +- `message`: concise user-facing explanation. +- `data`: structured recovery context, such as missing permission, invalid session, stale sequence, invalid model, or unavailable provider. + +## Client Requests + +Representative client-to-server JSON-RPC request methods and response results: + +| Method | Purpose | Important Params | Server Response Result | +|---|---|---|---| +| `server.initialize` | Register a client connection, authenticate it, and negotiate protocol compatibility. | `client_id`, `client_kind`, `protocol_version`, `auth_token`, `client_capabilities`, `workspace_root` where known. | `server_id`, `server_version`, `protocol_version`, `server_capabilities`, `latest_sequence`. | +| `server.shutdown` | Request a graceful server shutdown when the caller is authorized to do so. | `reason`, `client_id`, optional `force_after_timeout`. | `accepted`, `shutdown_state`, optional `message`. | +| `session.list` | Return session summaries for pickers, recent-session views, and restore flows. | `workspace_filter`, `include_archived`, `limit`, `cursor`, optional sort order. | `sessions`, `next_cursor`, `latest_sequence`. | +| `session.open` | Load a session and return a current projection without necessarily subscribing to future events. | `session_id`, `projection`, optional `from_sequence`. | `session_snapshot`, `latest_sequence`. | +| `session.create` | Create a new session record when the client explicitly starts a new session before submitting a turn. | `workspace_root`, `initial_metadata`, optional `client_generated_label`. | `session_id`, `session_snapshot`, `latest_sequence`. | +| `session.fork` | Create a child session that inherits visible history from a parent session without deep-copying all parent records. | `parent_session_id`, `fork_turn_id`, `workspace_root`, optional `fork_label`. | `session_id`, `parent_session_id`, `fork_turn_id`, `inherited_segment_id`, `session_snapshot`. | +| `session.archive` | Mark a session as archived so it leaves active-session views while remaining recoverable where policy allows. | `session_id`, `archive_reason`. | `session_id`, `archived`, `latest_sequence`. | +| `session.delete` | Delete or request deletion of a session while preserving explicit policy for forks and retained shared records. | `session_id`, `delete_mode`, `fork_policy`, `confirm_token` when required. | `accepted`, `session_id`, `delete_state`, `affected_forks`, `inherited_segment_actions`, `retained_records`, `latest_sequence`. | +| `session.export` | Export session history and allowed related data for user data portability. | `session_id`, `include_inherited_history`, `redaction_level`, `format`. | `export_id`, `accepted`, `status`, optional `download_ref`, `latest_sequence`. | +| `session.subscribe` | Start receiving ordered events for a session from a given sequence or from the current state. | `session_id`, `from_sequence`, `event_filter`, `projection`. | `subscription_id`, optional `session_snapshot`, `next_sequence`. | +| `session.unsubscribe` | Stop a previous session subscription. | `subscription_id`, `session_id`. | `subscription_id`, `closed`. | +| `turn.submit` | Submit user input, content parts, and mentions for agent execution. If a turn is active, the client must state whether the message is normal, steer, or queue. | `session_id` or `new_session`, `submission_mode`, `active_turn_id` where applicable, `content_parts`, `mentions`, `client_message_id`, optional `mode_overrides`. | `session_id`, `turn_id` or `queue_item_id` or `steer_item_id`, `accepted`, `classification`, `latest_sequence`. | +| `message.editPrevious` | Edit the immediately preceding eligible user-authored message in the current session branch. | `session_id`, `expected_target_message_id`, `edited_content_parts`, `edited_mentions`, `client_edit_id`, optional `edit_mode`, optional `workspace_restore_policy`. | `accepted`, `edit_id`, `target_message_id`, `replacement_message_id`, `superseded_turn_id` where applicable, `workspace_restore_state`, `new_turn_id` or `queue_item_id` where applicable, `edit_state`, `latest_sequence`. | +| `turn.interrupt` | Request interruption of active execution, including model generation, tool execution, pending prompts, or the whole turn. | `session_id`, `turn_id`, `reason`, optional `target_kind`, optional `target_id`, optional `interrupt_mode`. | `turn_id`, `interrupt_id`, `interrupt_state`, `cleanup_state`, `latest_sequence`. | +| `turn.resume` | Start a continuation turn linked to an interrupted or otherwise recoverable turn. | `session_id`, `interrupted_turn_id`, `client_resume_id`, optional `resume_content_parts`, optional `resume_mentions`, optional `resume_mode`. | `session_id`, `turn_id`, `resume_of_turn_id`, `accepted`, `resume_state`, `latest_sequence`. | +| `execution.inspect` | Return active execution state so a client can show running work and let the user choose what to stop. | `session_id`, optional `include_background_processes`, optional `include_recent_output`, optional `redaction_level`. | `active_turn`, `running_tool_calls`, `pending_approvals`, `pending_questions`, `background_processes`, `latest_sequence`. | +| `backgroundProcess.stop` | Request stop for a tracked background process started by the program. | `process_id`, optional `session_id`, optional `turn_id`, `reason`, optional `stop_mode`. | `process_id`, `stop_state`, `latest_sequence`. | +| `queue.cancel` | Cancel a queued message before it starts execution. | `session_id`, `queue_item_id`, `reason`. | `queue_item_id`, `canceled`, `latest_sequence`. | +| `goal.get` | Return the current Ralph Loop goal projection for a session. | `session_id`, optional `include_history`, optional `redaction_level`. | `goal`, `goal_history` where requested, `latest_sequence`. | +| `goal.create` | Create the first goal for a session or explicitly replace a non-terminal goal. | `session_id`, `objective`, optional `token_budget`, optional `time_budget_seconds`, optional `turn_budget`, optional `replace_existing`, optional `expected_goal_id`. | `accepted`, `goal`, `replaced_goal_id` where applicable, `latest_sequence`. | +| `goal.pause` | Pause autonomous continuation without discarding the goal. | `session_id`, `goal_id`, optional `expected_goal_id`, optional `reason`. | `accepted`, `goal`, `latest_sequence`. | +| `goal.resume` | Resume a paused or blocked goal and allow continuation after eligibility checks. | `session_id`, `goal_id`, optional `expected_goal_id`, optional `resume_reason`. | `accepted`, `goal`, `continuation_eligible`, `latest_sequence`. | +| `goal.complete` | Let the user mark the goal complete directly. | `session_id`, `goal_id`, optional `expected_goal_id`, `verification_summary`. | `accepted`, `goal`, `latest_sequence`. | +| `goal.cancel` | End the goal without marking it complete. | `session_id`, `goal_id`, optional `expected_goal_id`, `reason`. | `accepted`, `goal`, `latest_sequence`. | +| `goal.clear` | Remove the current goal from normal UI projections while preserving audit records. | `session_id`, `goal_id`, optional `expected_goal_id`. | `accepted`, `cleared_goal_id`, `latest_sequence`. | +| `approval.respond` | Answer a pending tool or permission approval request. | `session_id`, `turn_id`, `approval_id`, `decision`, optional `note`. | `approval_id`, `accepted`, `latest_sequence`. | +| `question.respond` | Answer a pending Plan Mode or question-tool prompt. | `session_id`, `turn_id`, `question_id`, `answers`, optional `freeform_text`. | `question_id`, `accepted`, `latest_sequence`. | +| `model.list` | Return supported and configured model projections suitable for selection UI. | `include_supported`, `configured_only`, optional `capability_filter`. | `models`, `current_model`, credential status only. | +| `model.select` | Change the session's active model binding and reasoning effort where allowed. | `session_id`, `model_binding_id`, optional `reasoning_effort`, optional `persist_default`. | `effective_model`, `metadata_update`, `latest_sequence`. | +| `config.inspect` | Return effective configuration and source information safe for client display. | `scope`, `include_sources`, optional `redaction_level`. | `effective_config_projection`, `sources`, `latest_sequence`. | +| `config.update` | Update supported configuration values where the client is authorized to do so. | `scope`, `updates`, `persistence_target`, `redaction_level`. | `accepted`, `changed_keys`, `effective_config_projection`, `latest_sequence`. | + +Request methods should return explicit success or structured error results. + +## Server Notifications + +Representative server-to-client JSON-RPC notification methods: + +| Method | Purpose | Payload | +|---|---|---| +| `server.statusChanged` | Tell clients that server availability, lifecycle state, or capabilities changed. | `server_id`, `status`, `capabilities`, `latest_sequence`, optional `message`. | +| `session.event` | Broadcast session-level changes to subscribed clients. | `sequence`, `session_id`, `event_kind`, `event_payload`, `source_client_id` where applicable. | +| `session.subscriptionClosed` | Tell a client that a subscription ended or can no longer be continued. | `subscription_id`, `session_id`, `reason`, optional `resubscribe_hint`. | +| `turn.event` | Broadcast turn and item changes to clients subscribed to the session. | `sequence`, `session_id`, `turn_id`, `event_kind`, `event_payload`, `source_client_id` where applicable. | +| `approval.requested` | Ask connected clients to present an approval decision to the user. | `sequence`, `session_id`, `turn_id`, `approval_id`, `approval_kind`, `summary`, `details`, `expires_at` where applicable. | +| `question.requested` | Ask connected clients to present a question prompt to the user. | `sequence`, `session_id`, `turn_id`, `question_id`, `prompt`, `options`, `allows_freeform`, `expires_at` where applicable. | +| `config.changed` | Tell clients that effective configuration changed and dependent displays may need refresh. | `sequence`, `changed_scopes`, `changed_keys`, `source`, `safe_summary`. | +| `goal.updated` | Tell clients that the session goal was created, replaced, paused, resumed, blocked, completed, canceled, budget-limited, cleared, or updated with progress. | `sequence`, `session_id`, `goal`, `change_kind`, `source`, `source_turn_id` where applicable, `source_client_id` where applicable. | +| `goal.continuationStarted` | Tell clients that an autonomous continuation turn has started for an active goal. | `sequence`, `session_id`, `goal_id`, `turn_id`, `reason`, `budget_state`. | +| `goal.budgetLimited` | Tell clients that the goal stopped because a configured budget was reached. | `sequence`, `session_id`, `goal_id`, `budget_kind`, `budget_state`, `progress_summary`. | + +Notifications should include sequence numbers sufficient for clients to order events and request catch-up after reconnect. + +## Sequencing And Catch-Up + +The server should assign a monotonic `session_sequence` for each session. Every `session.event` and `turn.event` for that session should carry this sequence. + +Rules: + +- Clients should pass `from_sequence` when subscribing or reconnecting. +- The server should deliver missed events after `from_sequence` when those events are still available. +- If the requested sequence is too old, unknown, or compacted away, the server should send a fresh `session_loaded` snapshot and continue from the snapshot's `latest_sequence`. +- Clients should treat events as idempotent by sequence and event identity. +- Client-generated request ids such as `client_message_id` should make retries safe after transient disconnects. +- Ordering is authoritative within a session. Cross-session global ordering is not required for normal client rendering. + +## Server-Client Event Payloads + +Server-client event payloads should be shaped for UI responsiveness and recovery, not for durable storage. + +Representative server-client event kinds: + +| Event Kind | Purpose | Payload Content | +|---|---|---| +| `session_loaded` | Provide a session projection after open, subscribe, or reconnect. | `session_id`, `metadata`, `visible_turns`, `pending_items`, `active_plan`, `active_goal`, `latest_sequence`. | +| `metadata_updated` | Report a change to session metadata such as model, reasoning, mode, persona, permission profile, workspace, or usage totals. | `session_id`, `metadata_patch`, `effective_metadata`, `source_client_id`, `sequence`. | +| `plan_updated` | Report creation or update of visible plan/to-do state from the plan tool. | `session_id`, `plan_id`, `operation`, `plan_status`, `items`, `changed_item_ids`, `source_turn_id`, `timestamp`. | +| `goal_updated` | Report creation, replacement, status transition, progress, blocker, verification, budget, or clear state for the current Ralph Loop goal. | `session_id`, `goal_id`, `operation`, `status`, `objective_preview`, `progress_summary`, `blocker_summary`, `verification_summary`, `budget_state`, `source`, `source_turn_id`, `timestamp`. | +| `goal_continuation_started` | Report that the server started an autonomous continuation turn for an active goal. | `session_id`, `goal_id`, `turn_id`, `reason`, `budget_state`, `timestamp`. | +| `goal_budget_limited` | Report that goal continuation stopped because a configured budget was reached. | `session_id`, `goal_id`, `budget_kind`, `budget_state`, `progress_summary`, `timestamp`. | +| `turn_started` | Tell all subscribed clients that a new turn has begun. | `session_id`, `turn_id`, `status`, `submitted_by_client_id`, `user_item_id`, `started_at`. | +| `turn_resumed` | Tell subscribed clients that an interrupted or recoverable turn has been resumed by a linked continuation turn. | `session_id`, `interrupted_turn_id`, `resume_turn_id`, `resume_mode`, `submitted_by_client_id`, `timestamp`. | +| `turn_status_changed` | Report a turn moving between running, waiting, completed, failed, or interrupted states. | `session_id`, `turn_id`, `previous_status`, `status`, `reason`, `timestamp`. | +| `turn_diff_updated` | Report the current display diff for files changed by the turn. This is a client-display projection, not the authoritative restore record. | `session_id`, `turn_id`, `change_set_id`, `diff_format`, `diff_ref` or `inline_diff`, `changed_files`, `is_complete`, `timestamp`. | +| `item_started` | Create or display a logical transcript item. | `session_id`, `turn_id`, `item_id`, `kind`, `role`, `visibility`, `initial_content`, `mentions`, `created_at`. | +| `item_content_update` | Apply a live content update to an existing item. | `session_id`, `turn_id`, `item_id`, `content_part_index`, `operation`, `text` or `content_ref`, `is_coalesced`, `timestamp`. | +| `item_completed` | Mark an item complete and provide final display metadata. | `session_id`, `turn_id`, `item_id`, `final_status`, `content_hash`, `completed_at`. | +| `item_failed` | Mark an item failed while preserving any partial content already sent. | `session_id`, `turn_id`, `item_id`, `error`, `recoverable`, `timestamp`. | +| `message_edit_recorded` | Show that an immediately previous message edit was accepted. | `session_id`, `edit_id`, `target_message_id`, `replacement_message_id`, `edit_state`, `content_preview`, `mentions`, `timestamp`. | +| `turn_superseded` | Mark a previous turn as superseded by an edited message continuation while keeping it auditable. | `session_id`, `superseded_turn_id`, `replacement_turn_id`, `edit_id`, `reason`, `timestamp`. | +| `workspace_restore_started` | Show that the server is attempting to restore files changed by a superseded turn. | `session_id`, `edit_id`, `superseded_turn_id`, `checkpoint_id`, `candidate_files`, `restore_policy`, `timestamp`. | +| `workspace_restore_completed` | Report the outcome of restoring files changed by a superseded turn. | `session_id`, `edit_id`, `superseded_turn_id`, `restored_files`, `skipped_files`, `unsupported_files`, `failed_files`, `current_state_kept`, `timestamp`. | +| `steer_added` | Show that a steer message was accepted for an active turn. | `session_id`, `turn_id`, `steer_item_id`, `content_preview`, `application_state`, `timestamp`. | +| `steer_reclassified` | Report that a requested steer could not affect the active turn and was queued, rejected, or otherwise resolved. | `session_id`, `turn_id`, `steer_item_id`, `new_classification`, `reason`, `queue_item_id` where applicable. | +| `queue_item_added` | Show that a queued message was accepted. | `session_id`, `queue_item_id`, `position`, `content_preview`, `created_at`. | +| `queue_item_started` | Show that a queued message has become the next executing turn. | `session_id`, `queue_item_id`, `turn_id`, `started_at`. | +| `queue_item_canceled` | Show that a queued message was canceled before execution. | `session_id`, `queue_item_id`, `reason`, `timestamp`. | +| `tool_call_started` | Show that a tool call has begun or is awaiting approval. | `session_id`, `turn_id`, `item_id`, `tool_call_id`, `tool_name`, `command_description` for command tools, `arguments_preview`, `approval_state`, `safety_state`. | +| `tool_call_updated` | Update tool call progress, streaming output preview, or status. | `session_id`, `turn_id`, `tool_call_id`, `status`, `progress`, `output_preview`, `redaction_state`, `safety_notice`, `timestamp`. | +| `tool_call_completed` | Show final tool result state. | `session_id`, `turn_id`, `tool_call_id`, `status`, `result_summary`, `structured_status`, `output_ref`, `redaction_state`, `safety_notice`, `completed_at`. | +| `background_process_updated` | Show state for a tracked background process started by the program. | `process_id`, `session_id`, `turn_id`, `command_label`, `status`, `runtime`, `recent_output_ref`, `stop_state`, `timestamp`. | +| `approval_resolved` | Report the final state of an approval request to all subscribed clients. | `session_id`, `turn_id`, `approval_id`, `decision`, `resolved_by_client_id`, `resolved_at`. | +| `question_resolved` | Report the final state of a question request to all subscribed clients. | `session_id`, `turn_id`, `question_id`, `answer_summary`, `resolved_by_client_id`, `resolved_at`. | +| `usage_updated` | Update token and cost-related display information. | `session_id`, `turn_id`, `invocation_id`, `usage_delta`, `usage_totals`. | +| `context_updated` | Report active context changes, compaction, or token pressure. | `session_id`, `context_id`, `token_estimate`, `effective_context_limit`, `compaction_status`, `compaction_trigger_source` where applicable. | +| `session_deleted` | Report session deletion or tombstoning to subscribed or listing clients. | `session_id`, `delete_state`, `affected_forks`, `retained_records`, `timestamp`. | +| `session_export_ready` | Report that an export request completed or failed. | `export_id`, `status`, `download_ref`, `error` where applicable. | +| `error_reported` | Report recoverable or terminal errors tied to a session, turn, item, or server operation. | `scope`, `phase`, `session_id`, `turn_id`, `item_id`, `code`, `message`, `recoverable`, `retry_state`, `retry_after`, `provider_error_ref`, `partial_state`, `recovery_actions`, `details_ref`. | + +`item_content_update` is a live client event and may be coalesced or throttled. It is not the same as a durable JSONL storage event. + +## Cross-Client Broadcast Behavior + +When one client submits a user message, the server must persist the accepted user input and broadcast the resulting session and turn updates to every client subscribed to that session. + +All clients, including the client that initiated the request, should receive the canonical server events. Clients may optimistically render local input, but they must reconcile that display against the server-confirmed `turn_started`, `item_started`, and later item events. + +Examples: + +- If the TUI submits `turn.submit`, desktop and IDE clients subscribed to the same session receive the new user item and turn state. +- If the desktop client answers an approval request, the TUI and IDE clients receive the approval state update and any resumed turn events. +- If the server streams assistant output, every subscribed client receives ordered `item_content_update` events for the same item. +- If the agent updates the plan tool, every subscribed client receives `plan_updated` for the same active plan state. +- If one client creates, pauses, resumes, completes, cancels, or clears a goal, every subscribed client receives the canonical `goal_updated` event. +- If the server starts an autonomous goal continuation, every subscribed client receives `goal_continuation_started` and the normal turn lifecycle events for that continuation turn. +- If a turn changes files through structured mutating tools, subscribed clients may receive `turn_diff_updated` events for review display. +- If one client interrupts or resumes a turn, every subscribed client receives the canonical turn status and resume events. +- If one client edits the immediately previous message, every subscribed client receives the edit event and any superseded or replacement turn events. + +## Immediate Previous Message Editing + +The protocol supports editing only the immediately preceding eligible user-authored message in the current session branch. + +Eligibility rules: + +- The server is authoritative for identifying the current branch's immediately previous eligible message. +- `message.editPrevious` must reject stale requests when `expected_target_message_id` is not the current eligible message. +- Direct editing of older historical messages must be rejected with a structured error that points the client toward session forking. +- Accepted edits must be append-only from the protocol perspective: the server records an edit and broadcasts canonical replacement state instead of mutating earlier events in place. +- The server/core is authoritative for workspace restoration. Clients may choose an allowed `workspace_restore_policy`, but clients must not be required to apply inverse patches or mutate the workspace to make message editing correct. + +Execution rules: + +- If the target message belongs to a completed, failed, or interrupted latest turn, the server should attempt workspace restoration for files changed by that turn, create a replacement user item and a replacement continuation turn, then mark the original turn as superseded. +- Workspace restoration should run before the replacement turn begins unless the edit is staged for later execution. +- `workspace_restore_policy` should allow the client to request the default safe restore behavior, skip restoration, or use another explicitly supported policy. The default safe behavior preserves current file contents when divergence is detected. +- Workspace restoration should use core-owned per-turn change sets, inverse records, content snapshots, or internal checkpoints. Client-visible unified diffs may help users review changes, but they are not the authoritative restore state. +- For each file changed by the superseded turn, the server should restore the pre-turn state when the current file state still matches the expected post-turn state or another safe restore predicate. +- If a changed file has diverged after the superseded turn, the server must skip restoration for that file, preserve the current file state, and report the skip in `workspace_restore_completed`. +- File changes from structured tools such as `write` and `apply_patch` should use captured before/after state or inverse operations. +- File changes from shell commands should be restored only when a reliable turn-level checkpoint or attribution record exists. +- A git-based hidden checkpoint or ghost commit may be used internally, but the protocol must expose restoration outcome rather than git implementation details. It must not publish, stage, or rewrite user-visible git history unless the user explicitly requests that. +- If the target message is a queued message that has not started, the server may update the queue item's effective content through an edit record and preserve the original revision for audit. +- If the target message belongs to an active running turn, the server must not mutate the already-started model or tool execution. It must reject the edit or require an interruption-oriented `edit_mode`; clients may offer `steer` as the lower-friction alternative. +- If a superseded turn produced non-file tool side effects, those side effects remain visible in the superseded turn. Message editing does not imply rollback for external APIs, processes, network actions, published git operations, or other non-file effects. + +Broadcast rules: + +- Every accepted edit must emit `message_edit_recorded` to subscribed clients. +- If workspace restoration is attempted, subscribed clients must receive `workspace_restore_started` and `workspace_restore_completed`. +- If a completed latest turn is replaced, subscribed clients must also receive `turn_superseded` and the normal events for the replacement turn. +- Clients may optimistically show an edit draft, but they must reconcile it against the server-confirmed `message_edit_recorded` event. +- `turn_diff_updated` events may be coalesced or replaced by later diff updates. Clients should treat them as display state and should not infer restore completion from them. + +## Interrupt And Resume Protocol Rules + +The server is authoritative for interrupt and resume state. Clients request control actions, but they must reconcile local UI against server-confirmed `turn_status_changed`, `tool_call_updated`, `background_process_updated`, and `turn_resumed` events. + +Rules: + +- `turn.interrupt` must return promptly after the server accepts or rejects the request. It must not wait for every provider stream, tool call, or background process cleanup action to finish. +- Accepted interruption should move the target into stopping, interrupted, completed-before-interrupt, failed, or cleanup-pending state. +- If the target turn is already terminal, the server should return an idempotent terminal result or a structured stale-state error. +- `execution.inspect` should return enough active work state for clients to show running model invocation, running tools, pending prompts, and tracked background processes without exposing secrets. +- `backgroundProcess.stop` should only target processes started and tracked by the program. +- `turn.resume` should create a linked continuation turn rather than mutating the interrupted turn in place. +- Resume requests should be rejected or degraded with a warning when required context, workspace state, or permission state is unavailable. +- Resumed turns must use the normal execution engine and normal safety policy. + +## Goal Protocol Rules + +The server is authoritative for Ralph Loop goal state. Clients request user-owned goal mutations, and clients render canonical `goal_updated` events. + +Rules: + +- Goal requests must return promptly after acceptance or rejection. They must not wait for an autonomous continuation turn to complete. +- `goal.create` should reject accidental replacement of an existing non-terminal goal unless `replace_existing` or an equivalent confirmation is explicit. +- Goal mutations should include `expected_goal_id` where the client has one. If the goal was replaced or reached a terminal state, stale mutations should fail safely or become no-ops with a structured stale-state result. +- Model-originated goal updates must not use client-owned mutation methods. They should flow through the narrow model-facing goal tool defined by `L2-DES-GOAL-001` and then broadcast as `goal_updated`. +- If a user pauses, cancels, clears, or replaces a goal while a turn is active, that mutation affects future continuation and future context assembly. It must not rewrite a model request that has already started. +- If `goal.resume` makes a goal active, the server should report whether continuation is currently eligible. If it is not eligible, the reason should be visible in the response or later events. +- If the server starts a continuation, it should emit `goal_continuation_started` before or alongside the normal `turn_started` event. +- If budget is reached, the server should emit `goal_budget_limited` and `goal_updated` so clients can explain that the goal stopped without implying verified completion. +- Plan Mode suppresses autonomous goal continuation. Goal state remains viewable and user-controllable while Plan Mode is active. + +## Tool And Plan Protocol Rules + +Tool calls are requested by the model and executed by the server-owned tool supervisor. Clients observe canonical tool and plan events; they do not decide whether a model-requested tool call is valid except when an approval or question response is explicitly requested. + +Rules: + +- Tool state should be reported through `tool_call_started`, `tool_call_updated`, and `tool_call_completed`. +- `tool_call_started` should include `command_description` for shell or command execution tools. +- A tool unavailable due to mode, permission, or missing configuration should complete with a structured blocked or unavailable result rather than disappearing. +- Tool completion should include a factual natural-language `result_summary` alongside structured status fields such as exit code, HTTP status, process id, or changed-file counts. +- Plan tool updates should be reported through `plan_updated`, not only as assistant text. +- `plan_updated` should carry the complete current plan projection or enough patch data for clients to reconstruct it from prior canonical plan state. +- The plan tool must not expose private model reasoning. Plan item text should be concise, user-visible task state. +- In Normal Mode, question-tool attempts must be rejected before `question.requested` is emitted. +- In Plan Mode, mutating tools must be blocked before execution and should produce a structured blocked result if requested. +- `multi_tool_use` child calls must still produce per-tool events and must not bypass validation, approval, or mode gates. + +## Approval And Question Resolution + +Approval and question requests are single-resolution prompts. Multiple clients may display the same prompt, but only the first accepted response should resolve it. + +Rules: + +- The server owns approval and question state. +- A successful `approval.respond` or `question.respond` resolves the prompt and broadcasts `approval_resolved` or `question_resolved`. +- If another client answers after the prompt has been resolved, the server must reject the request with a structured stale-state error such as `already_resolved`. +- `question.requested` must only be emitted when Plan Mode or another explicit requirement allows the question tool. +- In Normal Mode, the server must reject question-tool attempts before emitting `question.requested`. + +## Session Deletion And Forks + +Deleting a session must preserve user-visible consistency for forks. + +Rules: + +- `session.delete` must report fork descendants before destructive deletion when descendants exist. +- Deleting a parent session must not make surviving forked sessions unusable. +- If a fork survives parent deletion, inherited history required by that fork must remain available through a replayable inherited-history segment. That segment may be backed by protected shared records, materialized fork history, or another explicit retention mechanism. +- The parent session link in a fork is provenance and navigation metadata. It must not be the sole content pointer required to replay inherited history. +- After deletion, the parent session link may be non-dereferenceable. Clients must treat parent navigation failure as distinct from inherited-history loss. +- Before a parent session is made inaccessible, `session.delete` must either preserve the inherited segment for each surviving fork, materialize the inherited segment into the fork, or reject deletion until the user chooses another policy. +- Fork indicators should show parent deleted or unavailable when navigation to the parent can no longer work, while keeping origin metadata visible. +- Hard deletion of records still referenced by surviving forks must be blocked unless those forks first receive replayable inherited-history segments or the user explicitly requests cascade deletion of the dependent forks where supported. + +Persistent memory is core-maintained internal state. Session deletion may cause core memory maintenance internally, but the client-server protocol must not expose per-memory deletion decisions, linked memory lists, or memory-management prompts. + +## Provider Event Boundary + +Provider/core events are internal to agent execution. They may be more granular than both server-client events and durable JSONL events. + +Representative provider/core events: + +- `llm_request_started` +- `reasoning_started` +- `reasoning_delta` +- `reasoning_completed` +- `assistant_response_started` +- `assistant_response_delta` +- `assistant_response_completed` +- `tool_call_started` +- `tool_call_arguments_delta` +- `tool_call_completed` +- `usage_received` +- `llm_request_completed` +- `llm_request_failed` + +The server should normalize provider/core events into: + +- Durable JSONL events for persistence. +- Server-client events for live display. +- Runtime control events for orchestration. + +## Multi-Client State + +The server owns sessions, turns, approvals, model invocation, tool execution, context assembly, and persistence. + +Clients should not create independent server state when an existing local server is available. Multiple connected clients should subscribe to the same session and receive ordered event streams for the same underlying turns and items. + +If a client disconnects, the server continues owning active work subject to user-configured lifecycle policy. A reconnecting client should resubscribe and receive either missed server-client events or a fresh projection of the current durable session state. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-APP-001 | 1 | specs/L1/L1-REQ-APP-001-client-server-arch.md | Defines protocol, transport, and process ownership for shared clients. | +| related-to | L1-REQ-APP-002 | 1 | specs/L1/L1-REQ-APP-002-persistence.md | Reconnect and catch-up behavior depends on durable session state. | +| related-to | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Defines configuration inspection and update protocol behavior. | +| related-to | L1-REQ-APP-011 | 1 | specs/L1/L1-REQ-APP-011-error-recovery.md | Defines error and retry event payload requirements. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | Defines export, deletion, and credential-safe projection behavior. | +| related-to | L1-REQ-AGENT-001 | 1 | specs/L1/L1-REQ-AGENT-001-execution-workflow.md | Exposes execution lifecycle requests and events to clients. | +| related-to | L1-REQ-AGENT-002 | 1 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | Defines interrupt, resume, active-work inspection, and background stop protocol surfaces. | +| related-to | L1-REQ-AGENT-003 | 1 | specs/L1/L1-REQ-AGENT-003-task-planning.md | Exposes plan tool updates as client-visible plan state. | +| related-to | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Enforces question notification constraints for Plan Mode. | +| related-to | L1-REQ-CHANGE-001 | 1 | specs/L1/L1-REQ-CHANGE-001-rollback-and-recovery.md | Defines display diff events plus server-owned restoration events and outcomes for superseded-turn file rollback. | +| related-to | L1-REQ-CONV-001 | 1 | specs/L1/L1-REQ-CONV-001-session-lifecycle.md | Clients open, subscribe to, and resume sessions through the protocol. | +| related-to | L1-REQ-CONV-002 | 1 | specs/L1/L1-REQ-CONV-002-turn-lifecycle.md | Clients observe turn lifecycle events through the protocol. | +| related-to | L1-REQ-CONV-003 | 1 | specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md | Defines steer and queue request and event behavior. | +| related-to | L1-REQ-CONV-004 | 1 | specs/L1/L1-REQ-CONV-004-session-forking.md | Defines fork, delete, and parent-unavailable behavior. | +| related-to | L1-REQ-CONV-005 | 1 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | Defines immediate previous message edit protocol behavior. | +| related-to | L1-REQ-EDIT-001 | 1 | specs/L1/L1-REQ-EDIT-001-file-editing-workflow.md | Structured file-editing tools provide restoration data for superseded turns. | +| related-to | L1-REQ-GOAL-001 | 1 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | Defines goal control requests, goal events, and autonomous continuation visibility. | +| related-to | L1-REQ-GIT-001 | 1 | specs/L1/L1-REQ-GIT-001-change-management.md | Git checkpoints may support restoration without user-visible git history changes. | +| related-to | L1-REQ-TOOL-001 | 1 | specs/L1/L1-REQ-TOOL-001-safety.md | Defines redaction and safety fields in tool events. | +| related-to | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | Exposes built-in tool lifecycle and plan tool updates to clients. | +| related-to | L1-REQ-TOOL-005 | 1 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | Exposes tracked background process state and stop requests. | +| related-to | L1-REQ-MEM-001 | 1 | specs/L1/L1-REQ-MEM-001-persistent-memory.md | Excludes persistent memory from routine client-server protocol methods and notifications. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | The protocol exposes execution engine state to clients. | +| related-to | L2-DES-AGENT-002 | 1 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | The protocol exposes interrupt and resume control actions. | +| related-to | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | The protocol exposes tool and plan state from the built-in tool system. | +| related-to | L2-DES-GOAL-001 | 1 | specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md | The protocol exposes user-owned goal controls and canonical goal notifications. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable session events are distinct from live server-client protocol events. | +| related-to | L2-DES-APP-005 | 1 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | `config.inspect` and `config.update` operate on safe projections and updates derived from the `config.toml` and `auth.json` schemas. | +| specified-by | TBD | TBD | specs/L3/app/TBD.md | L3 behavior has not been authored yet. | + +## References + +- JSON-RPC 2.0 specification. +- Visual Studio Code Web Extensions documentation. +- Visual Studio Code Extension Host documentation. +- Visual Studio Code Language Server Extension Guide. +- `microsoft/vscode-languageserver-node` transport documentation. + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial client/server protocol, transport, provider-event boundary, and multi-client server ownership design. | +| 1 | 2026-05-25 | Human | Refinement | Linked configuration protocol methods to the concrete `config.toml` and `auth.json` schemas. | +| 1 | 2026-05-22 | Human | Refinement | Made WebSocket the concrete transport, removed the per-workspace caveat, expanded request and notification descriptions, and added cross-client broadcast behavior. | +| 1 | 2026-05-22 | Human | Refinement | Added steer and queue protocol behavior, deletion/export, fork deletion policy, sequencing, approval races, Plan Mode guards, error recovery fields, and tool safety fields. | +| 1 | 2026-05-22 | Human | Refinement | Added immediate previous message editing request, events, and branch-safe protocol rules. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that surviving forks use replayable inherited-history segments rather than relying on parent-session links after deletion. | +| 1 | 2026-05-22 | Human | Refinement | Added deletion response visibility for inherited segment preservation actions and non-dereferenceable parent links. | +| 1 | 2026-05-22 | Human | Refinement | Added workspace restoration request fields and events for immediate message editing. | +| 1 | 2026-05-22 | Human | Refinement | Removed persistent-memory management methods and notifications from the client-server protocol. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that turn diffs are client-display projections and workspace restoration remains server/core-owned. | +| 1 | 2026-05-22 | Human | Refinement | Added execution inspection, interrupt, resume, and background-process stop protocol surfaces. | +| 1 | 2026-05-22 | Human | Refinement | Added plan update events and tool protocol rules for the built-in tool system. | +| 1 | 2026-05-22 | Human | Refinement | Added command descriptions and natural-language result summaries to tool protocol projections. | +| 1 | 2026-05-23 | Human | Refinement | Added Ralph Loop goal requests, notifications, server-client events, and protocol rules. | +| 1 | 2026-05-25 | Human | Refinement | Added compaction trigger source to context update events for transcript-area compaction notices. | diff --git a/specs/L2/app/L2-DES-APP-004-observability-architecture.md b/specs/L2/app/L2-DES-APP-004-observability-architecture.md new file mode 100644 index 00000000..5612b85d --- /dev/null +++ b/specs/L2/app/L2-DES-APP-004-observability-architecture.md @@ -0,0 +1,274 @@ +--- +artifact_id: L2-DES-APP-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-APP-004 — Observability Architecture + +## Purpose + +Refine application observability into a technical design for structured logs, diagnostic projections, trace-mode records, correlation identifiers, and optional telemetry across clients, server, model calls, tools, and user interface paths. + +## Background / Context + +Agent workflows cross multiple components before a user sees a result. A single failure may involve client submission, server turn admission, context assembly, model-provider resolution, provider streaming, tool execution, approval waiting, persistence, or rendering. Observability must expose enough state to identify the failing phase without exposing secrets or forcing users to inspect hidden implementation state. + +## Source Requirements + +- `L1-REQ-APP-004` requires structured logs, configurable log levels, actionable diagnostics, trace logging for large language model streaming response events, and optional telemetry. +- `L1-REQ-LLM-003` requires model usage and stream observability. +- `L1-REQ-APP-011` requires actionable error recovery. +- `L1-REQ-APP-012` requires privacy and data ownership controls. +- `L1-REQ-AGENT-001` requires a visible execution workflow. +- `L1-REQ-TOOL-002` requires observable tool execution paths. +- `L2-DES-APP-003` defines the client/server protocol projections used by clients. +- `L2-DES-AGENT-001` defines the execution engine phases that must be observable. +- `L2-DES-LLM-003` defines model usage and streaming trace records. +- `L2-DES-TOOL-001` defines tool lifecycle and result summaries. + +## Design Requirement + +The program should implement observability as a set of related but separate outputs: + +1. User-facing diagnostics for current status and troubleshooting. +2. Structured logs for maintainers and local debugging. +3. Trace-mode diagnostic records for high-detail model stream investigation. +4. Optional telemetry events only when the user enables telemetry. + +These outputs may be derived from the same runtime events, but they should not expose the same data by default. User-facing diagnostics must be concise and actionable. Structured logs should be machine-readable. Trace records may be more detailed and must be guarded by explicit privacy, secret-handling, and retention controls. + +## Observability Planes + +### User-Facing Diagnostics + +User-facing diagnostics are safe projections that clients may show directly. + +Examples: + +- Current model and provider status. +- Current turn phase: model output, tool output, approval, question, persistence, or rendering. +- Token usage and context pressure. +- Tool name, status, timing, and result summary. +- Provider error phase and recovery hint. +- Whether a value is measured, estimated, unavailable, or redacted. + +User-facing diagnostics should avoid raw prompt text, raw credentials, full tool outputs by default, and provider-native error payloads that may contain secrets. + +### Structured Logs + +Structured logs are local diagnostic records emitted by the server and clients. + +Each structured log record should carry: + +- `timestamp` +- `level`: trace, debug, info, warn, or error. +- `component`: client, server, agent, model, provider, tool, persistence, protocol, or UI. +- `event_name` +- `phase` +- `status` +- `message` +- Correlation identifiers where available. +- Structured fields after redaction. +- `redaction_state` +- Optional `recovery_hint` + +Structured logs should be useful without relying on free-form text parsing. The `message` field is for humans; structured fields are for filtering and analysis. + +### Trace-Mode Records + +Trace-mode records are high-detail diagnostic records enabled only by explicit configuration or runtime option. + +Trace mode may record model stream event timing, ordering, completion state, event kinds, chunk sizes, normalized deltas, provider error metadata, and usage timing. Content-bearing stream deltas must follow the privacy and retention policy defined by `L2-DES-LLM-003`. + +Trace mode must not be required for normal operation. + +### Optional Telemetry + +Telemetry is outbound diagnostic data. It must be disabled unless the user enables it. + +Telemetry events must be redacted and aggregate-oriented. They should not include prompt content, response content, tool output content, API keys, local absolute file contents, or secrets. + +## Correlation Model + +Observability records should use stable correlation identifiers so a user or maintainer can connect related events across subsystems. + +Common identifiers: + +- `session_id` +- `turn_id` +- `item_id` +- `invocation_id` +- `tool_call_id` +- `approval_id` +- `question_id` +- `client_id` +- `subscription_id` +- `request_id` +- `trace_id` +- `workspace_change_set_id` + +Rules: + +- Every model invocation should have an `invocation_id`. +- Every tool call should have a `tool_call_id`. +- Every client request should have a request identifier from JSON-RPC or an explicit idempotency id. +- Logs for a single turn should be filterable by `session_id` and `turn_id`. +- Provider-stream trace records should be filterable by `invocation_id` and `trace_id`. + +## Log Levels + +The program should define log levels consistently: + +| Level | Intended Use | Content Policy | +|---|---|---| +| `error` | Terminal failures and failed operations requiring action. | Redacted structured error data and recovery hints. | +| `warn` | Degraded behavior, retries, skipped restore, unavailable provider fields. | Redacted summaries and phase data. | +| `info` | Important lifecycle transitions. | Safe identifiers, statuses, and summaries. | +| `debug` | Developer diagnostics for state transitions and decisions. | Redacted structured fields; no prompt or response content by default. | +| `trace` | High-detail timing and stream diagnostics. | May include sensitive trace records only when trace mode and retention policy allow it. | + +Changing log level should not change core behavior. It only changes what diagnostic records are emitted. + +## Diagnostic State Model + +The runtime should maintain a current diagnostic projection for active work. + +Conceptual active diagnostic fields: + +- `server_status` +- `active_session_id` +- `active_turn_id` +- `turn_status` +- `turn_phase` +- `waiting_reason`: model, tool, approval, question, persistence, rendering, or none. +- `current_model` +- `current_provider` +- `active_invocation_id` +- `running_tool_calls` +- `pending_approvals` +- `pending_questions` +- `usage_summary` +- `context_pressure` +- `last_error` +- `recovery_actions` + +This projection should be exposed to clients through existing protocol surfaces such as session snapshots, turn events, `execution.inspect`, `usage_updated`, `context_updated`, `tool_call_updated`, and `error_reported`. + +## Event Capture Points + +The server should emit observability records at the following points: + +- Client connection, initialization, subscription, reconnect, and disconnect. +- Turn admission, rejection, start, status change, completion, failure, interruption, and resume. +- Context assembly start, completion, compaction decision, and context-pressure update. +- Model resolution, model invocation start, stream start, stream completion, usage receipt, and provider failure. +- Tool validation, approval wait, execution start, progress update, completion, failure, and cancellation. +- Queue, steer, and message-edit state transitions. +- Persistence write, replay, recovery, and replay failure. +- Configuration changes that affect model, provider, log level, telemetry, or trace mode. + +The observability event should identify the phase and the affected object rather than only saying that something failed. + +## Client Visibility + +Clients should not infer runtime state from incomplete local UI state. The server should provide canonical diagnostic events and snapshots. + +Client-visible diagnostic behavior should include: + +- Showing whether a turn is waiting for model output, tool output, approval, question, or user input. +- Showing model usage and context pressure when available. +- Showing failed tool name, tool phase, and safe result summary. +- Showing provider error category such as authentication, network, rate limit, unavailable model, invalid request, or unknown. +- Showing recovery actions when known, such as update credentials, choose another model, retry later, reduce context, or inspect logs. + +Clients may provide local UI render diagnostics, but server-owned execution state remains authoritative. + +## Redaction And Privacy + +Observability must apply redaction before data crosses a boundary with lower trust or broader visibility. + +Rules: + +- API keys and credential material must never be logged in plaintext. +- Secret-looking environment variables, headers, provider request authorization fields, and tool outputs must be redacted. +- Prompt content, response content, and attachment content must not appear in normal logs. +- Trace-mode content logging must be explicitly enabled and governed by retention controls. +- User-facing diagnostics should prefer summaries, statuses, counts, and references over raw content. +- Redacted records should say that redaction occurred instead of silently omitting important context. + +## Retention + +The program should separate retention policy by data sensitivity: + +- User-facing session state follows session persistence rules. +- Structured logs follow configured local log retention. +- Trace-mode records follow stricter retention because they may include content-sensitive stream data. +- Telemetry follows telemetry opt-in and data minimization rules. + +Trace records should be easy to delete without corrupting durable session replay. They are diagnostic artifacts, not the source of truth for session state. + +## Error Diagnostics + +Errors should be classified by phase and recovery path. + +Conceptual error fields: + +- `error_id` +- `scope`: server, session, turn, invocation, tool, client, or persistence. +- `phase` +- `category` +- `message` +- `recoverable` +- `retryable` +- `retry_after` +- `provider_error_ref` where applicable. +- `tool_call_id` where applicable. +- `recovery_actions` +- `redaction_state` + +Examples: + +- Provider credentials invalid: category `authentication`, recovery action `update_provider_credentials`. +- Model unavailable: category `model_unavailable`, recovery action `choose_different_model`. +- Tool command timed out: category `tool_timeout`, recovery action `retry_or_interrupt`. +- Persistence write failed: category `persistence_failure`, recovery action `free_disk_space_or_check_permissions`. + +## Telemetry Boundary + +Telemetry is not required for local observability. Local logs and diagnostics must remain useful when telemetry is disabled. + +When telemetry is enabled, outbound telemetry should be limited to product-health events such as: + +- Feature usage counts. +- Error categories. +- Latency buckets. +- Provider class, not plaintext provider endpoint if sensitive. +- Tool category, not raw command text. + +Telemetry must not include model prompts, model responses, tool outputs, credentials, exact local file contents, or unredacted paths unless a later approved requirement explicitly allows them. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-APP-004 | 1 | specs/L1/L1-REQ-APP-004-observability.md | Defines cross-system observability architecture, log levels, diagnostics, trace mode, privacy, and telemetry boundaries. | +| related-to | L1-REQ-LLM-003 | 1 | specs/L1/L1-REQ-LLM-003-observability.md | Model usage and stream observability are specialized observability concerns. | +| related-to | L1-REQ-APP-011 | 1 | specs/L1/L1-REQ-APP-011-error-recovery.md | Actionable diagnostics drive recovery guidance. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | Observability records must obey privacy and redaction constraints. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Client/server events and snapshots expose safe diagnostics to clients. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Execution phases provide the event capture points for observability. | +| related-to | L2-DES-LLM-003 | 1 | specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md | Defines the model-specific usage and streaming trace data used by this architecture. | +| related-to | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | Tool lifecycle events and result summaries feed user-facing diagnostics. | +| specified-by | TBD | TBD | specs/L3/app/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial cross-system observability architecture. | diff --git a/specs/L2/app/L2-DES-APP-005-config-toml-schema.md b/specs/L2/app/L2-DES-APP-005-config-toml-schema.md new file mode 100644 index 00000000..de3a8097 --- /dev/null +++ b/specs/L2/app/L2-DES-APP-005-config-toml-schema.md @@ -0,0 +1,578 @@ +--- +artifact_id: L2-DES-APP-005 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-APP-005 - Config TOML And Auth JSON Schema + +## Purpose + +Define the durable `config.toml` file shape used by user-scoped and project-scoped configuration, and the companion `auth.json` file shape used for credentials. + +This design makes model/provider persistence, `/model` selection defaults, MCP server setup, skill discovery roots, and routine application preferences inspectable and mergeable without requiring clients to understand implementation-specific runtime state. It also keeps API keys and other credential material out of `config.toml`. + +## Background / Context + +`L2-DES-APP-002` defines where configuration files live and how project-scoped configuration takes precedence over user-scoped configuration. This document defines the TOML schema stored at those locations. + +The same schema is used for: + +- User-scoped configuration: + - Windows: `C:\Users\username\.devo\config.toml` + - Windows auth file: `C:\Users\username\.devo\auth.json` + - macOS and Linux: `~/.devo/config.toml` + - macOS and Linux auth file: `~/.devo/auth.json` +- Project-scoped configuration: + - `project_directory/.dev/config.toml` + - Project auth file: `project_directory/.dev/auth.json` + +The TOML schema stores durable preferences and durable setup records. It must not store transient session state, resolved runtime profiles, provider request payloads, routine client projections, API keys, or other secret values. + +The JSON auth schema stores credential material for the same source scopes. `config.toml` refers to auth entries by stable credential id. + +## Source Requirements + +- `L1-REQ-APP-010` requires persistent configuration, user/project file locations, project-over-user precedence, effective configuration inspection, and deterministic persistence targets. +- `L1-REQ-MODEL-001` requires persisted invocable model configuration. +- `L1-REQ-MODEL-002` requires persisted provider configuration and safe credential handling. +- `L1-REQ-MODEL-003` requires onboarding-created configuration to be restorable. +- `L1-REQ-TUI-010` requires onboarding to submit setup results for persistence. +- `L1-REQ-APP-008` requires MCP integrations to be user-configured and status-visible. +- `L1-REQ-APP-009` requires skills to be discoverable from configured sources with visible missing/unavailable states. +- `L1-REQ-APP-012` requires credential and privacy-safe projections. + +## Design Requirement + +Configuration should use a single TOML schema version with stable, keyed records. Credential material should use a companion `auth.json` schema version with stable credential ids. + +Keyed TOML tables are preferred for mergeable records because project-scoped configuration can replace, disable, or add individual records by stable identifier: + +- `[providers.]` +- `[model_bindings.]` +- `[mcp.servers.]` +- `[skills.roots.]` + +Arrays may be used only for ordered scalar settings where record-level override is not needed. + +The file should be readable and editable by users, but it is still a program-owned schema. The program should validate before persisting changes and should avoid inventing placeholder values when validation fails. + +`config.toml` is the durable configuration file. `auth.json` is the durable credential file. Environment variables and external secret stores are not the designed persistence mechanism for API keys or other credentials. + +## Top-Level Shape + +Every config file should declare the schema version: + +```toml +schema_version = 1 +``` + +Top-level sections: + +```text +[defaults] +[providers.] +[model_bindings.] +[tools.web_search] +[mcp.servers.] +[skills] +[skills.roots.] +[workspace.instructions] +[tui] +[telemetry] +[logging] +``` + +Sections may be absent. Missing sections mean no values are supplied by that source. + +Unknown top-level sections should be preserved on write when possible. Unknown keys under known sections should produce diagnostics unless they are under a documented extension namespace such as `[x.]`. + +## Complete Example + +```toml +schema_version = 1 + +[defaults] +model_binding = "gpt55-openrouter" +reasoning_effort = "medium" +mode = "default" +theme = "system" +permission_policy = "default" + +[providers.openrouter] +enabled = true +name = "OpenRouter" +base_url = "https://openrouter.ai/api/v1" +credential = "openrouter_api_key" + +[providers.openai] +enabled = true +name = "OpenAI" +base_url = "https://api.openai.com/v1" +credential = "openai_api_key" + +[model_bindings.gpt55-openrouter] +enabled = true +model_slug = "openai/gpt-5.5" +provider = "openrouter" +model_name = "openai/gpt-5.5" +invocation_method = "openai_responses" +default_reasoning_effort = "medium" + +[model_bindings.deepseek-v4-pro] +enabled = true +model_slug = "deepseek/deepseek-v4-pro" +provider = "openrouter" +model_name = "deepseek/deepseek-v4-pro" +invocation_method = "openai_chat_completions" +default_reasoning_effort = "high" + +[tools.web_search] +enabled = true +mode = "provider" +provider_search_binding = "gpt55-openrouter" + +[mcp.servers.github] +enabled = true +display_name = "GitHub" +transport = "stdio" +command = "github-mcp-server" +args = ["stdio"] +startup_policy = "lazy" +trust_policy = "user" +allowed_capabilities = ["tools", "resources"] +roots_policy = "workspace" + +[mcp.servers.github.env] +GITHUB_TOKEN = { credential = "github_token" } + +[skills] +enabled = true +model_catalog_enabled = true +auto_activate = false + +[skills.roots.user] +enabled = true +path = "~/.devo/skills" +trust_policy = "user" + +[skills.roots.interop] +enabled = true +path = "~/.agents/skills" +trust_policy = "user" + +[workspace.instructions] +fallback_filenames = ["CLAUDE.md", "PROMPT.md"] +max_bytes = 200000 + +[tui] +theme = "system" +vim_mode = false + +[telemetry] +enabled = false + +[logging] +level = "info" +``` + +Companion user-scoped `auth.json` example: + +```json +{ + "schema_version": 1, + "credentials": { + "openrouter_api_key": { + "kind": "api_key", + "value": "sk-or-example", + "created_at": "2026-05-25T00:00:00Z", + "updated_at": "2026-05-25T00:00:00Z" + }, + "openai_api_key": { + "kind": "api_key", + "value": "sk-example", + "created_at": "2026-05-25T00:00:00Z", + "updated_at": "2026-05-25T00:00:00Z" + }, + "github_token": { + "kind": "bearer_token", + "value": "ghp_example", + "created_at": "2026-05-25T00:00:00Z", + "updated_at": "2026-05-25T00:00:00Z" + } + } +} +``` + +## Defaults + +`[defaults]` stores durable default selections and user preferences. + +Fields: + +- `model_binding`: optional binding id from `[model_bindings]`. +- `reasoning_effort`: optional logical reasoning effort string for the default model selection. +- `mode`: optional default interaction mode id. +- `theme`: optional UI theme id. +- `permission_policy`: optional default permission posture. + +Rules: + +- `model_binding` must reference an enabled effective model binding. +- `reasoning_effort` must be allowed by the selected binding's supported model definition. +- Defaults are not active-session state. After the first user message, changing the active session model or reasoning effort does not rewrite `[defaults]` unless a workflow explicitly persists a default according to `L2-DES-APP-002`. +- If both a binding-level `default_reasoning_effort` and `[defaults].reasoning_effort` are present, `[defaults].reasoning_effort` is the default session selection for `[defaults].model_binding`. The binding-level value remains the default shown when that binding is selected outside the current default. + +## Permission Policy + +`[defaults].permission_policy` controls the default tool permission posture. + +Allowed values: + +- `default`: baseline permission behavior. Read-only inspection may proceed where otherwise allowed; mutating file operations, command execution, network access, external side effects, and privileged operations remain subject to normal validation, permission checks, sandbox checks, and user approval when required. +- `auto_review`: review-oriented permission behavior. The runtime should classify tool calls before execution and prefer automatic review/diagnostic feedback for risky or ambiguous operations. User approval is still required when the review cannot prove the action is within policy. +- `full_access`: broad permission behavior for trusted contexts. The runtime minimizes approval prompts for allowed tool calls, but it must still enforce validation, mode constraints, privacy rules, audit recording, and the configured sandbox. + +Rules: + +- The field name is `permission_policy`. +- `approve_policy` and `approval_policy` are not schema fields. If encountered, they should produce a migration or validation diagnostic rather than silently changing behavior. +- Permission policy does not define filesystem or network isolation. Sandbox policy is a separate execution restriction layer. +- `full_access` does not mean unrestricted host execution when sandbox restrictions are configured. + +## Sandbox Direction + +The durable sandbox schema is not finalized in this revision. The earlier placeholder value `sandbox = "workspace-write"` is intentionally not part of the current `config.toml` schema. + +The sandbox design target is to restrict the system calls available to tool execution processes, especially calls that open, read, write, create, delete, rename, or otherwise mutate filesystem objects. + +The sandbox should eventually control: + +- Directory read access. +- Directory write access. +- File creation, mutation, rename, and deletion. +- Process execution boundaries where supported. +- Network access, controlled at the domain level. + +Conceptual future policy dimensions: + +- Filesystem read roots and denied roots. +- Filesystem write roots and denied roots. +- Whether tool execution may spawn child processes. +- Whether network access is disabled, unrestricted, or restricted to allowed domains. +- Domain allowlists and denylists for network-capable tools and processes. + +Sandbox policy is enforced by the host around tool execution. It is not a model-visible promise and not a substitute for permission policy, user approval, or tool validation. + +## Providers + +`[providers.]` stores reusable user-defined provider endpoints. + +Required fields for an enabled provider: + +- `enabled`: boolean. +- `name`: user-facing provider display name. +- `base_url`: provider API base URL. +- `credential`: credential id from effective `auth.json`. + +Optional fields: + +- `availability_status`: last known safe status such as `unknown`, `valid`, `auth_required`, or `unavailable`. +- `timeout_ms`: provider request timeout. +- `connect_timeout_ms`: provider connection timeout. + +Provider ids are stable program-generated identifiers. Changing `name` must not change the provider id. + +Provider records must not contain model-specific fields such as `model_name`, `model_slug`, `invocation_method`, or reasoning effort. + +## Credentials + +`config.toml` stores credential references. `auth.json` stores credential material. + +Provider records reference credentials by id: + +```toml +credential = "openrouter_api_key" +``` + +MCP process environment entries and HTTP auth entries also reference `auth.json` credentials by id: + +```toml +[mcp.servers.github.env] +GITHUB_TOKEN = { credential = "github_token" } + +[mcp.servers.linear] +auth = { credential = "linear_api_key", scheme = "bearer" } +``` + +`auth.json` has the following shape: + +```json +{ + "schema_version": 1, + "credentials": { + "openrouter_api_key": { + "kind": "api_key", + "value": "sk-or-example" + } + } +} +``` + +Rules: + +- `auth.json` is the only designed durable storage location for API keys and other credential material. +- Environment variables, OS keychains, external credential stores, and inline TOML secrets are not part of this design. +- Routine client projections must show credential status, not plaintext credential values. +- `config.toml` writes must not insert plaintext credential values. +- `auth.json` writes must update only the intended credential entries. +- Errors may name the provider, MCP server, and credential id, but must not print credential values by default. +- Project-scoped `auth.json` is valid only as an explicit user choice because the file may be shared with the project directory. +- When creating project-scoped `auth.json`, the program should make the persistence target visible and should recommend or apply project-local ignore behavior where supported by later implementation design. + +Auth records: + +- `kind`: `api_key`, `bearer_token`, or another program-known credential kind. +- `value`: secret value. +- `created_at`: optional timestamp. +- `updated_at`: optional timestamp. +- `description`: optional user-facing label. + +Credential ids are stable keys inside `auth.json`. Renaming a provider does not rename its credential id. + +## Model Bindings + +`[model_bindings.]` stores configured invocable models. + +Required fields for an enabled binding: + +- `enabled`: boolean. +- `model_slug`: canonical supported model slug. +- `provider`: provider id from `[providers]`. +- `model_name`: provider-specific model name used for API requests. +- `invocation_method`: program-known invocation method id. + +Optional fields: + +- `display_name`: client display override. +- `default_reasoning_effort`: logical reasoning effort selected by onboarding or default setup. +- `availability_status`: last known safe status. + +Allowed `invocation_method` values for the initial schema: + +- `openai_responses` +- `openai_chat_completions` +- `anthropic_messages` + +Rules: + +- `model_slug` must exist in the built-in supported model catalog. +- `provider` must reference an enabled effective provider. +- `invocation_method` must be supported by the program and valid for the provider/model combination. +- `default_reasoning_effort` must be absent when the supported model does not allow reasoning. +- `default_reasoning_effort` must be one of the supported model's logical effort values when present. +- The `/model` command's first list is populated from enabled effective model bindings. It may show the binding's model and provider together, but it must ask for reasoning effort as a separate step when the model supports reasoning. + +## MCP Servers + +`[mcp.servers.]` stores configured MCP server connections. + +Common fields: + +- `enabled`: boolean. +- `display_name`: user-facing server name. +- `transport`: `stdio` or `http`. +- `startup_policy`: `eager`, `lazy`, or `manual`. +- `trust_policy`: `user`, `project`, or `untrusted`. +- `allowed_capabilities`: optional list containing `tools`, `resources`, `resource_templates`, `prompts`, `sampling`, or `elicitation`. +- `roots_policy`: `none`, `workspace`, or `configured`. + +Stdio fields: + +- `command` +- `args` +- `cwd` +- `[mcp.servers..env]` + +HTTP fields: + +- `base_url` +- `auth` + +Example HTTP credential reference: + +```toml +[mcp.servers.linear] +enabled = true +display_name = "Linear" +transport = "http" +base_url = "https://mcp.linear.app" +auth = { credential = "linear_api_key", scheme = "bearer" } +startup_policy = "lazy" +trust_policy = "user" +allowed_capabilities = ["tools", "resources"] +roots_policy = "none" +``` + +Rules: + +- Enabled stdio servers require `command`. +- Enabled HTTP servers require `base_url`. +- Secret-bearing process environment variables and HTTP auth values must reference `auth.json` credential ids. +- The runtime may inject an `auth.json` credential into a child process environment only for the configured server operation that requires it. That runtime injection does not make OS environment variables a credential persistence mechanism. +- Project-scoped MCP servers must be visible to the user before first use because they may start local processes or send workspace data to external services. + +## Skills + +`[skills]` controls global skill behavior for the configuration source. + +Fields: + +- `enabled`: whether skill discovery is enabled. +- `model_catalog_enabled`: whether a concise skill catalog may be offered to the model. +- `auto_activate`: whether model-selected skill activation is allowed without an explicit user naming a skill. + +`[skills.roots.]` stores skill discovery roots. + +Fields: + +- `enabled`: boolean. +- `path`: directory path. +- `trust_policy`: `user`, `workspace`, `plugin`, or `untrusted`. +- `max_depth`: optional scan depth for package discovery. + +Rules: + +- Skill roots are discovery roots, not instructions by themselves. +- Supporting files under a skill root are not loaded during configuration load. +- Workspace skill roots should be trust-visible before automatic activation. +- Duplicate skill names must be resolved deterministically or reported as conflicts by the skill catalog, not silently overwritten by configuration merge. + +## Tools + +Tool configuration should be grouped by tool family. + +Initial web search shape: + +```toml +[tools.web_search] +enabled = true +mode = "provider" +provider_search_binding = "gpt55-openrouter" +``` + +Rules: + +- `mode = "disabled"` means web search should be unavailable with a clear disabled-state message. +- `mode = "provider"` uses a cloud/provider-backed search path where the selected provider binding supports it. +- `mode = "local"` uses `local_provider` or later local search configuration. +- If the configured search path is invalid or unavailable, the runtime must report the configuration gap rather than fabricating results. + +## Workspace Instructions + +`[workspace.instructions]` stores project-instruction discovery preferences. + +Fields: + +- `fallback_filenames`: ordered list of additional instruction filenames after native priority files. +- `max_bytes`: maximum assembled instruction bytes before truncation. + +Rules: + +- The native instruction priority remains owned by `L2-DES-WORKSPACE-001`. +- Additional filenames are configuration-driven compatibility fallbacks. +- Truncation must be visible to the user or diagnostic projection. + +## Record Merge Semantics + +The effective configuration is built from user config, then project config. + +For scalar settings: + +- Project value replaces user value. +- Missing project value leaves user value in effect. + +For keyed record collections: + +- Record identity is the TOML table key. +- A project record with the same key replaces the entire user record. +- A project record may disable a user record by setting only `enabled = false`. +- Other enabled replacement records must include all required fields for that record type. +- Project records with new keys are added to the effective set. +- User records with keys not mentioned by project config remain in the effective set. + +This avoids mixing a project provider endpoint with a user invocation method or model name by accident. + +Credential references resolve against effective auth data: + +- User `auth.json` provides the base credential set. +- Project `auth.json` overlays user `auth.json` by credential id. +- A project-scoped configuration record that references a user-scoped credential id is allowed only after the project-scoped configuration is visible to the user. +- Invalid or missing credential references produce actionable errors and must not silently fall back to another credential id. + +## Validation + +The program should validate configuration in two phases: + +1. Parse and schema validation for each source independently. +2. Effective configuration validation after precedence is applied. + +Source validation catches malformed TOML, unsupported schema versions, wrong value types, missing required fields in enabled records, and unknown keys that cannot be preserved safely. + +Auth validation catches malformed JSON, unsupported auth schema versions, wrong value types, missing credential values, duplicate credential ids after source precedence, and credential kinds unsupported by the referring config field. + +Effective validation catches references to missing providers, disabled providers, missing model bindings, missing credentials, invalid supported model slugs, invalid invocation methods, unsupported reasoning efforts, invalid MCP transport combinations, and unavailable skill roots. + +Invalid higher-priority configuration must produce an actionable error instead of silently falling back to lower-priority values for the same setting. + +## Write Safety + +Configuration writes should be schema-aware: + +- Preserve unrelated sections and unknown extension sections where possible. +- Update only the target section or record. +- Never write placeholder model names, provider ids, invocation methods, or reasoning values after validation failure. +- Validate the full resulting file before replacing the previous file contents. +- Use an atomic write strategy in L3/implementation design. +- Report the target path and setting being changed without exposing plaintext credentials by default. + +Auth writes should be schema-aware: + +- Update `auth.json`, not `config.toml`, when only a credential value changes. +- Never log or show the prior or new credential value by default. +- Validate the resulting `auth.json` before replacing the previous file contents. +- Use an atomic write strategy in L3/implementation design. + +Workflows that create or modify providers, model bindings, MCP servers, skill roots, or credentials are persistence writes. Selecting an already-configured model binding for a running session is not a provider, binding, or credential rewrite. + +When a setup flow writes both `config.toml` and `auth.json`, the program should avoid committing a final `config.toml` that references a credential id absent from the final `auth.json`. Exact two-file commit and recovery mechanics belong in L3 design. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Defines the concrete `config.toml` and `auth.json` schemas for persistent configuration. | +| related-to | L1-REQ-MODEL-001 | 1 | specs/L1/L1-REQ-MODEL-001-config.md | Model bindings and defaults are persisted in `config.toml`, while credentials are referenced through `auth.json`. | +| related-to | L1-REQ-MODEL-002 | 1 | specs/L1/L1-REQ-MODEL-002-provider.md | User-defined providers are persisted in `config.toml`, with credential material in `auth.json`. | +| related-to | L1-REQ-MODEL-003 | 1 | specs/L1/L1-REQ-MODEL-003-onboard.md | Onboarding writes provider and model binding records into `config.toml` and credentials into `auth.json`. | +| related-to | L1-REQ-TUI-010 | 1 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | TUI onboarding collects values persisted by this schema. | +| related-to | L1-REQ-APP-008 | 1 | specs/L1/L1-REQ-APP-008-mcp.md | MCP servers are configured through `config.toml` and use `auth.json` credential references. | +| related-to | L1-REQ-APP-009 | 1 | specs/L1/L1-REQ-APP-009-skills.md | Skill roots and enablement are configured through `config.toml`. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | `auth.json` credential storage and redaction behavior protect secrets. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | Source precedence resolves this schema across user and project files. | +| related-to | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | Provides concrete persistence fields for providers and model bindings. | +| related-to | L2-DES-MCP-001 | 1 | specs/L2/mcp/L2-DES-MCP-001-mcp-integration-architecture.md | Provides concrete persistence fields for MCP servers. | +| related-to | L2-DES-SKILLS-001 | 1 | specs/L2/skills/L2-DES-SKILLS-001-agent-skills-architecture.md | Provides concrete persistence fields for skill roots and enablement. | +| specified-by | TBD | TBD | specs/L3/app/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-25 | Assistant | Initial | Initial TOML schema design for durable user and project configuration. | +| 1 | 2026-05-25 | Human | Refinement | Moved API keys and other credential material into dedicated `auth.json` files and removed environment variables or external stores as the recommended credential persistence mechanism. | diff --git a/specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md b/specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md new file mode 100644 index 00000000..15dadbe1 --- /dev/null +++ b/specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md @@ -0,0 +1,176 @@ +--- +artifact_id: L2-DES-CLIENT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-CLIENT-001 — Localization Readiness + +## Purpose + +Refine client localization readiness into a technical design for Unicode-safe input, IME-safe editing, wide-character rendering, localized content preservation, and future UI string translation. + +## Background / Context + +The first product milestone may ship English UI strings, but clients must already handle multilingual user content. User prompts, file paths, provider messages, tool output, command output, and transcript records may contain Unicode and localized text. A client that corrupts non-ASCII input or CJK display width is not usable for many users even if the UI chrome remains English. + +## Source Requirements + +- `L1-REQ-CLIENT-001` requires client interfaces to accept and display Unicode input, support IME composition in supported environments, preserve non-ASCII paths/content, handle CJK and wide-character layout, and structure user-visible strings for future localization. +- `L1-REQ-TUI-001` requires the composer to preserve non-ASCII and IME input. +- `L1-REQ-TUI-003` requires readable transcript display. +- `L1-REQ-TUI-007` requires responsive layout and readability. +- `L2-DES-TUI-002` defines modern TUI layout constraints. +- `L2-DES-TUI-003` defines composer behavior. +- `L2-DES-TUI-004` defines transcript rendering. + +## Design Requirement + +Client interfaces should separate text content handling from UI string localization. + +The initial design must guarantee locale-safe handling for user and runtime content even before translated UI catalogs exist. Future localization should be enabled by structuring UI strings and formatting logic so translation can be added without rewriting workflows. + +## Text Categories + +The program should distinguish these text categories: + +| Category | Examples | Handling | +|---|---|---| +| User content | Prompts, pasted text, file mentions. | Preserve exactly as user-authored. | +| Runtime content | Tool output, provider responses, errors from external commands. | Preserve content, apply redaction and display bounds only where required. | +| UI chrome | Labels, buttons, status text, hints. | Centralize or structure for future translation. | +| Protocol identifiers | Method names, enum values, artifact ids. | Keep stable and not localized. | +| Diagnostic codes | Error codes, metric names, machine states. | Keep stable; localize accompanying messages later. | + +## Unicode Text Model + +Clients should use a Unicode-aware text model. + +Rules: + +- Store and transport text as UTF-8 or another explicitly Unicode-safe representation. +- Do not split visible text by raw bytes. +- Cursor movement should operate on grapheme clusters where editing is visible to users. +- Display width calculations should use terminal/display column width, not scalar count or byte length. +- CJK wide characters, emoji, combining marks, and zero-width joiner sequences should not corrupt cursor position or wrapping. +- Truncation should occur at grapheme boundaries and should account for display width. +- Redaction should preserve valid Unicode output. + +## IME Composition + +Clients that support editable text should treat IME composition as an input state. + +Rules: + +- In-progress composition text should not be submitted as final user input. +- Composition updates should not trigger command-prefix execution. +- The composer should preserve committed IME text exactly. +- Cursor and selection state should remain coherent after composition commits. +- If a terminal or environment cannot expose enough composition information, the client should degrade predictably and document the support limitation. + +## Terminal Width And Wrapping + +TUI rendering must account for display columns. + +Rules: + +- Wrapping should use display width rather than bytes. +- Cell borders, status labels, and truncation markers should not split wide characters. +- A line ending with a wide character should not overflow into the next region. +- Horizontal truncation should reserve display columns for an omission marker when used. +- Status lines should collapse optional metadata before truncating important mode or waiting-state labels. + +Example: + +```text +Incorrect: +> 修复 parser 的 quote +| status overwrites the final wide char | + +Correct: +> 修复 parser 的 quote +-------------------------------------------------------------------------------- +ready Plan +``` + +## Non-ASCII Paths And Mentions + +File paths and mentions may contain non-ASCII text. + +Rules: + +- Display paths without lossy conversion. +- Preserve path text through client/server protocol calls. +- Avoid normalizing path strings in a way that changes filesystem meaning. +- When truncating long paths, keep enough leading or trailing context to identify the path. +- If a path cannot be displayed safely, show a safe escaped representation rather than corrupting layout. + +## UI String Structure + +The initial product may ship English UI strings only. Even so, client code should avoid scattering hard-coded user-facing strings through business logic. + +Recommended structure: + +- Keep user-facing strings near view or presentation layers. +- Use stable message keys or structured message constructors where practical. +- Keep diagnostic machine codes separate from human display text. +- Avoid concatenating translated fragments in ways that would block future localization. +- Allow labels to expand in future languages without breaking layout assumptions. +- Use formatted values through typed placeholders rather than manual string splicing. + +Example conceptual structure: + +```text +message_key: tui.status.waiting_for_tool +values: + tool_name: cargo test + elapsed: 00:04 +fallback_en: Waiting for tool: cargo test 00:04 +``` + +## Locale-Safe Diagnostics + +Diagnostic display should remain clear with localized external output. + +Rules: + +- Provider and tool output may be non-English and should be preserved. +- User-facing recovery hints may start as English but should be isolated for future translation. +- Error codes remain stable and not localized. +- Logs should preserve valid Unicode after redaction. + +## Test Strategy + +L3 and implementation should include targeted tests for: + +- Non-ASCII composer input submission. +- IME committed text preservation where testable. +- CJK display width in composer, transcript, and status lines. +- Truncation at grapheme boundaries. +- Non-ASCII file path display. +- Tool output containing localized text. +- UI string structure that separates codes from display text. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-CLIENT-001 | 1 | specs/L1/L1-REQ-CLIENT-001-localization-readiness.md | Defines Unicode, IME, display-width, non-ASCII path, diagnostic, and future localization design. | +| related-to | L1-REQ-TUI-001 | 1 | specs/L1/L1-REQ-TUI-001-composer.md | Composer input must preserve Unicode and IME text. | +| related-to | L1-REQ-TUI-003 | 1 | specs/L1/L1-REQ-TUI-003-transcript.md | Transcript rendering must preserve localized and non-ASCII content. | +| related-to | L1-REQ-TUI-007 | 1 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | Responsive layout depends on display-width aware rendering. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | The shell layout must account for Unicode and localized text expansion. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Composer and input modes rely on Unicode-safe editing. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Transcript and streaming cells rely on Unicode-safe rendering. | +| specified-by | TBD | TBD | specs/L3/client/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial client localization-readiness design. | diff --git a/specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md b/specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md new file mode 100644 index 00000000..0159bb6c --- /dev/null +++ b/specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md @@ -0,0 +1,252 @@ +--- +artifact_id: L2-DES-CONTEXT-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-22 +--- + +# L2-DES-CONTEXT-001 — Context Assembly + +## Purpose + +Refine the context assembly step of the agent execution engine. Define how the immutable context prefix, metadata-derived instructions, interaction-mode prompt sets, and consolidated change-signal messages compose into model-visible context while preserving token efficiency and provider cache friendliness. + +## Background / Context + +`L2-DES-AGENT-001` defines context assembly as a phase of the execution engine but does not specify how metadata-derived content (persona, interaction mode, interrupt state) is ordered, deduplicated, or inserted relative to the user message. `L2-DES-CONV-001` defines session metadata fields including `persona`, `interaction_mode`, `instruction_set`, and `agent_mode`, but does not define the context assembly rules that translate those fields into model-visible content. + +Four L1 requirements converge on this problem: + +- Token efficiency (`L1-REQ-LLM-001`) requires a stable context prefix and append-only representation of runtime configuration changes. +- Persona (`L1-REQ-LLM-004`) requires persona instructions to influence model behavior without mutating the prefix. +- Plan Mode (`L1-REQ-AGENT-005`) requires mode-specific behavior and instruction sets that change during a session. +- Code Review (`L1-REQ-REVIEW-001`) requires a review-oriented instruction set that changes during a session. + +The user has directed that Plan Mode and Review Mode share a single `interaction_mode` field with distinct prompt sets per mode, and that all pre-user-message signals (mode changes, persona changes, interruption) be consolidated into one message. + +## Source Requirements + +- `L1-REQ-LLM-001` requires stable context prefixes and append-only runtime configuration changes. +- `L1-REQ-LLM-004` requires persona instructions to influence model behavior. +- `L1-REQ-AGENT-005` requires Plan Mode as a session-local interaction mode with mode-specific behavior. +- `L1-REQ-GOAL-001` requires autonomous goal continuation without polluting the user-visible transcript. +- `L1-REQ-REVIEW-001` requires code review as a first-class workflow with review-specific behavior. +- `L1-REQ-CONTEXT-001` requires useful model context management across long-running sessions. +- `L1-REQ-AGENT-002` requires that interrupted work be visible as prior state when the user resumes. +- `L2-DES-CONV-001` defines the `persona`, `interaction_mode`, `agent_mode`, and `instruction_set` session metadata fields. +- `L2-DES-AGENT-001` defines the context assembly phase within the execution engine. +- `L2-DES-AGENT-002` defines interrupt and resume control, including the interrupt state used to assemble the resume signal. +- `L2-DES-GOAL-001` defines hidden goal context for Ralph Loop continuation. + +## Design Requirement + +The program should assemble model-visible context from four layers: + +1. **Immutable prefix**: Stable content that must not be rewritten in-place, including base instructions, tool definitions, and prior transcript turns. +2. **Metadata-derived content**: Persona instructions and interaction-mode instructions assembled from session metadata for every turn. +3. **Hidden goal context**: Active Ralph Loop goal context inserted only when goal continuation is eligible for the current turn. +4. **Consolidated change-signal message**: A single message inserted before the user input when persona, interaction mode, goal state, or interrupt state changed since the prior turn. + +These layers compose into the final model context. The immutable prefix preserves provider cache reuse. Metadata-derived content allows dynamic configuration without prefix mutation. Hidden goal context keeps Ralph Loop continuation out of the visible transcript. The consolidated signal avoids redundant messages while keeping the model informed of changed circumstances. + +## Interaction Mode + +The session carries an `interaction_mode` field that represents the current session-local interaction mode. Values are: + +| Value | Behavior | +|---|---| +| `normal` | Full agent capabilities. Question tool blocked. Mutating tools available subject to permission and safety policy. | +| `plan` | File mutation blocked. Question tool available. Agent produces strategic analysis and plans. Non-mutating inspection tools available. | +| `review` | File mutation blocked. Agent inspects code and produces prioritized findings. Code-location and reasoning required per finding. | + +`interaction_mode` is distinct from the session-level `agent_mode` field (Coding Mode, Security Mode) defined by `L2-DES-CONV-001`. The session-level agent mode is locked at session creation. `interaction_mode` may change during a session. + +Each `interaction_mode` value maps to a distinct prompt set that defines mode-specific instructions, constraints, and output expectations. The prompt set is metadata-owned and is not a user transcript item. + +The mode prompt set for `plan` should include: +- Prohibition on file creation, editing, deletion, renaming, or other mutation. +- Permission to read files, search the codebase, and inspect project context. +- Requirement to produce a strategic, actionable plan from user input and analysis. +- Permission to ask clarification questions through the question tool. +- Instruction that the output is a plan, not an implementation. + +The mode prompt set for `review` should include: +- Prohibition on file creation, editing, deletion, renaming, or other mutation. +- Requirement to inspect the relevant code, diff, branch, commit, or pull request context. +- Requirement to lead with findings ordered by severity. +- Requirement to include file or code location and risk reasoning per finding. +- Instruction to state clearly when no issues are found and identify remaining verification gaps. +- Instruction not to modify code unless the user explicitly requests fixes. + +The mode prompt set for `normal` should include: +- Standard agent capabilities without mode-specific additions. +- No question tool availability (blocked at the tool gate, not just instruction level). + +## Immutable Context Prefix + +The immutable context prefix is the portion of model-visible context that must not be rewritten in-place between turns. It includes: + +- Base instructions. +- Session-level agent mode instructions (e.g., Coding Mode or Security Mode base prompts). +- Tool definitions and schemas. +- Prior transcript turns and their items. +- Prior durable metadata-derived content that has already been sent to the model (see Change-Signal Rules below). + +The immutable prefix may grow as new instructions, tool definitions, or transcript turns are appended. It must not have existing content rewritten, reordered, or mutated when downstream configuration changes. + +When context compaction produces a summary, the summary replaces the compacted transcript range in future context snapshots. This is not a prefix mutation — it is a new context snapshot that begins after the stable prefix and references summaries instead of individual turns. The previously sent prefix content is not rewritten. + +## Metadata-Derived Content + +Each turn, the context assembler derives model-visible content from session metadata. This content is not a transcript turn and is not persisted as a user or assistant message in durable storage. + +The metadata-derived content includes: + +- **Persona instructions**: The instruction text associated with the current persona selection, such as concise style, detailed style, or other configured communication-style instructions. +- **Interaction-mode instructions**: The mode-specific prompt set for the current `interaction_mode` value. +- **Goal context**: The active Ralph Loop objective, status, budget, and progress summary when the current turn is eligible for goal continuation. + +These instructions are assembled from the `instruction_set` and related metadata fields defined by `L2-DES-CONV-001`. They are included in the model-visible context for every turn. + +## Hidden Goal Context + +Hidden goal context is model-visible context derived from active goal state. It is not a user-visible transcript item. + +Rules: + +- It should be inserted only when the session has an active goal and the current turn is eligible for goal-guided execution. +- It should be suppressed in Plan Mode unless a later L3 design explicitly defines a read-only planning interaction with goals. +- It should include the untrusted user objective, budget state, progress, and completion-audit instructions. +- User-provided objective text must be escaped before being embedded in structured tags. +- The exact hidden goal context or a stable reference to it should be captured in the context snapshot or in a goal context snapshot record. + +## Consolidated Change-Signal Message + +When the state of persona, interaction mode, goal context, or interrupt condition changes between turns, the context assembler generates one consolidated change-signal message inserted before the user input. This message bundles all active changes into a single model-visible signal. + +The change-signal message is generated when any of the following differ from the prior turn's context state: + +- `persona` has changed. +- `interaction_mode` has changed. +- Active goal state relevant to model-visible context has changed. +- The prior turn was interrupted by the user. + +If none of these changed, no change-signal message is generated. The metadata-derived persona, mode, and hidden goal context already reflect the current state where they are eligible. + +The change-signal message should be concise and factual: + +- State which persona is now active. +- State which interaction mode is now active. +- State that the active goal changed, paused, resumed, completed, blocked, or stopped by budget, if applicable. +- State that the prior turn was interrupted, if applicable. + +Example shape: + +```text +[SYSTEM — change signal] + +The persona is now: concise. +The interaction mode is now: plan. +The active goal was paused by the user. +The previous turn was interrupted by the user. + +All subsequent responses should use the current persona, interaction mode, and goal state. +``` + +All active changes are stated in one message. The program must not emit separate messages for the persona change, the mode change, and the interruption. + +The change-signal message is metadata-derived content. It is not a transcript turn and is not persisted as a user or assistant message in durable storage. It is regenerated during context assembly when replaying or continuing a session. + +### Ordering + +When a change-signal message is generated, the ordering within the model-visible context for the current turn is: + +```text +[Immutable prefix] +[Metadata-derived: persona instructions (current)] +[Metadata-derived: interaction-mode instructions (current)] +[Hidden goal context, if eligible] +[Consolidated change-signal message, if applicable] +[User input — the current turn's accepted user message] +``` + +The change-signal message appears after the metadata-derived instructions and immediately before the user input. This ensures the model sees the current instructions, then the signal explaining what changed, then the user's request. + +### Interaction with Context Compaction + +When context compaction summarizes earlier turns, the change-signal message for those earlier turns may be summarized or omitted depending on whether the signal's information remains relevant. The current turn's context assembly is not affected by compaction of earlier change-signal messages — the assembler always generates the current turn's signal from current metadata. + +## Interrupt Signal + +When the prior turn was interrupted by the user, the change-signal message includes an interrupt statement. The statement should be factual and not imply the model's prior output was wrong. + +The interrupt signal does not carry the interrupted turn's partial content. The immutable prefix already includes the interrupted turn's durable records (partial assistant output, tool results, workspace change state). The signal only informs the model that the prior turn was interrupted. + +When a turn is interrupted and the user immediately submits a new message, the change-signal message bundles the interrupt notification with any concurrent persona or mode changes. If the user changes persona or mode before resubmitting, those changes appear in the same consolidated message. + +## Context Assembly Flow + +For each new turn, the context assembler: + +1. Load the immutable prefix from the current context snapshot. +2. Detect whether persona, interaction mode, active goal context, or interrupt condition changed since the prior turn's assembled context. +3. Assemble the metadata-derived persona instructions from current session metadata. +4. Assemble the metadata-derived interaction-mode instructions from current session metadata. +5. Assemble hidden goal context when the current turn is eligible for goal-guided execution. +6. If a change is detected, generate one consolidated change-signal message. +7. Insert the accepted user input after the metadata-derived content, optional hidden goal context, and optional change signal. +8. Serialize the assembled context into provider-specific request messages (system, developer, user, assistant, tool messages). +9. Record a context snapshot reference for future turns. + +Step 8 is a provider-specific serialization concern and does not convert metadata-derived content, hidden goal context, or change-signal messages into transcript turns. The durable session record does not store the assembled context as transcript items. + +## Prefix Stability and Token Efficiency + +This design satisfies token-efficiency requirements because: + +- The immutable prefix is never rewritten in-place when configuration changes. +- Persona and mode changes are represented by appended metadata-derived content and an optional change-signal message, not by editing previously sent prefix content. +- Goal context is appended as hidden metadata-derived content when needed; it does not rewrite earlier transcript or instruction records. +- Interruption state is represented by the change-signal message, not by mutating the prior turn's durable records. +- The change-signal message is consolidated — one message regardless of the number of changes — avoiding redundant token consumption. +- When no metadata changes occur between turns, no change-signal message is generated at all. + +Provider prefix caching should benefit from stable base instructions, tool definitions, and prior transcript content that remain byte-identical across turns. + +## Persona, Mode, and Review as Metadata + +This design treats persona, interaction mode, and review behavior as metadata-derived instructions, not as transcript items. This satisfies: + +- Persona changes influence model behavior without creating user-visible transcript turns. +- Plan Mode and Review Mode share the `interaction_mode` field with distinct prompt sets. +- Review findings and plan output are normal assistant response items within the transcript. The mode instructions that produce those findings are metadata-derived, not user-authored transcript content. +- Users can switch between normal, plan, and review modes within a session without creating synthetic transcript items just to represent the mode change. +- Active goal context can guide autonomous continuation without creating synthetic user transcript items. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---:|---|---|---| +| refines | L1-REQ-LLM-001 | 1 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | Defines immutable prefix, append-only metadata changes, and consolidated change-signal for cache-friendly context. | +| refines | L1-REQ-LLM-004 | 1 | specs/L1/L1-REQ-LLM-004-persona.md | Defines persona as metadata-derived instruction with append-only change signaling. | +| refines | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Defines plan as an interaction_mode value with a mode-specific prompt set and consolidate change signal. | +| related-to | L1-REQ-GOAL-001 | 1 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | Defines hidden goal context as model-visible metadata-derived content rather than a user-visible transcript turn. | +| refines | L1-REQ-REVIEW-001 | 1 | specs/L1/L1-REQ-REVIEW-001-code-review.md | Defines review as an interaction_mode value with a mode-specific prompt set sharing the mode field. | +| related-to | L1-REQ-CONTEXT-001 | 1 | specs/L1/L1-REQ-CONTEXT-001-management.md | Context assembly produces the model-visible context managed by the context management system. | +| related-to | L1-REQ-AGENT-002 | 1 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | Interrupt state informs the consolidated change-signal message before the next user input. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Session metadata fields (persona, interaction_mode, instruction_set) provide the source data for context assembly. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Refines the context assembly phase of the execution engine. | +| related-to | L2-DES-AGENT-002 | 1 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | Interrupt state feeds the consolidated change-signal when resuming after interruption. | +| related-to | L2-DES-GOAL-001 | 1 | specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md | Defines goal context content, eligibility, and persistence expectations. | +| specified-by | TBD | TBD | specs/L3/context/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial context assembly design refining token efficiency, persona, plan mode, and code review into immutable prefix, metadata-derived content, and consolidated change-signal message. | +| 1 | 2026-05-23 | Human | Refinement | Added hidden Ralph Loop goal context as metadata-derived model-visible content. | diff --git a/specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md b/specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md new file mode 100644 index 00000000..587c77fe --- /dev/null +++ b/specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md @@ -0,0 +1,235 @@ +--- +artifact_id: L2-DES-CONTEXT-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-CONTEXT-002 — Context Compaction + +## Purpose + +Define how the program detects context pressure, selects eligible history for summarization, produces a durable compaction summary, and updates active context references so future model invocations use the compacted representation while the full transcript remains available for user review. + +## Background / Context + +`L2-DES-CONV-001` defines durable `context_compaction_started` and `context_compaction_completed` records, and states that compaction creates a summary record referenced by later context snapshots instead of older detailed transcript records. `L2-DES-AGENT-001` places compaction as an optional step before the primary model request when context pressure requires it. `L2-DES-CONTEXT-001` defines how metadata-derived content and change-signal messages compose into context, and notes that compacted change-signal messages from earlier turns may be summarized or omitted. + +This document defines when compaction triggers, what eligible history looks like, what the summary must preserve, how the active context snapshot changes after compaction, and how non-compacted transcript records remain available for user review. + +## Source Requirements + +- `L1-REQ-CONTEXT-003` requires compaction when context usage reaches a configured threshold, with summaries preserving task continuity, recent turns remaining uncompressed, and raw history remaining recoverable. +- `L1-REQ-CONTEXT-001` requires useful model context management across long-running sessions. +- `L1-REQ-LLM-001` requires stable context prefixes and append-only handling of configuration changes. +- `L2-DES-CONV-001` defines durable compaction records, context snapshots, and summary item references. +- `L2-DES-AGENT-001` defines the execution engine phase where compaction occurs. +- `L2-DES-CONTEXT-001` defines the immutable prefix and metadata-derived content that must remain valid after compaction. +- `L2-DES-TUI-004` defines transcript-area rendering for compaction lifecycle notices. + +## Design Requirement + +The program should compact older transcript history into a summary when the estimated model-visible context approaches the effective context limit. Compaction should produce a durable summary record that preserves task continuity (objectives, decisions, changed files, blockers, verification status) while allowing the full raw transcript to remain available outside the model context. The active context snapshot should reference the summary instead of individual compacted turns for future invocations. + +Compaction is append-only from the durable storage perspective. It creates new summary records and updated context snapshots. It does not delete, mutate, or rewrite existing transcript turns. + +## Trigger Condition + +Compaction should be considered before a model invocation when the estimated token count of the assembled context approaches the model's effective context limit. + +Conceptual trigger fields: + +- `current_token_estimate`: the token estimate for the current context snapshot before the pending invocation. +- `effective_context_limit`: the program-safe context window for the selected model, accounting for response budget where known. +- `compaction_threshold`: a ratio or byte count at which compaction is triggered, such as 80 % of the effective limit. +- `reserved_recent_turns`: the minimum number of most recent turns to preserve uncompressed, or a token budget reserved for recent uncompressed content. + +The trigger should evaluate before each model invocation where context assembly has produced a token estimate. If the estimate exceeds the threshold, compaction should run before the model call proceeds. This threshold-driven path is **automatic compaction**. + +The user may also request compaction explicitly through a client command such as `/compact`. This user-requested path is **manual compaction**. Manual and automatic compaction share eligibility, summary, durable-recording, and context-snapshot behavior, but they differ in the user-visible started notice. + +Compaction may be skipped when: + +- There is insufficient eligible history to compact (only recent turns exist, or a prior compaction already summarized older history). +- Compaction would not produce meaningful token savings (the eligible range is too small). +- The current invocation is within a tool-call loop where the model is still completing tool work, and re-compacting the same prefix would add latency without new information. In this case, compaction may be deferred until the next user-initiated turn. +- A prior compaction for the same eligible range is still in progress or recently completed. + +When compaction is skipped despite threshold pressure, the program should log or record the reason so future context assembly decisions are auditable. + +## Eligibility + +Context is divided into two conceptual regions for compaction: + +1. **Preserved recent range**: The most recent N turns (or a token budget reserved for recent content) that are not compacted. These turns remain as full transcript items in the active context. +2. **Compaction-eligible range**: Older turns that are candidates for summarization. + +Eligibility rules: + +- Turns already summarized by a prior compaction are not eligible for re-compaction unless the summary itself has become stale and a new compaction of the same range would materially improve context quality. +- The current turn's user input and any in-progress tool work must not be compacted. +- Steer messages, queue items, and other active-turn records associated with uncompressed turns should be preserved with their parent turns or included in the summary where meaningful. +- A turn that was interrupted may be compacted, but the summary should note that work was interrupted. +- A turn that was superseded by message editing may be compacted; the summary should reflect the replacement branch content rather than the superseded turn. +- Fork-inherited history segments are eligible for compaction the same way as native transcript turns. + +## Summary Content + +The compaction summary is a durable record that replaces eligible transcript detail in future context snapshots. It is not a transcript item and does not appear in the user-visible conversation history as a user or assistant message. + +The summary must preserve enough information for the model to continue work without the compacted raw detail. Required summary content: + +- **Current objectives**: What the user asked for and what remains to be done, including any explicit task goals from the plan tool. +- **Key decisions**: Architectural choices, design tradeoffs, selected approaches, and rationale where recorded in the compacted turns. +- **Changed files**: Files created, modified, deleted, or renamed, grouped by turn where attribution is clear. Include enough path and change-kind information for the model to understand workspace state. +- **Blockers and unresolved work**: Any work that was blocked, deferred, interrupted, or requires follow-up. +- **Verification status**: Tests written, tests run, verification outcomes, and remaining test gaps where recorded in the compacted turns. +- **Error context**: Persistent errors, provider failures, or tool failures that may affect future work, with enough detail for the model to avoid repeating the same failure. + +The summary may also include: + +- Persona and mode changes that occurred during the compacted range, if those changes are not already captured by the current metadata-derived content. +- Notable tool outputs that constitute durable task state (e.g., a resolved approval, a completed plan item, a confirmed file path). +- A compacted representation of earlier change-signal messages, reduced to the fact that a change occurred rather than the full signal text. + +The summary should omit: + +- Transient assistant reasoning or exploration that did not lead to decisions or changes. +- Redundant tool output, repeated search results, or low-value content that carries no durable task state. +- Full verbatim assistant responses unless the response text itself records a decision or finding that cannot be derived from other summary fields. + +## Compaction Flow + +Conceptual compaction flow: + +```text +Context assembly detects token estimate exceeds threshold + ↓ +Identify compaction-eligible turn range and preserved recent range + ↓ +Record durable context_compaction_started + ↓ +Emit transcript-area `Manual Compaction Started` or `Automatically Compaction Started` notice + ↓ +Extract summary content from eligible turns + ↓ +Build summary record (objectives, decisions, changed files, blockers, verification, errors) + ↓ +Record durable context_compaction_completed with summary reference + ↓ +Emit transcript-area `Compaction Done` notice + ↓ +Create updated context snapshot referencing summary plus preserved recent turns + ↓ +Proceed with model invocation using compacted context +``` + +Compaction must complete before the invocation that detected the threshold proceeds. If compaction fails, the program should record the failure and either proceed with uncompressed context (accepting provider-limit risk) or fail the turn with a structured context-overflow error. + +## Durable Recording + +Compaction produces these durable records through `L2-DES-CONV-001`: + +- `context_compaction_started`: identifies the compaction event, the session, the trigger source (`manual` or `automatic`), the triggering invocation or command where applicable, the eligible turn range, the preserved recent range, and the compaction strategy. +- `context_compaction_completed`: references the compaction event, the produced summary record, the compacted turn range, the token estimate before and after compaction, and the new context snapshot reference. + +The summary record itself is a durable context record, not a transcript item. It is stored as a content-addressable or identified record referenced by the context snapshot. + +Conceptual summary record fields: + +- `summary_id` +- `session_id` +- `compaction_event_id` +- `trigger_source`: manual or automatic. +- `compacted_turn_range`: first and last turn id in the compacted range. +- `preserved_recent_range`: first and last turn id in the preserved range, for traceability. +- `objectives` +- `decisions` +- `changed_files`: a structured list of paths, change kinds, and source turns. +- `blockers_and_unresolved` +- `verification_status` +- `error_context` +- `notable_state_changes`: persona, mode, or other metadata changes during the compacted range. +- `created_at` +- `content_hash` + +## Active Context After Compaction + +The active context snapshot after compaction should reference: + +```text +[Immutable prefix — same as before compaction] +[Metadata-derived: persona instructions (current)] +[Metadata-derived: interaction-mode instructions (current)] +[Summary record — replacing compacted eligible turns] +[Preserved recent turns — uncompressed, as full transcript items] +[Consolidated change-signal message, if applicable] +[User input — current turn] +``` + +The immutable prefix is unchanged by compaction. The summary record is appended as a new context reference, not as an in-place mutation of earlier prefix content. The preserved recent turns remain as direct transcript references. This satisfies the token-efficiency requirement for stable prefixes: compaction does not rewrite earlier context bytes, it produces a new context snapshot that references different records. + +When compaction runs again later, the prior summary becomes part of the compaction-eligible range. The new summary should incorporate the prior summary's preserved content so task continuity is not lost across multiple compactions. + +## Replay and Recovery + +After replay, the program must be able to: + +- Reconstruct the compacted context snapshot from durable compaction records and summary records. +- Identify which transcript turns were compacted into which summary. +- Display the full raw transcript to the user for review, even when the active model context uses the summary. +- Detect whether a compaction event was recorded without a corresponding completion record (indicating a compaction that crashed) and either resume or restart compaction during the next context assembly. + +Crash during compaction should leave durable records in a recoverable state: either the pre-compaction context snapshot remains valid and the incomplete compaction event can be discarded, or the compaction event is restartable from the durable `compaction_started` record. + +## User Visibility + +The full transcript remains available for user review regardless of compaction. Compaction affects only the model-visible context, not the user-visible conversation history. + +The program should make compaction visible to the user through: + +- A context status indicator showing the current token estimate relative to the effective limit. +- Transcript-area lifecycle notices with exact labels: + - `Manual Compaction Started` when compaction was requested by the user. + - `Automatically Compaction Started` when compaction was triggered by context pressure. + - `Compaction Done` when compaction completes successfully. +- An indication of how many turns were compacted and how many remain uncompressed. +- The ability to inspect a summary record to understand what was preserved from compacted history. + +The transcript-area notices are user-visible status cells. They are not user, assistant, or model-visible transcript messages, and they do not expose the summary content inline. Replay should be able to reconstruct the notices from durable compaction records. + +The user should not be required to approve compaction for it to proceed during normal operation. Compaction is a context-management operation, not a user-prompted workflow. + +## Invariants + +- Compaction is triggered before model invocation, not during or after. +- Compaction creates new durable records and updated context snapshots; it never deletes or mutates existing transcript turns. +- The immutable prefix is not rewritten by compaction. +- Recent turns remain uncompressed; only older eligible history is compacted. +- A compaction summary must preserve objectives, decisions, changed files, blockers, verification status, and error context. +- The full transcript remains available for user review outside the model context. +- Compaction failures must not leave the session in an unrecoverable context state. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---:|---|---|---| +| refines | L1-REQ-CONTEXT-003 | 1 | specs/L1/L1-REQ-CONTEXT-003-compress.md | Defines compaction triggers, eligibility, summary content, durable recording, and context snapshot updates. | +| related-to | L1-REQ-CONTEXT-001 | 1 | specs/L1/L1-REQ-CONTEXT-001-management.md | Compaction is the primary mechanism for managing context growth across long sessions. | +| related-to | L1-REQ-LLM-001 | 1 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | Compaction uses append-only summary records to avoid prefix mutation while reducing token usage. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Defines durable compaction records, summary records, and context snapshot structure. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Compaction runs before model invocation within the execution engine's context assembly phase. | +| related-to | L2-DES-CONTEXT-001 | 1 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | Compaction updates active context snapshots while preserving the immutable prefix and metadata-derived content structure. | +| specified-by | TBD | TBD | specs/L3/context/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial context compaction design covering triggers, eligibility, summary content, durable recording, active context update, and replay recovery. | +| 1 | 2026-05-25 | Human | Refinement | Added exact transcript-area lifecycle notices for manual and automatic compaction. | + diff --git a/specs/L2/context/L2-DES-CONTEXT-003-context-normalization.md b/specs/L2/context/L2-DES-CONTEXT-003-context-normalization.md new file mode 100644 index 00000000..72968b27 --- /dev/null +++ b/specs/L2/context/L2-DES-CONTEXT-003-context-normalization.md @@ -0,0 +1,265 @@ +--- +artifact_id: L2-DES-CONTEXT-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-22 +--- + +# L2-DES-CONTEXT-003 — Context Normalization + +## Purpose + +Define how the program normalizes assembled model context before provider serialization, bounding individual item sizes, preserving tool-call pairing integrity, filtering unsupported modalities against the current model's capabilities, and ensuring the total context fits within the model's effective token budget. + +## Background / Context + +`L2-DES-CONTEXT-001` defines context assembly — how instructions, metadata, transcript items, and change-signal messages compose into model-visible context. `L2-DES-CONTEXT-002` defines compaction — how older history is summarized when the context grows too large. Neither defines what happens when individual items are oversized, modality-incompatible, or when the total assembled context still exceeds the model's limits after compaction. + +Normalization is the final safety pass between context assembly and provider serialization. It ensures that the context sent to any model is well-formed, bounded, modality-compatible, and deterministic. + +## Source Requirements + +- `L1-REQ-CONTEXT-002` requires item size bounding, visible truncation, tool-call pairing integrity, modality filtering against the current model, and model-switching safety. +- `L1-REQ-CONTEXT-001` requires useful model context that stays within limits. +- `L1-REQ-LLM-001` requires token-efficient context construction. +- `L1-REQ-MODEL-001` requires model capability metadata used for modality filtering. +- `L1-REQ-INPUT-001` requires multimodal content parts as first-class context. +- `L2-DES-CONTEXT-001` defines the assembled context that normalization receives as input. +- `L2-DES-CONTEXT-002` defines compaction, which runs before normalization. +- `L2-DES-MODEL-001` defines the resolved model profile used for modality compatibility checks. +- `L2-DES-AGENT-001` defines the execution engine phase where normalization runs. + +## Design Requirement + +The program should normalize assembled context immediately before each model invocation. Normalization runs after context assembly and compaction, and before provider-specific serialization into request messages. + +Normalization applies three passes in order: + +1. **Modality filter**: Remove or convert content parts unsupported by the current model. +2. **Item size bound**: Truncate individual oversized items with visible truncation indicators. +3. **Token-budget bound**: If the total estimated token count still exceeds the model's effective budget, apply a budget-aware reduction that preserves the most recent and most important content. + +Normalization is deterministic: given the same assembled context and the same resolved model profile, it must produce the same normalized output. + +## Execution Phase + +Normalization runs between compaction and provider serialization in the execution engine's flow: + +```text +Context assembly (per L2-DES-CONTEXT-001) + ↓ +Compaction check (per L2-DES-CONTEXT-002) + ↓ +Normalization (this design) ← runs here + ↓ +Provider serialization (system/developer/user/assistant/tool messages) + ↓ +Model invocation +``` + +Normalization runs before every model invocation, including within tool-call loops where context carries new tool results. When the user switches models mid-session, normalization runs with the new model's capabilities. + +Normalization does not mutate durable transcript records. It produces a normalized projection of the assembled context for the current invocation only. The durable transcript and context snapshot remain unmodified. + +## Pass 1 — Modality Filter + +The modality filter removes or converts content parts that are incompatible with the current model's supported modalities. + +### Filtering Rules + +1. Resolve the current model's supported modalities from the `ResolvedModelProfile` → `SupportedModelDefinition.modalities`. +2. For each content part in the assembled context, check whether the part's modality is in the supported set. +3. If the modality is supported, pass the content part through unchanged. +4. If the modality is not supported: + - If a conversion to a supported modality exists and is enabled, convert the content part. + - Otherwise, remove the content part from the normalized output. +5. If removing a content part results in an item having no remaining content parts, insert a placeholder text part describing what was removed and why. The placeholder must identify the removed modality and the count of removed parts, but must not include the removed binary payload. + +### Modality Categories + +| Modality | Content Part Kind | Filter Behavior When Unsupported | +|---|---|---| +| `text` | `text` | Always supported. Pass through. | +| `image` | `image_ref` | Remove unless convertible. | +| `audio` | `audio_ref` | Remove unless convertible. | +| `video` | `video_ref` | Remove unless convertible. | +| `tool_call_json` | `tool_call_json` | Always supported (structural, not modality). Pass through. | +| `tool_result_text` | `tool_result_text` | Always supported. Pass through. | +| `provider_metadata` | `provider_metadata` | Remove (provider-specific and not model-visible). | + +Multimodal artifact references (`image_ref`, `audio_ref`, `video_ref`) that point to binary content should not have the binary payload included in the normalized context when the modality is unsupported. The content part is removed from the normalized output and a placeholder note is inserted. + +### Model-Switching Safety + +When the user switches models between turns, normalization must re-evaluate modality compatibility against the new model. Content parts that were valid for the previous model may become invalid and must be filtered. Content parts that were previously filtered may become valid with the new model and should be restored (since normalization reads from durable content parts, which are unchanged by previous filtering passes). + +### Placeholder Format + +When content parts are removed, the placeholder text should be concise: + +```text +[Unsupported content omitted: 2 image(s), 1 video(s). The current model supports text only.] +``` + +The placeholder must not include image data, URLs that resolve to the removed content, or other information that should remain in the durable record but outside the model request. + +## Pass 2 — Item Size Bound + +After modality filtering, individual items are bounded to prevent any single item from consuming a disproportionate share of the context window. + +### Per-Item-Type Limits + +Different item types have different size policies because their content serves different purposes: + +| Item Kind | Limit Strategy | Default Limit | Truncation Note | +|---|---|---|---| +| `user_input` | Truncate tail | High (generous, near context-window fraction) | `[User message truncated: N bytes omitted]` | +| `assistant_text` | Truncate tail | Medium | `[Response truncated: N bytes omitted]` | +| `assistant_reasoning` | Truncate tail | Medium | `[Reasoning truncated: N bytes omitted]` | +| `tool_call` | Must fit entirely | Call arguments are bounded | Tool calls that exceed limit are marked invalid; the model must not see partial tool-call JSON | +| `tool_result` | Truncate tail | Medium | `[Tool output truncated: N bytes omitted from result of {tool_name}]` | +| `error` | Preserve entirely | Full | Errors are not truncated; they carry diagnostic value | +| `steer_message` | Truncate tail | Medium | `[Steer message truncated: N bytes omitted]` | +| `context_summary` | Truncate tail | Medium | `[Summary truncated: N bytes omitted]` | +| `approval_request` | Preserve entirely | Full | Approval context must be intact for user decisions | +| `question_request` | Preserve entirely | Full | Question context must be intact for user understanding | + +Limits are configurable per item kind. The default values should be sensible fractions of typical context windows. + +### Truncation Behavior + +Truncation is always at the tail (end) of the content — content at the beginning is preserved because it typically carries more context-establishing information. Tool results are the exception: for very large tool outputs where the relevant result is at the end, the program may offer a configurable truncation strategy (head, tail, or head-and-tail). + +Truncation indicators must be: +- Injected as structured text within the truncated content part. +- Visible to both the model and the user (via client display of normalized context). +- Distinct from the original content so replay does not confuse the indicator with authentic tool output. + +Truncation must not: +- Break the structural integrity of tool-call JSON. +- Remove the tool_call_id or tool_name from a tool result (pairing anchors are preserved). +- Remove error codes, error messages, or recovery hints from error items. + +### Tool Call Integrity + +A tool call from the model must arrive at normalization as a complete, valid JSON structure. If a tool call's serialized arguments exceed the item limit, the tool call must not be partially included — it is marked invalid and replaced with an error note: + +```text +[Tool call to {tool_name} omitted: arguments exceed the size limit.] +``` + +The corresponding tool result must also be omitted to preserve pairing. Orphaned tool calls (without results) and orphaned tool results (without their initiating call) must not appear in the normalized context. + +When a tool-call/tool-result pair spans a compaction boundary (the call was in compacted history but the result is recent), the result must still be paired. If the compacted summary does not include the tool call, the result should be treated as a standalone item with a note that its originating call was summarized. + +## Pass 3 — Token-Budget Bound + +After modality filtering and item size bounding, the total estimated token count may still exceed the model's effective context budget. The token-budget pass applies a budget-aware reduction. + +### Token Estimation + +The program maintains a token estimate for each content part and for the assembled context as a whole. Estimation may use: +- Provider-reported token counts from previous invocations. +- Character-based heuristics (e.g., characters ÷ N for the provider family). +- Cached token counts from prior normalization passes. + +The estimate must be conservative enough that normalization does not produce context that the provider then rejects. It does not need to be byte-exact. + +### Budget-Aware Truncation + +If the total estimated tokens exceed the effective context budget: + +1. Identify the oldest items in the context that are eligible for further reduction. +2. Apply progressive truncation from oldest to newest: + - First, further truncate tool results in the eligible range. + - Second, further truncate assistant reasoning in the eligible range. + - Third, further truncate assistant text in the eligible range. + - Fourth, truncate user input in the eligible range. +3. Stop when the estimated token count fits within the budget. +4. Instruction content (system instructions, mode instructions, persona instructions, project instruction files) must not be truncated by the token-budget pass. Compaction is the mechanism for reducing instruction-level content. +5. The most recent user input and any in-progress tool work must not be truncated by the token-budget pass. + +If truncation cannot bring the context within budget without removing required instruction or current-turn content, the program must produce a structured error indicating that the context cannot be normalized for the current model. + +### Visibility + +When the token-budget pass truncates content, the program must: +- Emit a `context_updated` event so clients can show the updated token estimate. +- Mark the truncated items in client projections so the user can see what was reduced. +- Log the before-and-after token estimates for debugging. + +## Per-Item-Type Policies + +Different item types carry different types of content. Normalization should apply type-appropriate strategies: + +| Item Kind | Modality Check | Size Bound | Token-Budget Eligible | Notes | +|---|---|---|---|---| +| User input | Check attachments | High limit | Last resort (oldest first) | Preserve user intent. Attachments filtered by modality. | +| Assistant text | Text only | Medium limit | Yes (oldest first) | Preserve decisions and findings. | +| Assistant reasoning | Text only | Medium limit | Yes (oldest first) | Reduce before assistant text. | +| Tool call | Structural only | Must fit entirely | Yes (as part of pair) | Cannot be partially included. | +| Tool result | Text only | Medium limit | Yes (first candidate) | Preserve tool_call_id pairing. | +| Error | Text only | Full | No | Diagnostic value preserved. | +| Context summary | Text only | Medium limit | No | Already compacted; do not re-truncate. | +| Instructions | Text only | Full | No | Must not be truncated by normalization. | +| Change-signal | Text only | Full | No | Already minimal; do not truncate. | + +## Interaction With Compaction + +Compaction runs before normalization. Compaction reduces the number of transcript items by summarizing older history. Normalization handles what remains: +- Individual oversized items that survived compaction. +- Modality filtering for the current model. +- Token-budget overflow after compaction. + +Compaction is the primary mechanism for reducing long-history context. Normalization is the safety net for individual item size, modality compatibility, and total budget overflow. They do not overlap: compaction does not filter modalities; normalization does not summarize history. + +A compaction summary is a context item of kind `context_summary`. It receives the same normalization treatment as other items: modality filtering (text only, always supported), size bounding (medium limit), and exemption from token-budget truncation (already compacted). + +## Determinism + +Normalization must be deterministic for the same inputs: +- Same assembled context (identical items, content parts, and ordering). +- Same resolved model profile (same model slug, same modality set, same effective context window). + +Determinism enables: +- Reproducible debugging: when a model call fails with a context error, the normalized context can be reconstructed. +- Client-side context inspection: the client can request a projection of the normalized context and receive a consistent result. +- Replay verification: replaying a turn from durable records should produce the same normalized context for the same model. + +External factors such as provider-reported token counts from prior invocations may affect the token estimate, making the budget-bound pass dependent on invocation history. This is acceptable; the estimate is input to normalization, and normalization with the same estimate must produce the same output. + +## Invariants + +- Normalization does not mutate durable transcript records or context snapshots. +- Tool calls and their results must remain paired. Orphaned calls or results must not reach the model. +- Truncation must be visibly indicated; silent data loss is not allowed. +- Unsupported modality payloads must not be sent to model providers. +- Instruction content must not be truncated by normalization. +- Normalization produces deterministic output for the same assembled context and model profile. +- When the user switches models, normalization re-evaluates against the new model's capabilities. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---:|---|---|---| +| refines | L1-REQ-CONTEXT-002 | 1 | specs/L1/L1-REQ-CONTEXT-002-normalize.md | Defines the concrete normalization pipeline: modality filter, item size bounds, tool pairing integrity, token-budget reduction, and model-switching safety. | +| related-to | L1-REQ-CONTEXT-001 | 1 | specs/L1/L1-REQ-CONTEXT-001-management.md | Normalization ensures context is well-formed and bounded before model invocation. | +| related-to | L1-REQ-LLM-001 | 1 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | Token-budget pass and size bounding prevent wasted tokens from oversized items. | +| related-to | L1-REQ-MODEL-001 | 1 | specs/L1/L1-REQ-MODEL-001-config.md | Model capability metadata drives modality filtering decisions. | +| related-to | L1-REQ-INPUT-001 | 1 | specs/L1/L1-REQ-INPUT-001-attachments-and-multimodal.md | Multimodal content parts are subject to modality filtering. | +| related-to | L2-DES-CONTEXT-001 | 1 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | Normalization receives the assembled context as input. | +| related-to | L2-DES-CONTEXT-002 | 1 | specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md | Normalization runs after compaction and handles items that survive compaction. | +| related-to | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | The resolved model profile provides modality capabilities and context-window limits. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Normalization runs as a phase of the execution engine between compaction and provider serialization. | +| specified-by | TBD | TBD | specs/L3/context/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial context normalization design covering three-pass pipeline, per-item-type policies, modality filtering, tool pairing integrity, token-budget bound, and determinism. | + diff --git a/specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md b/specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md new file mode 100644 index 00000000..c667e6ff --- /dev/null +++ b/specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md @@ -0,0 +1,735 @@ +--- +artifact_id: L2-DES-CONV-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-CONV-001 — Session JSONL Data Model + +## Purpose + +Refine session, transcript, turn, item, and active-context requirements into a durable append-only data model suitable for server processing, client rendering, and crash recovery. + +## Background / Context + +The program needs session data in both server logic and client interfaces such as the TUI. Session data must also be persisted. JSONL is the preferred durable storage shape because each line can represent an append-only event or snapshot, which supports crash recovery and preserves stable context prefixes for token efficiency. + +A session has two major conceptual regions: + +- Metadata: durable session state, configuration references, workspace identity, active mode, persona, token usage, and active-context state. +- Transcript: user-visible and auditable conversational history made from turns and items. + +The full transcript can grow without bound from the user's perspective. The active model context is finite and should reference transcript records rather than duplicating them. + +## Source Requirements + +- `L1-REQ-CONV-001` requires durable session lifecycle behavior. +- `L1-REQ-CONV-002` requires auditable turn lifecycle behavior. +- `L1-REQ-CONV-003` requires durable and visible `steer` and `queue` handling during active turns. +- `L1-REQ-CONV-004` requires session forking from a specific turn and fork traceability. +- `L1-REQ-CONV-005` requires append-only editing of the immediately preceding eligible user-authored message. +- `L1-REQ-AGENT-001` requires durable enough execution history for review and recovery. +- `L1-REQ-AGENT-002` requires completed steps, outputs, and file-change state to be preserved after interruption and resume. +- `L1-REQ-AGENT-003` requires visible task planning with status updates. +- `L1-REQ-CHANGE-001` requires rollback and recovery behavior for file changes. +- `L1-REQ-EDIT-001` requires file edits to be reviewable and recoverable. +- `L1-REQ-GOAL-001` requires durable Ralph Loop goal state across turns and recoverable session resumes. +- `L1-REQ-GIT-001` constrains git-oriented change management. +- `L1-REQ-APP-012` requires data ownership controls for user-visible stored data and model-visible context. +- `L1-REQ-MEM-001` defines persistent memory as core-maintained internal state. +- `L1-REQ-CONTEXT-001` requires useful model context management across long-running sessions. +- `L1-REQ-CONTEXT-003` requires context compression when context approaches model limits. +- `L1-REQ-INPUT-001` requires attachments and multimodal input as first-class task context. +- `L1-REQ-TUI-003` requires transcript review and audit behavior. +- `L1-REQ-LLM-001` requires token-efficient stable context construction. +- `L1-REQ-MODEL-001` and `L2-DES-MODEL-001` define model and model-binding references used by session metadata. +- `L1-REQ-TOOL-002` and `L2-DES-TOOL-001` define built-in tool and plan-tool records. + +## Design Requirement + +The durable session file should be an append-only JSONL log. The log should contain versioned records that can rebuild: + +- Session metadata. +- Transcript turns and items. +- Coalesced streaming content segments. +- Usage deltas and totals. +- Active context snapshots. +- Context compression outputs. + +The server may materialize indexed or cached runtime state from the JSONL log, but the JSONL log remains the durable source of truth. + +## Event Planes + +The program should distinguish three event planes: + +1. Provider/core events: high-frequency runtime events emitted while an LLM provider streams a response. +2. Server-client events: live protocol events sent to connected clients for rendering and interaction. +3. Durable JSONL events: compact replay records persisted to session storage. + +These planes may carry related information, but they should not share the same event granularity. + +Provider/core events may include fine-grained deltas such as reasoning text deltas, assistant text deltas, and partial tool-call argument deltas. These events are useful for runtime orchestration and live rendering, but they are too frequent to persist directly as one JSONL record per provider delta. + +Server-client events may include live content updates for responsiveness. The server may coalesce or throttle these updates before sending them to clients. + +Durable JSONL events should use replay-friendly records that preserve session state without storing every provider streaming delta. + +## Durable Record Categories + +Conceptual JSONL record categories: + +- `session_created` +- `session_forked` +- `session_metadata_updated` +- `session_deleted` +- `plan_created` +- `plan_updated` +- `goal_created` +- `goal_replaced` +- `goal_status_changed` +- `goal_budget_accounted` +- `goal_progress_recorded` +- `goal_context_snapshot_recorded` +- `goal_cleared` +- `turn_started` +- `turn_interrupt_requested` +- `turn_completed` +- `turn_failed` +- `turn_interrupted` +- `turn_resume_started` +- `turn_superseded` +- `item_started` +- `item_content_appended` +- `item_completed` +- `item_failed` +- `message_edit_recorded` +- `turn_workspace_checkpoint_recorded` +- `turn_workspace_change_recorded` +- `turn_workspace_diff_recorded` +- `turn_workspace_restore_started` +- `turn_workspace_restore_completed` +- `steer_recorded` +- `queue_item_recorded` +- `queue_item_resolved` +- `usage_recorded` +- `memory_link_recorded` +- `context_snapshot_recorded` +- `context_compaction_started` +- `context_compaction_completed` + +Each record should carry a schema version and enough identity data to be replayed independently. + +`item_content_appended` is not a separate transcript item. It is an append operation against one logical item created by `item_started`. Replay folds all append operations into the same item content. + +The append operation should identify the target `item_id`, content part, append offset, content kind, and appended content or content reference. Append boundaries should be chosen to balance crash recovery and storage efficiency, such as by byte threshold, time threshold, semantic boundary, or provider event boundary. + +## Session Metadata + +Session metadata describes current session state and references durable configuration. It is not a transcript turn. + +Conceptual fields: + +- `session_id` +- `workspace_root` +- `active_model_binding` +- `canonical_model_slug` +- `reasoning_effort` +- `permission_profile` +- `agent_mode` +- `persona` +- `instruction_set` +- `latest_context` +- `usage_totals` +- `last_invocation_usage` +- `active_plan` +- `active_goal` +- `created_at` +- `updated_at` + +`instruction_set` includes base instructions, system instructions, active mode instructions, persona instructions, and other high-priority model-visible instruction sources. These instructions belong to metadata or context assembly state, not to user transcript turns. + +When the model request is built, the request assembler may serialize metadata-derived instructions into provider-specific system or developer messages. That serialization does not make those instructions transcript turns. + +## Transcript Structure + +A transcript is an ordered set of turns. A turn is an execution cycle that begins with user-submitted input and ends as completed, failed, or interrupted. + +Conceptual turn fields: + +- `turn_id` +- `session` +- `status` +- `started_at` +- `completed_at` +- `item_refs` +- `usage_delta` +- `superseded_by_edit` +- `workspace_checkpoint_refs` +- `workspace_restore_refs` +- `interrupt_refs` +- `resume_of_turn_id` +- `resume_turn_refs` + +Conceptual item fields: + +- `item_id` +- `turn` +- `kind` +- `status` +- `role` +- `content_parts` +- `mentions` +- `visibility` +- `created_at` +- `completed_at` +- `revision_of` +- `superseded_by` + +Item kinds include: + +- `user_input` +- `assistant_text` +- `assistant_reasoning` +- `tool_call` +- `tool_result` +- `approval_request` +- `question_request` +- `steer_message` +- `queue_message` +- `error` +- `context_summary` + +System instructions, base instructions, and initial metadata-derived instructions are not `user_input` items and are not their own turns. + +## Tool Items + +Tool call and tool result items should preserve both structured tool-domain data and natural-language summaries that help later model decisions and user review. + +Conceptual tool call fields: + +- `tool_call_id` +- `tool_name` +- `command_description` +- `arguments` +- `arguments_preview` +- `approval_state` +- `safety_state` +- `created_at` + +Conceptual tool result fields: + +- `tool_call_id` +- `status` +- `result_summary` +- `structured_status` +- `canonical_content_ref` +- `display_content_ref` +- `redaction_state` +- `completed_at` + +`command_description` is the model-provided natural-language purpose for a shell or command execution tool invocation. It should be recorded with the tool call for command-like tools and should not be treated as a required field for every tool category. + +`result_summary` is a factual natural-language summary derived from observed tool output or structured status. It should coexist with exact structured fields such as exit code, HTTP status, process id, or changed-file counts. + +## Plan State + +Plan state is user-visible task state maintained by the plan tool. It is not private model reasoning and is not merely assistant prose. + +Plan records should be durable so replay can reconstruct the active plan after restart, reconnect, or session review. + +Conceptual plan fields: + +- `plan_id` +- `session` +- `created_turn_id` +- `updated_turn_id` +- `objective` +- `status`: active, completed, blocked, abandoned, or superseded. +- `item_refs` +- `created_at` +- `updated_at` + +Conceptual plan item fields: + +- `plan_item_id` +- `plan_id` +- `text` +- `status`: pending, in_progress, completed, blocked, or canceled. +- `details` +- `parent_item_id` +- `parallel_group_id` +- `source_turn_id` +- `updated_at` + +Replay should project at most one active plan by default unless a later requirement explicitly allows multiple concurrent plans. Historical, superseded, completed, and abandoned plans remain auditable. + +Plan state may be rendered by clients separately from transcript items. A plan update may also correspond to a tool call/result in the transcript, but the plan projection should come from durable plan records rather than parsing assistant text. + +## Goal State + +Goal state is durable session state for Ralph Loop autonomous work. It is not private model reasoning and is not merely assistant prose. + +Replay should project at most one current goal per session in the first milestone. Historical, replaced, completed, budget-limited, canceled, and cleared goals remain auditable. + +Conceptual goal fields: + +- `goal_id` +- `session` +- `objective` +- `status`: active, paused, blocked, complete, budget_limited, or canceled. +- `token_budget` +- `time_budget_seconds` +- `turn_budget` +- `tokens_used` +- `time_used_seconds` +- `turns_used` +- `progress_summary` +- `blocker_summary` +- `verification_summary` +- `created_at` +- `updated_at` + +Goal records should be append-only. Creating, replacing, pausing, resuming, blocking, completing, canceling, budget-limiting, clearing, or accounting usage for a goal should append a record rather than mutate earlier records. + +Hidden goal context used for a model invocation should be reconstructable through `goal_context_snapshot_recorded` or through a context snapshot that references the exact hidden goal context. This hidden context is model-visible but not rendered as a normal transcript turn. + +If SQLite or another indexed store is used to accelerate goal lookup, it is a projection of these durable records and must be rebuildable from the JSONL rollout. + +## Interrupted And Resumed Turns + +An interrupted turn remains a durable terminal turn. Partial assistant content, reasoning, tool calls, tool results, usage records, and workspace change-set records accepted before interruption must remain auditable. + +Conceptual interrupt record fields: + +- `interrupt_id` +- `session` +- `turn_id` +- `requested_by_client` +- `target_kind` +- `target_id` +- `interrupt_mode` +- `interrupt_status` +- `cleanup_state` +- `created_at` +- `resolved_at` + +Resuming an interrupted task should create a linked continuation turn rather than mutating the interrupted turn. The continuation turn should reference the interrupted turn with `resume_of_turn_id` or equivalent provenance. + +Conceptual resume record fields: + +- `resume_id` +- `session` +- `resume_of_turn_id` +- `resume_turn_id` +- `resume_mode` +- `resume_content_refs` +- `context_snapshot_id` +- `created_by_client` +- `created_at` + +Replay should preserve the interrupted turn and then project the resume turn as a later continuation. Active context assembly for the resumed turn may use the original user request, partial outputs, completed tool results, workspace change summaries, and user-provided resume instructions where safe. + +## Active Turn Messages + +`steer` and `queue` submissions are user-authored records, but they are not ordinary completed user turns when they are submitted during active work. + +Conceptual durable fields: + +- `message_id` +- `session` +- `active_turn` +- `submission_mode`: `steer` or `queue`. +- `content_parts` +- `mentions` +- `status` +- `created_at` +- `resolved_at` + +`steer` records are associated with the active turn they are intended to influence. `queue` records preserve user-visible order until they become a later turn, are canceled, or are otherwise resolved. + +Restored `steer` and `queue` records must remain distinguishable from already-executed transcript turns. + +## Immediate Previous Message Edits + +Editing a previous message should be represented as append-only revision data. The original user-authored message, original turn, and any original assistant/tool outputs remain durable records. + +Conceptual edit record fields: + +- `edit_id` +- `session` +- `target_message_id` +- `target_turn_id` +- `replacement_message_id` +- `replacement_turn_id` +- `edited_content_parts` +- `edited_mentions` +- `edit_mode` +- `created_by_client` +- `created_at` +- `edit_state` + +For a completed, failed, or interrupted latest turn, replay should produce: + +1. The original turn as a durable historical turn. +2. A `message_edit_recorded` event that links the original message to the replacement message. +3. Optional workspace restoration records for files changed by the superseded turn. +4. A `turn_superseded` event that marks the original turn as superseded in the current branch projection. +5. A replacement turn that uses the edited message as the current branch's user input. + +For a queued message that has not started, replay should fold the accepted edit into that queue item's effective content while retaining prior queue-message revisions for audit. + +For an active running turn, replay must not reinterpret already-started model or tool execution as though the edited message had been used. The edit is accepted only if the runtime explicitly interrupts or otherwise records a safe transition. + +Client projections may show the edited message as the current branch content and collapse the superseded turn by default, but audit projections must be able to recover the original message and original turn. + +Active context assembly for future model invocations should use the replacement branch content. It should not include both original and edited user text as ordinary user intent unless an explicit audit or comparison task asks for that history. + +## Turn Workspace Change Tracking + +The program should record enough workspace change data during each turn to support later restoration if immediate message editing supersedes that turn. This change data is core-owned durable state, not client-owned rollback state. + +The checkpoint should capture the workspace baseline before the turn's first mutating file operation where possible, and should record the resulting post-turn file state for changed files after each structured mutation or at turn completion. In git workspaces, an implementation may maintain hidden per-turn tree snapshots or ghost commits so the previous checkpoint can act as the pre-turn restore source. + +The program should not rely on whole-workspace scanning as the primary change detection mechanism for every turn. Structured mutating tools should report exact file deltas as they complete, and the core should accumulate those deltas into a per-turn workspace change set. Broader filesystem snapshots or hidden git checkpoints may supplement this when structured deltas are insufficient, especially for shell-command changes. + +Client-visible diffs are projections of the core-owned change set. They may be emitted for review, display, or progress feedback, but they are not the authoritative restore mechanism. Restoration must use the durable per-turn change set, inverse operations, content snapshots, or internal checkpoints owned by the core. + +Conceptual turn checkpoint fields: + +- `checkpoint_id` +- `session` +- `turn_id` +- `workspace_root` +- `checkpoint_strategy`: structured tool inverse records, hidden git checkpoint, filesystem snapshot, or unsupported. +- `baseline_ref` +- `baseline_hash` +- `created_at` +- `tool_coverage` +- `unattributed_change_policy` + +Conceptual turn workspace change-set fields: + +- `change_set_id` +- `session` +- `turn_id` +- `checkpoint_id` +- `structured_tool_coverage` +- `shell_change_coverage` +- `file_change_refs` +- `display_diff_ref` +- `restore_data_ref` +- `change_set_status` +- `created_at` +- `updated_at` + +Conceptual per-file change fields: + +- `file_change_id` +- `turn_id` +- `tool_call_id` +- `tool_name` +- `path` +- `change_kind`: create, modify, delete, rename, or mode change. +- `pre_state_ref` +- `pre_state_hash` +- `post_state_ref` +- `post_state_hash` +- `inverse_ref` +- `display_diff_hunk_ref` +- `attribution_confidence` + +Known structured file-editing tools such as `write` and `apply_patch` should produce per-file change records with enough before/after state or inverse operation data to restore the pre-turn state. + +Shell commands may modify files outside structured file-editing tools. Those changes should be restorable only if a turn-level workspace checkpoint, filesystem snapshot, hidden git checkpoint, or other attribution mechanism captured them. Otherwise replay should mark them as unsupported or unattributed for automatic restoration. + +The program may store or emit a unified diff for a turn. A unified diff is a useful display artifact and may be sufficient for some manual review workflows, but it must not be the only durable source required for automatic restoration when richer before/after state, inverse operations, or checkpoint references are available. + +Conceptual restore result fields: + +- `restore_id` +- `session` +- `superseded_turn_id` +- `edit_id` +- `checkpoint_id` +- `restore_policy` +- `started_at` +- `completed_at` +- `file_results` + +Conceptual per-file restore result fields: + +- `path` +- `restore_status`: restored, skipped_current_state_kept, unsupported, failed, or not_needed. +- `reason` +- `expected_current_hash` +- `actual_current_hash` +- `restored_to_hash` +- `source_change_id` + +Restoration should be performed by the server/core before the replacement turn begins. Clients may request an allowed restore policy and display restore outcomes, but they should not be responsible for applying inverse patches or mutating the workspace. For each file, the program should restore the pre-turn state only when the current file state still matches the expected post-turn state or another explicitly safe predicate. If the file has diverged, replay should record `skipped_current_state_kept` and the replacement turn should proceed from the current file content for that path. + +A hidden git checkpoint or ghost commit can be used as an internal checkpoint strategy. It should be treated as a content-addressed workspace snapshot, not as a user-visible commit. It must not be published, staged as user work, or used to rewrite visible branch history unless the user explicitly requests that. If a git checkpoint would discard user edits made after the superseded turn, the default restore result for those files should be `skipped_current_state_kept`. + +Workspace restoration affects file state only. It does not undo external API effects, running process effects, network calls, published git commits, or other non-file side effects. Those effects remain auditable in the superseded turn. + +## Mentions + +User input items should include a dedicated `mentions` field. A mention records structured references detected or selected inside a user message, separate from the user-visible content text. + +Mention examples: + +- Skill reference. +- File or directory reference. +- MCP server, resource, or template reference. +- Tool or connector reference. +- Session, turn, or transcript reference. +- Image, pasted artifact, or attachment reference. + +Conceptual mention fields: + +- `mention_id` +- `kind` +- `display_text` +- `target` +- `source_range` +- `resolution_status` +- `visibility` + +`content_parts` represent what the user submitted or what the assistant/tool produced. `mentions` represent structured references extracted from or attached to that content. A pasted image may therefore appear as a multimodal content part and also have a mention record that tracks how the program resolved or referenced that artifact. + +## Content Parts + +Items should support multimodal content parts. + +Conceptual content part kinds: + +- `text` +- `image_ref` +- `file_ref` +- `audio_ref` +- `video_ref` +- `tool_call_json` +- `tool_result_text` +- `provider_metadata` + +Large binary artifacts should be stored outside inline JSONL content and referenced by stable artifact references. + +## Fork Origin And Inherited History + +A forked session should store durable origin metadata and a replayable inherited-history segment rather than copying the full parent transcript into every fork. + +The origin metadata explains where the fork came from. It is not the only content pointer for replay, because a parent session may later be deleted or unavailable. + +Conceptual fork origin fields: + +- `parent_session_id` +- `fork_turn_id` +- `fork_created_at` +- `parent_display_label` +- `fork_turn_display_label` +- `fork_turn_digest` +- `origin_snapshot_hash` +- `parent_availability` + +Fork origin fields preserve provenance and user-facing labels. They are not guaranteed to remain dereferenceable links after parent deletion. + +Conceptual inherited-history segment fields: + +- `inherited_segment_id` +- `source_parent_session_id` +- `source_range` +- `storage_strategy`: protected shared segment, materialized fork segment, or protected retained source records. +- `record_refs` or `materialized_record_refs` +- `segment_hash` +- `availability_state` + +The inherited-history segment describes the parent transcript range that is visible and usable in the fork. A fork replay must be able to reconstruct that inherited transcript from the inherited segment without requiring the deleted parent session file to be opened. + +`source_parent_session_id` and `source_range` identify where the segment came from, but they are provenance keys after parent deletion. They must not be treated as the only way to load the inherited content. + +Parent deletion rules: + +- Deleting a parent session must not make surviving forked sessions unusable. +- If forked sessions still reference inherited source records, those records must remain in a protected shared segment, be materialized into the fork, or be protected by another explicit retention mechanism before the parent session is made inaccessible. +- A surviving fork must not rely only on `parent_session_id` plus `fork_turn_id` to recover inherited content after parent deletion. +- Deleting the parent session may make the parent session index entry and parent event file inaccessible, but it must not remove any inherited-history segment still required by a surviving fork. +- A forked session whose parent has been deleted should retain a fork indicator with `parent_availability` set to deleted or unavailable. +- Navigation to a deleted parent may fail, but inherited history visible in the fork must remain understandable. +- Hard purge of parent records referenced by surviving forks must be blocked unless the inherited segment is first materialized or moved to protected shared storage, or unless the user explicitly chooses cascade deletion of dependent forks where supported. + +## Internal Persistent Memory Links + +Persistent memory is stored outside the session transcript and is maintained by the core agent runtime. Durable session records may include internal provenance links where useful for replay, debugging, safety, or context-quality analysis. + +These links are not client-managed transcript items and are not part of the routine client projection. + +Conceptual memory link fields: + +- `memory_id` +- `source_session_id` +- `source_turn_id` +- `source_item_id` +- `derivation_event` +- `source_availability` + +When a session is deleted, the core may update, unlink, retain, or remove internal memory links according to internal memory policy. Session replay must not require clients to make per-memory decisions, and ordinary client projections should not expose memory-link records. + +## Active Context + +The active context object is distinct from the full transcript. It should reference transcript items, item ranges, summaries, instruction-set records, and artifact records rather than duplicating their full content. + +Conceptual context snapshot fields: + +- `context_id` +- `session` +- `created_for_turn` +- `model_binding` +- `instruction_set_ref` +- `entries` +- `token_estimate` +- `immutable_prefix_hash` +- `created_at` + +Conceptual context entry kinds: + +- `instruction_ref` +- `transcript_item_ref` +- `transcript_range_ref` +- `context_summary_ref` +- `artifact_ref` + +When context approaches the model's effective context limit, compaction creates a summary item or summary record and a later context snapshot references that summary instead of all older detailed transcript records. The full transcript remains available for user review. + +Compaction lifecycle records should preserve whether compaction was triggered manually by the user or automatically by context pressure. Clients use that trigger source to render transcript-area status cells with `Manual Compaction Started`, `Automatically Compaction Started`, and `Compaction Done` while keeping the compaction summary itself as context state rather than ordinary user or assistant transcript text. + +## Token Usage + +Token usage should be recorded as per-invocation deltas and derived totals. + +Conceptual usage fields: + +- `input_tokens` +- `cached_input_tokens` +- `output_tokens` +- `reasoning_output_tokens` +- `cache_creation_input_tokens` +- `total_tokens` + +`reasoning_output_tokens` is an optional breakdown when a provider reports it. It should not be added a second time if the provider already includes it inside output or completion tokens. + +For context pressure, the active context token estimate should be tracked separately from billing or response totals. + +## Streaming Persistence + +Streaming responses should be persisted incrementally. + +Conceptual streaming sequence: + +```text +turn_started +item_started +item_content_appended +item_content_appended +item_completed +usage_recorded +turn_completed +``` + +If the program exits unexpectedly, replay can recover completed content append operations and mark the incomplete turn or item as interrupted. + +The durable storage layer should not write one JSONL record for every provider SSE delta. It should buffer and append coalesced content operations often enough to preserve useful partial output after a crash while avoiding unbounded storage overhead. + +This design keeps a single logical response item. Multiple append records are storage operations used to rebuild the item after replay; they are not multiple response objects in the transcript or client projection. + +## Metadata Changes And Model Visibility + +Because JSONL is append-only, session metadata changes are represented by appended metadata events or snapshots rather than in-place mutation. + +Before each LLM invocation, the program should: + +1. Replay or load the current metadata state. +2. Detect metadata changes relevant to the next invocation. +3. Build active context from metadata and transcript references. +4. Serialize only the model-visible subset into provider request messages. + +Not every metadata change must become model-visible text. For example, token totals are runtime metadata. Persona, active mode, permission posture, workspace, and instruction-set changes may need model-visible representation depending on the current requirements. + +## Reference Representation Note + +This design uses ID-shaped fields such as `session_id`, `turn_id`, `item_id`, and `context_id` to make durable references explicit in diagrams and JSONL records. + +Implementation data structures do not need to store every relationship as a raw ID string after records are loaded. Server runtime code may hold resolved references, owned structs, indexes, arenas, handles, or other idiomatic structures. The requirement is that durable JSONL records preserve stable references and that runtime structures can be projected back to the durable reference model without ambiguity. + +## Server And Client Projections + +The server owns replay, mutation, context assembly, provider request construction, and persistence. + +Client projections should receive only the data needed for display and interaction, such as: + +- Session summary. +- Turn status. +- Visible transcript items. +- Token usage summary. +- Current model and reasoning display. +- Active plan state. +- Active goal state. +- Plan Mode or permission state display. +- Mention display and resolution status. + +Clients do not need full active-context internals unless they are explicitly showing context diagnostics. + +Clients also do not need persistent-memory internals. Persistent memory may affect model-visible context assembled by the core, but routine client projections should not expose memory records, memory-link records, or memory-change events. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-CONV-001 | 1 | specs/L1/L1-REQ-CONV-001-session-lifecycle.md | Defines the durable session data model used for lifecycle persistence. | +| refines | L1-REQ-CONV-002 | 1 | specs/L1/L1-REQ-CONV-002-turn-lifecycle.md | Defines turn and item data structures for auditable turns. | +| related-to | L1-REQ-AGENT-001 | 1 | specs/L1/L1-REQ-AGENT-001-execution-workflow.md | Durable records preserve execution workflow state. | +| related-to | L1-REQ-AGENT-002 | 1 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | Durable records preserve interrupted and resumed turn state. | +| related-to | L1-REQ-AGENT-003 | 1 | specs/L1/L1-REQ-AGENT-003-task-planning.md | Durable plan records preserve visible task planning state. | +| related-to | L1-REQ-CONV-003 | 1 | specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md | Defines durable steer and queue records. | +| refines | L1-REQ-CONV-004 | 1 | specs/L1/L1-REQ-CONV-004-session-forking.md | Defines fork references and deletion retention behavior. | +| refines | L1-REQ-CONV-005 | 1 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | Defines append-only message edit records and replacement turn references. | +| related-to | L1-REQ-CHANGE-001 | 1 | specs/L1/L1-REQ-CHANGE-001-rollback-and-recovery.md | Defines core-owned workspace change-set and restoration records for superseded turns. | +| related-to | L1-REQ-EDIT-001 | 1 | specs/L1/L1-REQ-EDIT-001-file-editing-workflow.md | Structured tool file changes provide restoration inputs. | +| related-to | L1-REQ-GOAL-001 | 1 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | Durable goal records preserve objective, status, budget, progress, and hidden goal context snapshots for replay. | +| related-to | L1-REQ-GIT-001 | 1 | specs/L1/L1-REQ-GIT-001-change-management.md | Hidden git checkpoints may support turn restoration. | +| related-to | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | Durable records preserve built-in tool calls, results, and plan state. | +| related-to | L1-REQ-APP-002 | 1 | specs/L1/L1-REQ-APP-002-persistence.md | Defines JSONL replay and recovery records for durable conversation history. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | Defines privacy handling for model-visible user data. | +| related-to | L1-REQ-MEM-001 | 1 | specs/L1/L1-REQ-MEM-001-persistent-memory.md | Defines persistent memory as core-maintained internal state. | +| related-to | L1-REQ-CONTEXT-001 | 1 | specs/L1/L1-REQ-CONTEXT-001-management.md | Defines active context as references into transcript and metadata. | +| related-to | L1-REQ-CONTEXT-003 | 1 | specs/L1/L1-REQ-CONTEXT-003-compress.md | Defines compaction output as durable summary records referenced by active context. | +| related-to | L1-REQ-INPUT-001 | 1 | specs/L1/L1-REQ-INPUT-001-attachments-and-multimodal.md | Defines content parts and mentions for attachments and multimodal input. | +| related-to | L1-REQ-TUI-003 | 1 | specs/L1/L1-REQ-TUI-003-transcript.md | Defines transcript structures that clients render. | +| related-to | L1-REQ-LLM-001 | 1 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | Preserves immutable context prefixes through append-only storage and context references. | +| related-to | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | Session metadata references configured model bindings. | +| related-to | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | Defines tool and plan records persisted by the session data model. | +| related-to | L2-DES-GOAL-001 | 1 | specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md | Defines the goal records and projection semantics persisted by the session data model. | +| specified-by | TBD | TBD | specs/L3/conv/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial session JSONL data model with metadata, transcript, mentions, active context references, streaming persistence, and reference representation note. | +| 1 | 2026-05-22 | Human | Refinement | Split provider, server-client, and durable event planes and replaced durable item deltas with coalesced content append operations. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that durable content append records are operations on one logical item, not multiple item objects. | +| 1 | 2026-05-22 | Human | Refinement | Added steer and queue records, fork deletion retention behavior, and persistent memory provenance links. | +| 1 | 2026-05-22 | Human | Refinement | Added append-only immediate previous message edit records and replacement turn projection behavior. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that fork origin keys may become non-dereferenceable after parent deletion and cannot be the fork's only inherited content pointer. | +| 1 | 2026-05-22 | Human | Refinement | Added turn workspace checkpoints and restore result records for superseded-turn file restoration. | +| 1 | 2026-05-22 | Human | Refinement | Reframed persistent memory links as internal core provenance outside routine client projections. | +| 1 | 2026-05-22 | Human | Refinement | Clarified that per-turn workspace change sets are core-owned restore data, while client-visible diffs are display projections. | +| 1 | 2026-05-22 | Human | Refinement | Added durable interrupt and resume records for execution engine recovery. | +| 1 | 2026-05-22 | Human | Refinement | Added durable plan records for plan-tool state and replay. | +| 1 | 2026-05-22 | Human | Refinement | Added durable command descriptions and natural-language result summaries. | +| 1 | 2026-05-23 | Human | Refinement | Added durable Ralph Loop goal records and active-goal projection fields. | +| 1 | 2026-05-25 | Human | Refinement | Added manual versus automatic compaction trigger provenance for transcript-area compaction notices. | diff --git a/specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md b/specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md new file mode 100644 index 00000000..76b55f81 --- /dev/null +++ b/specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md @@ -0,0 +1,428 @@ +--- +artifact_id: L2-DES-GOAL-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-GOAL-001 — Ralph Loop Goals + +## Purpose + +Refine the Ralph Loop goal requirement into a durable, optionally bounded, autonomous goal loop that can be controlled by the user, observed by clients, resumed after restart, and executed by the server without polluting the visible transcript with hidden continuation prompts. + +## Background / Context + +A normal chat turn is request-response: the user submits input, the program works, and the turn stops. A Ralph Loop goal changes that interaction model. The user sets a durable objective once, and the program continues across turns while the goal is active, verified incomplete, unpaused, and within any configured budget or stop policy. + +The goal feature must remain user-owned. The model may work toward the objective and report completion or blockers, but it must not silently rewrite the objective, expand the budget, or hide why the loop is continuing. + +## Source Requirements + +- `L1-REQ-GOAL-001` requires users to create, view, pause, resume, clear, and complete a bounded Ralph Loop goal. +- `L1-REQ-AGENT-001` requires the server-side execution workflow from accepted input through turn completion. +- `L1-REQ-AGENT-002` requires interrupt and resume control. +- `L1-REQ-AGENT-005` defines Plan Mode behavior that must remain distinct from autonomous execution. +- `L1-REQ-APP-002` requires durable persistence and recovery. +- `L1-REQ-CONV-001` requires durable session lifecycle behavior. +- `L1-REQ-CONV-002` requires observable turn lifecycle behavior. +- `L1-REQ-TUI-006` requires discoverable command invocation from the TUI. +- `L1-REQ-TUI-004` requires visible state for running, waiting, stopped, and failed work. +- `L1-REQ-LLM-003` requires model usage observability. +- `L2-DES-AGENT-001` defines the execution engine that runs goal continuation turns. +- `L2-DES-AGENT-002` defines interruption and resume control. +- `L2-DES-APP-003` defines the client-server protocol and multi-client event model. +- `L2-DES-CONV-001` defines JSONL as the durable session source of truth. +- `L2-DES-CONTEXT-001` defines context assembly and hidden metadata-derived context. +- `L2-DES-TUI-003` defines slash-command discovery and submission. + +## Design Requirement + +The program should support one current Ralph Loop goal per session in the first milestone. + +Historical goal records remain auditable. A new goal may replace a prior non-terminal goal only through an explicit user action. The replacement creates a new `goal_id`; it must not mutate the previous goal record in place. + +The goal loop should: + +- Persist the objective, status, optional budget, usage, progress, and blocker state. +- Inject hidden goal context into model invocations while the goal is active. +- Start autonomous continuation turns only when the session is idle and policy permits. +- Account token and time usage incrementally. +- Stop automatically when complete, blocked, paused, canceled, or limited by a configured budget or stop policy. +- Explain to the user why work continues or why it stopped. + +## State Model + +Conceptual statuses: + +| Status | User Label | Terminal | Meaning | +|---|---|---|---| +| `active` | pursuing | no | The program may continue autonomous work toward the objective. | +| `paused` | paused | no | The user stopped autonomous continuation without discarding the goal. | +| `blocked` | blocked | no | The program cannot make useful progress until user input or an external state change. | +| `complete` | completed | yes | The objective has been verified as satisfied. | +| `budget_limited` | budget-limited | yes | The configured budget or stop limit was reached before verified completion. | +| `canceled` | canceled | yes | The user ended the goal without marking it complete. | + +Terminal statuses are irreversible. A user who wants to continue after a terminal status should create a new goal, optionally seeded from the previous objective or progress summary. + +Allowed state transitions: + +```text +none + -> active + +active + -> paused + -> blocked + -> complete + -> budget_limited + -> canceled + +paused + -> active + -> canceled + +blocked + -> active + -> canceled + +complete + -> terminal + +budget_limited + -> terminal + +canceled + -> terminal +``` + +## User Sovereignty + +The user owns objective text, explicit optional budget, pause/resume, cancellation, clear, and replacement. + +Model-facing goal tools should be narrower than client or slash-command controls. The model may report: + +- `complete`, with evidence and verification summary. +- `blocked`, with blocker details and what input or external change is needed. + +The model must not be allowed to: + +- Change the objective. +- Increase or reset the budget. +- Clear or cancel the goal. +- Replace the active goal. +- Resume a user-paused goal. + +## Persistent Data Model + +Conceptual persistent goal fields: + +| Field | Purpose | +|---|---| +| `session_id` | Session that owns the current goal projection. | +| `goal_id` | Version identifier regenerated when the goal is replaced. | +| `objective` | User-provided objective and success condition. | +| `status` | Current goal status. | +| `token_budget` | Optional token budget. | +| `time_budget_seconds` | Optional wall-clock budget. | +| `turn_budget` | Optional continuation-turn budget. | +| `tokens_used` | Accounted non-cached input plus output tokens. | +| `time_used_seconds` | Accounted wall-clock runtime. | +| `turns_used` | Counted goal-driven turns. | +| `progress_summary` | Current concise progress state for display and continuation. | +| `blocker_summary` | Current blocker state when blocked. | +| `verification_summary` | Evidence used when marking complete. | +| `created_at` | Creation timestamp. | +| `updated_at` | Last state-change timestamp. | +| `completed_at` | Completion timestamp where applicable. | +| `expected_goal_id` | Optional optimistic-concurrency guard on mutation requests. | + +`goal_id` is not the same as `session_id`. A session has one current goal projection, but replacing that goal generates a new `goal_id` so stale updates can become no-ops. + +## JSONL Source Of Truth And SQLite Projection + +The session JSONL rollout remains the authoritative durable source of truth. Every goal state that must survive restart must be representable by append-only JSONL records. + +SQLite may be introduced as a rebuildable projection and operational index: + +- It may store the current goal projection for fast lookup. +- It may support indexed session lists and goal status filters. +- It may help perform atomic runtime accounting and optimistic concurrency checks. +- It must be rebuildable from JSONL rollout files. +- It must not be the only durable copy of objective, status, usage, budget, or progress state. + +If SQLite is used during live execution, a successful projection mutation must correspond to a durable goal event. The L3 design should define an append-and-project or outbox pattern so crashes cannot permanently create a SQLite-only goal state that replay cannot reconstruct. + +On recovery, replay of JSONL records is authoritative. Stale or missing SQLite rows should be regenerated from rollout files. + +## Durable JSONL Events + +Conceptual durable record kinds: + +| Record Kind | Purpose | +|---|---| +| `goal_created` | Creates the first current goal for a session. | +| `goal_replaced` | Ends the previous current goal and creates a new current goal with a new `goal_id`. | +| `goal_status_changed` | Records pause, resume, block, complete, cancel, or budget-limit transitions. | +| `goal_budget_accounted` | Records token, time, and turn deltas applied to the goal. | +| `goal_progress_recorded` | Records concise progress, blocker, or verification summaries. | +| `goal_context_snapshot_recorded` | Records or references the exact hidden goal context used for a model invocation. | +| `goal_cleared` | Removes the current goal projection from normal UI views while retaining audit records. | + +Goal events should include: + +- `schema_version` +- `session_id` +- `goal_id` +- `event_id` +- `expected_previous_goal_id` where mutation races matter +- `turn_id` or `invocation_id` where the event was produced by execution +- `created_by`: user, model_tool, system, or recovery +- `timestamp` + +`goal_context_snapshot_recorded` exists so replay can explain the model-visible hidden goal context used for a turn. It should store either the exact serialized hidden context or a content-addressed reference to it. + +## Runtime State + +Runtime goal state is transient and should not be mixed into durable session metadata unless it affects replay. + +Conceptual runtime fields: + +- Active `goal_id` loaded for the current turn. +- Per-turn token baseline. +- Per-turn wall-clock baseline. +- Last accounted token and time counters. +- Continuation semaphore or lock. +- Reserved active-turn slot for autonomous continuation. +- Whether budget-limit guidance has already been injected for the current turn. +- Whether the last autonomous continuation did no useful work. +- Pending continuation reason. + +Runtime state may be lost on restart. Recovery reconstructs durable goal state from JSONL and may resume autonomous continuation only after rechecking current session and goal conditions. + +## Budget Accounting + +Budget accounting should happen incrementally rather than only at turn end. + +The first milestone does not assign a default token, time, or turn budget when the user creates a goal through `/goal `. When no explicit budget is configured, accounting still records usage for display, recovery, and future limits, but absence of a budget must not by itself transition the goal to `budget_limited`. + +Accounting points: + +- Turn start: capture baselines. +- Tool completion: account observed usage and time deltas. +- Model-facing goal tool completion: account usage and suppress redundant budget prompts where appropriate. +- Turn completion: account final usage and time deltas. +- Interruption or abort: account consumed usage before pausing or stopping. +- External goal mutation: best-effort account current usage before applying the mutation. + +Token accounting should use normalized model usage: + +```text +goal_token_delta = non_cached_input_tokens + output_tokens +``` + +Cached input tokens are excluded because they represent reused context rather than newly consumed context cost for the current goal. If a provider reports reasoning tokens separately, the model usage normalization layer must state whether they are already included in `output_tokens`. Goal accounting must not double-count reasoning tokens. + +Budget transitions should be atomic at the projection layer. A successful accounting operation should increment usage and switch `active` to `budget_limited` in one logical operation when a configured limit is reached. + +## Continuation Loop + +Autonomous continuation should run only when all preconditions are true: + +- Goals feature is enabled. +- The session has a current goal with status `active`. +- The session is not in Plan Mode. +- No turn is currently active. +- No queued user work has priority. +- No approval or question prompt is waiting. +- Budget permits another continuation. +- The previous autonomous continuation was not suppressed for no useful work. + +The continuation launch pattern should be: + +```text +pre-check + load candidate active goal + verify session is idle and eligible + +reserve + acquire continuation lock + reserve active-turn slot + +re-check + reload goal projection + verify same goal_id and status active + verify budget and mode still permit work + +launch + record goal context snapshot + create hidden continuation input + start continuation turn through the normal execution engine +``` + +The re-check is required because the user may pause, cancel, replace, or clear the goal between pre-check and launch. + +If an autonomous continuation ends without tool calls, verification, progress update, or useful assistant output, the server should suppress the next automatic continuation and report that the goal needs user input or review. This prevents empty loop cycles. + +## Hidden Goal Context + +Goal context should be injected into model requests as hidden context, not as a normal user-visible transcript item. + +Conceptual hidden context: + +```xml + + active + ... + + + ... + + Continue working toward the active session goal. + Verify completion against the objective before reporting the goal complete. + Base decisions on current workspace evidence and tool results. + + +``` + +Rules: + +- User-provided objective text must be XML-escaped. +- User-provided objective text should be placed inside an explicitly untrusted tag such as `untrusted_objective`. +- If no budget is configured, hidden goal context should state that no explicit budget is configured or omit limit fields; it must not fabricate a default budget. +- Hidden goal context must not render as an ordinary transcript turn. +- The context snapshot must be auditable through JSONL records or context snapshot references. +- The hidden context may be serialized with provider-specific roles during request construction, but that serialization does not make it a transcript item. + +## Plan Mode Interaction + +Plan Mode and autonomous goal continuation are mutually exclusive execution modes. + +When Plan Mode is active: + +- The current goal may still be viewed by the user. +- The user may pause, cancel, clear, or inspect the goal. +- Autonomous continuation must not start. +- Goal hidden context should not be injected into Plan Mode turns unless a later L3 design explicitly defines a read-only planning interaction. +- Goal usage accounting should not charge Plan Mode exploratory turns to the autonomous goal budget unless the user explicitly asks the Plan Mode turn to operate on the goal. + +When the session returns to Build mode, the server may re-evaluate continuation eligibility. + +## Model-Facing Goal Tool + +The model-facing goal update tool should be deliberately narrow. + +Conceptual tool input: + +| Field | Purpose | +|---|---| +| `status` | Allowed values: `complete` or `blocked`. | +| `verification_summary` | Required when `status = complete`. | +| `blocker_summary` | Required when `status = blocked`. | +| `evidence_refs` | Optional references to tests, files, commands, or tool outputs. | +| `expected_goal_id` | Optimistic-concurrency guard supplied by hidden context. | + +Rules: + +- `complete` requires evidence that the objective is actually satisfied. +- `blocked` requires a concrete blocker and the needed user input or external change. +- If `expected_goal_id` does not match the current goal, the update should become a no-op. +- The tool cannot modify objective, budget, or user-owned controls. +- The tool result should be recorded as both a tool result and a goal status/progress event where applicable. + +## Client And Protocol Surface + +Client requests should expose user-owned controls: + +| Method | Purpose | +|---|---| +| `goal.get` | Return the current goal projection for a session. | +| `goal.create` | Create or explicitly replace the current goal. | +| `goal.pause` | Pause autonomous continuation. | +| `goal.resume` | Resume a paused or blocked goal. | +| `goal.complete` | Let the user mark the goal complete. | +| `goal.cancel` | End the goal without completion. | +| `goal.clear` | Remove the current goal from normal UI views while retaining audit records. | + +Server notifications should include: + +| Notification/Event | Purpose | +|---|---| +| `goal.updated` | Broadcast canonical status, budget, progress, or blocker changes to subscribed clients. | +| `goal.continuationStarted` | Tell clients an autonomous continuation turn has started. | +| `goal.budgetLimited` | Tell clients the goal stopped because a configured budget was reached. | + +Goal protocol responses should be immediate. Long-running work caused by a resumed active goal should be reported through subsequent turn and goal events. + +## TUI Integration + +The `/goal` slash command is the primary TUI entry point for goal control. + +The TUI should support: + +- Opening a current-goal panel with objective, status, progress, blockers, verification, and budget where available. +- Creating a goal directly from `/goal ` when none exists, with no default budget prompt. +- Explicitly replacing an existing non-terminal goal after confirmation. +- Pausing and resuming goal continuation. +- Canceling, clearing, or user-marking completion. +- Showing why automatic continuation stopped. + +The current goal should also be visible in state surfaces such as `/status` or an active-work strip when it affects current execution. + +## Recovery And Replay + +Replay must reconstruct: + +- Current goal projection. +- Historical goal replacements and terminal states. +- Usage totals. +- Progress, blocker, and verification summaries. +- Whether the goal was active, paused, blocked, terminal, or cleared at the end of the rollout. +- Hidden goal context snapshots used for model invocations. + +After restart, the server must not blindly continue just because the last durable status is `active`. It must re-evaluate session idleness, Plan Mode, queued work, approvals, budgets, and the continuation suppression state that can be reconstructed or safely inferred. + +## Invariants + +- A session has at most one current goal projection in the first milestone. +- User-owned goal fields cannot be modified by the model. +- Complete, budget-limited, and canceled are terminal states. +- JSONL rollout records are the replayable source of truth. +- SQLite, if present, is a derived projection and may be rebuilt. +- Autonomous continuation uses the normal execution engine and produces normal turn records. +- Hidden goal context is auditable but not rendered as a user transcript turn. +- Budget accounting must not double-count cached input or separately reported reasoning tokens. +- A goal created without an explicit budget has no default token, time, or turn budget. +- Every subscribed client receives canonical goal updates when any client or model-facing tool changes the goal. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-GOAL-001 | 1 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | Defines durable goal state, statuses, continuation, budget accounting, model tool limits, and client controls. | +| related-to | L1-REQ-AGENT-001 | 1 | specs/L1/L1-REQ-AGENT-001-execution-workflow.md | Goal continuations run through the normal execution engine. | +| related-to | L1-REQ-AGENT-002 | 1 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | Interrupts and resumes update or re-evaluate goal runtime state. | +| related-to | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Plan Mode suppresses autonomous goal continuation. | +| related-to | L1-REQ-APP-002 | 1 | specs/L1/L1-REQ-APP-002-persistence.md | Goal state must replay from durable storage after restart. | +| related-to | L1-REQ-CONV-001 | 1 | specs/L1/L1-REQ-CONV-001-session-lifecycle.md | Goals are session-owned durable state. | +| related-to | L1-REQ-CONV-002 | 1 | specs/L1/L1-REQ-CONV-002-turn-lifecycle.md | Goal continuation produces ordinary durable turns. | +| related-to | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | The `/goal` command is the TUI control surface. | +| related-to | L1-REQ-LLM-003 | 1 | specs/L1/L1-REQ-LLM-003-observability.md | Goal budgets depend on normalized model usage. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Defines the execution engine used by autonomous continuations. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Defines client requests and notifications for goal control and broadcast. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Defines durable JSONL event and replay principles used by goal records. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines slash command discovery and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/goal/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial Ralph Loop goal architecture with JSONL source of truth, optional SQLite projection, bounded continuation loop, budget accounting, model-tool limits, and `/goal` integration. | +| 1 | 2026-05-25 | Human | Refinement | Set first-milestone `/goal ` creation to default to no explicit budget. | +| 1 | 2026-05-25 | Assistant | Refinement | Clarified that budget fields are optional, usage accounting still occurs without a configured budget, and hidden context must not fabricate a default budget. | diff --git a/specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md b/specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md new file mode 100644 index 00000000..967ace64 --- /dev/null +++ b/specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md @@ -0,0 +1,298 @@ +--- +artifact_id: L2-DES-LLM-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-LLM-003 — Model Usage Observability + +## Purpose + +Refine model usage observability into a technical design for usage metrics, context-window pressure reporting, model invocation correlation, and trace-mode streaming response diagnostics. + +## Background / Context + +Model calls are expensive, latency-sensitive, and provider-dependent. Users need to know how many tokens were read, written, cached, or estimated; whether the current context is near the effective limit; and whether provider streaming behaved as expected. The design must preserve provider-reported facts without inventing unavailable values and must protect sensitive prompt, response, and credential data. + +## Source Requirements + +- `L1-REQ-LLM-003` requires model usage, cached input usage, context-window usage, estimate labeling, unavailable-value reporting, and trace-mode stream records. +- `L1-REQ-APP-004` requires structured logging, configurable log levels, actionable diagnostics, and trace logging for large language model streaming response events. +- `L1-REQ-LLM-001` requires token-efficient context construction. +- `L1-REQ-CONTEXT-001` requires useful active context management. +- `L1-REQ-CONTEXT-003` requires context compression near model limits. +- `L1-REQ-MODEL-001` defines model capability metadata and effective context length. +- `L2-DES-APP-004` defines the cross-system observability architecture. +- `L2-DES-AGENT-001` defines where model invocations occur in the execution engine. +- `L2-DES-CONV-001` defines durable usage records in the session JSONL data model. +- `L2-DES-MODEL-001` defines resolved model profiles and model-provider bindings. + +## Design Requirement + +The program should represent model observability as invocation-scoped facts with explicit provenance. + +For each model invocation, the program should track: + +- Which session and turn caused the invocation. +- Which model binding and provider method were used. +- What context size was assembled or estimated. +- What usage values the provider reported. +- Which usage values were estimated locally. +- Which usage values were unavailable. +- How the response stream progressed when trace mode is enabled. + +The program must not infer exact provider usage when the provider does not report it. Locally computed token counts are estimates unless an adapter can prove they match provider accounting. + +## Invocation Identity + +Each model call should receive an `invocation_id` before provider execution starts. + +Conceptual invocation fields: + +- `invocation_id` +- `session_id` +- `turn_id` +- `context_snapshot_id` +- `model_binding_id` +- `canonical_model_slug` +- `provider_id` +- `invocation_method` +- `reasoning_effort` +- `started_at` +- `completed_at` +- `status`: running, completed, failed, interrupted, or canceled. +- `streaming`: whether the invocation used streaming. +- `trace_id` when trace mode is enabled. + +The `invocation_id` should appear in logs, usage records, server-client usage events, provider error diagnostics, and trace-mode stream records. + +## Usage Metric Model + +Usage metrics should use a common wrapper that distinguishes measured, estimated, unavailable, and redacted values. + +Conceptual `UsageMetric` fields: + +- `name` +- `value` +- `unit`: tokens, bytes, milliseconds, events, or provider-defined. +- `source`: provider_reported, local_estimate, unavailable, or redacted. +- `provider_field` where applicable. +- `confidence`: exact, approximate, unknown, or provider_defined. +- `included_in` where the provider defines an inclusion relationship. +- `notes` where needed for provider-specific interpretation. + +Core token metrics: + +| Metric | Meaning | +|---|---| +| `input_tokens` | Tokens read by the model for this invocation. | +| `cached_input_tokens` | Input tokens served from provider-side cache where reported. | +| `output_tokens` | Tokens generated by the model for this invocation. | +| `reasoning_tokens` | Provider-reported internal reasoning token count where available. | +| `total_tokens` | Provider-reported total where available. | +| `context_tokens` | Token count or estimate for active model-visible context. | +| `effective_context_window` | Effective context limit used by the program for this model call. | + +Reasoning tokens are provider-defined. The program should not assume they are inside or outside `output_tokens` unless the provider report or adapter explicitly identifies that relationship. When known, the usage metric should record `included_in: output_tokens`, `included_in: total_tokens`, or another explicit relationship. When unknown, the display should avoid double-counting reasoning tokens in totals. + +## Read, Write, And Cached-Read Display + +For user-facing display, the program should map core token metrics into readable terms: + +- Read: `input_tokens` +- Cached read: `cached_input_tokens` +- Write: `output_tokens` + +If a provider uses different names, the provider adapter should map them into the common metric model while preserving the original `provider_field`. + +Display rules: + +- Provider-reported values should be labeled as measured or provider-reported. +- Local estimates should be labeled as estimates. +- Unavailable values should be shown as unavailable, not zero. +- Redacted values should be shown as redacted. +- Totals should identify whether they are provider-reported or derived. + +## Context Pressure + +Context pressure describes how close the invocation is to the effective context window. + +Conceptual context pressure fields: + +- `context_tokens` +- `context_tokens_source` +- `effective_context_window` +- `pressure_ratio` +- `pressure_state`: normal, high, near_limit, over_limit, or unknown. +- `compaction_threshold` +- `compaction_status`: not_needed, recommended, scheduled, running, completed, failed, or unavailable. + +When exact context tokenization is unavailable, the program may estimate context size. Estimated context size must be labeled as an estimate. + +Context pressure should be emitted before or around model invocation where possible, and updated when provider usage confirms or corrects the estimate. + +## Durable Usage Records + +Durable session records should preserve usage summaries needed for replay and later inspection. + +Durable usage records should include: + +- `session_id` +- `turn_id` +- `invocation_id` +- `model_binding_id` +- `canonical_model_slug` +- `provider_id` +- Usage metrics. +- Context pressure summary. +- Whether values are provider-reported, estimated, unavailable, or redacted. +- `recorded_at` + +Durable usage records should not store prompt content, response content, credential material, or raw provider request headers. + +The durable session JSONL file remains the source of truth for usage summaries that affect session inspection. Trace-mode stream records are diagnostic artifacts and should be stored under observability retention policy rather than treated as transcript content. + +## Server-Client Usage Projection + +Clients should receive safe usage projections through the server-client protocol. + +Usage projections should include: + +- Current invocation usage where known. +- Turn-level usage delta. +- Session-level usage totals. +- Context pressure. +- Metric source labels. +- Unavailable or redacted markers. + +Clients should not need provider-specific parsing logic to display token usage. Provider-specific details may be exposed as safe metadata when useful for diagnostics. + +## Trace-Mode Stream Records + +When trace logging is enabled, the model provider adapter and execution engine should record stream events in invocation order. + +Conceptual stream trace fields: + +- `trace_id` +- `invocation_id` +- `session_id` +- `turn_id` +- `sequence` +- `timestamp` +- `elapsed_ms` +- `provider_event_kind` +- `normalized_event_kind` +- `content_policy`: omitted, redacted, inline, or content_ref. +- `delta_text` or `delta_ref` where allowed. +- `delta_bytes` +- `delta_chars` +- `finish_reason` where available. +- `usage_fragment` where available. +- `error_fragment` where available. +- `raw_event_ref` where configured and allowed. + +Trace records should preserve timing and ordering even when content is redacted. If content logging is disabled or redacted, the record should still preserve event kind, sequence, timing, content length, and completion state where available. + +## Trace Content Policy + +Trace-mode stream records are more sensitive than ordinary usage records because response deltas may reveal user data, generated code, or secrets. + +The program should support these trace content modes: + +| Mode | Behavior | +|---|---| +| `metadata_only` | Records event kind, sequence, timing, sizes, usage fragments, and completion state, but omits streamed text. | +| `redacted_content` | Records deltas after configured redaction. | +| `content_ref` | Stores content in a protected diagnostic artifact and references it from the trace record. | +| `inline_content` | Stores streamed content inline only when explicitly enabled for local debugging. | + +Default trace behavior should prefer `metadata_only` or `redacted_content`. Inline content should be opt-in because it increases privacy risk. + +## Provider Adapter Responsibilities + +Provider adapters should normalize usage and stream events without hiding provider-specific uncertainty. + +Adapter responsibilities: + +- Map provider usage fields into common metrics. +- Preserve original provider field names in `provider_field`. +- Mark missing values as unavailable. +- Mark local token counts as estimates unless exact. +- Preserve reasoning-token inclusion relationships only when known. +- Emit normalized stream event kinds. +- Preserve provider finish reasons and error categories. +- Apply redaction before writing content-bearing trace records. + +Provider adapters should not calculate monetary cost unless a later requirement defines provider pricing inputs and billing rules. + +## Diagnostic Examples + +Example usage projection: + +```json +{ + "invocation_id": "inv_01", + "read": { "value": 18420, "source": "provider_reported" }, + "cached_read": { "value": 12000, "source": "provider_reported" }, + "write": { "value": 930, "source": "provider_reported" }, + "reasoning": { + "value": 460, + "source": "provider_reported", + "included_in": "output_tokens" + }, + "context_pressure": { + "pressure_state": "near_limit", + "pressure_ratio": 0.91, + "source": "local_estimate" + } +} +``` + +Example unavailable value: + +```json +{ + "invocation_id": "inv_02", + "cached_read": { + "source": "unavailable", + "notes": "Provider did not report cached input tokens." + } +} +``` + +## Privacy And Retention + +Rules: + +- Usage summaries may be durable session metadata because they are needed for later inspection. +- Stream traces are diagnostic logs, not transcript records. +- Trace records should follow the retention policy from `L2-DES-APP-004`. +- Prompt content and response content should not be written to normal logs. +- Trace content should be redacted, referenced, or omitted according to trace content policy. +- Credential material and authorization headers must never be recorded. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-LLM-003 | 1 | specs/L1/L1-REQ-LLM-003-observability.md | Defines model usage metrics, context pressure, measured versus estimated values, unavailable values, and trace-mode stream records. | +| related-to | L1-REQ-APP-004 | 1 | specs/L1/L1-REQ-APP-004-observability.md | Uses application observability controls for logs, trace mode, privacy, and diagnostics. | +| related-to | L1-REQ-LLM-001 | 1 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | Context pressure and cached input usage inform token efficiency. | +| related-to | L1-REQ-CONTEXT-001 | 1 | specs/L1/L1-REQ-CONTEXT-001-management.md | Active context size and pressure are context-management diagnostics. | +| related-to | L1-REQ-CONTEXT-003 | 1 | specs/L1/L1-REQ-CONTEXT-003-compress.md | Context pressure explains compaction behavior. | +| related-to | L2-DES-APP-004 | 1 | specs/L2/app/L2-DES-APP-004-observability-architecture.md | Provides the cross-system observability architecture used by model-specific observability. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | Model invocations occur inside the execution engine. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable usage records are stored with session data. | +| related-to | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | Usage records reference model bindings and resolved provider profiles. | +| specified-by | TBD | TBD | specs/L3/llm/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial model usage and stream observability design. | diff --git a/specs/L2/mcp/L2-DES-MCP-001-mcp-integration-architecture.md b/specs/L2/mcp/L2-DES-MCP-001-mcp-integration-architecture.md new file mode 100644 index 00000000..99b431f3 --- /dev/null +++ b/specs/L2/mcp/L2-DES-MCP-001-mcp-integration-architecture.md @@ -0,0 +1,361 @@ +--- +artifact_id: L2-DES-MCP-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-MCP-001 - MCP Integration Architecture + +## Purpose + +Define the technical design for integrating user-configured Model Context Protocol (MCP) servers into the program as discoverable, observable, and safety-gated capabilities. + +## Background / Context + +The MCP reference model separates three roles: + +- The program is the MCP host. +- The program creates one MCP client per configured MCP server connection. +- External MCP servers provide capabilities such as tools, resources, resource templates, and prompts through standardized protocol operations. + +MCP servers are useful because they let users connect the agent to external systems without hardcoding every integration. They are risky because the program does not own those external systems, their descriptions, or their side effects. The integration must therefore normalize MCP capabilities into the program's server-owned tool, context, configuration, and safety systems. + +## Source Requirements + +- `L1-REQ-APP-008` requires user-configured MCP integrations, discovery, status, startup errors, and safety participation. +- `L1-REQ-TOOL-001` requires tool safety, approval, redaction, and bounded output. +- `L1-REQ-TOOL-002` requires controlled tool execution through the built-in tool lifecycle. +- `L1-REQ-LLM-002` requires model-requested tool use to be validated and supervised. +- `L1-REQ-APP-010` requires persistent configuration and unavailable-state behavior. +- `L1-REQ-APP-011` requires actionable error recovery. +- `L1-REQ-APP-012` requires privacy, credential safety, and user data ownership. +- `L2-DES-APP-002` defines configuration precedence for user-scoped and project-scoped settings. +- `L2-DES-APP-003` defines client/server protocol events and request behavior. +- `L2-DES-AGENT-001` defines the execution engine that dispatches tools. +- `L2-DES-CONTEXT-001` defines context assembly. +- `L2-DES-TOOL-001` defines the server-owned tool registry and tool supervisor. + +## Design Requirement + +The program should integrate MCP by maintaining an MCP manager that owns configured server connections, lifecycle state, capability discovery, and normalized dispatch into the program's existing runtime boundaries. + +MCP capabilities must not bypass the program's registry, context assembly, safety, approval, redaction, output bounding, observability, or durable recording. From the model's perspective, MCP-provided tools are available only as server-approved tool definitions. From the user's perspective, every MCP server has visible status and failures. + +## Standards Alignment + +The design should align with the MCP architecture and capability model: + +| MCP concept | Program design | +|---|---| +| Host | The program runtime that owns sessions, context, tools, clients, and user control. | +| Client | A per-server connection object managed by the MCP manager. | +| Server | An external process or remote endpoint configured by the user or workspace. | +| Tools | Normalized into server-owned tool definitions and dispatched through the tool supervisor. | +| Resources | Exposed as readable context objects through a controlled resource-read path. | +| Resource templates | Exposed as typed resource patterns requiring parameter validation before read. | +| Prompts | Discovered as reusable prompt templates, but not automatically injected into model context. | +| Roots | Sent as advisory workspace context when policy allows; not treated as a security boundary. | +| Sampling | Disabled by default; if enabled later, host-owned model invocation, budgets, and approval apply. | +| Elicitation | Converted into user-visible pending input owned by the program, not raw server-to-model control. | +| Logging | Captured as server diagnostics with redaction and source attribution. | + +The required L1 surface is tools, resources, resource templates, status, and safety. MCP prompts, sampling, and elicitation are protocol primitives that should be represented explicitly so the integration can fail safely or support them later without redefining the architecture. + +## MCP Configuration + +MCP configuration should follow the precedence and source-tracking rules from `L2-DES-APP-002`. + +The concrete TOML shape for persisted MCP server records is defined by `L2-DES-APP-005` under `[mcp.servers.]`. Secret material used by MCP servers is stored in companion `auth.json` files and referenced from TOML by credential id. + +Conceptual `McpServerConfig` fields: + +- `server_id`: stable local identifier. +- `display_name`: user-facing server name. +- `enabled`: whether the server may be used. +- `transport`: stdio, streamable HTTP, or another MCP-approved transport added later. +- `command`: command and arguments for stdio servers. +- `cwd`: optional working directory for stdio servers. +- `env`: non-secret environment values or credential-id references that the host injects into a stdio server process at runtime. +- `base_url`: endpoint for HTTP servers. +- `auth_ref`: `auth.json` credential id for HTTP authorization, not routine plaintext. +- `startup_policy`: eager, lazy, or manual. +- `trust_policy`: user, project, or untrusted workspace source. +- `allowed_capabilities`: optional allowlist for tools, resources, templates, prompts, sampling, and elicitation. +- `roots_policy`: which workspace roots may be shared with the server. +- `output_limits`: per-server output and diagnostic limits where configured. + +Project-scoped MCP configuration can start local processes or route data to external services. Therefore project-scoped MCP servers must be visible to the user before first use. A project may suggest MCP servers, but the runtime should not silently grant broad trust to an unreviewed project configuration. + +## Server Lifecycle + +Each configured server has an independent lifecycle. One server failure must not disable built-in tools or unrelated MCP servers. + +Conceptual server states: + +- `disabled` +- `not_started` +- `starting` +- `ready` +- `degraded` +- `auth_required` +- `failed` +- `stopped` + +Lifecycle rules: + +- Eager servers start during runtime bootstrap if they are enabled and trusted for the current workspace. +- Lazy servers start when their status is inspected, a capability is needed, or the user requests refresh. +- Manual servers start only through explicit user action. +- Stdio server stdout is reserved for MCP protocol messages. Diagnostic output from stderr is captured as logs and bounded. +- HTTP server authentication failures produce `auth_required` or `failed` status with credential-safe diagnostics. +- Restart and refresh operations are per-server; they do not rebuild the whole runtime unless configuration precedence changes require it. + +## Capability Discovery + +After initialization and capability negotiation, the MCP manager discovers server catalogs. + +Discovery should collect: + +- Tool names, descriptions, annotations where available, and input schemas. +- Resource URIs, names, descriptions, MIME types where available, and size hints where available. +- Resource-template URI patterns, parameter descriptions, and descriptions. +- Prompt names, descriptions, and argument schemas where supported. +- Server capability flags and protocol version. +- Last successful refresh time and last failure. + +Discovery output must be normalized into a catalog that retains source identity: + +- `server_id` +- source configuration path and scope. +- original MCP name. +- normalized program-facing name. +- capability kind. +- user-facing description. +- schema or parameter contract. +- availability and failure state. + +Descriptions supplied by an MCP server are user-facing hints, not trusted safety policy. The program must classify risk using configuration, schema, tool kind, permission policy, and runtime behavior. + +## Tool Normalization + +MCP tools should become ordinary tool definitions in the program registry. + +Normalized MCP tool definition fields: + +- `tool_name`: collision-free program-facing name. +- `display_name`: readable name including the originating server when needed. +- `server_id` +- `mcp_tool_name`: original MCP tool name. +- `description` +- `input_schema` +- `capability_kind`: mcp_tool. +- `execution_mode`: read_only, mutating, external_side_effect, network, or unknown. +- `permission_profile` +- `permission_policy` +- `redaction_policy` +- `output_limit_policy` +- `availability` + +Name collisions must be impossible from the model-facing registry. The registry should namespace MCP tools by server identity or otherwise generate stable unique names while preserving the original MCP name in metadata. + +## Tool Invocation + +MCP tool calls follow the same lifecycle as built-in tool calls: + +1. The model requests a normalized MCP tool. +2. The execution engine resolves the tool definition. +3. Input is validated against the MCP tool schema. +4. Mode, permission, safety, approval, configuration, and server availability gates run. +5. The MCP manager sends `tools/call` to the originating server only after gates pass. +6. Tool output is normalized into structured content, text content, resource references, and status. +7. Output is bounded and redacted before model, client, or durable exposure. +8. The tool result is recorded with server id, tool name, terminal state, and diagnostics. + +If the originating server is unavailable, authentication is missing, or the tool disappeared after refresh, the call must complete with a structured unavailable result. It must not silently disappear from the transcript or be replaced with invented output. + +## Resource Access + +MCP resources should be treated as external context objects, not as automatic prompt content. + +Rules: + +- Resource lists may be shown to users and summarized to the model only when useful and bounded. +- Reading a resource requires an explicit resource-read operation selected by the user, a model-requested controlled tool, or another approved workflow. +- Resource reads must apply size limits, MIME-type filtering, redaction, and token-budget policy before context insertion. +- Resource templates require typed parameter validation before the URI is expanded. +- A resource URI is data, not authority. It must not bypass filesystem, network, or privacy policy. +- Resource subscriptions are optional. If supported, updates should produce client-visible resource-changed diagnostics and should not mutate prior transcript records. + +Large resource catalogs should not be injected wholesale into every model context. The context assembler should prefer compact capability summaries and on-demand resource reads. + +## Prompt Templates + +MCP prompts are reusable server-provided prompt templates. They should be discovered and exposed as user-controllable templates, not automatically treated as higher-priority instructions. + +Rules: + +- Prompt discovery records names, descriptions, and argument schemas. +- Prompt execution, if enabled, should be a user-visible action that materializes prompt content into a normal submitted message or explicit context attachment. +- Prompt content must remain lower priority than system, developer, user, safety, and workspace instructions. +- Prompt output must not be injected silently as hidden model instructions. +- Prompt support may be deferred without blocking the required MCP tools/resources/resource-template integration, but unsupported prompts must be reported honestly in capability status. + +## Roots, Sampling, And Elicitation + +### Roots + +When the runtime shares roots with an MCP server, roots are advisory context about the user's workspace. They are not a sandbox. The program must continue enforcing its own filesystem, command, permission, and approval checks for every tool call. + +Project-scoped roots should be minimized to the active workspace and only shared with servers allowed by configuration and trust policy. + +### Sampling + +MCP sampling lets a server ask the host to perform an LLM call. Sampling is disabled by default. + +If enabled by a later design, the host must own: + +- Model selection. +- Prompt inspection and redaction. +- User approval where required. +- Token and cost budgets. +- Tool availability for the sampled call. +- Durable recording and attribution. + +An MCP server must never receive an unrestricted pass-through to the user's configured model or secrets. + +### Elicitation + +MCP elicitation lets a server ask the user for structured input. Elicitation should be represented as a server-originated pending prompt owned by the program. + +Rules: + +- The prompt must identify the requesting server and operation. +- The user may approve, answer, deny, or cancel. +- Answers are returned only to the requesting MCP server for the active operation. +- Elicitation must not be used to bypass Plan Mode question-tool restrictions because it is not a model-facing question tool. It is an external-server prompt under user control. +- Secret collection through elicitation requires explicit credential-handling policy and must not be stored unless the user chooses a durable credential target. + +## Context Assembly + +MCP capability state feeds context assembly through tool availability and compact metadata, not through unbounded catalog injection. + +Context assembly should include: + +- Available normalized MCP tool schemas when they are enabled for the current session, mode, model, and permission posture. +- A compact MCP status summary only when relevant to the user request or recent failure. +- No raw resource content unless a resource was explicitly read and selected for context. +- No MCP prompt content unless the user or an approved workflow invoked the prompt. + +When an MCP capability catalog changes during a session, the next context snapshot should reflect the new tool availability. If a model calls a stale tool, the runtime returns a structured unavailable result. + +## Client Visibility + +Clients should be able to show MCP integration state without reading raw configuration secrets. + +Client projections should include: + +- Server id and display name. +- Configuration source scope and safe source path where useful. +- Enabled/disabled state. +- Startup state. +- Authentication state. +- Last refreshed time. +- Safe startup or protocol error summary. +- Counts and names of tools, resources, resource templates, and prompts. +- Capability disabled reasons. + +Representative protocol surfaces may include: + +- `mcp.listServers` +- `mcp.refreshServer` +- `mcp.startServer` +- `mcp.stopServer` +- `mcp.listCapabilities` +- `mcp.readResource` + +These protocol surfaces are client/server methods. Model-requested MCP tool execution still flows through the tool supervisor. + +## Error Handling + +MCP errors should be normalized into stable categories: + +- Configuration invalid. +- Server disabled. +- Server not trusted. +- Startup failed. +- Transport failed. +- Protocol negotiation failed. +- Authentication required. +- Capability unavailable. +- Input invalid. +- Tool invocation failed. +- Resource read failed. +- Output rejected by policy. +- Operation canceled. + +Every error shown to the user should include the affected server and safe recovery context. Errors must not print plaintext credentials or unbounded external output by default. + +## Security And Privacy + +MCP servers are external capability providers and must be treated as untrusted unless configured otherwise. + +Security rules: + +- Project-provided MCP servers require trust-aware visibility before first use. +- MCP tools use the same approval and safety model as built-in tools. +- MCP server-provided descriptions and schemas do not grant permission. +- Secrets are read from `auth.json` credential references and passed only to the specific configured server operation that requires them. +- Tool outputs, logs, and resources are redacted before model and client exposure. +- MCP servers cannot alter system, developer, or user instruction priority. +- MCP resources and prompts cannot silently become hidden instructions. +- Roots are advisory and must not replace host-side sandboxing or permission checks. +- Sampling is disabled unless a later approved design enables it with host-owned controls. + +## Invariants + +- The program is the MCP host and owns all user-facing state. +- Every configured MCP server has visible status. +- One MCP server failure does not disable unrelated capabilities. +- MCP tools execute only through the server-owned tool supervisor. +- MCP resource content enters model context only through explicit controlled read paths. +- MCP prompts are user-controlled templates, not automatic higher-priority instructions. +- MCP server credentials are not exposed in routine client projections. +- Stale or disappeared MCP capabilities fail with structured unavailable results. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-APP-008 | 1 | specs/L1/L1-REQ-APP-008-mcp.md | Defines MCP configuration, lifecycle, capability discovery, status, safety, and failure behavior. | +| related-to | L1-REQ-TOOL-001 | 1 | specs/L1/L1-REQ-TOOL-001-safety.md | MCP tool calls follow the same safety, approval, redaction, and output limits as built-in tools. | +| related-to | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | MCP tools are normalized into the server-owned tool registry. | +| related-to | L1-REQ-LLM-002 | 1 | specs/L1/L1-REQ-LLM-002-tools.md | Model-requested MCP tool use is controlled by the execution engine and tool supervisor. | +| related-to | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | MCP servers are configured through user-scoped and project-scoped configuration. | +| related-to | L1-REQ-APP-011 | 1 | specs/L1/L1-REQ-APP-011-error-recovery.md | MCP startup, authentication, protocol, and tool errors require actionable recovery context. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | MCP credential and external-resource handling must preserve privacy boundaries. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | Configuration precedence resolves MCP server records. | +| related-to | L2-DES-APP-005 | 1 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | Defines concrete TOML fields for persisted MCP server records and `auth.json` credential references. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Clients inspect MCP state and receive MCP-related status events through the server protocol. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | The execution engine dispatches normalized MCP tool calls. | +| related-to | L2-DES-CONTEXT-001 | 1 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | MCP tool schemas and selected resources participate in context assembly. | +| related-to | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | MCP tools are external tool definitions governed by the built-in tool lifecycle. | +| specified-by | TBD | TBD | specs/L3/mcp/TBD.md | L3 behavior has not been authored yet. | + +## References + +- [Model Context Protocol architecture](https://modelcontextprotocol.io/docs/learn/architecture) +- [Model Context Protocol server concepts](https://modelcontextprotocol.io/docs/learn/server-concepts) + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-25 | Assistant | Initial | Initial MCP integration architecture based on MCP reference documentation and product requirements. | +| 1 | 2026-05-25 | Human | Refinement | Linked MCP configuration to the concrete `config.toml` schema. | +| 1 | 2026-05-25 | Human | Refinement | Clarified that MCP credentials are stored in companion `auth.json` files and injected only at runtime when needed. | +| 1 | 2026-05-25 | Human | Refinement | Renamed normalized MCP tool metadata from `approval_policy` to `permission_policy`. | diff --git a/specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md b/specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md new file mode 100644 index 00000000..cf95de33 --- /dev/null +++ b/specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md @@ -0,0 +1,203 @@ +--- +artifact_id: L2-DES-MODEL-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-MODEL-001 — Model Provider Binding + +## Purpose + +Refine model and provider requirements into a data model that separates supported model capability metadata, reusable user-defined providers, and configured invocable model bindings. + +## Background / Context + +The program supports models first. A supported model definition describes a model capability profile known by the program. Providers are user-defined connection entries because users may add OpenAI-compatible gateways, local endpoints, or other provider endpoints over time. + +The same supported model may be exposed by different providers under different model names. The invocation method can also differ by model-provider pair. Therefore invocation method belongs to the binding between a supported model and a user-defined provider, not to the provider itself. + +User providers and model-provider bindings are persisted through application configuration. Effective values are resolved from user-scoped and project-scoped configuration, with project-scoped configuration taking precedence for overlapping records. + +The concrete `config.toml` field shape for persisted providers, model bindings, and defaults is defined by `L2-DES-APP-005`. Credential material is stored in the companion `auth.json` file and referenced from provider records by credential id. + +## Source Requirements + +- `L1-REQ-MODEL-001` requires built-in supported model definitions and configured invocable models. +- `L1-REQ-MODEL-002` requires configurable model providers and safe credential handling. +- `L1-REQ-MODEL-003` requires onboarding for missing model/provider setup. +- `L1-REQ-TUI-010` requires TUI onboarding for model setup. +- `L1-REQ-APP-010` requires persistent model and reasoning defaults. +- `L1-REQ-APP-012` requires privacy and credential-handling controls. + +## Design Requirement + +The program should model invocation through three related data concepts: + +1. `SupportedModelDefinition` +2. `UserProvider` +3. `ModelProviderBinding` + +`ModelProviderBinding` is the invocable edge. It links one supported model capability profile to one user-defined provider and records the provider-specific model name, invocation method, and reasoning effort. + +## SupportedModelDefinition + +`SupportedModelDefinition` represents intrinsic model metadata known by the program. + +Required conceptual fields: + +- `canonical_model_slug`: stable program-known model slug, such as `openai/gpt-5.5`. +- `display_name`: human-readable model name. +- `base_instructions`: model-specific base instruction text or reference. +- `context_window`: total model context window where known. +- `effective_context_window`: program-safe effective context window where known. +- `modalities`: supported input modalities such as text, image, and video where applicable. +- `reasoning_capability`: unsupported, binary, or named reasoning efforts such as low, medium, high, xhigh, max, or adaptive. +- `default_reasoning_effort`: default reasoning effort where applicable. + +`SupportedModelDefinition` must not contain: + +- Provider name. +- Provider identifier. +- Base URL. +- API key or credential material. +- Provider-specific model name. +- Invocation method. +- Tool support flags, because current supported models are assumed to support tool calling. + +## UserProvider + +`UserProvider` represents a reusable provider connection endpoint created by the user. + +Required conceptual fields: + +- `provider_id`: stable identifier generated by the program. +- `provider_name`: user-entered display name used to recognize the provider later. +- `base_url`: provider API base URL. +- `credential_ref`: credential id referencing stored credential material in effective `auth.json`, or credential status when projected to clients. +- `availability_status`: optional provider availability or validation status. + +`UserProvider` must not contain: + +- Canonical model slug. +- Provider-specific model name. +- Invocation method. +- Reasoning effort. + +The same `UserProvider` may be reused by multiple model-provider bindings. + +## ModelProviderBinding + +`ModelProviderBinding` represents a configured invocable model. + +Required conceptual fields: + +- `binding_id`: stable identifier generated by the program. +- `canonical_model_slug`: reference to `SupportedModelDefinition`. +- `provider_id`: reference to `UserProvider`. +- `model_name`: provider-specific model name used in API calls. +- `invocation_method`: API protocol or SDK adapter, such as OpenAI Responses, OpenAI Chat Completions, or Anthropic Messages. +- `reasoning_effort`: configured default reasoning effort when the supported model permits reasoning. +- `is_default`: optional default-selection marker where configuration policy supports it. + +The binding must be valid only when: + +- The supported model slug exists in the built-in supported-model list. +- The provider exists and has sufficient connection details. +- The invocation method is supported by the program. +- The reasoning effort is allowed by the selected supported model. + +## Configuration Persistence + +`UserProvider` and `ModelProviderBinding` are durable configuration records. Onboarding and model setup create or update these records, while `SupportedModelDefinition` remains built-in capability metadata. + +Persistent model configuration conceptually stores: + +- User provider records. +- Model-provider binding records. +- Default selected binding where supported. +- Default reasoning effort where supported. + +Persistent model configuration should store references to supported model slugs rather than copying the full supported model definition into user or project configuration. + +Current-session model and reasoning selection is not itself a `ModelProviderBinding`. A session may select a configured binding and use a session-local reasoning effort without rewriting the binding record. Configuration writes should occur only when a provider, binding, or persisted default selection is created or changed by the relevant workflow. + +The TOML schema stores providers under `[providers.]`, invocable bindings under `[model_bindings.]`, and durable default selection under `[defaults]`. Provider records reference credentials by id. Runtime-only `ResolvedModelProfile` values and plaintext credential values are not persisted in `config.toml`. + +The effective provider and binding set is resolved through configuration precedence: + +```text +User-scoped providers and bindings + + +Project-scoped providers and bindings + ↓ +Effective invocable model configuration +``` + +When project-scoped and user-scoped configuration define overlapping provider or binding records, the project-scoped records take precedence. + +Credential storage details are defined by configuration and privacy design. This L2 model design expects provider records to refer to `auth.json` credential ids through `credential_ref` or equivalent credential state, while routine client projections still avoid plaintext API keys by default. + +## Resolution + +For an actual model call, the program resolves: + +```text +SupportedModelDefinition + + +UserProvider + + +ModelProviderBinding + + +Session current model/reasoning selection + ↓ +ResolvedModelProfile + ↓ +Provider request +``` + +`ResolvedModelProfile` is a runtime-only profile for one model call. It combines capability metadata, provider connection details, binding details, and session selection state. It should not be treated as the durable source of truth. + +## Client Projection + +Routine client views should receive model/provider/binding projections rather than raw secret-bearing configuration. + +Client projections may include: + +- Supported model slug and display name. +- Provider name. +- Credential status. +- Provider availability status. +- Model name used for invocation. +- Invocation method. +- Reasoning effort. + +Routine client projections must not include plaintext API keys by default. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-MODEL-001 | 1 | specs/L1/L1-REQ-MODEL-001-config.md | Defines supported models and invocable model-provider bindings. | +| refines | L1-REQ-MODEL-002 | 1 | specs/L1/L1-REQ-MODEL-002-provider.md | Defines user-defined reusable providers. | +| refines | L1-REQ-MODEL-003 | 1 | specs/L1/L1-REQ-MODEL-003-onboard.md | Defines the data created by onboarding. | +| related-to | L1-REQ-TUI-010 | 1 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | TUI onboarding collects the values that create provider and binding records. | +| related-to | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Persistent model and reasoning defaults reference configured bindings. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | Credential references and client projections follow privacy requirements. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | Configuration precedence resolves durable provider and binding records. | +| related-to | L2-DES-APP-005 | 1 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | Defines concrete TOML fields for persisted providers, bindings, and defaults plus `auth.json` credential storage. | +| specified-by | TBD | TBD | specs/L3/model/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial model/provider/binding data design from approved product rules. | +| 1 | 2026-05-22 | Human | Refinement | Added persistent configuration source precedence for providers and model-provider bindings. | +| 1 | 2026-05-25 | Human | Refinement | Clarified that binding reasoning effort is configured default state and session-local reasoning selection does not rewrite binding records by itself. | +| 1 | 2026-05-25 | Human | Refinement | Linked provider and binding persistence to the concrete `config.toml` schema. | +| 1 | 2026-05-25 | Human | Refinement | Clarified that provider credentials are stored in companion `auth.json` files, not inline in `config.toml`. | diff --git a/specs/L2/skills/L2-DES-SKILLS-001-agent-skills-architecture.md b/specs/L2/skills/L2-DES-SKILLS-001-agent-skills-architecture.md new file mode 100644 index 00000000..8d05c888 --- /dev/null +++ b/specs/L2/skills/L2-DES-SKILLS-001-agent-skills-architecture.md @@ -0,0 +1,328 @@ +--- +artifact_id: L2-DES-SKILLS-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-SKILLS-001 - Agent Skills Architecture + +## Purpose + +Define the technical design for discovering, presenting, selecting, loading, and applying Agent Skills as reusable instruction packages. + +## Background / Context + +Agent Skills are reusable packages centered on a `SKILL.md` file. The external Agent Skills guidance emphasizes progressive disclosure: expose concise metadata first, load the full skill instructions only when relevant, and let the skill package refer to supporting scripts, references, and assets that can be loaded on demand. + +The program should use skills to improve task behavior without letting skill content override the user's current request, safety policy, project instructions, or approval boundaries. + +## Source Requirements + +- `L1-REQ-APP-009` requires skill discovery, explicit skill requests, visible skill use, and clear missing-skill handling. +- `L1-REQ-APP-010` requires persistent configuration and unavailable-state behavior. +- `L1-REQ-WORKSPACE-001` requires workspace context that respects local project state. +- `L1-REQ-CONTEXT-001` requires useful model context management. +- `L1-REQ-LLM-001` requires token-efficient context construction. +- `L1-REQ-LLM-004` requires communication behavior to remain controlled by configured instructions. +- `L1-REQ-TOOL-001` requires safety, approval, and redaction for actions triggered while using skills. +- `L2-DES-APP-002` defines configuration precedence. +- `L2-DES-APP-003` defines client/server protocol visibility. +- `L2-DES-CONTEXT-001` defines metadata-derived context assembly. +- `L2-DES-TOOL-001` defines the tool system used for skill activation and related file/script work. +- `L2-DES-WORKSPACE-001` defines project instruction discovery, which remains separate from skills. + +## Design Requirement + +The program should provide a skill catalog, skill resolver, and skill activation workflow. + +The catalog discovers skill metadata from configured roots and exposes only concise name/description/source information by default. The resolver loads a skill package when the user explicitly requests it or when the model selects it through a controlled activation path. The activated skill becomes task-scoped guidance in model context and may reference package files for on-demand reading, but it does not gain authority over safety, tools, or user intent. + +## Skill Package Model + +A skill package should be a directory containing a required `SKILL.md` file and optional supporting files. + +Conceptual package layout: + +```text +skill-name/ + SKILL.md + references/ + scripts/ + assets/ +``` + +`SKILL.md` is the entrypoint. It should contain frontmatter followed by instructions. Required and recommended metadata should follow the Agent Skills specification where possible. + +Conceptual frontmatter fields: + +- `name`: stable skill name. +- `description`: concise description used for discovery and model selection. +- `version`: optional package version. +- `enabled`: optional local enablement marker. +- `tags`: optional categorization. +- `compatibility`: optional client or model compatibility hints. +- `allowed_tools`: optional advisory list of tools the skill may need. + +Only `name` and `description` should be required for a package to be discoverable as a normal skill. Missing or malformed optional fields should produce diagnostics, not hard failures, unless a later L3 validator requires stricter authoring mode. + +Supporting files are package resources. They must not be eagerly loaded into model context just because the skill exists. + +## Skill Sources + +The catalog should support multiple source scopes: + +| Source | Purpose | +|---|---| +| Built-in | Skills shipped with the program. | +| User | Skills installed for the current user across workspaces. | +| Workspace | Skills committed to or placed inside the active workspace. | +| Plugin | Skills contributed by installed plugins. | +| External package | Skills installed from a package or repository by an explicit user action. | + +User and project scopes follow configuration precedence from `L2-DES-APP-002`. Workspace skills are useful but potentially untrusted, because opening a repository should not silently grant that repository authority to steer the agent. + +## Discovery Roots + +Discovery roots are configuration-driven. Recommended default roots: + +- User native root: `~/.devo/skills/` +- User interoperability root: `~/.agents/skills/` +- Workspace native root: `/.devo/skills/` +- Workspace interoperability root: `/.agents/skills/` +- Plugin-provided skill directories from installed plugin metadata. + +The concrete TOML shape for persisted skill enablement and discovery roots is defined by `L2-DES-APP-005` under `[skills]` and `[skills.roots.]`. + +Discovery rules: + +- Scan only configured skill roots and immediate package directories unless a root explicitly declares another layout. +- Ignore unrelated large build or dependency directories. +- Require a `SKILL.md` entrypoint for normal package discovery. +- Treat non-UTF-8 or unreadable `SKILL.md` files as invalid and report diagnostics. +- Canonicalize paths before comparing roots and package identities. +- Bound the number of discovered skills and the total metadata bytes returned to context. +- Do not read supporting package files during catalog discovery. + +If the same skill name appears in multiple sources, resolution must be deterministic and visible. The design should prefer explicit source priority over silent replacement. A duplicate can be represented as an error, a shadowed lower-priority record, or a user-resolved conflict, but it must not be ambiguous to the model. + +## Skill Catalog + +The skill catalog is the discovery output consumed by clients, context assembly, and activation. + +Conceptual `SkillCatalogEntry` fields: + +- `skill_id`: stable local identifier. +- `name` +- `description` +- `source` +- `package_root` +- `entrypoint_path` +- `enabled` +- `trust_state` +- `version` +- `tags` +- `compatibility` +- `diagnostics` +- `last_loaded_at` +- `last_changed_at` + +The model-visible catalog should usually include only `name`, `description`, and a stable activation identifier. Full paths, diagnostics, and source details are client-visible or debug-visible but should not consume routine model context unless needed. + +## Activation Paths + +Skills can be activated in two ways: + +1. User-explicit activation: the user names or selects a skill. +2. Model-selected activation: the model selects a skill from the concise catalog through a controlled activation tool or equivalent runtime path. + +User-explicit activation has priority because it is part of the user's current intent. If the user asks for a missing skill, the program should explain that it is unavailable and continue only if the task can be performed without it. + +Model-selected activation must be mediated by the runtime. The model should not be expected to search arbitrary directories and decide that a file is a skill. A dedicated activation path allows the program to enforce trust, source, availability, diagnostics, token limits, and audit records. + +Conceptual activation input: + +- `skill_id` +- `activation_reason` +- `requested_by`: user, model, client, or automation. +- `turn_id` +- `workspace_root` + +Conceptual activation result: + +- `skill_id` +- `skill_name` +- `source` +- `entrypoint_content` +- `package_root` +- `available_supporting_files` +- `diagnostics` +- `loaded_at` + +The assistant should tell the user when a skill is being used and why, satisfying the visibility requirement from `L1-REQ-APP-009`. + +## Context Integration + +Skill context follows progressive disclosure. + +Context assembly should include: + +- A bounded catalog of available skills when skill use is enabled and relevant. +- Full `SKILL.md` content only for activated skills. +- Supporting file content only when explicitly read or selected by the skill instructions through normal tools. +- A concise activation record so replay can explain why the skill was present. + +Activated skill instructions are task-scoped metadata-derived content. They are not user transcript items and should not silently rewrite prior context. If a skill is activated after a turn has started, the activation applies to the next model invocation or next turn according to the execution state. + +If multiple skills are activated for one task, their order must be deterministic: + +1. User-explicit skills in user-specified order. +2. Runtime-required skills. +3. Model-selected skills in activation order. + +Conflicts are resolved by instruction precedence and, where equal, later activation does not silently override earlier active skill guidance without a visible activation record. + +## Instruction Precedence + +Skill instructions are lower priority than: + +- System and developer instructions. +- Safety and permission policy. +- The user's current request. +- Explicit project instruction files. +- Current interaction mode instructions. +- Active configuration and permission posture. + +Skill instructions may specialize how work is performed only inside those boundaries. A skill cannot grant tool permissions, disable approval, override user constraints, change privacy policy, or require the assistant to hide its use. + +## Supporting Files And Scripts + +Skills may contain scripts, references, templates, examples, assets, or other package files. These files are package resources. + +Rules: + +- Supporting files are not loaded during normal discovery. +- Relative paths mentioned by `SKILL.md` resolve inside the skill package root unless explicitly allowed otherwise. +- Reading supporting files uses normal file-read behavior with output limits and redaction. +- Running scripts uses normal command execution, approval, and workspace policy. +- A skill's `allowed_tools` or similar metadata is advisory. It may help the model choose tools, but it does not authorize tool use. +- Generated artifacts from a skill are ordinary workspace changes and must be attributed to the active turn. + +## Trust And Safety + +Skills are instruction packages and may contain prompt-injection attempts, unsafe commands, stale guidance, or misleading descriptions. + +Safety rules: + +- Workspace-provided skills require trust-aware visibility before automatic model activation. +- User-explicit skill activation may load an untrusted skill only with clear source visibility where policy requires it. +- Skills cannot override higher-priority instructions. +- Skills cannot make hidden network, filesystem, or command actions happen without normal tool calls. +- Skill descriptions are selection hints, not trusted policy. +- Skill content must be bounded and redacted before model insertion. +- Skill package paths and diagnostic details are safe client/debug projections, not routine model context. +- Skill activation and supporting-file reads should be durable enough for replay and audit. + +## Client Visibility + +Clients should expose skill state without forcing users to inspect filesystem paths. + +Client projections should include: + +- Skill name and description. +- Source scope. +- Enabled or disabled state. +- Trust state. +- Diagnostics for missing, invalid, duplicate, or incompatible skills. +- Active skills for the current turn or task. +- Last refresh time. + +Representative protocol surfaces may include: + +- `skills.list` +- `skills.refresh` +- `skills.activate` +- `skills.deactivate` +- `skills.inspect` + +Activation/deactivation are session or task state changes. They should produce server-client events so every connected client can render the same active skill state. + +## Refresh And Change Handling + +Skill discovery should refresh when: + +- The effective configuration changes. +- The active workspace changes. +- A watched skill root changes. +- The user explicitly requests refresh. +- A skill activation attempts to load a stale entry. + +Refresh should be atomic from the perspective of context assembly. If discovery is in progress, the runtime may use the last successful catalog and apply the refresh to later turns. + +If an activated skill changes on disk during a session, the runtime should not silently replace already-injected content. A later turn may load the new version with a visible refresh or activation record. + +## Error Handling + +Skill errors should be normalized into stable categories: + +- Discovery disabled. +- Root unavailable. +- Skill not found. +- Skill disabled. +- Skill untrusted. +- Skill incompatible. +- Invalid metadata. +- Entrypoint unreadable. +- Duplicate skill name. +- Supporting file unavailable. +- Activation rejected by policy. +- Content exceeds limit. + +Errors should be actionable. For example, a missing explicitly requested skill should name the missing skill and the active discovery roots where appropriate, without dumping irrelevant filesystem data into model context. + +## Invariants + +- A skill is discovered from metadata before full content is loaded. +- Full skill instructions enter context only after activation. +- Supporting files are loaded on demand, not during catalog discovery. +- Skill use is visible to the user. +- Missing or invalid skills do not fail the whole session unless the requested task depends on that skill. +- Skill instructions cannot override higher-priority instructions, safety, or approval rules. +- Skill activation is bounded, auditable, and replayable. +- Duplicate skill names are resolved deterministically or reported as conflicts. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-APP-009 | 1 | specs/L1/L1-REQ-APP-009-skills.md | Defines skill package discovery, activation, context integration, trust, and visibility behavior. | +| related-to | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Skill roots, enablement, and refresh behavior are configuration-driven. | +| related-to | L1-REQ-WORKSPACE-001 | 1 | specs/L1/L1-REQ-WORKSPACE-001-project-context.md | Workspace skills are part of workspace context but remain separate from project instruction files. | +| related-to | L1-REQ-CONTEXT-001 | 1 | specs/L1/L1-REQ-CONTEXT-001-management.md | Skill catalog and activated skill content participate in context management. | +| related-to | L1-REQ-LLM-001 | 1 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | Progressive disclosure avoids injecting every skill body into every request. | +| related-to | L1-REQ-LLM-004 | 1 | specs/L1/L1-REQ-LLM-004-persona.md | Skill instructions must not override configured persona and communication behavior. | +| related-to | L1-REQ-TOOL-001 | 1 | specs/L1/L1-REQ-TOOL-001-safety.md | Scripts and tool use triggered by skills remain subject to safety and approval. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | Configuration precedence resolves skill roots and enablement. | +| related-to | L2-DES-APP-005 | 1 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | Defines concrete TOML fields for skill enablement and discovery roots. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Clients inspect and receive events for skill discovery and activation state. | +| related-to | L2-DES-CONTEXT-001 | 1 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | Activated skills are task-scoped metadata-derived context. | +| related-to | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | Skill activation and supporting file/script use flow through controlled tools. | +| related-to | L2-DES-WORKSPACE-001 | 1 | specs/L2/workspace/L2-DES-WORKSPACE-001-project-instruction-discovery.md | Project instruction files and workspace skills are separate context sources. | +| specified-by | TBD | TBD | specs/L3/skills/TBD.md | L3 behavior has not been authored yet. | + +## References + +- [Agent Skills](https://agentskills.io/) +- [Agent Skills specification](https://agentskills.io/specification) +- [Adding skills support to agents](https://agentskills.io/client-implementation/adding-skills-support) + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-25 | Assistant | Initial | Initial Agent Skills architecture based on Agent Skills reference documentation and product requirements. | +| 1 | 2026-05-25 | Human | Refinement | Linked skill configuration to the concrete `config.toml` schema. | diff --git a/specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md b/specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md new file mode 100644 index 00000000..2936d6af --- /dev/null +++ b/specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md @@ -0,0 +1,379 @@ +--- +artifact_id: L2-DES-TOOL-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TOOL-001 — Built-In Tool System + +## Purpose + +Define the built-in tool system used by the agent execution engine, including baseline tool categories, tool lifecycle, safety gates, visibility, and the plan tool that maintains user-visible task planning state. + +## Background / Context + +The agent execution engine depends on tools to inspect the workspace, modify files, run commands, ask for approval, ask Plan Mode clarification questions, search, fetch external content, coordinate subagents, and keep a visible plan. + +`L2-DES-AGENT-001` defines where model-requested tool dispatch occurs in the execution loop. This design defines the tool system that dispatch uses. + +The plan tool is part of the tool system rather than private model reasoning. It updates a user-visible to-do list that represents planned work and execution status. + +## Source Requirements + +- `L1-REQ-TOOL-002` requires a baseline set of built-in tools. +- `L1-REQ-AGENT-003` requires visible task planning with status updates. +- `L1-REQ-AGENT-005` restricts the question tool to Plan Mode. +- `L1-REQ-LLM-002` requires model-requested tool use through a controlled lifecycle. +- `L1-REQ-TOOL-001` requires tool safety, approval, redaction, and bounded output. +- `L1-REQ-TOOL-005` requires background process visibility and manual stop behavior. +- `L1-REQ-TOOL-003` requires configurable web search behavior. +- `L1-REQ-TOOL-004` requires explicit parallel tool orchestration where enabled. +- `L1-REQ-AGENT-004` requires subagent delegation where enabled. +- `L1-REQ-GOAL-001` requires verified completion and blocker reporting for Ralph Loop goals. +- `L1-REQ-APP-010` requires configuration and unavailable-state behavior. +- `L2-DES-AGENT-001` defines the execution engine that dispatches tools. +- `L2-DES-AGENT-002` defines interruption and background process control. +- `L2-DES-APP-003` defines protocol events that expose tool and plan state. +- `L2-DES-CONV-001` defines durable tool, plan, and transcript records. +- `L2-DES-GOAL-001` defines the narrow model-facing goal update tool. + +## Design Requirement + +The program should provide a server-owned tool registry and tool supervisor. The model may request tools through structured tool calls, but the server validates, authorizes, executes, records, and reports every tool call. + +Tool calls are not arbitrary code paths. Each tool must be defined by metadata, schema, capability classification, safety policy, output policy, and runtime handler. + +## Tool Registry + +Conceptual `ToolDefinition` fields: + +- `tool_name` +- `display_name` +- `description` +- `input_schema` +- `output_schema` +- `tool_category` +- `execution_mode`: read_only, mutating, command, background_process, user_prompt, planning, goal_status, delegation, web, or internal. +- `availability`: available, disabled, needs_configuration, unsupported, or blocked_by_mode. +- `configuration_refs` +- `permission_profile` +- `permission_policy` +- `redaction_policy` +- `output_limit_policy` +- `supports_streaming_output` +- `supports_cancellation` +- `supports_parallel_execution` + +The registry should expose only tools available to the current session mode, permission posture, configuration, and model capability. A tool that is disabled or misconfigured should fail explicitly with a structured unavailable result rather than fabricating output. + +## Permission Policy And Sandbox + +Tool execution is governed by permission policy and sandbox policy as separate layers. + +`permission_policy` controls whether a tool call may proceed automatically, requires review, or requires user approval. The initial policy values are: + +- `default`: baseline permission behavior for normal sessions. +- `auto_review`: review-oriented behavior that classifies tool calls before execution and requires user approval when risk or ambiguity remains. +- `full_access`: broad permission behavior for trusted contexts, while still preserving validation, mode constraints, privacy rules, audit recording, and sandbox restrictions. + +Sandbox policy controls what the tool execution process can do at the host boundary. The durable sandbox configuration schema is not finalized, but the sandbox design target is to restrict system calls such as `open`, `read`, and `write`, with practical controls over: + +- Directory read access. +- Directory write access. +- File creation, mutation, rename, and deletion. +- Process execution boundaries where supported. +- Network access at the domain level. + +The permission policy must not be treated as the sandbox. `full_access` can reduce approval prompts only inside the limits still imposed by the sandbox and tool validation. + +## Command Intent Inputs + +Shell or command execution tools should include an invocation-level `description` field at the beginning of their input schema. The field should be a concise natural-language sentence describing what the model intends the command to accomplish before the model provides the command text. + +This requirement is specific to command-like tools. Structured tools such as file read, file write, apply patch, search, approval, question, plan, and subagent coordination should not be forced to add a generic `description` field when their schema already carries intent through typed arguments. + +Conceptual command tool invocation fields: + +- `description` +- `command` +- `timeout` where applicable. +- `working_directory` where applicable. +- `tool_call_id` +- `session_id` +- `turn_id` +- `requested_at` + +The command `description` field is not hidden reasoning and not a substitute for validation. It is an explicit intent summary used by the runtime, audit trail, and model-facing context. It helps bind a shell command to a natural-language purpose before the command text is generated and executed. + +The command text may still include normal shell comments, including a first-line intent comment, when that is useful for command readability or model generation quality. Such comments are part of the executable script text sent to the shell. The runtime should not parse shell comments as protocol metadata, and it should not require a command comment to duplicate the structured `description` field. + +Command execution tools should use an input shape where `description` precedes `command`. For example: + +- `description`: one sentence describing the command's intended outcome. +- `command`: the command text to execute. + +Example command-tool input: + +```json +{ + "description": "Check if the dev server is running, then capture any errors.", + "command": "# Check if the dev server is running, then capture any errors.\nsleep 5\nSERVER_STATUS=$(curl -s -o /dev/null -w \"%{http_code}\" http://localhost:3000)\nif [ \"$SERVER_STATUS\" = \"200\" ]; then\n echo \"Dev server is healthy, checking for runtime errors\"\n cat /tmp/app.log | grep -i \"error\" | tail -20\nelse\n echo \"Dev server failed to start (HTTP $SERVER_STATUS), showing startup logs\"\n cat /tmp/app.log | tail -50\nfi" +} +``` + +In this example, the command description states the canonical intent before the command body. The command body also includes a normal shell comment and translates raw HTTP status into natural-language status lines that are easier for the model and user to interpret. + +The server should validate the actual command independently from the description. If the description and command conflict, the server should treat the call as suspicious, invalid, or approval-worthy according to safety policy. + +## Baseline Tool Categories + +The baseline built-in tool set should cover these categories: + +| Category | Purpose | Examples | +|---|---|---| +| File read | Inspect file contents and metadata. | read file, list directory. | +| File mutation | Create, edit, delete, or rename files through structured operations. | write, apply patch. | +| Search | Find files or content in the workspace. | file-name search, content search. | +| Command execution | Run shell commands with bounded output. | one-shot command execution. | +| Background process | Track long-running commands and process stdin. | dev server, test watcher, interactive command. | +| Planning | Maintain visible task plan state. | plan tool. | +| Goal status | Report verified goal completion or blockers. | goal update tool. | +| Approval | Ask the user for permission before risky actions. | approval request. | +| Question | Ask the user for clarification in Plan Mode only. | question tool. | +| Web | Fetch or search external content where configured. | web fetch, web search. | +| Delegation | Start or coordinate subagents where enabled. | subagent spawn/status/result. | +| Parallel orchestration | Execute an explicit group of valid tool calls concurrently. | `multi_tool_use`. | + +Exact tool names and schemas are L3 concerns. This L2 design defines the categories and lifecycle constraints. + +## Tool Invocation Lifecycle + +The normal lifecycle for a model-requested tool call is: + +1. Provider stream emits a structured tool request, including a command intent `description` for shell or command execution tools. +2. Execution engine normalizes the request into an internal tool invocation. +3. Tool registry resolves the tool definition. +4. Input is validated against the tool schema. +5. Mode, permission, safety, and configuration gates are evaluated. +6. Approval is requested if required. +7. The tool supervisor executes the handler or returns a structured denial/unavailable result. +8. Streaming progress is emitted where supported. +9. Output is bounded, redacted, and normalized. +10. Durable tool call and tool result records are appended. +11. Server-client events update subscribed clients. +12. The structured result is returned to the model if the provider interaction continues. + +Every tool invocation should produce one of these terminal states: + +- `completed` +- `denied` +- `blocked_by_mode` +- `needs_configuration` +- `invalid_input` +- `failed` +- `canceled` +- `interrupted` + +## Plan Tool + +The plan tool maintains a visible to-do list for the current session or active task. It is the primary mechanism for satisfying visible task-planning requirements without exposing private model reasoning. + +The plan tool should be available in Normal Mode and Plan Mode. In Plan Mode it may be used to build a strategic plan without mutating files. In Normal Mode it may be used to track execution progress. + +Conceptual plan fields: + +- `plan_id` +- `session_id` +- `turn_id` where created or last updated. +- `objective` +- `items` +- `status`: active, completed, blocked, abandoned, or superseded. +- `created_by`: agent, user, or imported. +- `created_at` +- `updated_at` + +Conceptual plan item fields: + +- `plan_item_id` +- `text` +- `status`: pending, in_progress, completed, blocked, or canceled. +- `details` +- `parent_item_id` +- `parallel_group_id` +- `source_turn_id` +- `updated_at` + +Plan tool operations should include: + +- Create or replace an active plan. +- Add plan items. +- Update item status. +- Mark the overall plan complete, blocked, abandoned, or superseded. +- Attach a blocker or short status note. +- Represent explicit parallel work through multiple in-progress items or a `parallel_group_id`. + +The plan tool output is program state, not hidden chain-of-thought. Plan item text should be concise, user-visible, and action-oriented. + +## Plan Consistency + +The execution engine should keep plan state consistent with actual execution state: + +- When a planned step starts, the corresponding item should become `in_progress`. +- When a planned step finishes, the corresponding item should become `completed`. +- When execution cannot continue, the corresponding item should become `blocked` with a concise reason. +- If the user's objective changes, the active plan should be updated, superseded, or explicitly abandoned. +- If work is delegated to subagents, plan state should identify delegated or parallel work. + +The plan tool should not be mandatory for trivial one-step tasks. The agent may create a plan when task complexity, risk, user request, Plan Mode, or parallel work justifies it. + +## Goal Update Tool + +The goal update tool lets the model report that the current Ralph Loop goal is verified complete or blocked. It is not the user-facing `/goal` command and must not expose user-owned goal controls to the model. + +Allowed operations: + +- Mark the current goal `complete` with a verification summary and evidence references. +- Mark the current goal `blocked` with a blocker summary and the user input or external state change needed to continue. + +Disallowed operations: + +- Create a goal. +- Replace a goal. +- Edit the objective. +- Increase, reset, or remove a budget. +- Pause, resume, clear, or cancel a goal. + +The tool should include `expected_goal_id` so stale model output cannot update a replaced goal. If the goal changed after context assembly, the tool result should become a no-op with a factual stale-state summary. + +The tool result is program state. It should produce durable goal records and client-visible goal events as defined by `L2-DES-GOAL-001`, `L2-DES-CONV-001`, and `L2-DES-APP-003`. + +## Mode Gating + +Tools must respect session-local interaction mode and session-level agent mode. + +Plan Mode constraints: + +- Mutating file tools must be blocked. +- The question tool may be available for clarification. +- Read-only tools, search tools, and safe inspection tools may be available. +- The plan tool should be available. +- Command, web, and subagent tools should follow explicit mode policy because they may have non-file side effects. + +Normal Mode constraints: + +- The question tool must be blocked unless a later requirement explicitly allows another mode. +- The plan tool remains available for visible progress tracking. +- Mutating tools may be available subject to permission and safety policy. + +Tool availability should be resolved before tool schemas are exposed to the model where practical. If a provider still emits a blocked tool call, the server must return a structured blocked result. + +## Parallel Tool Orchestration + +`multi_tool_use` is an explicit orchestration tool. It should not bypass the lifecycle of individual tool calls. + +Rules: + +- Each child tool call must be independently resolved, validated, authorized, and recorded. +- Child calls should execute concurrently only when their tool definitions allow parallel execution. +- Mutating child calls require conflict and safety handling before concurrent execution. +- The parent orchestration result should preserve child ordering, child ids, terminal states, and partial failures. +- Progress events should be emitted per child call so clients can render work before the entire group completes. + +## Output And Redaction + +Tool output should be split into: + +- Natural-language status summary returned to the model. +- Canonical result content returned to the model. +- Display content safe for clients. +- Structured status fields such as exit code, HTTP status, process id, or file counts. +- Durable output references where output is large. +- Redaction metadata explaining whether secrets or unsafe content were removed. + +The natural-language status summary should translate raw tool-domain signals into a concise statement that supports the next agent decision. For example, a command result should not expose only `exit_code: 1`; it should also provide a summary such as "Command failed: port 3000 is already in use by process 8432." Structured fields should still be retained for exactness, replay, and UI display. + +Tool result summaries should be factual and derived from observed tool output or structured status. They must not invent likely causes or next actions. When the tool can identify a cause, the summary should name it directly; when it cannot, it should say what is known. + +Output must be bounded so noisy commands, large files, web content, or background processes do not make the client or model context unusable. + +## Background Processes + +Command tools may start processes that continue after the originating tool call returns. Such processes should be registered with the tool supervisor and exposed through `L2-DES-AGENT-002` and `L2-DES-APP-003`. + +The built-in tool system should record: + +- Process id where available. +- Command label. +- Session and turn association. +- Workspace root. +- Runtime status. +- Recent output reference. +- Stdin capability. +- Stop capability. + +Detailed process termination semantics are refined by interrupt/resume and future tool L3 designs. + +## Durable Recording + +The tool system should produce durable records through `L2-DES-CONV-001` for: + +- Tool calls. +- Tool results. +- Tool progress summaries where needed for replay. +- Plan creation and updates. +- Goal completion or blocker reports. +- Approval and question requests. +- Background process registration and terminal state. +- Workspace change-set records from structured mutating tools. + +Live server-client events may be more frequent than durable records, but replay must recover the final visible tool and plan state. + +## Invariants + +- Tools execute only through the server-owned tool supervisor. +- Tool schemas exposed to the model reflect current mode, configuration, and capability state where practical. +- Shell or command execution tool calls include a concise intent `description` before the command text. +- Tool calls cannot bypass validation, safety, approval, or mode gates. +- The question tool is blocked in Normal Mode. +- The plan tool creates visible plan state and does not expose private model reasoning. +- The goal update tool can only report verified completion or blockers; it cannot modify user-owned goal parameters. +- Mutating tools report file changes to the core-owned workspace change set where supported. +- Tool outputs are bounded and redacted before model or client exposure. +- Tool outputs include natural-language status summaries alongside structured status fields. +- A tool unavailable due to configuration returns a clear unavailable result. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | Defines the built-in tool registry, categories, lifecycle, and baseline tool behavior. | +| refines | L1-REQ-AGENT-003 | 1 | specs/L1/L1-REQ-AGENT-003-task-planning.md | Defines the plan tool as visible to-do state for task planning and progress. | +| related-to | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Applies Plan Mode restrictions to mutating tools and question-tool availability. | +| related-to | L1-REQ-LLM-002 | 1 | specs/L1/L1-REQ-LLM-002-tools.md | Defines the controlled lifecycle for model-requested tools. | +| related-to | L1-REQ-TOOL-001 | 1 | specs/L1/L1-REQ-TOOL-001-safety.md | Tool safety, approval, and redaction gates apply to all tool calls. | +| related-to | L1-REQ-APP-003 | 1 | specs/L1/L1-REQ-APP-003-safety.md | Tool execution remains bounded by permissions, sandboxing, and user approval. | +| related-to | L1-REQ-TOOL-003 | 1 | specs/L1/L1-REQ-TOOL-003-web-search-configuration.md | Web search is a configurable built-in tool category. | +| related-to | L1-REQ-TOOL-004 | 1 | specs/L1/L1-REQ-TOOL-004-parallel-tool-orchestration.md | `multi_tool_use` is the explicit parallel orchestration tool. | +| related-to | L1-REQ-TOOL-005 | 1 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | Command tools can register background processes for inspection and stop control. | +| related-to | L1-REQ-AGENT-004 | 1 | specs/L1/L1-REQ-AGENT-004-subagents.md | Subagent coordination is a built-in delegation tool category where enabled. | +| related-to | L1-REQ-GOAL-001 | 1 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | Defines the narrow model-facing goal update tool for verified completion and blockers. | +| related-to | L2-DES-AGENT-001 | 1 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | The execution engine dispatches tools through this tool system. | +| related-to | L2-DES-AGENT-002 | 1 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | Interrupt and resume control active tool and background process work. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol events expose tool and plan state to clients. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable records preserve tool and plan state. | +| related-to | L2-DES-GOAL-001 | 1 | specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md | Defines goal status transitions exposed through the goal update tool. | +| specified-by | TBD | TBD | specs/L3/tool/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial built-in tool system and plan tool design. | +| 1 | 2026-05-22 | Human | Refinement | Added command intent inputs and natural-language tool status summaries. | +| 1 | 2026-05-23 | Human | Refinement | Added the narrow model-facing goal update tool for Ralph Loop completion and blockers. | +| 1 | 2026-05-25 | Human | Refinement | Renamed tool metadata from `approval_policy` to `permission_policy` and separated permission policy from sandbox enforcement direction. | diff --git a/specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md b/specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md new file mode 100644 index 00000000..42bebdd9 --- /dev/null +++ b/specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md @@ -0,0 +1,208 @@ +--- +artifact_id: L2-DES-TUI-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-001 — Onboarding UI Flow + +## Purpose + +Refine the TUI onboarding requirement into a concrete terminal interaction design for required model setup. + +## Background / Context + +The TUI onboarding flow must let a user configure an invocable model without leaving the terminal. L1 requires model slug selection, provider selection or creation, provider-specific model name entry, invocation method selection, and reasoning effort selection where supported. + +This L2 design defines the concrete terminal flow, inline presentation, popup behavior, and interaction sequence. It does not define storage backends, provider validation protocol, or final visual styling values. + +## Source Requirements + +- `L1-REQ-TUI-010` requires TUI onboarding for required model setup. +- `L1-REQ-MODEL-001` defines supported model definitions, invocable model configuration, and credential status expectations. +- `L1-REQ-MODEL-002` defines provider setup and provider availability behavior. +- `L1-REQ-MODEL-003` defines onboarding as the product-level setup path. +- `L1-REQ-APP-010` defines persistent configuration and project-over-user configuration precedence. +- `L1-REQ-APP-012` defines privacy and credential-handling expectations. +- `L2-DES-MODEL-001` defines the supported model, user provider, and model-provider binding records created by this flow. +- `L2-DES-APP-002` defines the configuration source precedence and default persistence target behavior. + +## Design Requirement + +The TUI onboarding UI should use a searchable popup for discrete selections and an inline form for provider, model name, and credential entry. Every active input field or popup must show a concise hint that explains the current value expected from the user. + +The flow order is: + +1. Search and select supported model slug. +2. Close the model slug popup after confirmation. +3. Select an existing provider or choose to add a provider. +4. If adding a provider, enter provider name. +5. If adding a provider, enter base URL. +6. If adding a provider, enter API key. +7. Enter the model name expected by the selected provider. +8. Search and select invocation method. +9. Search and select reasoning effort when the selected model supports reasoning. +10. Persist setup results to configuration. +11. Complete setup and continue to a usable session. + +## Interaction Sketch + +The following ASCII sketch defines the required interaction structure and visible control groups. It is not a final styling specification for dimensions, color, or focus rings. + +Onboarding controls should be visually unframed. Popup sections must not use outer ASCII box borders such as `+--------+` or full-frame side borders. The inline setup stack should use a single vertical rail to connect the configured fields from top to bottom. + +```text +Select Model Slug +Hint: Choose the model capability profile the program should use. + +Search: gpt + +> openai/gpt-5.5 + openai/gpt-5.4 + anthropic/claude-opus + local/qwen3-coder + +Enter: select and close popup +Esc: cancel + +Select Provider +Hint: Choose a provider or add one. + +Search: open + +> OpenAI + OpenRouter + Add provider... + +Enter: select and close popup +Esc: back + +Model: openai/gpt-5.5 +| +* provider name: +| Hint: Enter a name to recognize this provider later. +| OpenRouter +| +* base url: +| Hint: Enter the provider API base URL. +| https://api.example +| +* api key: +| Hint: Enter the API key for this provider. +| [hidden input] +| +* model name: +| Hint: Enter the model name this provider expects. +| openai/gpt-5.5 +| +* invocation method: +| Hint: Choose the API protocol used for this binding. +| [open popup] +| +* reasoning effort: +| Hint: Choose the default reasoning effort for this binding. +| [open popup if the model supports reasoning] +| + +Invocation Method +Hint: Choose the API protocol used to call this model. + +Search: openai + +> OpenAI Responses + OpenAI Chat Completions + Anthropic Messages + +Enter: select and close popup +Esc: back + +Reasoning Effort +Hint: Choose the default reasoning effort for this binding. + +> medium + high + xhigh + +Enter: select and close popup +Esc: back +``` + +## Flow Behavior + +- The model slug selector is the first control in model onboarding. +- The model slug selector must show a hint that tells the user they are choosing the model capability profile the program should use. +- The model slug selector must support search or filtering by slug text. +- Pressing Enter on a highlighted model slug confirms the selection and closes the popup. +- Pressing Esc from the model slug selector cancels onboarding or returns to the previous onboarding step where one exists. +- After model slug confirmation, the provider selector opens. +- The provider selector must show existing providers plus an add-provider option. +- The provider selector must show a hint that tells the user to choose an existing provider or add one. +- Pressing Enter on a highlighted provider confirms the selection and closes the popup. +- If the user chooses to add a provider, provider detail entry is inline rather than a boxed popup. +- The inline setup view must display the selected model slug before editable provider fields. +- The inline setup view must use a single continuous vertical rail to connect model display, provider name entry, base URL entry, API key entry, model name entry, invocation method selection, and reasoning effort selection where applicable. The rail should appear under each field marker rather than before the `* field` label. +- The inline setup rail is a guide for the setup sequence, not an outer frame; it must not wrap the content on both sides or draw top/bottom box borders. +- Provider name entry must appear before base URL entry when adding a provider. +- Provider name entry must show a hint that tells the user to enter a name for recognizing the provider later. +- Base URL entry must show a hint that tells the user to enter the provider API base URL. +- API key entry must use hidden or masked input by default. +- API key entry must show a hint that tells the user to enter the API key for the selected provider. +- Model name entry appears after provider selection or provider creation. +- Model name entry must show a hint that tells the user to enter the model name this provider expects for API calls. +- Invocation method selection appears after model name entry. +- Invocation method selection uses the same search-popup interaction pattern as model slug selection. +- Invocation method selection must show a hint that tells the user to choose the API protocol used to call this model through this provider. +- Invocation method choices include OpenAI Responses, OpenAI Chat Completions, and Anthropic Messages where available. +- Pressing Enter on a highlighted invocation method confirms the selection, closes the popup, and returns to the inline setup view. +- If the selected model supports reasoning, reasoning effort selection appears after invocation method selection. +- Reasoning effort selection uses the same search-popup interaction pattern as model slug selection. +- Reasoning effort selection must show a hint that tells the user to choose the default reasoning effort for this model binding. +- Pressing Enter on a highlighted reasoning effort confirms the selection, closes the popup, and returns to the inline setup view. +- If the selected model does not support reasoning, the inline setup view omits the reasoning effort selection step. +- Successful setup submits the selected values for persistent configuration storage before normal model invocation begins. +- If onboarding runs with an active project directory and no explicit target selection is available, the default persistence target is the project-scoped configuration file. +- If onboarding runs without an active project directory and no explicit target selection is available, the default persistence target is the user-scoped configuration file. +- Validation failures should preserve the selected model slug and safe completed fields where useful. + +## Error And Recovery Behavior + +- Invalid base URL input should produce a concise inline error near the base URL field. +- Invalid or rejected API key input should produce a concise provider setup error without writing the plaintext key into transcript history. +- Unsupported invocation method selection should be prevented by the selection list where possible. +- If provider validation fails after submission, the TUI should return to the inline setup view with safe completed fields preserved. +- If persistence fails after valid setup input, the TUI should report the configuration target and return to a recoverable setup state. +- The user should be able to go back from provider, invocation method, and reasoning effort popups without losing earlier safe fields. + +## Privacy Constraints + +- API key entry is an explicit credential-handling flow. +- Plaintext API keys must not appear in routine transcript, model list, model switcher, logging, or telemetry paths by default. +- The inline setup view may show credential status or masked input, but must not display plaintext credential values by default after entry. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-010 | 1 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | Provides the concrete terminal interaction design for TUI onboarding. | +| related-to | L1-REQ-MODEL-001 | 1 | specs/L1/L1-REQ-MODEL-001-config.md | Uses supported model and invocable model configuration requirements. | +| related-to | L1-REQ-MODEL-002 | 1 | specs/L1/L1-REQ-MODEL-002-provider.md | Uses provider setup requirements. | +| related-to | L1-REQ-MODEL-003 | 1 | specs/L1/L1-REQ-MODEL-003-onboard.md | Refines the TUI presentation of model onboarding. | +| related-to | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Uses persistent configuration and project-over-user precedence requirements. | +| related-to | L1-REQ-APP-012 | 1 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | Carries credential-handling constraints into UI design. | +| related-to | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | The flow creates user provider and model-provider binding records. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | Defines where successful onboarding results are persisted and how they are resolved. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial L2 design extracted from the approved concrete TUI onboarding sketch. | +| 1 | 2026-05-22 | Human | Refinement | Updated the flow to model-first, provider-select-or-add, provider name/base URL/API key, provider-specific model name, invocation method, reasoning effort, and per-field hints. | +| 1 | 2026-05-22 | Human | Refinement | Added persistent configuration storage and default target behavior for successful onboarding. | +| 1 | 2026-05-25 | Human | Refinement | Removed outer ASCII frames while keeping a continuous inline rail under the setup field markers. | diff --git a/specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md b/specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md new file mode 100644 index 00000000..c538ca20 --- /dev/null +++ b/specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md @@ -0,0 +1,230 @@ +--- +artifact_id: L2-DES-TUI-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-002 — Modern TUI Shell Layout + +## Purpose + +Refine the terminal user interface, transcript, state visibility, and responsive layout requirements into a concrete modern TUI shell design. + +## Background / Context + +The TUI is the first interactive client. It must feel like a capable terminal-native work surface, not a decorative chat page. The layout should be dense, calm, readable, and stable while the agent streams model output, runs tools, waits for approvals, and accepts user input. + +This document defines the high-level visual structure and responsive behavior. Specific composer behavior, streaming cell behavior, and terminal lifecycle cleanup are refined by adjacent TUI L2 documents. + +## Source Requirements + +- `L1-REQ-APP-007` requires an inline-capable terminal UI with header or status area, transcript area, composer area, onboarding, command discovery, and visible active work state. +- `L1-REQ-TUI-003` requires a readable and reviewable transcript. +- `L1-REQ-TUI-004` requires visible current execution state. +- `L1-REQ-TUI-007` requires responsive layout and readability. +- `L1-REQ-CLIENT-001` requires Unicode-safe and localization-ready client display behavior. +- `L2-DES-APP-003` defines the canonical client/server events used to drive the UI. +- `L2-DES-CONV-001` defines transcript turns, items, and durable replay state. +- `L2-DES-TUI-003` defines composer and input-mode behavior. +- `L2-DES-TUI-004` defines streaming transcript and state rendering. +- `L2-DES-TUI-005` defines terminal lifecycle safety. +- `L2-DES-TUI-006` defines the full transcript alternate-screen overlay entered by `Ctrl+T`. + +## Design Requirement + +The TUI should be organized as a stable vertical shell with five conceptual regions: + +1. Session header. +2. Transcript viewport. +3. Active work strip or working indicator. +4. Composer. +5. Bottom status line. + +Inline mode and alternate-screen mode should use the same conceptual shell. Inline mode additionally preserves useful terminal scrollback above the live region where possible. + +## Modern TUI Principles + +- Prioritize task state, transcript content, and composer usability over decoration. +- Keep region boundaries visually clear but lightweight. +- Use stable row allocation so streaming updates do not cause avoidable layout jitter. +- Use icons or compact labels only when they improve scanning. +- Avoid nested boxed panels. Use rows, separators, indentation, and compact cells instead. +- Treat color as secondary information. Text labels must still communicate state without color. +- Prefer summary lines plus expandable or scrollable detail for long content. +- Keep the composer visible whenever the terminal is large enough for interaction. + +## Standard Layout + +The following sketch is normative for region order and relative priority. It is not a final choice of border glyphs, color, or exact column widths. + +```text + +┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ██████╗ ███████╗██╗ ██╗ ██████╗ ┃ +┃ ██╔══██╗ ██╔════╝██║ ██║██╔═══██╗ ┃ +┃ ██║ ██║ █████╗ ██║ ██║██║ ██║ v0.1.9┃ +┃ ██║ ██║ ██╔══╝ ╚██╗ ██╔╝██║ ██║ ┃ +┃ ██████╔╝ ███████╗ ╚████╔╝ ╚██████╔╝ ┃ +┃ ╚═════╝ ╚══════╝ ╚═══╝ ╚═════╝ ┃ +┣━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┫ +┃ Model deepseek-v4-pro Reasoning high ┃ +┃ Workspace ~/Desktop/devo ┃ +┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ + + Tip: Ready in /Users/username/Desktop/devo + +┃ Fix the parser regression and run the focused tests. + +┃ Thought: The failure is isolated to escaped quote handling. + +┃ I will update the parser branch, add a regression test, and run the + focused suite. + +┃ Explore + ┗ Read crates/parser/src/lib.rs + Grep "quoted_escape" crates/parser tests + +┃ Edit crates/parser/src/lib.rs + + @@ parse_value + - return parse_bare_value(input); + + return parse_quoted_or_bare_value(input); + +⠋ Working · 12s + +┃ Ask Devo + + Build · deepseek-v4-pro high ↑0[cached 0 0%] ↓0 ▱▱▱▱▱▱▱▱▱▱ 0% 0/950k +``` + +Required characteristics: + +- The startup header provides product identity, version, current model, workspace, and reasoning effort. +- The transcript viewport owns most vertical space. +- The active work strip appears only when useful and shows transient live state such as `⠋ Working · 12s`. +- The composer remains above the bottom status line. +- The bottom status line is reserved for active mode, current model/reasoning, token usage, and context-window pressure. +- The `┃` glyph in transcript and composer regions is normally a single leading marker for the active prompt line, cell title, or first content line. User-message cells and the bottom input composer are background-band surfaces: when they contain multiple user-entered lines, each content line may repeat `┃`, while top and bottom padding rows keep the shared background without rendering the marker. Diff detail, tool output detail, and assistant wrapped text align under their content column and do not repeat the marker unless they are separate logical cells. + +## Startup Header Visual Rules + +The startup header should use the boxed ASCII layout from the standard sketch. It should be a first-screen identity and environment summary, not a repeating transcript cell. + +Theme mapping: + +- The ASCII-art product wordmark uses the theme primary foreground color. +- Header border glyphs such as `┏`, `━`, `┣`, `┃`, and `┗` use a muted grey foreground. +- Header metadata labels `Model`, `Workspace`, and `Reasoning` use muted grey. +- Header version text such as `v0.1.9` uses muted grey. +- Header metadata values, such as model slug, workspace path, and reasoning value, use normal white foreground. +- The header box should not rely on color alone; the border, labels, and values must remain readable in monochrome terminals. + +The tip line below the header should use bold styling for `Tip:` and normal styling for the tip content. + +```text + Tip: Ready in /Users/username/Desktop/devo +``` + +In the rendered TUI, only `Tip:` is bold. + +## Transcript Viewport In The Shell + +The transcript viewport sits between the startup/session header and the bottom composer region. It is a scrollable list of transcript cells defined by `L2-DES-TUI-004`. + +Shell-level responsibilities: + +- Allocate most vertical space to the transcript viewport. +- Keep the single transcript cell marker column aligned with the composer marker column where practical. +- Preserve enough bottom space for the working indicator, composer, and status line. +- Let completed transcript cells scroll away while live composer and status regions remain fixed. +- Avoid embedding the transcript viewport inside a decorative card or nested box. + +The shell owns placement and clipping. `L2-DES-TUI-004` owns the detailed rendering of user, assistant, tool, shell, working, and completed-turn cells. + +Example shell composition with transcript content: + +```text +┃ Add escaping support for quoted parser values. + +┃ Thinking: Checking existing parser tests and the quoted branch. + +┃ The tests show escaped quotes are currently treated as ordinary text. + +┃ Explore + ┗ Read crates/parser/src/lib.rs + Read tests/parser/quoted.rs + +┃ Running cargo test parser::quoted -- --nocapture + +⠋ Working · 8s + +┃ Ask Devo + + Build · deepseek-v4-pro high ↑420[cached 300 71%] ↓12 ▰▰▱▱▱▱▱▱▱▱ 20% 190k/950k +``` + +## Region Responsibilities + +| Region | Responsibility | Must Avoid | +|---|---|---| +| Header | Startup identity, version, model, workspace, and reasoning effort. | Consuming too many rows or repeating full configuration. | +| Transcript viewport | Durable user-visible conversation, tool, approval, question, and error history. | Showing unlimited raw output inline. | +| Active work strip | Current live work summary, waiting reason, running background process summary. | Becoming the only place important state appears. | +| Composer | Current editable input, popups, mode-specific input affordances. | Being pushed off-screen during streaming. | +| Bottom status line | Current mode, model/reasoning, token/cache usage, and context-window usage. | Duplicating long transcript content. | + +## Responsive Rules + +- The composer must remain usable at every supported terminal size. +- Long lines should wrap, fold, or truncate by display columns, not bytes. +- Folding or truncation must be visible when important content is omitted. +- Terminal resize should recompute layout from current state rather than incrementally shifting old rows. +- Optional metadata should collapse before transcript or composer content. +- Popups should be constrained to the visible terminal and should not cover the composer unless the popup is directly editing composer state. +- If the terminal is too small for meaningful interaction, the TUI should show a concise minimum-size message and preserve terminal safety. + +## State Sources + +The TUI shell should render from server-confirmed state whenever possible: + +- Session snapshots from `session.open` or `session.subscribe`. +- Turn events from `turn.event`. +- Usage updates from `usage_updated`. +- Context pressure from `context_updated`. +- Tool and background process events from `tool_call_*` and `background_process_updated`. +- Error diagnostics from `error_reported`. + +Clients may optimistically render local input, but canonical state comes from the server protocol. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-APP-007 | 1 | specs/L1/L1-REQ-APP-007-tui.md | Defines the high-level terminal shell, core regions, inline/fullscreen consistency, and visible active-work layout. | +| refines | L1-REQ-TUI-007 | 1 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | Defines responsive priorities, narrow layout behavior, and non-overlap rules. | +| related-to | L1-REQ-TUI-003 | 1 | specs/L1/L1-REQ-TUI-003-transcript.md | Provides the transcript viewport placement and layout constraints. | +| related-to | L1-REQ-TUI-004 | 1 | specs/L1/L1-REQ-TUI-004-state-visibility.md | Defines shell regions that expose execution state. | +| related-to | L1-REQ-CLIENT-001 | 1 | specs/L1/L1-REQ-CLIENT-001-localization-readiness.md | Responsive layout must account for Unicode and localized display width. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Protocol events provide canonical state for the shell. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable transcript records are rendered in the viewport. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Composer and input mode behavior fills the shell's bottom regions. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Streaming transcript cells and state indicators populate the shell. | +| related-to | L2-DES-TUI-005 | 1 | specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md | Terminal lifecycle behavior constrains inline and alternate-screen shell modes. | +| related-to | L2-DES-TUI-006 | 1 | specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md | Defines the alternate-screen transcript review surface entered from the inline shell. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial modern TUI shell and responsive layout design. | +| 1 | 2026-05-23 | Human | Refinement | Added concrete startup header sketch, theme-color rules, bold tip label, composer prompt band, and token/context status line shape. | +| 1 | 2026-05-23 | Human | Refinement | Clarified that `┃` is a single leading marker, not a rail repeated through the full cell. | +| 1 | 2026-05-23 | Human | Refinement | Updated working indicator examples to use the spinner frame style and kept consecutive read calls on separate Explore lines. | +| 1 | 2026-05-23 | Human | Refinement | Clarified that multi-line composer and user-message background bands may repeat `┃` on content lines, but not on padding rows. | +| 1 | 2026-05-25 | Assistant | Refinement | Linked the shell layout to the `Ctrl+T` full transcript alternate-screen design. | diff --git a/specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md b/specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md new file mode 100644 index 00000000..a32f7a32 --- /dev/null +++ b/specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md @@ -0,0 +1,351 @@ +--- +artifact_id: L2-DES-TUI-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-003 — Composer And Input Modes + +## Purpose + +Refine TUI composer, terminal command prefix, and session-local input mode requirements into a concrete interaction design. + +## Background / Context + +The composer is the user's main control surface. It must handle normal chat input, multi-line input, command discovery, TUI-only shell command entry, and Plan Mode input without surprising the user or breaking Unicode/IME text entry. + +Session-local input modes affect how the TUI interprets the next composer submission. They are distinct from session-level agent modes. + +## Source Requirements + +- `L1-REQ-TUI-001` requires reliable text entry, multi-line input, intentional submit, command discovery, and session-local input modes. +- `L1-REQ-TUI-006` requires discoverable and intentional command invocation from the TUI. +- `L1-REQ-TUI-008` requires leading `!` input to enter Shell Mode and execute through the terminal command capability. +- `L1-REQ-TUI-009` requires Default Input Mode, Shell Mode, Plan Mode, and bottom status line labels for `Build`, `Plan`, and `Shell`. +- `L1-REQ-TUI-007` requires the composer and bottom status line to remain readable and non-overlapping. +- `L1-REQ-CLIENT-001` requires Unicode, IME, and wide-character safety. +- `L1-REQ-AGENT-005` defines agent-level Plan Mode behavior. +- `L1-REQ-TOOL-002` defines the command execution capability used by Shell Mode. +- `L2-DES-TUI-002` defines the shell regions occupied by composer and status line. +- `L2-DES-CLIENT-001` defines localization and Unicode readiness. + +## Design Requirement + +The composer should be a stable bottom-region editor with explicit submission semantics, visible non-default input modes, and predictable prefix handling. + +The TUI should support these session-local input modes: + +| Input Mode | Purpose | Status Label | +|---|---|---| +| Default Input Mode | Normal build/task input. | `Build` | +| Shell Mode | Terminal command input routed to the program's command execution capability. | `Shell` | +| Plan Mode | Plan-oriented input governed by agent-level Plan Mode behavior. | `Plan` | + +## Composer Layout + +Default composer: + +```text + +┃ Ask Devo + + Build · deepseek-v4-pro high ↑0[cached 0 0%] ↓0 ▱▱▱▱▱▱▱▱▱▱ 0% 0/950k +``` + +Multi-line composer: + +```text + +┃ Refactor the parser in three steps: +┃ 1. isolate quoted value parsing +┃ 2. add regression tests +┃ 3. run the focused suite + + Build · deepseek-v4-pro high ↑0[cached 0 0%] ↓0 ▰▰▱▱▱▱▱▱▱▱ 20% 190k/950k +``` + +Shell Mode composer: + +```text + +┃ cargo test parser::quoted -- --nocapture + + Shell · deepseek-v4-pro high ↑420[cached 300 71%] ↓12 ▰▰▱▱▱▱▱▱▱▱ 20% 190k/950k +``` + +Plan Mode composer: + +```text + +┃ Plan the migration steps before changing files. + + Plan · deepseek-v4-pro high ↑420[cached 300 71%] ↓12 ▰▰▱▱▱▱▱▱▱▱ 20% 190k/950k +``` + +Rules: + +- Composer content lines use a left `┃` marker in the theme primary foreground color. +- The composer is rendered as a full-width input band: one padding line above the content, the content lines, and one padding line below the content share the same background span. +- For multi-line input, each user-entered content line may repeat the `┃` marker. The top and bottom padding rows must keep the input-band background but must not render `┃`. +- `Ask Devo` is the empty-input hint. It uses muted grey text and disappears as soon as the user types content. +- User-entered input replaces the hint and uses normal input foreground styling. +- The status label is the first field in the bottom status line. +- `Build`, `Plan`, and `Shell` must use distinct colors. +- The bottom status line appears below the composer. +- Composer height may grow for multi-line input within configured bounds, then scroll internally or show a line count. + +## Bottom Status Line + +The bottom status line has this conceptual shape: + +```text + Build · deepseek-v4-pro high ↑0[cached 0 0%] ↓0 ▱▱▱▱▱▱▱▱▱▱ 0% 0/950k +``` + +Fields: + +- `Build`, `Plan`, or `Shell`: current TUI input/work mode. `Build` is the normal default work-state label. `Plan` and `Shell` replace it when those session-local input modes are active. +- `deepseek-v4-pro`: current model name or model slug. +- `high`: current reasoning effort. +- `↑0[cached 0 0%]`: input token count, cached input token count, and input cache hit rate. +- `↓0`: output token count. +- `▱▱▱▱▱▱▱▱▱▱`: context-window usage bar. +- `0%`: context-window usage percentage. +- `0/950k`: context-window usage and effective context-window length. + +The status line should derive model, reasoning, token, cache, and context values from server-confirmed usage and context events where available. If a value is unavailable, estimated, or redacted, the status line should use a compact marker defined by L3 rather than inventing an exact value. + +## Submission Semantics + +The TUI should separate text editing from submission. + +Rules: + +- Plain submit sends the current composer content according to active input mode. +- Supported modified-enter input inserts a newline instead of submitting. +- If a terminal cannot report the required key sequence, the TUI may expose an alternate newline action through command discovery or documented keybinding. +- Empty input should not create a normal chat turn. +- The composer must preserve the submitted content exactly as entered, subject only to intentional mode-specific parsing. +- Submission should use client-generated ids so reconnect or retry does not duplicate messages. + +This L2 design does not mandate exact keybindings because terminal event support differs. The L3 design should define required keybindings and fallbacks for supported terminals. + +## Prefix Handling + +The TUI-specific terminal command prefix is `!` at the first character of composer input. + +Rules: + +- If the first character of composer input is `!`, the TUI enters Shell Mode. +- Leading whitespace before `!` does not trigger Shell Mode. This keeps pasted text and indented examples from becoming commands unexpectedly. +- A literal normal-chat message beginning with `!` should be escapable by prefixing a backslash, for example `\!important`. The backslash is removed before normal chat submission. +- Input that consists only of `!` enters Shell Mode without executing a command. +- Input that starts with `!` followed by command text may be treated as one-shot Shell Mode submission. +- Shell Mode should exit back to Default Input Mode after a command completes unless the user explicitly enters a persistent Shell Mode in a later approved design. + +## Shell Mode Execution + +Shell Mode turns composer content into a command execution request. + +Rules: + +- The command text is the composer content after the leading `!` prefix or the Shell Mode editor content. +- Shell Mode command execution must use the program's terminal command capability, not an unmanaged client-local shell. +- Shell Mode must respect workspace, permission policy, safety, privacy, and sandbox constraints. +- Shell Mode results should appear in the transcript as command/tool output with bounded display. +- If approval is required, the TUI should show the approval prompt and keep Shell Mode state understandable. +- Command output should be summarized or folded when long. +- Failed commands should show status, exit code where available, and a natural-language result summary. + +Example Shell Mode flow: + +```text +User types: + +! cargo test parser::quoted + +TUI state: + +┃ Running cargo test parser::quoted + +⠋ Working · 2s + +┃ Ask Devo + + Shell · deepseek-v4-pro high ↑420[cached 300 71%] ↓12 ▰▰▱▱▱▱▱▱▱▱ 20% 190k/950k + +After completion: + +┃ Run cargo test parser::quoted failed 2.1s + ┗ output 42 lines hidden, 12 shown Ctrl+T for full transcript + Test failed: parser::quoted_escape expected escaped quote handling. +``` + +## Plan Mode Input + +Plan Mode is a session-local TUI input mode that activates agent-level Plan Mode behavior for submitted input. + +Rules: + +- Plan Mode must be visible in the bottom status line. +- Plan Mode does not change the session-level agent mode. +- Submitted Plan Mode input must be marked so the server applies Plan Mode rules. +- While Plan Mode is active, file modification is prohibited by agent/tool policy. +- The question tool may be used only where Plan Mode permits it. +- The TUI must not present Plan Mode as permission to make changes. +- Leaving Plan Mode returns the composer to Default Input Mode. + +Plan Mode can be entered by slash command, command palette, or another L3-defined control. This L2 document defines mode behavior, not the final keybinding. + +## Command Discovery + +The composer should provide command discovery without replacing typed text unexpectedly. + +Command discovery behavior: + +- Typing `/` in an empty composer opens the slash-command list. +- The composer line remains visible as `┃ /` while the list is open. +- `!` enters Shell Mode as defined above. +- Slash command suggestions should appear directly below the composer prompt line. +- The slash-command list has an eight-row visible height. +- Slash command rows render with two-character left padding. +- Up and Down arrow keys move the active selection. +- Enter confirms the active selection. +- Esc closes suggestions and preserves typed input where safe. +- The active row uses the theme primary foreground color for both the command name and description. +- In inactive rows, the command name uses normal white foreground and the description uses muted foreground. +- Selecting a suggestion must either invoke the command or open the relevant flow. + +Open slash-command list: + +```text +┃ / + + /theme switch the UI theme + /model choose the active model + /compact compact the current session context + /resume resume a saved chat + /goal set or view the goal for a long-running task + /new start a new chat + /status show current session configuration and token usage + /permissions choose what Devo is allowed to do +``` + +Full slash-command list: + +```text + /theme switch the UI theme + /model choose the active model + /compact compact the current session context + /resume resume a saved chat + /goal set or view the goal for a long-running task + /new start a new chat + /status show current session configuration and token usage + /permissions choose what Devo is allowed to do + /clear clear the current transcript + /onboard configure model provider connection + /btw start a side conversation in an ephemeral fork + /exit exit Devo +``` + +The eight-row visible list shows the first eight matching commands by default. When there are more than eight matches, the selection may scroll through the full list while preserving the two-character left padding and row color rules. + +Slash-command inline rendering: + +- When composer input begins with `/` and matches an existing slash command, the matched command token uses the theme primary foreground color. +- Parameters or placeholder text following the matched slash command use muted foreground color. +- If the typed slash command does not match an existing command, the composer should not apply matched-command coloring. +- Inline command coloring is presentational only. Command parsing and validation still happen when the user confirms or submits the command. +- For `/goal`, free-form text after the command is the objective. Pressing Enter submits that objective directly to the goal command instead of opening a budget prompt or create wizard. + +Example matched slash command with parameter hint: + +```text +┃ /btw + + Build · deepseek-v4-pro high ↑0[cached 0 0%] ↓0 ▱▱▱▱▱▱▱▱▱▱ 0% 0/950k +``` + +In the rendered TUI, `/btw` uses the primary foreground color and `` uses muted foreground color. + +Command purposes: + +- `/theme`: switch the UI theme. +- `/model`: choose the active model. +- `/compact`: compact the current session context. +- `/resume`: resume a saved chat. +- `/goal`: set or view the goal for a long-running task. +- `/new`: start a new chat. +- `/status`: show current session configuration and token usage. +- `/permissions`: choose what Devo is allowed to do. +- `/clear`: clear the current transcript. +- `/onboard`: configure model provider connection. +- `/btw`: start a side conversation in an ephemeral fork. +- `/exit`: exit Devo. + +Command-specific L2 designs: + +| Command | Design Artifact | +|---|---| +| `/theme` | `L2-DES-TUI-CMD-001` | +| `/model` | `L2-DES-TUI-CMD-002` | +| `/compact` | `L2-DES-TUI-CMD-003` | +| `/resume` | `L2-DES-TUI-CMD-004` | +| `/goal` | `L2-DES-TUI-CMD-010` | +| `/new` | `L2-DES-TUI-CMD-005` | +| `/status` | `L2-DES-TUI-CMD-006` | +| `/permissions` | `L2-DES-TUI-CMD-007` | +| `/clear` | `L2-DES-TUI-CMD-008` | +| `/onboard` | `L2-DES-TUI-CMD-009` | +| `/btw` | `L2-DES-TUI-CMD-011` | +| `/exit` | `L2-DES-TUI-CMD-012` | + +## Unicode And IME Constraints + +Composer editing must be text-model based rather than byte-position based. + +Rules: + +- Cursor movement should respect grapheme clusters and display columns. +- CJK and other wide characters should not corrupt wrapping or cursor placement. +- IME composition should not submit partial composition text. +- Non-ASCII text should remain intact through local editing, submission, server transport, transcript display, and replay. +- The composer should rely on `L2-DES-CLIENT-001` for cross-client localization and Unicode rules. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-001 | 1 | specs/L1/L1-REQ-TUI-001-composer.md | Defines composer layout, multi-line behavior, submission semantics, command discovery, and Unicode constraints. | +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines slash-command trigger behavior, list height, keyboard navigation, row styling, and the initial command list. | +| refines | L1-REQ-TUI-008 | 1 | specs/L1/L1-REQ-TUI-008-terminal-command-prefix.md | Defines leading `!` prefix behavior, Shell Mode execution, escaping, and command result display. | +| refines | L1-REQ-TUI-009 | 1 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | Defines Default, Shell, and Plan Mode behavior plus bottom status line labels. | +| related-to | L1-REQ-TUI-007 | 1 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | Composer and bottom status line must remain readable across terminal sizes. | +| related-to | L1-REQ-CLIENT-001 | 1 | specs/L1/L1-REQ-CLIENT-001-localization-readiness.md | Composer input must preserve Unicode, IME, and wide-character text. | +| related-to | L1-REQ-AGENT-005 | 1 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | Plan Mode input must trigger agent-level planning-only behavior. | +| related-to | L1-REQ-TOOL-002 | 1 | specs/L1/L1-REQ-TOOL-002-tools.md | Shell Mode uses the built-in command execution capability. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Defines the shell regions used by composer and status line. | +| related-to | L2-DES-CLIENT-001 | 1 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | Defines shared Unicode and localization design constraints. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial composer, Shell Mode, Plan Mode, and command discovery design. | +| 1 | 2026-05-23 | Human | Refinement | Added `Ask Devo` prompt band styling, `Build` default status label, distinct mode colors, and bottom token/cache/context status fields. | +| 1 | 2026-05-23 | Human | Refinement | Added slash-command popup behavior, eight-row command list, selection styling, keyboard navigation, and initial command catalog. | +| 1 | 2026-05-23 | Human | Refinement | Added inline slash-command coloring for matched command tokens and muted parameter hints. | +| 1 | 2026-05-23 | Human | Refinement | Clarified that multi-line composer input repeats `┃` on content lines while top and bottom padding rows remain background-only. | +| 1 | 2026-05-23 | Human | Refinement | Reconciled shell-mode examples and slash-command catalog with the current TUI visual grammar and approved command list. | +| 1 | 2026-05-23 | Human | Refinement | Linked the shared command catalog to command-specific L2 design artifacts. | +| 1 | 2026-05-23 | Human | Refinement | Removed `/diff` from the slash-command catalog. | +| 1 | 2026-05-23 | Human | Refinement | Changed `/btw` from active-turn injection to a side conversation in an ephemeral fork. | +| 1 | 2026-05-23 | Human | Refinement | Added `/goal` as the TUI entry point for Ralph Loop goals. | +| 1 | 2026-05-25 | Human | Refinement | Clarified that `/goal ` submits the following text as the objective without a default budget prompt. | +| 1 | 2026-05-25 | Assistant | Refinement | Added composer-level handling guidance for direct `/goal` objective submission. | diff --git a/specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md b/specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md new file mode 100644 index 00000000..02cb9719 --- /dev/null +++ b/specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md @@ -0,0 +1,538 @@ +--- +artifact_id: L2-DES-TUI-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-004 — Streaming Transcript And State + +## Purpose + +Refine TUI streaming rendering, transcript review, and state visibility requirements into a concrete display model for live and completed session activity. + +## Background / Context + +The TUI must show progress while work is happening and preserve a readable audit trail afterward. Model output, tool execution, approvals, questions, errors, and background processes arrive as ordered server events. The TUI should render those events promptly without treating transient live state as durable transcript truth. + +## Source Requirements + +- `L1-REQ-TUI-002` requires timely streaming of assistant text, reasoning summaries, tool starts, tool output deltas, and completion states. +- `L1-REQ-TUI-003` requires a durable, readable, scrollable transcript. +- `L1-REQ-TUI-004` requires visible idle, generating, tool, waiting, interrupted, failed, completed, background process, and input-mode states. +- `L1-REQ-TUI-007` requires stable layout during streaming and resize. +- `L1-REQ-APP-004` requires actionable diagnostics. +- `L1-REQ-TOOL-005` requires visibility and manual stop access for background processes. +- `L2-DES-APP-003` defines server-client event payloads. +- `L2-DES-CONV-001` defines durable transcript records. +- `L2-DES-TOOL-001` defines tool lifecycle and result summaries. +- `L2-DES-APP-004` defines observability fields used by diagnostic display. +- `L2-DES-CONTEXT-002` defines compaction lifecycle records and user-visible compaction notices. + +## Design Requirement + +The TUI should render from a transcript projection plus a live overlay: + +```text +Durable transcript projection + + +Live server-client events + + +Local composer state + ↓ +Visible TUI frame +``` + +The durable transcript projection provides stable review content. The live overlay provides in-progress streaming text, running tool output, waiting prompts, spinners, and active process state. When the server finalizes an item, the live overlay should reconcile into the durable transcript cell. + +## Shell Placement Boundary + +`L2-DES-TUI-002` owns the overall shell placement of the transcript viewport, working indicator, composer, and bottom status line. This document owns the rendering rules for cells inside that transcript viewport and for the live working indicator that appears immediately below the transcript while a turn is active. + +Placement contract: + +- Transcript cells render in the transcript viewport. +- The live working indicator renders after the latest transcript content and before the composer. +- The composer and bottom status line remain outside the transcript viewport. +- Entering full-screen alternate transcript mode, such as through `Ctrl+T`, may use the same transcript cell renderers with expanded output limits. + +Compact shell placement: + +```text + +┃ Thought: The fix is isolated to escaped quote handling. +┃ I will patch the parser and run the focused suite. +┃ Running cargo test parser::quoted -- --nocapture + +⠋ Working · 12s + + +┃ Ask Devo + + + Build · deepseek-v4-pro high ↑420[cached 300 71%] ↓12 ▰▰▱▱▱▱▱▱▱▱ 20% 190k/950k +``` + +## Transcript Cell Types + +The TUI should use explicit cell types rather than a single generic text block. + +| Cell Type | Purpose | Durable | +|---|---|---| +| User message | User-submitted content and mentions. | Yes | +| Assistant message | Final or streaming assistant response. | Yes | +| Reasoning summary | User-visible reasoning summary where available and allowed. | Yes, when emitted as visible item | +| Explore tool group | Read, glob, and grep/search activity grouped for scanning. | Yes | +| File mutation tool | Create/write and edit/apply-patch activity with diff preview. | Yes | +| Shell running cell | Active shell command state. | No, reconciles into shell result | +| Shell result cell | Completed shell command output summary and folded output. | Yes | +| Tool call | Other tool starts, arguments preview, approval state, command description. | Yes | +| Tool output | Other tool result summary, bounded output, status, redaction state. | Yes | +| Running tool overlay | Live output and spinner for an active tool. | No, reconciles into tool cells | +| Approval prompt | Pending or resolved approval request. | Yes | +| Question prompt | Pending or resolved Plan Mode question. | Yes | +| Error | Recoverable or terminal failure with recovery hint. | Yes | +| Plan update | Current plan item changes. | Yes | +| Background process | Tracked process state and stop affordance. | Yes for lifecycle events, live for recent output | +| Context/usage status | Token usage, context pressure, compaction notice. | Yes when recorded, live in header/status | +| Working indicator | Active turn indicator between transcript and composer. | No | +| Completed turn summary | Final assistant turn metadata after completion. | Yes | + +## Transcript Area Visual Design + +The transcript area is a vertical list of cells. Cells fall into three main categories: + +1. User message cells. +2. Assistant message cells. +3. Tool message cells. + +The left marker `┃` is the main visual anchor for transcript cells. For assistant and tool cells, it marks the first visible line of a logical cell. For user-message cells, which are background-band surfaces, it may repeat on each user-authored content line when the message has multiple lines. The marker column should align consistently across transcript content, use color to distinguish role/state, and remain readable without color. + +### User Message Cells + +User message cells begin with a left `┃` rendered in the theme primary foreground color. The user cell is a background band. For single-line messages, the marker appears only on the content line. For multi-line messages, each user-authored content line may repeat the marker. + +A single-line user message renders as a three-row band: + +```text + +┃ Fix the parser regression and run the focused tests. + +``` + +Rules: + +- The top padding row, content row, and bottom padding row share the same background span. +- Only content rows carry the primary-colored `┃` marker. +- User message text uses normal foreground color. +- Multi-line user messages keep the same background band and preserve author-entered line breaks. +- Multi-line user-authored content lines may repeat `┃`; the top and bottom padding rows must not render `┃`. + +Multi-line example: + +```text + +┃ Refactor the parser in three steps: +┃ 1. isolate quoted-value parsing +┃ 2. add regression tests +┃ 3. run the focused suite + +``` + +### Assistant Reasoning And Reply Cells + +Assistant cells also begin with a single `┃`, but they do not use a background band. Wrapped continuation lines align with the assistant text column and do not repeat the marker. + +Reasoning cells: + +- Appear above the reply cell for the same assistant turn. +- Use muted foreground for reasoning body text. +- Begin with `Thinking:` while reasoning is streaming. +- Change to `Thought:` after the reasoning item is complete. +- Render `Thinking:` and `Thought:` in italic styling when supported. +- Use a distinct muted/accent color for the label so it differs from primary text but remains visually quiet. + +Streaming reasoning example: + +```text +┃ Thinking: Inspecting the parser branch and matching it against the + existing quoted-value tests. +``` + +Completed reasoning example: + +```text +┃ Thought: The failure is isolated to escaped quote handling, so the + smallest fix is a parser branch update plus a focused regression test. +``` + +Reply cells: + +- Appear below reasoning cells. +- Use normal white foreground. +- Stream incrementally as assistant text arrives. +- Begin with a single `┃` and do not use a background band. + +Streaming reply example: + +```text +┃ The parser accepts quoted values, but the escape branch currently + treats a backslash before a quote as ordinary text. I will update the + branch and add a regression test before running the focused suite. +``` + +### Completed Turn Summary + +After an assistant turn completes, the assistant reply cell should show a compact completed-turn summary. + +```text +┃ The focused parser tests now pass. I also added a regression test for + escaped quotes. + + ▣ Build · DeepSeek V4 Pro · 2.1s +``` + +Rules: + +- `▣` uses the theme primary foreground color. +- `Build` is the active mode label for the completed turn. It may be `Plan` when the turn ran in Plan Mode. +- `DeepSeek V4 Pro` is the display model name. +- `2.1s` is total turn duration. +- The summary appears only after completion and should not duplicate the live working indicator. + +## Tool Message Visual Design + +Tool message cells begin with a single `┃` and should communicate the tool family, target, and outcome without requiring raw logs. + +### Explore Tools: Read, Glob, Grep + +`read`, `glob`, and `grep` are grouped under `Explore`. + +Rules: + +- Consecutive `read` calls may be grouped under a single `Explore` title. +- Each `read` call renders on its own line. Multiple read targets must not be merged into one `Read` line. +- The `read` target must include the file parameter. +- `glob` and `grep` each render as their own line even when consecutive. +- `glob` is file-pattern search. `grep` is content search. + +Example: + +```text +┃ Explore + ┗ Read crates/core/src/query.rs + Read crates/core/src/session/turn.rs + Glob crates/**/*.rs + Grep "execute_turn" crates/core +``` + +If another `read` of `crates/core/src/query.rs` arrives immediately after the group above, it is still a distinct tool call and should render as its own `Read` line. + +### File Mutation Tools: Create And Edit + +`write` and `apply_patch` render as file mutation cells with a diff preview. + +Rules: + +- `write` renders as `Create `. +- `apply_patch` renders as `Edit `. +- If the target is inside the workspace, the path is workspace-relative. +- If the target is outside the workspace, the path is absolute. +- A blank line separates the title from the diff. +- Diff content should use a git-diff-like layout. +- Diff lines should render on a diff background. +- Added, removed, and metadata lines should be distinguishable by color and symbol even when the diff background is present. + +Create example: + +```text +┃ Create crates/parser/src/quoted.rs + + diff --git a/crates/parser/src/quoted.rs b/crates/parser/src/quoted.rs + new file mode 100644 + +pub fn parse_quoted(input: &str) -> Result { + + todo!("parse escaped quotes") + +} +``` + +Edit example: + +```text +┃ Edit crates/parser/src/lib.rs + + diff --git a/crates/parser/src/lib.rs b/crates/parser/src/lib.rs + @@ parse_value + - return parse_bare_value(input); + + return parse_quoted_or_bare_value(input); +``` + +### Shell Tool: Running And Run Cells + +Shell calls render as two related cells: + +1. `Running`: one-line active command state. +2. `Run`: completed command result with compressed output. + +Running example: + +```text +┃ Running cargo test parser::quoted -- --nocapture +``` + +Completed output example: + +```text +┃ Run cargo test parser::quoted -- --nocapture failed 2.3s +┗ output 64 lines hidden, 12 shown Ctrl+T for full transcript + test parser::quoted_empty ... ok + test parser::quoted_escape ... FAILED + assertion failed: expected escaped quote handling +``` + +Rules: + +- The `Running` cell updates while the process is active. +- The `Run` cell replaces or follows the `Running` cell when the command completes. +- `Run` output is compressed by default. +- The `┗` relationship marker connects the command title to its output summary. +- Pressing `Ctrl+T` enters the full-screen alternate transcript mode defined by `L2-DES-TUI-006` for reviewing the full transcript and full output. +- The compressed output should show enough lines to explain the result and must indicate hidden line counts. + +### Context And Compaction Cells + +Context and compaction status cells render in the transcript area so the user can later review when context was compacted. + +Compaction lifecycle cells must use these exact visible labels: + +```text +┃ Manual Compaction Started +┃ Automatic Compaction Started +┃ Compaction Done +``` + +Rules: + +- `Manual Compaction Started` appears when compaction starts because the user requested it, such as through `/compact`. +- `Automatic Compaction Started` appears when compaction starts because context pressure crossed the configured threshold. +- `Compaction Done` appears when a compaction event completes successfully and the active context snapshot has been updated. +- These cells are transcript-area status cells, not assistant messages, user messages, or model-visible context content. +- Inline rendering should preserve the exact label text. Counts, token estimates, or summary inspection affordances may be available in an expanded detail view, but must not change the inline label. +- On replay, durable compaction records should project back into the same transcript-area status cells. + +## Active Turn Working Indicator + +When the current turn has not completed, the TUI should show a live working indicator between the transcript area and the bottom composer. + +Example: + +```text +⠋ Working · 12s + +┃ Ask Devo +``` + +Rules: + +- The left side is an animated spinner using this frame sequence: `⠋`, `⠙`, `⠹`, `⠸`, `⠼`, `⠴`, `⠦`, `⠧`, `⠇`, `⠏`. +- The text `Working` identifies the active turn state. +- A dot separates the state from elapsed time. +- Elapsed time is compact and may use seconds, minutes, hours, or days, such as `12s`, `3m`, `2h`, or `1d`. +- The working indicator is live-only and disappears when the turn completes. +- After completion, the completed-turn summary replaces the need for the working indicator. +- The shell layout reserves placement for this indicator; transcript rendering provides the content and state transition semantics. + +## Live Streaming Layout + +Live streaming examples should use the same transcript cell grammar as completed content. The TUI should not introduce a second table-like visual language for live state. + +Normal streaming assistant response: + +```text +┃ The parser accepts quoted values, but the escape branch currently + treats a backslash before a quote as ordinary text. I will update... +``` + +Reasoning summary: + +```text +┃ Thinking: Inspect parser branch -> add regression -> patch escape handling -> run tests. +``` + +Running tool with output deltas: + +```text +┃ Running cargo test parser::quoted -- --nocapture + ┗ output +18 lines + test parser::quoted_empty ... ok + test parser::quoted_escape ... FAILED +``` + +Tool completed with folded output: + +```text +┃ Run cargo test parser::quoted -- --nocapture failed 2.3s + ┗ output 64 lines hidden, 12 shown Ctrl+T for full transcript + test parser::quoted_empty ... ok + test parser::quoted_escape ... FAILED + assertion failed: expected escaped quote handling +``` + +Approval wait: + +```text +┃ Approval required waiting + apply_patch wants to modify 2 files. + [Approve] [Deny] [Details] +``` + +Background process: + +```text +┃ Background npm run dev running 03:14 + ┗ output http://localhost:3000 ready + recent output +5 lines [Stop] +``` + +## State Mapping + +The TUI should map canonical server events into visible state. + +| Server Event | TUI State | +|---|---| +| `turn_started` | New turn row and running status. | +| `item_started` assistant | Assistant cell appears immediately. | +| `item_content_update` | Existing cell updates before final completion. | +| `item_completed` | Live cell becomes completed transcript cell. | +| `tool_call_started` | Tool row appears even before final output exists. | +| `tool_call_updated` | Tool progress/output preview updates. | +| `tool_call_completed` | Tool result summary and status become durable review content. | +| `approval.requested` | Approval prompt appears and bottom status shows waiting reason. | +| `question.requested` | Question prompt appears and bottom status shows waiting for answer. | +| `background_process_updated` | Background process strip and transcript state update. | +| `usage_updated` | Header/context or turn usage display updates. | +| `context_updated` with compaction start | Transcript area shows `Manual Compaction Started` or `Automatic Compaction Started` based on compaction trigger source. | +| `context_updated` with compaction completion | Transcript area shows `Compaction Done` and context pressure display updates. | +| `context_updated` without lifecycle change | Context pressure display updates. | +| `error_reported` | Error cell appears with phase and recovery action. | +| `turn_status_changed` | Header/status and terminal turn cell update. | + +The TUI must not wait for all parallel tools to finish before showing a started or updated sibling tool when the server has emitted the sibling event. + +## Timeliness Requirements + +The TUI should be event-driven and repaint promptly after meaningful server events. + +Rules: + +- Assistant deltas should update the visible assistant cell before the final response completes. +- Tool start should be visible before tool completion. +- Tool output deltas should update the visible tool cell before final completion. +- Parallel tool events should be independently visible. +- Approval and question waits should interrupt ambiguous "running" display with a specific waiting reason. +- The TUI may coalesce frequent deltas to avoid flicker, but coalescing must not make active work appear frozen during normal operation. +- Live Markdown rendering should preserve readable partial output and avoid corrupting completed transcript layout. + +## Transcript Review + +Completed transcript content should support audit and recovery. + +Rules: + +- Completed turns remain reviewable after live rendering finishes. +- Tool calls should show command or tool summary, status, timing, and bounded result. +- Approval and question cells should show the prompt and final resolution. +- Error cells should show phase, concise message, recoverability, and recovery action where available. +- Long output should be folded, truncated, or referenced rather than rendered without bound. +- Omitted content must be marked with a visible line count, byte count, or content reference. +- Scrollback should preserve logical item boundaries so users can find relevant work. + +## Active Work Strip + +The active work strip should summarize the most important current state without replacing transcript content. + +Examples: + +```text +⠋ Working · 12s + running cargo test parser::quoted output +24 lines + waiting approval for apply_patch modifies 2 files + context 81% near limit compaction available + cleanup interrupted turn 1 background process running +``` + +Priority: + +1. Approval or question waiting state. +2. Active tool or background process state. +3. Model generation state. +4. Context pressure or compaction state. +5. Idle or ready state. + +## Failure And Interruption Display + +Failure and interruption states should be explicit transcript events. + +Example: + +```text +┃ Interrupted user requested + completed read src/parser.rs + stopped cargo test parser::quoted + next resume, edit previous message, or submit a new request +``` + +Rules: + +- Terminal turn state must remain visible after the live spinner stops. +- Partial assistant/tool content should remain visible if it was already emitted. +- Recovery actions should be shown when the server provides them. +- The TUI should distinguish failed, interrupted, canceled, and completed states. + +## Markdown And Wrapping + +Markdown rendering should be readable in both live and completed cells. + +Rules: + +- Live Markdown may use a tolerant incremental renderer. +- Completed Markdown may be re-rendered from final content for better formatting. +- Code blocks should preserve indentation and wrap or scroll according to transcript policy. +- Tables may degrade to preformatted text or simplified columns in narrow terminals. +- Links and file references should remain visible as text in terminals that cannot open them directly. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-002 | 1 | specs/L1/L1-REQ-TUI-002-streaming.md | Defines live streaming behavior for assistant text, reasoning summaries, tool starts, tool deltas, and completion. | +| refines | L1-REQ-TUI-003 | 1 | specs/L1/L1-REQ-TUI-003-transcript.md | Defines transcript cell types, review behavior, folding, and durable/live reconciliation. | +| refines | L1-REQ-TUI-004 | 1 | specs/L1/L1-REQ-TUI-004-state-visibility.md | Defines visible state mapping for idle, generating, tools, approvals, questions, failures, interruptions, and background processes. | +| related-to | L1-REQ-TUI-007 | 1 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | Streaming and transcript rendering must remain stable across resize and narrow widths. | +| related-to | L1-REQ-TOOL-005 | 1 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | Background process state and stop controls are rendered in the TUI. | +| related-to | L1-REQ-APP-004 | 1 | specs/L1/L1-REQ-APP-004-observability.md | User-facing diagnostics and waiting reasons inform state display. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Server-client events drive live rendering. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Durable transcript records are the replay source. | +| related-to | L2-DES-TOOL-001 | 1 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | Tool lifecycle, command descriptions, and result summaries feed tool cells. | +| related-to | L2-DES-APP-004 | 1 | specs/L2/app/L2-DES-APP-004-observability-architecture.md | Diagnostic fields provide recovery and phase display. | +| related-to | L2-DES-CONTEXT-002 | 1 | specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md | Compaction lifecycle records render as transcript-area status cells. | +| related-to | L2-DES-TUI-006 | 1 | specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md | Defines full transcript alternate-screen projection, live-tail sync, and pager controls. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial streaming transcript and state visibility design. | +| 1 | 2026-05-23 | Human | Refinement | Added concrete transcript-area visual design for user, assistant, tool, shell, working, and completed-turn cells. | +| 1 | 2026-05-23 | Human | Refinement | Clarified shell placement boundary between transcript viewport, working indicator, composer, and status line. | +| 1 | 2026-05-23 | Human | Refinement | Clarified that `┃` is a single leading marker for each logical cell, not a full-cell rail. | +| 1 | 2026-05-23 | Human | Refinement | Defined the working spinner frame sequence and changed Explore read rendering to one line per read call. | +| 1 | 2026-05-23 | Human | Refinement | Updated live streaming examples to reuse the transcript cell visual grammar. | +| 1 | 2026-05-23 | Human | Refinement | Clarified that multi-line user-message background bands may repeat `┃` on content lines while padding rows remain background-only. | +| 1 | 2026-05-23 | Human | Refinement | Reconciled active work and interruption examples with the current transcript and working-indicator visual grammar. | +| 1 | 2026-05-25 | Human | Refinement | Added exact transcript-area labels for manual compaction start, automatic compaction start, and compaction completion. | +| 1 | 2026-05-25 | Assistant | Refinement | Linked `Ctrl+T` full transcript review to `L2-DES-TUI-006`. | diff --git a/specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md b/specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md new file mode 100644 index 00000000..2aadfc7a --- /dev/null +++ b/specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md @@ -0,0 +1,180 @@ +--- +artifact_id: L2-DES-TUI-005 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-005 — Terminal Lifecycle Safety + +## Purpose + +Refine terminal lifecycle safety into a design for TUI startup, inline mode, alternate-screen mode, interrupt handling, cleanup, and shell prompt handoff. + +## Background / Context + +The TUI runs inside a user's terminal. It may change terminal modes, render live regions, handle interrupts, and exit while work is active. Users rely on the terminal after exit, so cleanup correctness is more important than preserving decorative UI state. + +## Source Requirements + +- `L1-REQ-TUI-005` requires safe startup, terminal mode restoration, consistent normal and interrupt exit, useful inline scrollback preservation, stale live-region cleanup, and understandable cleanup failures. +- `L1-REQ-APP-007` requires inline mode and alternate full-screen mode where appropriate. +- `L1-REQ-TUI-007` requires stable layout across resize events. +- `L2-DES-TUI-002` defines the TUI shell used by inline and alternate-screen modes. +- `L2-DES-TUI-006` defines one concrete alternate-screen overlay entered by `Ctrl+T`. + +## Design Requirement + +The TUI should treat terminal lifecycle as an explicit state machine. + +Conceptual states: + +- `not_started` +- `starting` +- `running_inline` +- `running_alternate_screen` +- `stopping` +- `restored` +- `restore_failed` + +The TUI should enter terminal modes deliberately, restore them exactly once where possible, and avoid relying on fragile assumptions about shell prompt position. + +## Inline Mode Model + +Inline mode renders a live TUI region inside the existing terminal scrollback. + +```text +Before start: + +$ program +previous shell output remains above + +During inline TUI: + +previous shell output remains above +┌──────────────── live TUI region ────────────────┐ +│ header │ +│ transcript │ +│ composer │ +└──────────────── bottom status ──────────────────┘ + +After exit: + +previous shell output remains above +$ _ +``` + +Rules: + +- Inline mode should preserve useful scrollback above the live region. +- The live region may be cleared or compacted on exit so stale TUI rows do not confuse the next shell prompt. +- The shell owns the next prompt after the program exits. +- Cleanup must not depend on predicting where the shell prompt will be printed. + +## Alternate-Screen Mode Model + +Alternate-screen mode may use the terminal's alternate screen when configured or appropriate. + +Rules: + +- Entering alternate-screen mode must save the previous terminal screen according to terminal capability. +- Exiting alternate-screen mode must restore the normal screen and terminal modes. +- The same logical shell layout should be used inside alternate-screen mode. +- If alternate-screen entry fails, the TUI should fall back to inline mode or fail with an understandable message. +- Overlay-style alternate-screen surfaces, including full transcript review and the resume session browser, should return control to inline rendering through a single cleanup path that leaves alternate screen and schedules a fresh inline frame. + +## Startup Rules + +Startup should: + +- Detect terminal capability where practical. +- Record which terminal modes were changed. +- Enter raw mode, bracketed paste, alternate screen, mouse mode, or keyboard enhancement modes only when required by supported behavior. +- Initialize the shell layout after terminal mode changes succeed. +- Preserve a cleanup guard that can restore terminal modes on normal exit or interrupt. + +If startup fails after partially changing terminal modes, cleanup should attempt to restore any changed modes before reporting failure. + +## Exit Rules + +Normal exit and interrupt-triggered exit should share the same cleanup path. + +Cleanup should: + +1. Stop accepting new composer input. +2. Resolve or detach active client subscriptions according to server policy. +3. Stop live rendering. +4. Clear or compact the live TUI region where inline mode requires it. +5. Restore terminal modes changed by the TUI. +6. Leave shell prompt placement to the shell. +7. Report cleanup failure only after best-effort restore. + +Cleanup should prioritize terminal usability over preserving the final decorative TUI frame. + +## Interrupt Handling + +Interrupt handling must distinguish TUI process exit from agent turn interruption. + +Rules: + +- A user interrupt intended to stop the current turn should be routed through the server interrupt protocol when the TUI remains open. +- A user interrupt intended to exit the TUI should trigger terminal cleanup. +- If the TUI exits while server work continues, the TUI should make the lifecycle policy clear where possible before exit. +- If cleanup occurs during active streaming, partial live output already persisted by the server remains recoverable through session replay. + +## Stale Region Prevention + +The TUI should avoid leaving stale live-rendered rows below or around the next shell prompt. + +Rules: + +- Inline cleanup should clear the owned live region when possible. +- Cleanup should avoid writing extra prompt-like text after terminal restore. +- Cleanup should avoid double-restoring terminal modes. +- Cleanup should not rely on exact shell prompt row prediction. +- If the terminal cannot clear the live region reliably, the TUI should prefer a concise final status line over a partially stale UI. + +## Resize Handling + +Resize handling should be safe in both inline and alternate-screen modes. + +Rules: + +- Resize events should trigger full layout recomputation. +- The TUI should not preserve stale absolute row assumptions after resize. +- If a resize makes the terminal too small, the TUI should show a minimum-size message or simplified frame. +- Exiting after resize should still restore terminal modes and avoid prompt corruption. + +## Cleanup Failure Display + +If cleanup cannot fully restore terminal state, the program should provide a concise, terminal-safe message after best-effort restore. + +Example: + +```text +program: terminal cleanup may be incomplete; run `reset` if input looks wrong. +``` + +The message should be emitted only when useful and should not expose internal debug details by default. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-005 | 1 | specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md | Defines terminal lifecycle states, startup, cleanup, inline scrollback preservation, alternate-screen behavior, and interrupt-safe exit. | +| related-to | L1-REQ-APP-007 | 1 | specs/L1/L1-REQ-APP-007-tui.md | Inline and alternate-screen modes are high-level TUI requirements. | +| related-to | L1-REQ-TUI-007 | 1 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | Resize handling must preserve layout and safe exit behavior. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Defines the shell layout whose live region must be cleaned up safely. | +| related-to | L2-DES-TUI-006 | 1 | specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md | Defines a concrete alternate-screen overlay and its return-to-inline lifecycle. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial terminal lifecycle safety design. | +| 1 | 2026-05-25 | Assistant | Refinement | Linked terminal lifecycle safety to the current full transcript overlay and resume-browser alternate-screen cleanup path. | diff --git a/specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md b/specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md new file mode 100644 index 00000000..58e57d9d --- /dev/null +++ b/specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md @@ -0,0 +1,176 @@ +--- +artifact_id: L2-DES-TUI-006 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-006 — Full Transcript Alternate Screen + +## Purpose + +Define the current TUI behavior for the `Ctrl+T` full transcript alternate-screen overlay. + +## Current Implementation Contract + +- Trigger: `Ctrl+T` from the normal inline TUI event loop. +- Rendering mode: terminal alternate screen. +- Data source: the current `ChatWidget` history plus the active live tail, projected through transcript overlay cells. +- Server effect: no server command is sent when the overlay opens. +- Transcript effect: opening, scrolling, and closing the overlay do not create transcript items. +- Blocking rule: `Ctrl+T` does not open the transcript overlay while the `/resume` browser is loading or open. + +The host owns alternate-screen lifecycle through the overlay state. The transcript overlay owns pager rendering, scrolling, user-message selection, and close detection. The chat widget owns conversion from current history and live state into renderable transcript cells. + +## Entry And Exit + +Opening the overlay follows this sequence: + +1. The host receives a `Ctrl+T` key event while no overlay is active and no resume browser is open. +2. The host reads the current terminal width and enters the alternate screen. +3. The overlay is initialized from `transcript_overlay_cells(width)`. +4. The initial scroll offset is set to the bottom of the transcript. +5. A new frame is scheduled and all subsequent draw/key events are routed to the overlay until it closes. + +Close keys: + +| Key | Behavior | +|---|---| +| `Ctrl+T` | Close the transcript overlay and return to inline TUI rendering. | +| `q` | Close the transcript overlay and return to inline TUI rendering. | +| `Ctrl+C` | Close the transcript overlay and return to inline TUI rendering. | + +`Esc` does not close the transcript overlay. It selects the previous user message for edit preview, matching the footer hint. + +When the overlay is done, the host clears the overlay, leaves alternate screen, schedules a frame, and resumes normal inline drawing. + +## Layout + +The overlay uses the full terminal area. + +```text +/ T R A N S C R I P T + +~ +~ + ↑/↓ to scroll pgup/pgdn to page home/end to jump 100% + q to quit esc/← to edit prev → to edit next enter to edit message +``` + +Rules: + +- The header line is dim and starts with `/ T R A N S C R I P T`. +- The content area starts one row below the header and ends above the two-line bottom bar. +- Empty content rows after the rendered transcript are filled with `~`. +- The bottom bar occupies two rows. +- Both bottom-bar rows are initialized as dim separator rows, then overwritten with hint text. +- The first bottom-bar row shows scroll hints and a right-aligned scroll percentage. +- The second bottom-bar row shows close and edit-selection hints. + +Footer hint text: + +```text + ↑/↓ to scroll pgup/pgdn to page home/end to jump + q to quit esc/← to edit prev → to edit next enter to edit message +``` + +## Transcript Projection + +Committed transcript content is projected from the widget's current history: + +- Each history cell becomes a transcript overlay cell using `transcript_lines(width)`. +- User history cells also carry an editable `UserMessage` payload containing text, text elements, local image paths, and remote image URLs. +- Stream-continuation cells are marked so the overlay can avoid inserting a visual gap before continuation content. +- Non-user cells render as wrapped paragraphs. +- User-message cells render as a background-band surface using the user-message style. +- A selected user-message cell uses a stronger highlighted background and foreground. +- Non-continuation cells after the first rendered cell receive a one-row top inset. + +Live turn content is projected as a live tail: + +- The overlay considers a live tail present when the widget has an active cell, active text items, active tool calls, or pending tool calls. +- The live tail key includes width, active-cell revision, stream-continuation status, and optional animation tick. +- If the overlay is scrolled to the bottom, committed-cell replacement or live-tail updates keep it at the bottom. +- If the overlay is not scrolled to the bottom, live updates do not force the user away from the current review position. +- Active animation ticks schedule follow-up frames while the overlay is at the bottom. + +On every draw, the host synchronizes the overlay with the current widget: + +- If terminal width or committed cell count changed, committed cells are rebuilt. +- The live tail is refreshed when its key changed. +- The overlay then renders using the latest synchronized content. + +## Scrolling + +The pager supports press and repeat key events. + +| Key | Behavior | +|---|---| +| `Up` / `k` | Scroll one line up. | +| `Down` / `j` | Scroll one line down. | +| `PageUp` / `Ctrl+B` | Scroll one page up. | +| `PageDown` / `Ctrl+F` | Scroll one page down. | +| `Space` | Scroll one page down. | +| `Shift+Space` | Scroll one page up. | +| `Ctrl+D` | Scroll half a page down. | +| `Ctrl+U` | Scroll half a page up. | +| `Home` | Jump to the top. | +| `End` | Jump to the bottom. | + +The page height is derived from the last rendered content area height, falling back to the current viewport-derived content height before the first render. + +## Previous Message Editing + +The full transcript overlay also hosts the current previous-message edit preview behavior. + +Selection behavior: + +- `Esc` or `Left` selects the previous user message cell. +- `Right` selects the next user message cell. +- If no user message is selected yet, selection starts at the latest user message. +- The selected user message is scrolled into view and highlighted. +- If there are no user messages, selection remains empty. + +Edit confirmation: + +- Pressing `Enter` while a user message is selected is intercepted by the host before normal overlay key handling. +- The chat widget truncates visible history to include user turns through the selected user message. +- The selected message is restored into the composer with its text, text elements, local image paths, and remote image URLs. +- The overlay closes, alternate screen is left, and the status message becomes `Previous message loaded`. +- Pressing `Enter` without a selected user message has no edit effect. + +## Terminal Lifecycle + +While the overlay is active, it takes precedence over normal inline TUI key handling and drawing. The inline transcript, composer, and status line are not rendered into the alternate screen. + +Lifecycle rules: + +- Alternate-screen entry happens only after reading terminal width for the initial transcript projection. +- Overlay draw uses `tui.draw(u16::MAX, ...)`, so it owns the full viewport height. +- Closing the overlay always routes through the overlay state cleanup path, which leaves alternate screen and schedules a new inline frame. +- Resize is handled by the normal draw synchronization path because width changes rebuild committed overlay cells. +- If another alternate-screen surface such as `/resume` is open, `Ctrl+T` is ignored rather than stacking overlays. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-003 | 1 | specs/L1/L1-REQ-TUI-003-transcript.md | Defines full-screen transcript review, scrolling, live-tail sync, and previous-message selection behavior. | +| related-to | L1-REQ-TUI-005 | 1 | specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md | Uses alternate-screen entry, cleanup, resize, and restore behavior. | +| related-to | L1-REQ-TUI-007 | 1 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | Rebuilds transcript projection when width changes and provides pager controls for narrow or long content. | +| related-to | L1-REQ-CONV-005 | 1 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | Hosts previous-message selection and composer restore behavior. | +| related-to | L2-DES-TUI-002 | 1 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | Complements the inline transcript viewport with an alternate-screen review surface. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Reuses transcript cell projection, live overlay state, and full-output review semantics. | +| related-to | L2-DES-TUI-005 | 1 | specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md | Depends on alternate-screen lifecycle safety and restore behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-25 | Human | Initial | Requested a design document for `Ctrl+T` full transcript alternate-screen behavior according to current `crates/tui` implementation. | +| 1 | 2026-05-25 | Assistant | Initial | Documented current entry, exit, rendering, scrolling, live sync, previous-message edit preview, and terminal lifecycle behavior. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-001-theme.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-001-theme.md new file mode 100644 index 00000000..b7825490 --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-001-theme.md @@ -0,0 +1,64 @@ +--- +artifact_id: L2-DES-TUI-CMD-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-TUI-CMD-001 — Slash Command: /theme + +## Purpose + +Define the TUI behavior for `/theme`, which lets the user switch the terminal UI theme without leaving the current session. + +## Command Contract + +- Command: `/theme` +- Description: `switch the UI theme` +- Parameters: none in the first milestone. +- Mutability: client configuration only. +- Active-turn availability: allowed during active work because it does not change agent execution. + +## UI Flow + +`/theme` opens a compact searchable popup using the same popup behavior as slash-command discovery. + +```text +┃ /theme + + devo-dark + devo-light + terminal-default +``` + +Rules: + +- The current theme is preselected. +- Up and Down move selection; Enter applies; Esc cancels. +- Theme preview may apply optimistically while the popup is open. +- If canceled, the TUI restores the previous theme. +- The selected theme should be persisted after confirmation. + +## State And Error Behavior + +- The command must not create a transcript turn. +- The command must not modify session metadata that affects model behavior. +- If persistence fails, the TUI may keep the theme for the current process but must show a concise warning that it was not saved. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery, popup, and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/theme` command design. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-002-model.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-002-model.md new file mode 100644 index 00000000..9f1153fe --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-002-model.md @@ -0,0 +1,218 @@ +--- +artifact_id: L2-DES-TUI-CMD-002 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-CMD-002 — Slash Command: /model + +## Purpose + +Define the TUI behavior for `/model`, the post-onboarding command for changing the active session model, provider binding, invocation method, and reasoning effort where applicable. + +## Command Contract + +- Command: `/model` +- Description: `choose the active model` +- Parameters: none in the first milestone. +- Mutability: session metadata; model-provider configuration only when adding or repairing configuration; default-selection configuration only where the application configuration lifecycle requires it. +- Active-turn availability: blocked while a turn is generating, running tools, or waiting on active execution. + +## Design Requirement + +`/model` should first show configured model-provider bindings from effective configuration. These bindings may have been created by onboarding or defined directly in configuration files. The last row in the configured binding list is always `Add model...`. + +Selecting an existing configured binding does not immediately apply the final session model when the model supports reasoning. `/model` groups model and provider together as the configured binding step, then treats reasoning effort as a distinct follow-up step. If the selected model does not support reasoning, the configured binding can be applied immediately. If the user selects `Add model...`, `/model` enters the same add-model setup sequence as onboarding: + +1. Select configured model-provider binding, or choose `Add model...`. +2. If adding a model, select supported model slug. +3. If adding a model, select an existing provider or add a provider. +4. When adding a provider, enter provider name. +5. When adding a provider, enter base URL. +6. When adding a provider, enter API key. +7. Enter the model name expected by the selected provider. +8. Select invocation method when a new binding requires it. +9. Select reasoning effort when the selected model supports reasoning. +10. Apply the resulting model-provider binding and reasoning effort to future turns in the current session, persisting newly created provider or binding records before treating them as configured. + +The interaction surface differs from onboarding. Onboarding may show a longer inline setup stack because it is the initial setup experience. `/model` is a focused slash-command workflow and should show only the current step directly below the composer, using the slash-command popup visual grammar from `L2-DES-TUI-003`. The slash-command surface occupies the same bottom region as the bottom status line, so while `/model` is visible the normal bottom status line is hidden. When the user confirms or submits a step, that step disappears and the next step replaces it in the same below-composer command area. Previously completed steps are retained in command state but are not rendered as visible inline history. + +## UI Flow + +`/model` opens a transient command surface below the composer. The first surface is a configured model-provider binding list, not the supported-model slug selector. The configured binding list may show model and provider together, but it must not conflate reasoning effort into the binding-selection row. Discrete choices use the same searchable popup pattern as onboarding and the row layout rules from `L2-DES-TUI-003`: two-character left padding, active row primary foreground, inactive command/name text in normal foreground, and secondary details in muted foreground. Free-text values use a single active input prompt below the composer. + +```text +┃ /model + +> deepseek-v4-pro OpenRouter + gpt-5.5 OpenAI + claude-sonnet-5 Anthropic + Add model... +``` + +Selecting an existing configured binding removes the binding list and shows the reasoning effort step when the selected model supports reasoning: + +```text +┃ /model + + Reasoning Effort + Hint: Choose the reasoning effort for gpt-5.5 through OpenAI. + + > medium + high + xhigh + + Enter: select and apply + Esc: back +``` + +Selecting `Add model...` removes the configured binding list and shows the first add-model step: + +```text +┃ /model + + Select Model Slug + Hint: Choose the model capability profile the session should use. + + Search: gpt + + > openai/gpt-5.5 + openai/gpt-5.4 + anthropic/claude-opus + local/qwen3-coder + + Enter: select and continue + Esc: back +``` + +After selecting a model slug in the add-model flow, the model selector is removed and the provider step appears: + +```text +┃ /model + + Select Provider + Hint: Choose a provider or add one. + + Search: open + + > OpenAI + OpenRouter + Add provider... + + Enter: select and continue + Esc: back +``` + +When a free-text step is active, only that step is shown: + +```text +┃ /model + + Model Name + Hint: Enter the model name this provider expects. + + openai/gpt-5.5 + + Enter: continue + Esc: back +``` + +When an invocation method is required for a new binding, the command surface replaces the previous step with the invocation selector: + +```text +┃ /model + + Invocation Method + Hint: Choose the API protocol used to call this model. + + Search: openai + + > OpenAI Responses + OpenAI Chat Completions + Anthropic Messages + + Enter: select and continue + Esc: back +``` + +The below-composer command surface must not render the onboarding-style inline history stack: + +```text +Model: openai/gpt-5.5 +| +* provider name: +| ... +| +* base url: +| ... +``` + +That stacked rail view belongs to onboarding. `/model` uses one active step at a time below the composer. + +## Step Behavior + +- The configured model-provider binding list is the first `/model` step. +- The configured binding list must be populated from effective configuration, including bindings created by onboarding and bindings defined directly in configuration files. +- The configured binding list must show configured model-provider bindings and an `Add model...` row at the bottom. It may show provider identity in each row, but it must not include reasoning effort. +- While any `/model` command surface is visible, the normal bottom status line must not be rendered because the command surface occupies that same area below the composer. +- Pressing Enter on a highlighted configured binding records the selected model-provider binding in command-local state. +- Reasoning effort selection is required after configured binding selection when the selected model supports reasoning, even when the binding already has a default or last-used reasoning effort. +- If the selected configured binding's model does not support reasoning, the binding may be applied immediately after selection. +- The final selection is applied only after the configured binding and any required reasoning effort step have completed. +- Selecting an existing configured binding after the first user message updates the current session selection only; it must not rewrite provider records, binding records, or default-selection fields. +- Selecting an existing configured binding before the first user message may persist the default selected binding and reasoning effort according to application configuration rules, but it must not duplicate or rewrite unchanged provider and binding records. +- Pressing Enter on `Add model...` removes the configured binding list and starts the add-model flow at supported model slug selection. +- The model slug selector must support search or filtering by slug text. +- Pressing Enter on a highlighted model slug confirms the slug, removes the model selector, and shows the provider step. +- The provider step must let the user choose an existing provider or add a provider. +- If the user chooses to add a provider, `/model` prompts for provider name, base URL, and API key as separate current-step views below the composer. +- API key entry must use hidden or masked input by default. +- After provider selection or creation, `/model` prompts for the provider-specific model name. +- Invocation method selection appears after model name entry and uses the same searchable selection pattern as onboarding. +- If the selected model supports reasoning, reasoning effort selection appears after invocation method selection and uses the same searchable selection pattern as onboarding. +- If the selected model does not support reasoning, the reasoning effort step is skipped. +- Pressing Esc returns to the previous step when one exists; otherwise it cancels `/model` and clears the below-composer command surface. +- Completed step values are stored in command-local state so back navigation and final application remain correct, but completed steps are not shown as inline history below the composer. +- The command may show a concise final confirmation or success status after applying the selection, but it must not expand the full completed-step history. + +## State And Error Behavior + +- The TUI should use `model.list` to populate configured model-provider bindings for the first `/model` screen and supported model choices for the `Add model...` flow. +- The final selection should use `model.select`. +- New or modified provider and model-provider binding data should follow the same validation and persistence expectations as onboarding and should be persisted before the command applies a newly created binding. +- Persistence errors for newly created or modified configuration must keep `/model` in a recoverable command state rather than silently falling back to a session-only binding. +- The command must show credential status but must not display plaintext API keys in routine lists. +- If invoked during active work, the TUI shows a concise blocked message such as `Cannot change model while generating`. +- The selected model and reasoning effort affect the next turn, not an already-running invocation. +- Validation failures keep the user on the current below-composer step with a concise error near that step. +- If persistence fails after valid setup input, `/model` reports the target configuration scope and leaves the command in a recoverable state. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines `/model`, the required post-onboarding model-selection command. | +| related-to | L1-REQ-MODEL-001 | 1 | specs/L1/L1-REQ-MODEL-001-config.md | Model selection uses configured model-provider bindings. | +| related-to | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Defines when model selection changes are persisted as defaults versus session state. | +| related-to | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | Defines supported models, user providers, and model-provider bindings. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | Defines configuration write scope, persistence target behavior, and distinction between session selection and durable records. | +| related-to | L2-DES-TUI-001 | 1 | specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md | Reuses the onboarding model setup sequence while using a transient below-composer command surface instead of an inline history stack. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery, popup, and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/model` command design. | +| 1 | 2026-05-25 | Human | Refinement | Aligned `/model` with the onboarding model setup sequence and specified one-step-at-a-time rendering below the composer. | +| 1 | 2026-05-25 | Human | Refinement | Changed the first `/model` screen to configured binding selection with `Add model...` as the entry to the add-model flow. | +| 1 | 2026-05-25 | Human | Refinement | Clarified that the `/model` command surface replaces the bottom status line while visible. | +| 1 | 2026-05-25 | Human | Refinement | Split configured model, provider, and reasoning effort into distinct `/model` steps. | +| 1 | 2026-05-25 | Human | Refinement | Grouped model and provider back into the configured binding selection while keeping reasoning effort separate. | +| 1 | 2026-05-25 | Human | Refinement | Clarified that existing binding selection is session state after the first user message, while newly created provider or binding records require configuration persistence. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-003-compact.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-003-compact.md new file mode 100644 index 00000000..6239d5c3 --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-003-compact.md @@ -0,0 +1,72 @@ +--- +artifact_id: L2-DES-TUI-CMD-003 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-CMD-003 — Slash Command: /compact + +## Purpose + +Define the TUI behavior for `/compact`, which asks the server to compact the current session context while preserving the full transcript. + +## Command Contract + +- Command: `/compact` +- Description: `compact the current session context` +- Parameters: none in the first milestone. +- Mutability: active context snapshot and durable compaction records. +- Active-turn availability: blocked while a turn is actively generating or running tools. + +## UI Flow + +`/compact` asks for confirmation before starting compaction. + +```text +┃ /compact +``` + +During compaction: + +```text +┃ Manual Compaction Started + +⠋ Working · 4s +``` + +After successful compaction: + +```text +┃ Compaction Done +``` + +## State And Error Behavior + +- The command must not delete transcript items. +- Starting manual compaction must add a transcript-area status cell with the exact text `Manual Compaction Started`. +- Successful compaction creates durable context summary state and updates the active context snapshot. +- The TUI should show `Compaction Done` in the transcript area when `context_updated` reports successful compaction completion. +- If compaction fails, the prior context snapshot remains active and the TUI shows an error with a recovery hint. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-CONTEXT-003 | 1 | specs/L1/L1-REQ-CONTEXT-003-compress.md | Context compaction is the underlying workflow. | +| related-to | L2-DES-CONTEXT-002 | 1 | specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md | Defines compaction triggers, summaries, and context updates. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Defines working indicator and context update display. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/compact` command design. | +| 1 | 2026-05-25 | Human | Refinement | Added transcript-area notices for manual compaction start and completion. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-004-resume.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-004-resume.md new file mode 100644 index 00000000..c5ee577b --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-004-resume.md @@ -0,0 +1,112 @@ +--- +artifact_id: L2-DES-TUI-CMD-004 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-CMD-004 — Slash Command: /resume + +## Purpose + +Define the TUI behavior for `/resume`, which lets the user reopen a saved chat session through the current alternate-screen session browser. + +## Command Contract + +- Command: `/resume` +- Description: `resume a saved chat` +- Parameters: none. +- Mutability: current interactive client session selection. +- Transcript effect: no model-visible user turn is created by the command itself. +- Rendering mode: alternate-screen session browser, not the below-composer slash-command surface. +- Search: not implemented in the current browser. +- Active-turn confirmation: not implemented in the current browser; command availability is governed by normal slash-command busy-state gating before the browser opens. + +## UI Flow + +Submitting `/resume` clears any previous resume browser state, marks the browser as loading, sends the host command `session list`, and sets the status message to `Loading sessions`. + +The host enters the terminal alternate screen before asking the background worker to list sessions. While the worker request is pending, the full visible area renders the loading view: + +```text +Devo Sessions + +Resume Session +Loading saved sessions... + +Please wait. +``` + +When sessions are returned, the loading view is replaced by a full-screen session browser: + +```text +Devo Sessions + +Resume Session 2 / 8 · 33% + + Title Session ID Updated + -------------------------- ------------------------------------ ----------------------- + earlier investigation 019db434-c4b4-7c81-ba66-74c58f0fbd60 2026-05-24 22:13:01 UTC +> active refactor 019db45d-61ec-7b02-894f-a847b78f7ac3 2026-05-25 04:09:44 UTC current + release notes 019db467-c5ef-7127-9ffd-5e0d9393c3ac 2026-05-25 05:31:10 UTC + +↑/↓ select pgup/pgdn page home/end jump +enter resume q back +``` + +Rules: + +- The selected row is marked with `>`. +- The current active session row is suffixed with `current`. +- The initial selection is the active session when present, otherwise the first row. +- The list shows session title, stable session ID, updated timestamp, and progress through the list. +- Long titles are truncated to fit the title column. +- If the list does not fit, `↑ more` and `↓ more` marker rows indicate hidden rows above or below. +- If no sessions are returned, the browser shows `No saved sessions found.` with the footer `q back`. + +Keyboard behavior: + +| Key | Behavior | +|---|---| +| `Up` / `Down` | Move selection by one visible row. | +| `PageUp` / `PageDown` | Move selection by one browser page. | +| `Home` / `End` | Jump to first or last session. | +| `Enter` | Resume the selected session when a row is selected. | +| `Esc` / `q` | Close the browser, leave the current session unchanged, and return status to `Ready`. | + +`Ctrl+T` full transcript review is disabled while the resume browser is loading or open. + +## State And Error Behavior + +- The TUI requests the persisted session list through the worker's session-list operation. The worker calls the server session-list API with a five-second timeout and maps each entry into title, session ID, updated timestamp, and active-session marker fields. +- When session listing succeeds, `SessionsListed` clears the host's pending browser flag and opens the browser in the already-active alternate screen. +- When session listing fails or times out, the worker emits a failure event. The TUI clears resume-browser loading state, records the error in the transcript as the normal failure path, and the host leaves alternate screen once no browser is open or pending. +- Pressing `Enter` on a session row clears the visible session UI for switching, sends `SwitchSession { session_id }`, and closes the browser. +- Before dispatching the switch, the widget clears completed history, the active streaming cell, active tool calls, pending tool calls, active text items, and the composer, then sets status to `Resuming session`. +- The host leaves alternate screen before switching, marks `session_switch_pending`, replaces the inline session UI, and asks the worker to resume the selected session. +- The worker resumes by calling the server session-resume API for the selected session ID. On success it emits `SessionSwitched` with restored working directory, optional title, model, thinking selection, reasoning effort, token counters, history items, rich history items, loaded item count, and pending texts. +- On `SessionSwitched`, the widget rebuilds visible history from rich restored items when available, falls back to projected transcript items otherwise, restores pending input queue cells, updates session metadata and token counters, clears busy state, and sets status to `Session switched`. +- Resuming a session must not delete the previously active persisted session. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-CONV-001 | 1 | specs/L1/L1-REQ-CONV-001-session-lifecycle.md | Resuming saved sessions is a session lifecycle workflow. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Defines session listing, opening, and subscription behavior. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery and invocation behavior. | +| related-to | L2-DES-TUI-005 | 1 | specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md | Uses alternate-screen entry and restore behavior for the session browser. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/resume` command design. | +| 1 | 2026-05-25 | Human | Refinement | Requested alignment with the current `crates/tui` alternate-screen implementation. | +| 1 | 2026-05-25 | Assistant | Refinement | Replaced the stale inline searchable-popup design with the current loading view, alternate-screen session browser, key bindings, worker flow, and session-switch restore behavior. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-005-new.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-005-new.md new file mode 100644 index 00000000..dabf79d5 --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-005-new.md @@ -0,0 +1,74 @@ +--- +artifact_id: L2-DES-TUI-CMD-005 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-CMD-005 — Slash Command: /new + +## Purpose + +Define the TUI behavior for `/new`, which starts a new chat session. + +## Command Contract + +- Command: `/new` +- Description: `start a new chat` +- Parameters: none. +- Mutability: prepares a fresh session slot for the next user message. +- Confirmation: no confirmation prompt is shown. +- Active-turn availability: unavailable while the current session has active work; the command is gated by normal busy-state slash-command handling rather than a confirmation flow. + +## UI Flow + +`/new` immediately prepares a new chat using the current workspace and effective default model configuration. The command does not ask the user to confirm. + +```text +┃ /new + +
+ +┃ + +New session ready; send a prompt to start it +``` + +Rules: + +- The current session remains durable and resumable. +- The visible transcript is reset to a new session HEADER box. +- The TUI then waits for the user to send the first message before entering the new session's first turn. +- The new session starts with current workspace metadata and the current effective model, thinking, and reasoning configuration. +- Token counters, active streaming cells, active tool state, and pending tool state reset to the fresh-session baseline. +- If onboarding or model configuration is incomplete, the command should route to `/onboard`. + +## State And Error Behavior + +- The TUI should request a new session preparation from the background worker. +- The server may defer durable session creation until the first user message is submitted. +- When preparation succeeds, the widget appends or refreshes the HEADER box, clears active turn state, clears pending input cells, resets token counters, and shows `New session ready; send a prompt to start it`. +- The command must not delete or overwrite the previous persisted session. It may clear the local visible transcript because the UI is now showing the prepared fresh session. +- If preparation fails, the TUI remains in the current session and shows a concise error. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-CONV-001 | 1 | specs/L1/L1-REQ-CONV-001-session-lifecycle.md | Starting a new chat creates a new session. | +| related-to | L2-DES-APP-003 | 1 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | Defines session creation behavior. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/new` command design. | +| 1 | 2026-05-25 | Human | Refinement | Removed confirmation and specified that `/new` adds a HEADER box, then waits for the next user message to enter the new session. | +| 1 | 2026-05-25 | Assistant | Refinement | Updated the flow from immediate session creation with confirmation to no-confirmation new-session preparation. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-006-status.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-006-status.md new file mode 100644 index 00000000..6dac293d --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-006-status.md @@ -0,0 +1,66 @@ +--- +artifact_id: L2-DES-TUI-CMD-006 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-TUI-CMD-006 — Slash Command: /status + +## Purpose + +Define the TUI behavior for `/status`, which displays current session configuration, token usage, context pressure, and runtime state. + +## Command Contract + +- Command: `/status` +- Description: `show current session configuration and token usage` +- Parameters: none in the first milestone. +- Mutability: read-only. +- Active-turn availability: allowed during active work. + +## UI Flow + +`/status` opens a compact status panel or inserts a bounded status cell. + +```text +┃ Status + model deepseek-v4-pro + reasoning high + workspace ~/Desktop/devo + mode Build + tokens ↑420[cached 300 71%] ↓12 + context ▰▰▱▱▱▱▱▱▱▱ 20% 190k/950k +``` + +Rules: + +- The display must use the same token/cache/context style as the bottom status line. +- Provider credentials must be represented by safe status only, never plaintext secret values. +- Active work should be shown concisely if a turn is running. + +## State And Error Behavior + +- The command should use server-confirmed session snapshots, usage events, and safe configuration projections. +- Missing or estimated values must be marked instead of invented. +- `/status` must not create a model-visible transcript turn. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-TUI-004 | 1 | specs/L1/L1-REQ-TUI-004-state-visibility.md | Status exposes current session and runtime state. | +| related-to | L2-DES-LLM-003 | 1 | specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md | Defines usage fields and uncertainty handling. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines bottom status-line fields reused by this command. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/status` command design. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-007-permissions.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-007-permissions.md new file mode 100644 index 00000000..389a5ca8 --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-007-permissions.md @@ -0,0 +1,65 @@ +--- +artifact_id: L2-DES-TUI-CMD-007 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-TUI-CMD-007 — Slash Command: /permissions + +## Purpose + +Define the TUI behavior for `/permissions`, which lets the user choose what the program is allowed to do in the current session. + +## Command Contract + +- Command: `/permissions` +- Description: `choose what Devo is allowed to do` +- Parameters: none in the first milestone. +- Mutability: session permission metadata and possibly durable configuration, depending on persistence choice. +- Active-turn availability: allowed only for future work; it must not retroactively authorize already-running tool calls. + +## UI Flow + +`/permissions` opens a selection popup. + +```text +┃ /permissions + + default + auto-approved + full access +``` + +Rules: + +- The current permission mode is preselected. +- The popup must summarize the operational effect of each mode. +- Enter applies; Esc cancels. +- If a tool approval is currently pending, the TUI must distinguish changing the default permission mode from answering that specific approval. + +## State And Error Behavior + +- Permission changes should be recorded as session metadata changes. +- The change applies to later tool decisions, not to already-issued provider or tool requests. +- If a mode is blocked by policy, the TUI should show why and keep the existing mode. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-TOOL-001 | 1 | specs/L1/L1-REQ-TOOL-001-safety.md | Permissions constrain tool safety and approval behavior. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Approval and waiting states must remain visible. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/permissions` command design. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-008-clear.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-008-clear.md new file mode 100644 index 00000000..50c9fdfb --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-008-clear.md @@ -0,0 +1,63 @@ +--- +artifact_id: L2-DES-TUI-CMD-008 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-TUI-CMD-008 — Slash Command: /clear + +## Purpose + +Define the TUI behavior for `/clear`, which clears the current TUI transcript view without deleting durable session history. + +## Command Contract + +- Command: `/clear` +- Description: `clear the current transcript` +- Parameters: none in the first milestone. +- Mutability: local TUI view state only. +- Active-turn availability: allowed, but active live work remains visible after the clear. + +## UI Flow + +`/clear` clears completed visible transcript cells from the current TUI viewport. + +```text +┃ /clear + + Transcript view cleared. Session history remains saved. +``` + +Rules: + +- The command must not delete session JSONL records. +- The command must not create a new session; `/new` handles new chats. +- Active turn content, pending approvals, questions, working indicators, and composer state remain visible. +- Resuming or reloading the session may restore durable transcript content unless a later requirement defines a persistent view-clear marker. + +## State And Error Behavior + +- `/clear` is display-only in the first milestone. +- If there is nothing to clear, the TUI may show no-op feedback. +- The command must not affect active context, compaction, model behavior, or transcript replay. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-TUI-003 | 1 | specs/L1/L1-REQ-TUI-003-transcript.md | Clear affects local transcript presentation, not durable transcript data. | +| related-to | L2-DES-TUI-004 | 1 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | Defines durable transcript and live overlay boundaries. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/clear` command design. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-009-onboard.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-009-onboard.md new file mode 100644 index 00000000..5fc39206 --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-009-onboard.md @@ -0,0 +1,59 @@ +--- +artifact_id: L2-DES-TUI-CMD-009 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-TUI-CMD-009 — Slash Command: /onboard + +## Purpose + +Define the TUI behavior for `/onboard`, which enters the onboarding process after the initial startup path. + +## Command Contract + +- Command: `/onboard` +- Description: `configure model provider connection` +- Parameters: none in the first milestone. +- Mutability: determined by the onboarding process. +- Active-turn availability: blocked while active work is running. + +## UI Flow + +`/onboard` is an entry command. After invocation, the TUI enters the onboarding process defined by `L2-DES-TUI-001`. + +Rules: + +- This command must not duplicate or redefine the onboarding sequence. +- Model selection, provider selection or creation, provider fields, invocation method selection, reasoning effort selection, validation, credential handling, and persistence are owned by `L2-DES-TUI-001`. +- The command should hand control to the onboarding UI without creating a transcript turn. +- If onboarding is already active, invoking `/onboard` should focus or resume the existing onboarding flow rather than starting a conflicting second flow. + +## State And Error Behavior + +- If active work blocks onboarding entry, the TUI must explain why and keep the current session state unchanged. +- Once onboarding starts, all state changes, validation failures, persistence failures, and successful setup behavior follow `L2-DES-TUI-001`. +- Canceling onboarding returns the user to the prior session view. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-TUI-010 | 1 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | `/onboard` reopens the TUI onboarding workflow. | +| related-to | L2-DES-TUI-001 | 1 | specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md | Defines the concrete onboarding UI flow. | +| related-to | L2-DES-MODEL-001 | 1 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | Defines provider and model-provider binding records. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/onboard` command design. | +| 1 | 2026-05-23 | Human | Refinement | Clarified that `/onboard` only enters the onboarding process owned by `L2-DES-TUI-001`. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-010-goal.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-010-goal.md new file mode 100644 index 00000000..fb317dd5 --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-010-goal.md @@ -0,0 +1,106 @@ +--- +artifact_id: L2-DES-TUI-CMD-010 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-25 +--- + +# L2-DES-TUI-CMD-010 — Slash Command: /goal + +## Purpose + +Define the TUI behavior for `/goal`, which lets the user create, view, pause, resume, complete, cancel, or clear the session's Ralph Loop goal. + +## Command Contract + +- Command: `/goal` +- Description: `set or view the goal for a long-running task` +- Parameters: optional free-form objective text. When present, the text following `/goal` is the objective. +- Mutability: goal/session state. +- Active-turn availability: viewing is allowed during active work; mutating actions must be server-serialized and must not rewrite an already-running turn. +- Default budget: none. The first milestone does not prompt for a token, time, or turn budget during goal creation. + +## UI Flow + +Typing `/goal` without objective text opens the current-goal panel. If no goal exists, it shows an empty state that tells the user to submit `/goal `. + +```text +┃ /goal + + Goal + status pursuing + objective Eliminate the failing parser tests and verify the full parser suite. + progress quoted values fixed; escape regression still failing + budget none + + [Pause] [Complete] [Cancel] [Clear] + + Build · deepseek-v4-pro high ↑420[cached 300 71%] ↓12 ▰▰▱▱▱▱▱▱▱▱ 20% 190k/950k +``` + +Typing `/goal ` creates and activates a goal directly. The prompt following `/goal` is the objective; pressing Enter begins execution. + +```text +┃ /goal Eliminate the failing parser tests and verify the full parser suite. + + Goal + objective Eliminate the failing parser tests and verify the full parser suite. + budget none + status starting +``` + +Rules: + +- `/goal` without parameters opens the current-goal panel or a no-goal empty state. +- `/goal ` treats `` as the objective and submits goal creation when the user presses Enter. +- The create path does not open a separate objective editor and does not ask for a budget. +- The first milestone creates goals with no default budget. Optional budget configuration may be added later as an explicit edit/control, not as part of the default create prompt. +- If a non-terminal goal already exists, replacing it requires explicit confirmation. +- The panel must show objective, status, progress, blocker, verification, and budget fields where available. If no budget is configured, the budget field renders as `none` or is omitted in narrow layouts. +- User-owned actions include pause, resume, complete, cancel, clear, create, and replace. +- The model cannot trigger `/goal`; model-originated goal status changes are shown as server events. +- Successful mutations should close the popup or update it in place according to L3 interaction rules. + +## Inline Rendering + +When the composer recognizes `/goal`, the command token uses the theme primary color and parameter text uses muted color. + +```text +┃ /goal + + Build · deepseek-v4-pro high ↑0[cached 0 0%] ↓0 ▱▱▱▱▱▱▱▱▱▱ 0% 0/950k +``` + +## State And Error Behavior + +- The command uses server-owned goal APIs; the TUI does not mutate local goal state independently. +- Read-only viewing should return the current server-confirmed projection. +- Direct creation with `/goal ` sends only the objective and omitted budget fields unless the user explicitly supplied budget configuration through a later design. +- After successful direct creation, the goal becomes active and the server may begin execution when continuation preconditions permit. +- Mutating actions should pass `expected_goal_id` where the TUI has one, so stale panels do not overwrite newer goal state. +- If the server rejects a stale action, the TUI should refresh the panel and show a concise message. +- If the goal is active and a turn is running, pause/cancel/clear may take effect immediately for future continuation but must not rewrite the current turn's already-built model context. +- If Plan Mode is active, `/goal` remains viewable and user-controllable, but autonomous continuation remains suppressed until Build mode is active. +- `/goal` must not create a model-visible transcript turn. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-GOAL-001 | 1 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | `/goal` is the TUI control surface for Ralph Loop goals. | +| related-to | L2-DES-GOAL-001 | 1 | specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md | Defines the goal state model, continuation loop, and protocol behavior controlled by this command. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines slash-command discovery, inline command rendering, and command submission. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/goal` command design. | +| 1 | 2026-05-25 | Human | Refinement | Set default goal creation to no budget and made `/goal ` submit the objective directly on Enter. | +| 1 | 2026-05-25 | Assistant | Refinement | Removed the default create panel budget prompt and documented direct objective submission. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-011-btw.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-011-btw.md new file mode 100644 index 00000000..399a2a5d --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-011-btw.md @@ -0,0 +1,70 @@ +--- +artifact_id: L2-DES-TUI-CMD-011 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-TUI-CMD-011 — Slash Command: /btw + +## Purpose + +Define the TUI behavior for `/btw`, which starts a side conversation inside an ephemeral fork. + +## Command Contract + +- Command: `/btw` +- Description: `start a side conversation in an ephemeral fork` +- Parameter: free-form text after the command token. +- Mutability: ephemeral runtime state only. +- Active-turn availability: allowed when a session has enough current context to fork. + +## UI Flow + +Inline command rendering shows the command token in primary color and the parameter hint in muted color. + +```text +┃ /btw + + Build · deepseek-v4-pro high ↑0[cached 0 0%] ↓0 ▱▱▱▱▱▱▱▱▱▱ 0% 0/950k +``` + +Submission example: + +```text +┃ /btw what if we solve this with a smaller parser-only change? + +⠋ Working · 18s + side ephemeral fork running +``` + +## State And Error Behavior + +- The command starts a side conversation using an ephemeral fork of the current session context. +- The side conversation must not write session, turn, item, queue, steer, or fork records to durable storage. +- The side conversation must not mutate the current session transcript, active turn, active context, or persistent configuration. +- The side conversation may use the current visible context as input, but any messages and model responses inside the side conversation are runtime-only. +- Closing or completing the side conversation discards its transcript unless a later explicit command promotes or copies content back into the durable session. +- The TUI must visually distinguish the side conversation from the durable transcript so the user understands it is temporary. +- If the current session context cannot be forked safely, the command should fail with a concise explanation rather than becoming a normal message. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-CONV-004 | 1 | specs/L1/L1-REQ-CONV-004-session-forking.md | `/btw` uses the fork concept for temporary side exploration, while explicitly avoiding durable fork persistence. | +| related-to | L2-DES-CONV-001 | 1 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | Defines the durable session model that `/btw` must not write to while running as an ephemeral fork. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Defines inline slash-command coloring and command submission. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/btw` command design. | +| 1 | 2026-05-23 | Human | Refinement | Changed `/btw` from active-turn injection to an ephemeral-fork side conversation that is not persisted. | diff --git a/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-012-exit.md b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-012-exit.md new file mode 100644 index 00000000..21a85fc0 --- /dev/null +++ b/specs/L2/tui/slash-commands/L2-DES-TUI-CMD-012-exit.md @@ -0,0 +1,61 @@ +--- +artifact_id: L2-DES-TUI-CMD-012 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-23 +--- + +# L2-DES-TUI-CMD-012 — Slash Command: /exit + +## Purpose + +Define the TUI behavior for `/exit`, which exits the TUI while preserving terminal safety and server-owned session durability. + +## Command Contract + +- Command: `/exit` +- Description: `exit Devo` +- Parameters: none in the first milestone. +- Mutability: TUI process lifecycle only, unless the user also chooses an active-work policy. +- Active-turn availability: allowed, but must clearly handle active work. + +## UI Flow + +If no active turn exists, `/exit` starts terminal cleanup immediately. + +If active work exists, the TUI must present a concise choice before exit. + +```text +┃ /exit + + Active work is still running. + [Keep Running And Exit] [Interrupt And Exit] [Cancel] +``` + +## State And Error Behavior + +- `/exit` must use the terminal lifecycle cleanup path defined by `L2-DES-TUI-005`. +- The TUI must restore terminal modes and leave shell prompt placement to the shell. +- The command must not delete the current session. +- If the server continues active work after the TUI exits, the session must remain resumable. +- If cleanup fails, the TUI should emit the terminal-safe cleanup warning defined by the lifecycle design. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refines | L1-REQ-TUI-006 | 1 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | Defines command-specific behavior for a discoverable TUI command. | +| related-to | L1-REQ-TUI-005 | 1 | specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md | Exit must use terminal-safe cleanup behavior. | +| related-to | L2-DES-TUI-005 | 1 | specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md | Defines exit, cleanup, and shell prompt handoff. | +| related-to | L2-DES-TUI-003 | 1 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | Uses shared slash-command discovery and invocation behavior. | +| specified-by | TBD | TBD | specs/L3/tui/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-23 | Assistant | Initial | Initial `/exit` command design. | diff --git a/specs/L2/workspace/L2-DES-WORKSPACE-001-project-instruction-discovery.md b/specs/L2/workspace/L2-DES-WORKSPACE-001-project-instruction-discovery.md new file mode 100644 index 00000000..b207b668 --- /dev/null +++ b/specs/L2/workspace/L2-DES-WORKSPACE-001-project-instruction-discovery.md @@ -0,0 +1,192 @@ +--- +artifact_id: L2-DES-WORKSPACE-001 +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: Assistant +last_updated: 2026-05-22 +--- + +# L2-DES-WORKSPACE-001 — Project Instruction File Discovery + +## Purpose + +Define how the program discovers, reads, and assembles project instruction files along the directory hierarchy from the project root to the current working directory, plus global user-level instruction files. + +## Background / Context + +`L1-REQ-WORKSPACE-002` defines the requirement: discover instruction files on the linear path from project root to cwd, using a per-directory filename priority of `AGENTS.override.md`, `AGENTS.md`, and configurable fallbacks, with root-to-cwd concatenation and no arbitrary depth limit. This design defines the concrete discovery algorithm, filename resolution, global-instruction support, size bounding, configuration surface, and refresh behavior. + +The assembled instruction content feeds into `L2-DES-CONTEXT-001` as part of the instruction hierarchy (metadata-derived content, not transcript turns). + +## Source Requirements + +- `L1-REQ-WORKSPACE-002` requires linear ancestor-chain discovery, per-directory filename priority, configurable fallback filenames, global instruction files, size bounding, and no artificial depth limit. +- `L1-REQ-WORKSPACE-001` requires workspace context that respects local project instructions. +- `L1-REQ-APP-010` requires persistent configuration for user-scoped and project-scoped settings. +- `L2-DES-CONTEXT-001` defines the context assembly step that consumes the assembled instruction content. +- `L2-DES-APP-002` defines configuration source precedence for the settings that control discovery. + +## Design Requirement + +The program should discover project instruction files by walking the linear directory hierarchy from the project root down to the current working directory, checking each directory for a recognized instruction file in priority order, and assembling the discovered content in root-to-cwd sequence. Global user-level instruction files should be included as a top-level prefix. + +Discovery should be deterministic, auditable, bounded, and fast. It must not scan irrelevant directory trees or impose arbitrary depth limits. + +## Filename Priority + +Each directory is checked for instruction files in this fixed priority order: + +| Priority | Filename | Description | +|---|---|---| +| 1 (highest) | `AGENTS.override.md` | User-local override that replaces the standard instruction file in this directory. | +| 2 | `AGENTS.md` | The standard project instruction file. | +| 3+ | User-configured fallbacks | Additional filenames provided through configuration, checked in configuration-specified order. | + +Only the first matching regular file found in a given directory is used. If `AGENTS.override.md` exists, `AGENTS.md` and all fallbacks are skipped for that directory. An empty or whitespace-only file is treated as absent — it does not contribute content and does not block lower-priority files in the same directory. + +Fallback filenames are not hardcoded to match specific external assistants by name. Instead they are user-configured, which lets projects that already maintain instruction files for other tools (e.g., `CLAUDE.md`, `PROMPT.md`) be recognized without code changes. The default fallback set should cover common external instruction filenames. + +## Discovery Algorithm + +### Step 1 — Resolve Project Root + +Walk upward from the canonicalized current working directory through parent directories. At each ancestor, check for the presence of any configured project-root marker. The default marker set is `[".git"]`. + +Stop at the first ancestor that contains any marker. That ancestor is the project root. If no ancestor contains a marker, the project has no discoverable root. + +Markers are directory entries, not files. A marker matches when a directory entry with that name exists at that path, regardless of whether it is a file or directory. + +The marker set is configurable. An empty marker list disables upward traversal entirely — only the cwd itself is searched. + +### Step 2 — Collect Search Path + +If a project root was found, collect all directories on the linear path from root to cwd: + +```text +cwd → parent → parent → ... → root +``` + +Reverse this sequence so root comes first: + +```text +[root, ..., parent, cwd] +``` + +If no project root was found, the search path contains only the cwd: + +```text +[cwd] +``` + +### Step 3 — Check Each Directory + +For each directory in the search path, for each candidate filename in priority order: + +1. Check whether a filesystem entry with that name exists in the directory. +2. If the entry is a regular file and is non-empty after trimming whitespace, select it and stop checking this directory. +3. If the entry is a directory, a special file, empty, or missing, continue to the next candidate filename. +4. If a read error occurs (permission denied, IO error), record a diagnostic for that file and continue checking remaining candidates and remaining directories. A single unreadable file must not abort discovery. +5. Skip entries whose names start with `.` unless they explicitly match a candidate filename (candidates can be hidden files, but arbitrary hidden entries are not candidates). + +### Step 4 — Load Global Instructions + +Before project-specific instructions, check the user-level configuration directory for global instruction files. On macOS and Linux this is `~/.devo/`; on Windows this is `%USERPROFILE%\.devo\`. + +Check in the same priority order as project directories: `AGENTS.override.md`, then `AGENTS.md`. Fallback filenames are not checked at the global level — only the two primary filenames apply. + +A missing global directory or missing global instruction files is normal and does not produce a diagnostic. + +### Step 5 — Read and Assemble + +Read each discovered file's content. The assembled result is: + +```text +[Global AGENTS.override.md content, if present] +[Global AGENTS.md content, if present] +[Project root directory instruction file content] +[Intermediate directory instruction file content, in path order] + ... +[Cwd instruction file content] +``` + +### Step 6 — Apply Size Bound + +The total assembled byte length must not exceed the configured maximum. If it does, truncate from the end of the last file's content and indicate truncation in the assembled output so the model and user understand that content was omitted. + +The size check applies after assembly, not per-file. This allows earlier (root-level) files to contribute fully while only the tail of the last file is affected when the total exceeds the bound. + +When the maximum is set to zero, all discovery is disabled and the assembled result is empty. + +## Global Instructions + +Global instruction files in `~/.devo/` apply across all projects and sessions. They are the top-level prefix of the assembled instruction content, appearing before project-root instructions. + +Global instructions are discovered on every session start and whenever the assembled instruction content is refreshed. They follow the same filename priority as project directories (`AGENTS.override.md` → `AGENTS.md`), but fallback filenames do not apply at the global level. + +Global instructions are subject to the same size bound as project instructions — the total assembled content, including global and project files, must not exceed the configured maximum. + +## Configuration + +Discovery behavior is controlled through persistent configuration following `L2-DES-APP-002` precedence rules. + +| Key | Type | Default | Purpose | +|---|---|---|---| +| `project_doc_max_bytes` | `usize` | `32768` (32 KiB) | Maximum total bytes of assembled instruction content. `0` disables all discovery. | +| `project_doc_fallback_filenames` | `Vec` | sensible default set | Additional filenames to check per directory after `AGENTS.override.md` and `AGENTS.md`. | +| `project_root_markers` | `Vec` | `[".git"]` | Directory entry names that identify project roots during upward traversal. Empty list disables parent traversal. | + +Project-scoped configuration overrides user-scoped configuration for all three keys. A project may specify an empty `project_root_markers` list to scope instruction-file discovery to the cwd only, or a custom marker set to identify roots in non-git workspaces. + +## Refresh + +Instruction file content must be refreshed when: + +- The current working directory changes. Re-discovery must run along the new path before the next model context is assembled. +- A previously discovered instruction file on the active path is modified. The program should detect the change through filesystem watchers, stat polling, or an explicit re-read trigger, and refresh the assembled instructions before the next model invocation. + +Refresh should not block session startup. If a refresh is in progress when context assembly begins, the program may use the most recent successfully assembled content and apply the refresh result to subsequent turns. + +When a file that was previously absent appears on the path, or a previously present file is deleted, the assembled instructions must reflect the current filesystem state after the next refresh. + +## Auditability + +The program must make the discovery result understandable to the user. After discovery, the program should expose: + +- The canonicalized current working directory. +- The resolved project root, or a statement that no root was found. +- Each discovered file, its directory, and which priority level it matched. +- The total assembled byte count and whether truncation occurred. +- Any diagnostic for unreadable files. + +This information should be available through a configuration-inspection or debug view. It should not be emitted as routine model context unless the user requests it. + +## Edge Cases + +- **No project root found**: Discovery scope is cwd only plus global instructions. No error or warning is produced. +- **Empty marker list**: Parent traversal is disabled. Only cwd is checked (same as no root found, but by user choice rather than because no marker matched). +- **Symlinks**: The canonical path is used for directory comparison so symlink chains are resolved before determining the search path. +- **Filesystem boundaries**: The upward traversal for root discovery stops at the filesystem root. If no marker is found by then, no project root exists. +- **Concurrent modification**: If a file is being written while discovery reads it, the program may read partial content. This is acceptable; the next refresh will correct it. +- **Binary files**: Files that contain null bytes or non-UTF-8 content should be skipped and a diagnostic produced. They are not treated as instruction files. +- **Very large individual files**: A single file that alone exceeds the maximum byte limit is read up to the limit, included as the last (and possibly only) contributing file, and truncation is indicated. Discovery does not skip it just because it is large. + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---:|---|---|---| +| refines | L1-REQ-WORKSPACE-002 | 1 | specs/L1/L1-REQ-WORKSPACE-002-project-instruction-files.md | Defines the concrete discovery algorithm, filename resolution, global-instruction support, size bounding, configuration surface, and refresh behavior. | +| related-to | L1-REQ-WORKSPACE-001 | 1 | specs/L1/L1-REQ-WORKSPACE-001-project-context.md | Instruction file discovery provides the local project instructions required by workspace context. | +| related-to | L1-REQ-APP-010 | 1 | specs/L1/L1-REQ-APP-010-configuration.md | Discovery behavior is controlled through persistent configuration with project-over-user precedence. | +| related-to | L2-DES-CONTEXT-001 | 1 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | Assembled instruction content feeds into the instruction hierarchy during context assembly. | +| related-to | L2-DES-APP-002 | 1 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | Configuration precedence resolves discovery settings from user-scoped and project-scoped sources. | +| specified-by | TBD | TBD | specs/L3/workspace/TBD.md | L3 behavior has not been authored yet. | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | 2026-05-22 | Assistant | Initial | Initial project instruction file discovery design covering filename priority, search algorithm, global instructions, configuration, refresh, and auditability. | + diff --git a/specs/l1_l2_traceability_gaps.py b/specs/l1_l2_traceability_gaps.py new file mode 100644 index 00000000..b618b083 --- /dev/null +++ b/specs/l1_l2_traceability_gaps.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +"""Report L1 requirements that are not refined or linked to L2 designs. + +By default this scans specs/L1 and compares the artifacts against +specs/traceability/l1_to_l2.md from the repository root inferred from this +script's location. +""" + +from __future__ import annotations + +import argparse +import json +import os +import re +import sys +from dataclasses import dataclass +from pathlib import Path + + +DEFAULT_REPO = Path(__file__).parent.parent +TRACEABILITY_PATH = Path("specs") / "traceability" / "l1_to_l2.md" +L1_ID_RE = re.compile(r"\bL1-[A-Z0-9]+(?:-[A-Z0-9]+)*-\d{3}\b") + + +@dataclass(frozen=True) +class L1Item: + artifact_id: str + path: Path + title: str + + +@dataclass(frozen=True) +class TraceLink: + source_id: str + source_path: str + target_id: str + target_path: str + relationship: str + rationale: str + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Find L1 requirements missing L2 traceability links." + ) + parser.add_argument( + "--repo", + type=Path, + default=DEFAULT_REPO, + help="Repository root to scan. Default: parent of this script directory.", + ) + parser.add_argument( + "--json", + action="store_true", + help="Emit machine-readable JSON instead of a text report.", + ) + return parser.parse_args() + + +def read_text(path: Path) -> str: + try: + return path.read_text(encoding="utf-8") + except FileNotFoundError: + raise SystemExit(f"Missing required path: {path}") from None + + +def resolve_repo(path: Path) -> Path: + expanded = path.expanduser() + if expanded.is_absolute(): + return expanded.resolve() + return (Path.cwd() / expanded).resolve() + + +def display_path(path: Path) -> str: + try: + relative = os.path.relpath(path, start=Path.cwd()) + except ValueError: + return str(path) + return "." if relative == "." else relative + + +def markdown_cells(line: str) -> list[str]: + stripped = line.strip() + if not stripped.startswith("|") or not stripped.endswith("|"): + return [] + return [cell.strip() for cell in stripped.strip("|").split("|")] + + +def extract_title(text: str, fallback: str) -> str: + for line in text.splitlines(): + stripped = line.strip() + if stripped.startswith("# "): + return stripped.lstrip("#").strip() + return fallback + + +def extract_l1_id(path: Path, text: str) -> str | None: + artifact_match = re.search(r"(?m)^artifact_id:\s*(L1-[A-Z0-9-]+)\s*$", text) + if artifact_match: + return artifact_match.group(1) + + filename_match = L1_ID_RE.search(path.name) + if filename_match: + return filename_match.group(0) + + text_match = L1_ID_RE.search(text) + if text_match: + return text_match.group(0) + + return None + + +def collect_l1_items(repo: Path) -> dict[str, L1Item]: + l1_dir = repo / "specs" / "L1" + if not l1_dir.is_dir(): + raise SystemExit(f"Missing L1 directory: {l1_dir}") + + items: dict[str, L1Item] = {} + duplicate_ids: dict[str, list[Path]] = {} + + for path in sorted(l1_dir.glob("*.md")): + text = read_text(path) + artifact_id = extract_l1_id(path, text) + if artifact_id is None: + print(f"warning: could not identify L1 artifact id in {path}", file=sys.stderr) + continue + + if artifact_id in items: + duplicate_ids.setdefault(artifact_id, [items[artifact_id].path]).append(path) + continue + + items[artifact_id] = L1Item( + artifact_id=artifact_id, + path=path.relative_to(repo), + title=extract_title(text, fallback=path.stem), + ) + + if duplicate_ids: + details = "\n".join( + f" {artifact_id}: {', '.join(str(p) for p in paths)}" + for artifact_id, paths in duplicate_ids.items() + ) + raise SystemExit(f"Duplicate L1 artifact ids found:\n{details}") + + return items + + +def collect_trace_links(repo: Path) -> list[TraceLink]: + matrix_path = repo / TRACEABILITY_PATH + text = read_text(matrix_path) + links: list[TraceLink] = [] + + for line in text.splitlines(): + cells = markdown_cells(line) + if len(cells) != 6: + continue + + source_id, source_path, target_id, target_path, relationship, rationale = cells + if source_id == "Source ID" or set(source_id) <= {"-"}: + continue + if not source_id.startswith("L1-"): + continue + if not target_id.startswith("L2-"): + continue + + links.append( + TraceLink( + source_id=source_id, + source_path=source_path, + target_id=target_id, + target_path=target_path, + relationship=relationship, + rationale=rationale, + ) + ) + + return links + + +def item_to_dict(item: L1Item) -> dict[str, str]: + return { + "artifact_id": item.artifact_id, + "path": str(item.path), + "title": item.title, + } + + +def link_to_dict(link: TraceLink) -> dict[str, str]: + return { + "source_id": link.source_id, + "source_path": link.source_path, + "target_id": link.target_id, + "target_path": link.target_path, + "relationship": link.relationship, + "rationale": link.rationale, + } + + +def print_item_list(title: str, items: list[L1Item]) -> None: + print(f"\n{title} ({len(items)})") + print("-" * len(f"{title} ({len(items)})")) + if not items: + print("None") + return + + for item in items: + print(f"- {item.artifact_id} | {item.path} | {item.title}") + + +def print_linked_only_related(items: list[L1Item], links_by_source: dict[str, list[TraceLink]]) -> None: + title = "Linked to L2 but not refined-by" + print(f"\n{title} ({len(items)})") + print("-" * len(f"{title} ({len(items)})")) + if not items: + print("None") + return + + for item in items: + links = links_by_source[item.artifact_id] + target_summary = ", ".join( + f"{link.target_id} ({link.relationship})" for link in links + ) + print(f"- {item.artifact_id} | {item.path} | {target_summary}") + + +def main() -> int: + args = parse_args() + repo = resolve_repo(args.repo) + repo_display = display_path(repo) + + l1_items = collect_l1_items(repo) + trace_links = collect_trace_links(repo) + + links_by_source: dict[str, list[TraceLink]] = {} + for link in trace_links: + links_by_source.setdefault(link.source_id, []).append(link) + + unlinked = [ + item + for artifact_id, item in sorted(l1_items.items()) + if artifact_id not in links_by_source + ] + linked_not_refined = [ + item + for artifact_id, item in sorted(l1_items.items()) + if artifact_id in links_by_source + and not any( + link.relationship == "refined-by" for link in links_by_source[artifact_id] + ) + ] + refined = [ + item + for artifact_id, item in sorted(l1_items.items()) + if any(link.relationship == "refined-by" for link in links_by_source.get(artifact_id, [])) + ] + stale_trace_sources = sorted( + source_id for source_id in links_by_source if source_id not in l1_items + ) + + if args.json: + payload = { + "repo": repo_display, + "traceability_path": str(TRACEABILITY_PATH), + "counts": { + "l1_total": len(l1_items), + "refined": len(refined), + "linked_not_refined": len(linked_not_refined), + "unlinked": len(unlinked), + "stale_trace_sources": len(stale_trace_sources), + }, + "unlinked": [item_to_dict(item) for item in unlinked], + "linked_not_refined": [ + { + **item_to_dict(item), + "links": [ + link_to_dict(link) for link in links_by_source[item.artifact_id] + ], + } + for item in linked_not_refined + ], + "stale_trace_sources": stale_trace_sources, + } + print(json.dumps(payload, indent=2, sort_keys=True)) + return 0 + + print("L1 to L2 Traceability Gap Report") + print(f"Repository: {repo_display}") + print(f"Traceability matrix: {TRACEABILITY_PATH}") + print() + print(f"L1 total: {len(l1_items)}") + print(f"Refined by at least one L2 item: {len(refined)}") + print(f"Linked to L2 but not refined-by: {len(linked_not_refined)}") + print(f"No L2 link: {len(unlinked)}") + print(f"Trace rows pointing at missing L1 IDs: {len(stale_trace_sources)}") + + print_item_list("No L2 link", unlinked) + print_linked_only_related(linked_not_refined, links_by_source) + + if stale_trace_sources: + print("\nTrace rows pointing at missing L1 IDs") + print("-------------------------------------") + for source_id in stale_trace_sources: + print(f"- {source_id}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/specs/templates/spec-l1-requirement.md b/specs/templates/spec-l1-requirement.md new file mode 100644 index 00000000..3ced21d9 --- /dev/null +++ b/specs/templates/spec-l1-requirement.md @@ -0,0 +1,58 @@ +--- +artifact_id: L1-REQ-- +revision: 1 +status: Draft +active_baseline: no +supersedes: +superseded_by: +owner: +last_updated: YYYY-MM-DD +--- + +# L1-REQ-- + +## Purpose + +Describe the user or business need this requirement exists to satisfy. + +## Background / Context + +Describe the business context, users, constraints, assumptions, and relevant prior decisions. + +## User / Business Requirement + +State the requirement from the user or business perspective. + +## Functional Requirements + +- +- + +## Non-Functional Requirements + +- + +## Acceptance Criteria + +- Given , when , then . +- Given , when , then . + +## Out of Scope + +- + +## Open Questions + +- + +## Traceability + +| Relationship | Target ID | Target Revision | Target Path | Rationale | +|---|---|---:|---|---| +| refined-by | L2-DES-- | 1 | spec/L2//.md | | + +## Revision Notes + +| Revision | Date | Author | Change Type | Notes | +|---:|---|---|---|---| +| 1 | YYYY-MM-DD | | Initial | Initial draft. | diff --git a/specs/traceability/l1_to_l2.md b/specs/traceability/l1_to_l2.md new file mode 100644 index 00000000..f7febc58 --- /dev/null +++ b/specs/traceability/l1_to_l2.md @@ -0,0 +1,151 @@ +# L1 to L2 Traceability Matrix + +| Source ID | Source Path | Target ID | Target Path | Relationship | Rationale | +|---|---|---|---|---|---| +| L1-REQ-AGENT-001 | specs/L1/L1-REQ-AGENT-001-execution-workflow.md | L2-DES-AGENT-001 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | refined-by | The agent execution engine design refines the workflow from accepted user input through context assembly, model invocation, tool dispatch, and terminal outcome. | +| L1-REQ-AGENT-001 | specs/L1/L1-REQ-AGENT-001-execution-workflow.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol exposes execution lifecycle events and requests to clients. | +| L1-REQ-AGENT-001 | specs/L1/L1-REQ-AGENT-001-execution-workflow.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model preserves execution history for review and recovery. | +| L1-REQ-AGENT-002 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | L2-DES-AGENT-002 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | refined-by | The interrupt and resume control design refines server-owned cancellation, inspection, cleanup, and continuation behavior. | +| L1-REQ-AGENT-002 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | L2-DES-AGENT-001 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | related-to | Interrupt and resume act on execution engine runtime state. | +| L1-REQ-AGENT-002 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol exposes interrupt, resume, active-work inspection, and background process stop methods. | +| L1-REQ-AGENT-002 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model preserves interrupted and resumed turn state. | +| L1-REQ-AGENT-002 | specs/L1/L1-REQ-AGENT-002-interrupt-resume.md | L2-DES-CONTEXT-001 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | related-to | Interrupt state informs the consolidated change-signal message before the next user input. | +| L1-REQ-AGENT-003 | specs/L1/L1-REQ-AGENT-003-task-planning.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | refined-by | The built-in tool system defines the plan tool as visible to-do state for task planning and execution progress. | +| L1-REQ-AGENT-003 | specs/L1/L1-REQ-AGENT-003-task-planning.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol exposes plan updates to subscribed clients. | +| L1-REQ-AGENT-003 | specs/L1/L1-REQ-AGENT-003-task-planning.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model preserves visible plan state for replay. | +| L1-REQ-AGENT-004 | specs/L1/L1-REQ-AGENT-004-subagents.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | The built-in tool system treats subagent coordination as a delegation tool category where enabled. | +| L1-REQ-APP-001 | specs/L1/L1-REQ-APP-001-client-server-arch.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | refined-by | The client-server protocol design refines shared runtime transport, event, and process ownership behavior. | +| L1-REQ-APP-002 | specs/L1/L1-REQ-APP-002-persistence.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | refined-by | The session JSONL data model refines durable history, replay, and recovery behavior. | +| L1-REQ-APP-002 | specs/L1/L1-REQ-APP-002-persistence.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines reconnect and catch-up behavior from durable state. | +| L1-REQ-APP-004 | specs/L1/L1-REQ-APP-004-observability.md | L2-DES-APP-004 | specs/L2/app/L2-DES-APP-004-observability-architecture.md | refined-by | The observability architecture refines structured logs, log levels, diagnostics, trace-mode controls, correlation, redaction, retention, and telemetry boundaries. | +| L1-REQ-APP-004 | specs/L1/L1-REQ-APP-004-observability.md | L2-DES-LLM-003 | specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md | related-to | Model usage and stream observability provide model-specific diagnostic data used by the application observability design. | +| L1-REQ-APP-004 | specs/L1/L1-REQ-APP-004-observability.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol exposes safe diagnostics, usage updates, context updates, tool status, and error reports to clients. | +| L1-REQ-APP-004 | specs/L1/L1-REQ-APP-004-observability.md | L2-DES-AGENT-001 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | related-to | The execution engine supplies phase and lifecycle capture points for observability. | +| L1-REQ-APP-005 | specs/L1/L1-REQ-APP-005-lightweight.md | L2-DES-APP-001 | specs/L2/app/L2-DES-APP-001-memory-efficient-rust-data-models.md | refined-by | The memory-efficient Rust data model design refines the lightweight operation requirement. | +| L1-REQ-APP-007 | specs/L1/L1-REQ-APP-007-tui.md | L2-DES-TUI-002 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | refined-by | The modern TUI shell layout refines inline/fullscreen terminal structure, core regions, active-work visibility, and responsive layout. | +| L1-REQ-APP-007 | specs/L1/L1-REQ-APP-007-tui.md | L2-DES-TUI-003 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | related-to | Composer, command discovery, and input modes provide the terminal input surface. | +| L1-REQ-APP-007 | specs/L1/L1-REQ-APP-007-tui.md | L2-DES-TUI-004 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | related-to | Streaming transcript and state rendering make active work understandable. | +| L1-REQ-APP-007 | specs/L1/L1-REQ-APP-007-tui.md | L2-DES-TUI-005 | specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md | related-to | Terminal lifecycle safety defines inline scrollback preservation and safe exit behavior. | +| L1-REQ-APP-007 | specs/L1/L1-REQ-APP-007-tui.md | L2-DES-TUI-001 | specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md | related-to | Onboarding is one of the required TUI flows. | +| L1-REQ-APP-007 | specs/L1/L1-REQ-APP-007-tui.md | L2-DES-TUI-006 | specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md | related-to | Full transcript alternate-screen review is one of the TUI fullscreen workflows. | +| L1-REQ-APP-008 | specs/L1/L1-REQ-APP-008-mcp.md | L2-DES-MCP-001 | specs/L2/mcp/L2-DES-MCP-001-mcp-integration-architecture.md | refined-by | The MCP integration architecture refines configured servers, discovery, status, safety, and error handling. | +| L1-REQ-APP-008 | specs/L1/L1-REQ-APP-008-mcp.md | L2-DES-APP-005 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | related-to | The config and auth schema defines the persisted MCP server configuration and credential reference shape. | +| L1-REQ-APP-009 | specs/L1/L1-REQ-APP-009-skills.md | L2-DES-SKILLS-001 | specs/L2/skills/L2-DES-SKILLS-001-agent-skills-architecture.md | refined-by | The Agent Skills architecture refines discovery, activation, progressive disclosure, trust, and visibility behavior. | +| L1-REQ-APP-009 | specs/L1/L1-REQ-APP-009-skills.md | L2-DES-APP-005 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | related-to | The TOML schema defines the persisted skill enablement and discovery-root shape. | +| L1-REQ-APP-010 | specs/L1/L1-REQ-APP-010-configuration.md | L2-DES-APP-002 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | refined-by | The configuration precedence design refines configuration file locations, project-over-user precedence, and onboarding persistence. | +| L1-REQ-APP-010 | specs/L1/L1-REQ-APP-010-configuration.md | L2-DES-APP-005 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | refined-by | The config and auth schema design refines the concrete persistent configuration and credential file formats. | +| L1-REQ-APP-010 | specs/L1/L1-REQ-APP-010-configuration.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines configuration inspection and update methods. | +| L1-REQ-APP-011 | specs/L1/L1-REQ-APP-011-error-recovery.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines error, retry, and recovery event fields. | +| L1-REQ-APP-012 | specs/L1/L1-REQ-APP-012-privacy-data-ownership.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines export, deletion, and safe projection behavior. | +| L1-REQ-AGENT-005 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design constrains question prompts to Plan Mode. | +| L1-REQ-AGENT-005 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | The built-in tool system enforces Plan Mode tool gating for questions and mutating tools. | +| L1-REQ-AGENT-005 | specs/L1/L1-REQ-AGENT-005-plan-mode.md | L2-DES-CONTEXT-001 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | refined-by | Context assembly defines plan as an interaction_mode value with a mode-specific prompt set and consolidated change signal. | +| L1-REQ-CHANGE-001 | specs/L1/L1-REQ-CHANGE-001-rollback-and-recovery.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model defines turn checkpoints and restore result records. | +| L1-REQ-CHANGE-001 | specs/L1/L1-REQ-CHANGE-001-rollback-and-recovery.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines workspace restoration events for superseded turns. | +| L1-REQ-CLIENT-001 | specs/L1/L1-REQ-CLIENT-001-localization-readiness.md | L2-DES-CLIENT-001 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | refined-by | The localization-readiness design refines Unicode, IME, display-width, non-ASCII path, diagnostics, and future UI-string structure. | +| L1-REQ-CLIENT-001 | specs/L1/L1-REQ-CLIENT-001-localization-readiness.md | L2-DES-TUI-002 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | related-to | TUI layout must account for Unicode and localized text width. | +| L1-REQ-CLIENT-001 | specs/L1/L1-REQ-CLIENT-001-localization-readiness.md | L2-DES-TUI-003 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | related-to | Composer input must preserve Unicode and IME text. | +| L1-REQ-CLIENT-001 | specs/L1/L1-REQ-CLIENT-001-localization-readiness.md | L2-DES-TUI-004 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | related-to | Transcript and streaming cells must preserve localized and non-ASCII content. | +| L1-REQ-CONV-001 | specs/L1/L1-REQ-CONV-001-session-lifecycle.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | refined-by | The session JSONL data model refines durable session lifecycle storage. | +| L1-REQ-CONV-001 | specs/L1/L1-REQ-CONV-001-session-lifecycle.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines session open, subscribe, and reconnect behavior. | +| L1-REQ-CONV-002 | specs/L1/L1-REQ-CONV-002-turn-lifecycle.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | refined-by | The session JSONL data model refines turn and item lifecycle persistence. | +| L1-REQ-CONV-002 | specs/L1/L1-REQ-CONV-002-turn-lifecycle.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines live turn event notifications. | +| L1-REQ-CONV-003 | specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | refined-by | The client-server protocol design defines steer and queue submissions and events. | +| L1-REQ-CONV-003 | specs/L1/L1-REQ-CONV-003-active-turn-message-handling.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model defines durable steer and queue records. | +| L1-REQ-CONV-004 | specs/L1/L1-REQ-CONV-004-session-forking.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | refined-by | The session JSONL data model defines fork references and retained inherited history. | +| L1-REQ-CONV-004 | specs/L1/L1-REQ-CONV-004-session-forking.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines session fork and deletion behavior. | +| L1-REQ-CONV-005 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | refined-by | The session JSONL data model defines append-only edit records and replacement turn references. | +| L1-REQ-CONV-005 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines immediate previous message edit requests and events. | +| L1-REQ-CONV-005 | specs/L1/L1-REQ-CONV-005-immediate-message-editing.md | L2-DES-TUI-006 | specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md | related-to | The full transcript overlay exposes previous-message selection and composer restore behavior. | +| L1-REQ-CONTEXT-001 | specs/L1/L1-REQ-CONTEXT-001-management.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model defines active context snapshots as references into transcript and metadata. | +| L1-REQ-CONTEXT-001 | specs/L1/L1-REQ-CONTEXT-001-management.md | L2-DES-CONTEXT-001 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | related-to | Context assembly produces the model-visible context from immutable prefix, metadata-derived content, and change-signal messages. | +| L1-REQ-CONTEXT-001 | specs/L1/L1-REQ-CONTEXT-001-management.md | L2-DES-CONTEXT-002 | specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md | related-to | Compaction is the primary mechanism for managing context growth across long sessions. | +| L1-REQ-CONTEXT-001 | specs/L1/L1-REQ-CONTEXT-001-management.md | L2-DES-CONTEXT-003 | specs/L2/context/L2-DES-CONTEXT-003-context-normalization.md | related-to | Normalization ensures context is well-formed and bounded before model invocation. | +| L1-REQ-CONTEXT-002 | specs/L1/L1-REQ-CONTEXT-002-normalize.md | L2-DES-CONTEXT-003 | specs/L2/context/L2-DES-CONTEXT-003-context-normalization.md | refined-by | Defines the three-pass normalization pipeline: modality filter, item size bounds, and token-budget reduction. | +| L1-REQ-CONTEXT-003 | specs/L1/L1-REQ-CONTEXT-003-compress.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model defines compaction outputs as durable summary records referenced by context snapshots. | +| L1-REQ-CONTEXT-003 | specs/L1/L1-REQ-CONTEXT-003-compress.md | L2-DES-CONTEXT-002 | specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md | refined-by | Context compaction defines triggers, eligibility, summary content, durable recording, and context snapshot updates. | +| L1-REQ-INPUT-001 | specs/L1/L1-REQ-INPUT-001-attachments-and-multimodal.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model defines item content parts and mentions for attachments and multimodal input. | +| L1-REQ-INPUT-001 | specs/L1/L1-REQ-INPUT-001-attachments-and-multimodal.md | L2-DES-CONTEXT-003 | specs/L2/context/L2-DES-CONTEXT-003-context-normalization.md | related-to | Multimodal content parts are subject to modality filtering during normalization. | +| L1-REQ-EDIT-001 | specs/L1/L1-REQ-EDIT-001-file-editing-workflow.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | Structured file-editing tool records support superseded-turn restoration. | +| L1-REQ-EDIT-001 | specs/L1/L1-REQ-EDIT-001-file-editing-workflow.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | File restoration outcomes are reported through protocol events. | +| L1-REQ-GOAL-001 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | L2-DES-GOAL-001 | specs/L2/goal/L2-DES-GOAL-001-ralph-loop-goals.md | refined-by | The Ralph Loop goal design refines durable goal state, statuses, budget accounting, autonomous continuation, model-tool limits, and client controls. | +| L1-REQ-GOAL-001 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol exposes user-owned goal controls and canonical goal events to clients. | +| L1-REQ-GOAL-001 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model preserves goal events for replay and recovery. | +| L1-REQ-GOAL-001 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | The built-in tool system defines the narrow model-facing goal update tool for verified completion and blockers. | +| L1-REQ-GOAL-001 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | L2-DES-AGENT-001 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | related-to | Goal-driven continuation turns run through the normal execution engine and provide budget-accounting signals. | +| L1-REQ-GOAL-001 | specs/L1/L1-REQ-GOAL-001-ralph-loop.md | L2-DES-CONTEXT-001 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | related-to | Context assembly injects hidden goal context without creating user-visible transcript turns. | +| L1-REQ-GIT-001 | specs/L1/L1-REQ-GIT-001-change-management.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | Hidden git checkpoints may support turn-level restore records. | +| L1-REQ-GIT-001 | specs/L1/L1-REQ-GIT-001-change-management.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | Git checkpoint use remains internal while protocol exposes restore results. | +| L1-REQ-MEM-001 | specs/L1/L1-REQ-MEM-001-persistent-memory.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model may store internal persistent-memory provenance links, but clients do not manage memory entries. | +| L1-REQ-LLM-001 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | L2-DES-CONTEXT-001 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | refined-by | Context assembly defines immutable prefix, append-only metadata changes, and consolidated change-signal for cache-friendly context. | +| L1-REQ-LLM-001 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | L2-DES-CONTEXT-002 | specs/L2/context/L2-DES-CONTEXT-002-context-compaction.md | related-to | Compaction uses append-only summary records to avoid prefix mutation while reducing token usage. | +| L1-REQ-LLM-001 | specs/L1/L1-REQ-LLM-001-token-efficiency.md | L2-DES-CONTEXT-003 | specs/L2/context/L2-DES-CONTEXT-003-context-normalization.md | related-to | Token-budget pass and size bounding prevent wasted tokens from oversized items. | +| L1-REQ-LLM-002 | specs/L1/L1-REQ-LLM-002-tools.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | The built-in tool system defines the controlled lifecycle for model-requested tools. | +| L1-REQ-LLM-003 | specs/L1/L1-REQ-LLM-003-observability.md | L2-DES-LLM-003 | specs/L2/llm/L2-DES-LLM-003-model-usage-observability.md | refined-by | The model usage observability design refines usage metrics, context pressure, measured versus estimated values, unavailable values, and trace-mode stream records. | +| L1-REQ-LLM-003 | specs/L1/L1-REQ-LLM-003-observability.md | L2-DES-APP-004 | specs/L2/app/L2-DES-APP-004-observability-architecture.md | related-to | The application observability architecture defines logging, trace-mode, privacy, retention, and telemetry boundaries. | +| L1-REQ-LLM-003 | specs/L1/L1-REQ-LLM-003-observability.md | L2-DES-AGENT-001 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | related-to | Model invocation observability is emitted during execution engine model-call phases. | +| L1-REQ-LLM-003 | specs/L1/L1-REQ-LLM-003-observability.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | Durable usage summaries are stored with session data for replay and later inspection. | +| L1-REQ-LLM-004 | specs/L1/L1-REQ-LLM-004-persona.md | L2-DES-CONTEXT-001 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | refined-by | Context assembly defines persona as metadata-derived instruction with append-only change signaling. | +| L1-REQ-MODEL-001 | specs/L1/L1-REQ-MODEL-001-config.md | L2-DES-MODEL-001 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | refined-by | The model provider binding design refines supported models, user providers, and invocable model bindings. | +| L1-REQ-MODEL-001 | specs/L1/L1-REQ-MODEL-001-config.md | L2-DES-APP-002 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | related-to | The configuration precedence design defines how persisted model provider and binding records are resolved. | +| L1-REQ-MODEL-001 | specs/L1/L1-REQ-MODEL-001-config.md | L2-DES-APP-005 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | related-to | The config and auth schema defines persisted model binding, credential reference, and default-selection fields. | +| L1-REQ-MODEL-001 | specs/L1/L1-REQ-MODEL-001-config.md | L2-DES-CONTEXT-003 | specs/L2/context/L2-DES-CONTEXT-003-context-normalization.md | related-to | Model capability metadata drives modality filtering decisions during normalization. | +| L1-REQ-MODEL-002 | specs/L1/L1-REQ-MODEL-002-provider.md | L2-DES-MODEL-001 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | refined-by | The model provider binding design refines user-defined provider records and generated provider identifiers. | +| L1-REQ-MODEL-002 | specs/L1/L1-REQ-MODEL-002-provider.md | L2-DES-APP-002 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | related-to | The configuration precedence design defines how persisted provider records are resolved. | +| L1-REQ-MODEL-002 | specs/L1/L1-REQ-MODEL-002-provider.md | L2-DES-APP-005 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | related-to | The config and auth schema defines persisted provider fields and `auth.json` credential entries. | +| L1-REQ-MODEL-003 | specs/L1/L1-REQ-MODEL-003-onboard.md | L2-DES-MODEL-001 | specs/L2/model/L2-DES-MODEL-001-model-provider-binding.md | refined-by | The model provider binding design defines the records created by onboarding. | +| L1-REQ-MODEL-003 | specs/L1/L1-REQ-MODEL-003-onboard.md | L2-DES-APP-002 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | related-to | The configuration precedence design defines how onboarding-created configuration is persisted. | +| L1-REQ-MODEL-003 | specs/L1/L1-REQ-MODEL-003-onboard.md | L2-DES-APP-005 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | related-to | The config and auth schema defines where onboarding-created providers, bindings, and credentials are written. | +| L1-REQ-REVIEW-001 | specs/L1/L1-REQ-REVIEW-001-code-review.md | L2-DES-CONTEXT-001 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | refined-by | Context assembly defines review as an interaction_mode value with a mode-specific prompt set sharing the mode field. | +| L1-REQ-TOOL-001 | specs/L1/L1-REQ-TOOL-001-safety.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol design defines tool redaction and safety event fields. | +| L1-REQ-TOOL-001 | specs/L1/L1-REQ-TOOL-001-safety.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | The built-in tool system applies safety, approval, redaction, and output limits to tool calls. | +| L1-REQ-TOOL-002 | specs/L1/L1-REQ-TOOL-002-tools.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | refined-by | The built-in tool system refines tool categories, registry, lifecycle, mode gating, and plan tool behavior. | +| L1-REQ-TOOL-002 | specs/L1/L1-REQ-TOOL-002-tools.md | L2-DES-AGENT-001 | specs/L2/agent/L2-DES-AGENT-001-execution-engine.md | related-to | The execution engine dispatches model-requested tools through the tool supervisor. | +| L1-REQ-TOOL-002 | specs/L1/L1-REQ-TOOL-002-tools.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol exposes tool lifecycle and plan update events. | +| L1-REQ-TOOL-002 | specs/L1/L1-REQ-TOOL-002-tools.md | L2-DES-CONV-001 | specs/L2/conv/L2-DES-CONV-001-session-jsonl-data-model.md | related-to | The session JSONL data model preserves tool calls, tool results, and plan state. | +| L1-REQ-TOOL-003 | specs/L1/L1-REQ-TOOL-003-web-search-configuration.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | The built-in tool system treats web search as a configurable tool category with unavailable-state behavior. | +| L1-REQ-TOOL-004 | specs/L1/L1-REQ-TOOL-004-parallel-tool-orchestration.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | The built-in tool system defines `multi_tool_use` as explicit parallel orchestration without bypassing child tool validation. | +| L1-REQ-TOOL-005 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | L2-DES-AGENT-002 | specs/L2/agent/L2-DES-AGENT-002-interrupt-resume-control.md | related-to | The interrupt and resume design includes active work inspection and tracked background process cleanup. | +| L1-REQ-TOOL-005 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | L2-DES-APP-003 | specs/L2/app/L2-DES-APP-003-client-server-protocol.md | related-to | The client-server protocol exposes active work inspection and tracked background process stop methods. | +| L1-REQ-TOOL-005 | specs/L1/L1-REQ-TOOL-005-background-process-management.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | The built-in tool system defines background process tools as tracked command tool outputs. | +| L1-REQ-TUI-001 | specs/L1/L1-REQ-TUI-001-composer.md | L2-DES-TUI-003 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | refined-by | The composer and input modes design refines text entry, multi-line input, submission, command discovery, and mode-specific handling. | +| L1-REQ-TUI-001 | specs/L1/L1-REQ-TUI-001-composer.md | L2-DES-CLIENT-001 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | related-to | Unicode, IME, grapheme, and display-width handling constrain composer implementation. | +| L1-REQ-TUI-002 | specs/L1/L1-REQ-TUI-002-streaming.md | L2-DES-TUI-004 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | refined-by | The streaming transcript design refines assistant, reasoning, tool, approval, question, background process, and Markdown streaming behavior. | +| L1-REQ-TUI-003 | specs/L1/L1-REQ-TUI-003-transcript.md | L2-DES-TUI-004 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | refined-by | The streaming transcript design refines transcript cells, durable/live reconciliation, scrolling review, folding, and failure display. | +| L1-REQ-TUI-003 | specs/L1/L1-REQ-TUI-003-transcript.md | L2-DES-TUI-006 | specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md | refined-by | The full transcript alternate-screen design refines pager review, live-tail sync, and transcript edit-preview behavior. | +| L1-REQ-TUI-003 | specs/L1/L1-REQ-TUI-003-transcript.md | L2-DES-TUI-002 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | related-to | The shell layout defines the transcript viewport. | +| L1-REQ-TUI-003 | specs/L1/L1-REQ-TUI-003-transcript.md | L2-DES-CLIENT-001 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | related-to | Transcript rendering must preserve localized and non-ASCII content. | +| L1-REQ-TUI-004 | specs/L1/L1-REQ-TUI-004-state-visibility.md | L2-DES-TUI-004 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | refined-by | The streaming transcript design refines visible state mapping for idle, running, waiting, failed, interrupted, completed, and background process states. | +| L1-REQ-TUI-004 | specs/L1/L1-REQ-TUI-004-state-visibility.md | L2-DES-TUI-002 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | related-to | The shell layout defines the header, active work strip, and bottom status regions. | +| L1-REQ-TUI-004 | specs/L1/L1-REQ-TUI-004-state-visibility.md | L2-DES-TUI-003 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | related-to | The composer design defines input mode visibility in the bottom status line. | +| L1-REQ-TUI-005 | specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md | L2-DES-TUI-005 | specs/L2/tui/L2-DES-TUI-005-terminal-lifecycle-safety.md | refined-by | The terminal lifecycle design refines startup, inline mode, alternate-screen mode, interrupt handling, cleanup, terminal restore, and shell prompt handoff. | +| L1-REQ-TUI-005 | specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md | L2-DES-TUI-002 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | related-to | The shell layout defines the live region that terminal cleanup must manage. | +| L1-REQ-TUI-005 | specs/L1/L1-REQ-TUI-005-terminal-lifecycle-safety.md | L2-DES-TUI-006 | specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md | related-to | The full transcript overlay uses alternate-screen entry, cleanup, and restore behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-003 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | refined-by | The composer and input modes design refines slash-command trigger behavior, list height, keyboard navigation, row styling, and the initial command list. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-001 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-001-theme.md | refined-by | Defines the `/theme` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-002 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-002-model.md | refined-by | Defines the `/model` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-003 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-003-compact.md | refined-by | Defines the `/compact` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-004 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-004-resume.md | refined-by | Defines the `/resume` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-005 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-005-new.md | refined-by | Defines the `/new` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-006 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-006-status.md | refined-by | Defines the `/status` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-007 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-007-permissions.md | refined-by | Defines the `/permissions` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-008 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-008-clear.md | refined-by | Defines the `/clear` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-009 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-009-onboard.md | refined-by | Defines the `/onboard` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-010 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-010-goal.md | refined-by | Defines the `/goal` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-011 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-011-btw.md | refined-by | Defines the `/btw` command behavior. | +| L1-REQ-TUI-006 | specs/L1/L1-REQ-TUI-006-command-discovery-control.md | L2-DES-TUI-CMD-012 | specs/L2/tui/slash-commands/L2-DES-TUI-CMD-012-exit.md | refined-by | Defines the `/exit` command behavior. | +| L1-REQ-TUI-007 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | L2-DES-TUI-002 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | refined-by | The modern TUI shell layout refines responsive priorities, narrow layout, non-overlap rules, and graceful degradation. | +| L1-REQ-TUI-007 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | L2-DES-TUI-003 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | related-to | Composer and bottom status line layout affect responsiveness. | +| L1-REQ-TUI-007 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | L2-DES-TUI-004 | specs/L2/tui/L2-DES-TUI-004-streaming-transcript-and-state.md | related-to | Streaming and transcript cells must remain stable across resize and narrow widths. | +| L1-REQ-TUI-007 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | L2-DES-TUI-006 | specs/L2/tui/L2-DES-TUI-006-full-transcript-alternate-screen.md | related-to | The full transcript overlay rebuilds transcript projection on width changes and supports pager navigation for long content. | +| L1-REQ-TUI-007 | specs/L1/L1-REQ-TUI-007-responsive-layout-readability.md | L2-DES-CLIENT-001 | specs/L2/client/L2-DES-CLIENT-001-localization-readiness.md | related-to | Display-width aware rendering is required for Unicode and localized readability. | +| L1-REQ-TUI-008 | specs/L1/L1-REQ-TUI-008-terminal-command-prefix.md | L2-DES-TUI-003 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | refined-by | The composer and input modes design refines leading `!` Shell Mode entry, escaping, command submission, result display, and safety boundaries. | +| L1-REQ-TUI-008 | specs/L1/L1-REQ-TUI-008-terminal-command-prefix.md | L2-DES-TOOL-001 | specs/L2/tool/L2-DES-TOOL-001-built-in-tool-system.md | related-to | Shell Mode routes execution through the built-in command execution capability. | +| L1-REQ-TUI-009 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | L2-DES-TUI-003 | specs/L2/tui/L2-DES-TUI-003-composer-and-input-modes.md | refined-by | The composer and input modes design refines Default Input Mode, Shell Mode, Plan Mode, labels, and Plan Mode submission behavior. | +| L1-REQ-TUI-009 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | L2-DES-TUI-002 | specs/L2/tui/L2-DES-TUI-002-modern-tui-shell-layout.md | related-to | The shell layout defines the bottom status line where non-default input modes are displayed. | +| L1-REQ-TUI-009 | specs/L1/L1-REQ-TUI-009-session-input-modes.md | L2-DES-CONTEXT-001 | specs/L2/context/L2-DES-CONTEXT-001-context-assembly.md | related-to | Plan Mode input is serialized as an interaction mode for context assembly. | +| L1-REQ-WORKSPACE-001 | specs/L1/L1-REQ-WORKSPACE-001-project-context.md | L2-DES-WORKSPACE-001 | specs/L2/workspace/L2-DES-WORKSPACE-001-project-instruction-discovery.md | related-to | Instruction file discovery provides the local project instructions required by workspace context. | +| L1-REQ-WORKSPACE-002 | specs/L1/L1-REQ-WORKSPACE-002-project-instruction-files.md | L2-DES-WORKSPACE-001 | specs/L2/workspace/L2-DES-WORKSPACE-001-project-instruction-discovery.md | refined-by | Defines the concrete discovery algorithm, filename priority, global-instruction support, configuration, and refresh behavior. | +| L1-REQ-TUI-010 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | L2-DES-TUI-001 | specs/L2/tui/L2-DES-TUI-001-onboarding-ui-flow.md | refined-by | The TUI onboarding UI flow design refines the user-facing onboarding requirement into concrete terminal interactions. | +| L1-REQ-TUI-010 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | L2-DES-APP-002 | specs/L2/app/L2-DES-APP-002-configuration-precedence.md | related-to | The configuration precedence design defines the persistence target for successful TUI onboarding results. | +| L1-REQ-TUI-010 | specs/L1/L1-REQ-TUI-010-onboarding-ui.md | L2-DES-APP-005 | specs/L2/app/L2-DES-APP-005-config-toml-schema.md | related-to | The config and auth schema defines the persisted fields and credentials produced by successful TUI onboarding. | diff --git a/specs/traceability/l2_to_l3.md b/specs/traceability/l2_to_l3.md new file mode 100644 index 00000000..e69de29b diff --git a/specs/traceability/l3_to_impl.md b/specs/traceability/l3_to_impl.md new file mode 100644 index 00000000..e69de29b diff --git a/specs/traceability/verification.md b/specs/traceability/verification.md new file mode 100644 index 00000000..e69de29b