awf-project · pocky · Apr 13, 2026 · Apr 13, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- **F082**: Human-readable streaming output for agent steps — when running with `awf run --output streaming`, agent responses now display as clean text instead of raw NDJSON; `output_format` field controls filtering (text/none formats filter NDJSON, `json` format passes through raw); buffered mode (`--output buffered`) displays filtered text in post-execution summary; raw NDJSON always preserved in `state.Output` for template interpolation; `--output silent` remains silent regardless of `output_format`; per-provider extractors implemented for Claude (parses `content_block_delta` events) with stubs for Gemini/Codex/OpenCode
 - **F081**: Model validation by prefix/pattern for Gemini and Codex providers — Gemini validates that `model` starts with `gemini-` (enables use of any Gemini model without CLI updates); Codex validates `model` against prefixes `gpt-`, `codex-`, or o-series pattern (`o` followed by digit, e.g., `o1`, `o3-mini`); validation errors include format guidance to guide correction
 - **F078**: OpenCode `--model` flag support — `model` option in workflow YAML now passed as `--model <value>` to OpenCode CLI in both `Execute` and `ExecuteConversation`; OpenCode always passes `--format json` for structured output
 - **F077**: `dangerously_skip_permissions` support for Gemini (`--approval-mode=yolo`) and Codex (`--yolo`) providers — unified permission bypass key works across all three agent providers (Claude, Gemini, Codex)

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -217,8 +217,6 @@ func TestWorkflowValidation(t *testing.T) {
 
 ## Architecture Rules
 
-- Document discovered runtime bugs in .specify/implementation/ISSUE/bug/ directory before implementation; prevents scope creep and enables separate tracking from test fixes
-- Own timeout responsibility in application layer via context.WithTimeout; infrastructure adapters must respect context cancellation without enforcing additional timeouts
 - Evaluate step transitions before fallback behaviors; transitions take priority over OnSuccess, OnFailure, and ContinueOnError (ADR-001)
 - Use pointer types (*T) for optional config fields in infrastructure types; apply defaults during mapping to distinguish omitted from explicit zero values
 - Implement private per-provider extraction methods (no shared interface) when output formats diverge fundamentally; avoids premature abstraction and enables independent testing
@@ -240,6 +238,10 @@ func TestWorkflowValidation(t *testing.T) {
 - Synchronize provider CLI flag changes across both implementation files and central options configuration (options.go); verify declarations and validation rules align
 - When extracting shared infrastructure behavior across multiple provider implementations, apply the delegation pattern uniformly; partial refactoring creates inconsistent ownership
 
+- When wiring optional transformations across multiple execution paths (ExecuteConversation, runWorkflow, etc.), apply consistently to all paths; missing stubs in any path indicates incomplete cross-layer wiring
+
+- When adding hook fields to shared infrastructure types, implement (with stubs acceptable for future providers) across all concrete providers in the same layer; missing implementations in any provider blocks deployment
+
 ## Common Pitfalls
 
 - Never block on I/O without context support; use goroutine+channel+select with buffered channel (cap 1) to enable graceful cancellation
@@ -286,8 +288,6 @@ func TestWorkflowValidation(t *testing.T) {
 
 ## Test Conventions
 
-- Write unit tests for prompt file validation, interpolation, and YAML mapping before integration tests; use table-driven tests for path resolution scenarios
-- Never use switch statements to populate table-driven test variables; declare all fields in struct literals to prevent silent zero-value failures from missed case names
 - Write table-driven tests for inline error object parsing (message + status validation) before integration tests; use yamlStep.OnFailure field as 'any' type in test fixtures to validate both string and object forms
 - Use distinct file naming for unit vs integration tests: *_unit_test.go vs *_test.go; prevents error analysis tools from reporting incorrect file scopes
 - Never hardcode OS-specific values in test assertions (usernames, paths, shell names); use `os/user.Current()` or mock dependencies for reproducible tests across environments
@@ -307,5 +307,9 @@ func TestWorkflowValidation(t *testing.T) {
 - When flipping integration test assertions for newly-enabled features, transition from 'not configured' errors to provider-level implementation errors; verify assertions change state, not disappear
 - Create separate test files for delegation patterns (*_delegation_test.go) to validate shared behavior independently from provider-specific unit tests
 
+- When adding fields to internal state types (DisplayOutput, cache fields, etc.), write explicit tests verifying the field is NOT resolvable in template interpolation context; prevents accidental exposure of implementation details
+
+- Add BenchmarkXX functions for new I/O processing components; measure throughput, memory allocation, and verify capacity constraints (1MB buffer, etc.) are respected
+
 ## Review Standards
 
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ A Go CLI tool for orchestrating AI agents (Claude, Gemini, Codex, OpenAI-Compati
 - **State Machine Execution** - Define workflows as state machines with conditional transitions based on exit codes, command output, or custom expressions
 - **Inline Error Handling** - Specify error messages and exit codes directly on steps without creating separate terminal states
 - **Agent Steps** - Invoke AI agents via CLI tools (Claude, Codex, Gemini) or direct HTTP (OpenAI, Ollama, vLLM, Groq) with prompt templates, response parsing, and accurate token tracking
-- **Output Formatting for Agent Steps** - Automatically strip markdown code fences and validate JSON output
+- **Output Formatting for Agent Steps** - Automatically strip markdown code fences and validate JSON output; human-readable streaming display controlled by `output_format` field (text vs raw NDJSON)
 - **External Prompt Files** - Load agent prompts from `.md` files with full template interpolation, helper functions, and local override support
 - **External Script Files** - Load commands from external script files with shebang-based interpreter dispatch, template interpolation, path resolution, and local override support
 - **Conversation Mode** - Multi-turn conversations with native session resume for CLI providers (`claude`, `codex`, `gemini`, `opencode`), automatic context window management for HTTP providers, mid-conversation context injection via `inject_context` field, and token tracking across all turns

diff --git a/docs/README.md b/docs/README.md
@@ -30,6 +30,7 @@ Learn how to use AWF effectively:
 - [Interactive Input Collection](user-guide/interactive-inputs.md) - Automatic prompting for missing workflow inputs
 - [Agent Steps](user-guide/agent-steps.md) - Invoke AI agents via CLI (Claude, Codex, Gemini) or HTTP APIs (OpenAI, Ollama, vLLM, Groq)
   - [Output Formatting](user-guide/agent-steps.md#output-formatting) - Automatic code fence stripping and JSON validation (`output_format: json|text`)
+  - [Streaming Output Display](user-guide/agent-steps.md#streaming-output-display) - Human-readable filtered output for `--output streaming` and `--output buffered` modes
   - [External Prompt Files](user-guide/agent-steps.md#external-prompt-files) - Load prompts from Markdown files with template interpolation
   - [Model Validation](user-guide/agent-steps.md#model-validation) - Provider-specific model name validation (Claude, Gemini, Codex)
 - [Conversation Mode](user-guide/conversation-steps.md) - Multi-turn conversations with native session resume for CLI providers and context window management

diff --git a/docs/user-guide/agent-steps.md b/docs/user-guide/agent-steps.md
@@ -540,6 +540,11 @@ process_response:
 
 ## Output Formatting
 
+The `output_format` field serves two purposes:
+
+1. **Post-processing**: Strips markdown code fences and optionally validates JSON (F065)
+2. **Display filtering**: Controls how agent responses appear on terminal during streaming and buffered execution (F082)
+
 When an agent wraps its output in markdown code fences (common with many LLMs), use `output_format` to automatically strip the fences and optionally validate the content:
 
 ```yaml
@@ -645,6 +650,61 @@ analyze:
   on_success: next
 ```
 
+### Streaming Output Display
+
+The `output_format` field also controls how agent responses appear on the terminal when running with `awf run --output streaming` or `--output buffered`:
+
+| `output_format` | Streaming Display | Buffered Display | Raw Storage |
+|---|---|---|---|
+| `text` (or omitted) | Human-readable filtered text | Filtered text in summary | Raw NDJSON |
+| `json` | Raw NDJSON (unfiltered) | Raw NDJSON (unfiltered) | Raw NDJSON |
+
+#### Streaming Mode (`--output streaming`)
+
+When running with streaming output, agent responses display incrementally as they're generated:
+
+```bash
+# Raw NDJSON appears on terminal (hard to read)
+awf run code-review --output streaming
+# Output: {"type":"content_block_delta",...}{"type":"content_block_delta",...}
+
+# Human-readable text with default output_format
+awf run code-review --output streaming  # output_format: text (or omitted)
+# Output: The code has several issues...
+```
+
+**Filtering behavior:**
+- `output_format: text` or omitted — Extracted text content displayed (filtered NDJSON)
+- `output_format: json` — Raw NDJSON passed through unchanged
+
+#### Buffered Mode (`--output buffered`)
+
+When running with buffered output, the post-execution summary displays filtered text:
+
+```bash
+awf run code-review --output buffered
+
+# With output_format: text (or omitted):
+# Output of "analyze" step:
+# The code has several issues...
+
+# With output_format: json:
+# Output of "analyze" step:
+# {"type":"content_block_delta",...}
+```
+
+#### Silent Mode (`--output silent`)
+
+Silent mode suppresses all display regardless of `output_format`:
+
+```bash
+awf run code-review --output silent
+# No output displayed (silent mode is absolute)
+# state.Output still contains raw NDJSON for template interpolation
+```
+
+**Note:** `state.Output` always contains the raw NDJSON regardless of display filtering. Filtering only affects terminal display, not data storage.
+
 ### Error Handling
 
 When `output_format: json` is specified but the output is invalid JSON:

diff --git a/docs/user-guide/commands.md b/docs/user-guide/commands.md
@@ -163,8 +163,10 @@ awf run <workflow> [flags]
 | Mode | Description |
 |------|-------------|
 | `silent` | No command output displayed (default) |
-| `streaming` | Real-time output with [OUT]/[ERR] prefixes |
-| `buffered` | Show output after each step completes |
+| `streaming` | Real-time output with [OUT]/[ERR] prefixes; for agent steps, displays human-readable text (or raw NDJSON if `output_format: json`) |
+| `buffered` | Show output after each step completes; for agent steps, displays filtered text in post-execution summary (or raw NDJSON if `output_format: json`) |
+
+**Note:** For agent steps, the `output_format` field controls display filtering: `text` or omitted (default) shows human-readable output; `json` shows raw NDJSON. See [Output Formatting](agent-steps.md#streaming-output-display) for details.
 
 ### Examples
 

diff --git a/internal/application/conversation_manager.go b/internal/application/conversation_manager.go
@@ -221,10 +221,10 @@ func (m *ConversationManager) ExecuteConversation(
 		return nil, err
 	}
 
-	options := step.Agent.Options
-	if options == nil {
-		options = make(map[string]any)
-	}
+	// Clone options to preserve FR-009 immutability of step.Agent.Options,
+	// and inject output_format so baseCLIProvider can route display filtering
+	// identically between executeAgentStep and conversation mode (F082).
+	options := cloneAndInjectOutputFormat(step.Agent.Options, step.Agent.OutputFormat)
 	if step.Agent.SystemPrompt != "" {
 		options["system_prompt"] = step.Agent.SystemPrompt
 	}

diff --git a/internal/application/execution_service.go b/internal/application/execution_service.go
@@ -2034,7 +2034,8 @@ func (s *ExecutionService) executeAgentStep(
 
 	// Execute the agent
 	s.logger.Debug("executing agent step", "step", step.Name, "provider", resolvedProvider)
-	result, execErr := provider.Execute(stepCtx, resolvedPrompt, step.Agent.Options, s.stdoutWriter, s.stderrWriter)
+	opts := cloneAndInjectOutputFormat(step.Agent.Options, step.Agent.OutputFormat)
+	result, execErr := provider.Execute(stepCtx, resolvedPrompt, opts, s.stdoutWriter, s.stderrWriter)
 
 	// Record step state
 	state := workflow.StepState{
@@ -2047,6 +2048,7 @@ func (s *ExecutionService) executeAgentStep(
 	// Populate state from result
 	if result != nil {
 		state.Output = result.Output
+		state.DisplayOutput = result.DisplayOutput
 		// AC5: JSON auto-parsed to states.step_name.Response
 		state.Response = result.Response
 		// AC6: Token usage in states.step_name.tokens_used
@@ -2194,6 +2196,7 @@ func (s *ExecutionService) executeConversationStep(
 
 	if result != nil {
 		state.Output = result.Output
+		state.DisplayOutput = result.DisplayOutput
 		state.Response = result.Response
 		state.TokensUsed = result.TokensTotal
 		state.Conversation = result.State
@@ -2226,6 +2229,26 @@ func (s *ExecutionService) executeConversationStep(
 	return s.resolveNextStep(step, intCtx, true)
 }
 
+// cloneAndInjectOutputFormat shallow-clones opts and injects output_format as string.
+// The original map is never mutated (FR-009). Precedence: an explicit
+// options["output_format"] set by the user wins (display-only intent); otherwise
+// the top-level step.Agent.OutputFormat is injected; otherwise defaults to text.
+// This keeps F065 post-processing (top-level) decoupled from F082 display intent (options).
+func cloneAndInjectOutputFormat(opts map[string]any, format workflow.OutputFormat) map[string]any {
+	cloned := make(map[string]any, len(opts)+2)
+	for k, v := range opts {
+		cloned[k] = v
+	}
+	if _, userSet := cloned["output_format"]; userSet {
+		return cloned
+	}
+	if format == workflow.OutputFormatNone {
+		format = workflow.OutputFormatText
+	}
+	cloned["output_format"] = string(format)
+	return cloned
+}
+
 // resolveOperationInputs resolves all string values in operation inputs via interpolation.
 func (s *ExecutionService) resolveOperationInputs(
 	inputs map[string]any,