From afac41e128f28b06ce31c10ae8e5832f114ad741 Mon Sep 17 00:00:00 2001 From: Hunter B Date: Sat, 6 Jun 2026 02:00:58 -0700 Subject: [PATCH] test(whaleflow): replay dogfood workflow from recorded trace --- CHANGELOG.md | 5 +- crates/tui/CHANGELOG.md | 5 +- crates/whaleflow/src/starlark_authoring.rs | 189 ++++++++++++++++++++- docs/V0_9_0_RELEASE_ACCEPTANCE.md | 2 +- 4 files changed, 197 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 986853607..61e4d7198 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -111,7 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 WhaleFlow mock/runtime foundations, explicit external-memory boundaries, and docs alignment. Live workflow execution, provider calls, TraceStore writes, and mutation-oriented GUI endpoints remain deferred until their atomicity and - replay contracts are tested. + replay contracts are tested. The `rlm_cache_change.star` dogfood workflow can + now be replayed from recorded mock leaf/control records, and missing dogfood + records produce `ReplayDiverged` instead of falling back to live execution + (#2679). Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction. - Added a state-store v2 schema migration for WhaleFlow trace tables covering workflow, branch, leaf, control-node, and teacher-candidate runs. The diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md index 986853607..61e4d7198 100644 --- a/crates/tui/CHANGELOG.md +++ b/crates/tui/CHANGELOG.md @@ -111,7 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 WhaleFlow mock/runtime foundations, explicit external-memory boundaries, and docs alignment. Live workflow execution, provider calls, TraceStore writes, and mutation-oriented GUI endpoints remain deferred until their atomicity and - replay contracts are tested. + replay contracts are tested. The `rlm_cache_change.star` dogfood workflow can + now be replayed from recorded mock leaf/control records, and missing dogfood + records produce `ReplayDiverged` instead of falling back to live execution + (#2679). Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction. - Added a state-store v2 schema migration for WhaleFlow trace tables covering workflow, branch, leaf, control-node, and teacher-candidate runs. The diff --git a/crates/whaleflow/src/starlark_authoring.rs b/crates/whaleflow/src/starlark_authoring.rs index d930ac557..ff44bdc52 100644 --- a/crates/whaleflow/src/starlark_authoring.rs +++ b/crates/whaleflow/src/starlark_authoring.rs @@ -416,7 +416,13 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) { #[cfg(test)] mod tests { use super::*; - use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus}; + use std::collections::BTreeMap; + + use crate::{ + AgentType, ControlNodeKind, LeafResult, MockWorkflowExecutor, ReplayControlRecord, + ReplayLeafRecord, WorkflowReplayExecutor, WorkflowReplayTrace, WorkflowRunStatus, + compute_leaf_input_hash, + }; #[test] fn starlark_compiles_to_ir() { @@ -464,6 +470,187 @@ mod tests { ); } + #[test] + fn rlm_cache_change_workflow_replays_from_recorded_mock_trace() { + let source = include_str!("../../../workflows/rlm_cache_change.star"); + let workflow = compile_starlark_workflow("rlm_cache_change.star", source) + .expect("example should compile"); + let execution = MockWorkflowExecutor::new() + .with_predicate_results("implement-until-tests-pass", vec![true]) + .run(&workflow) + .expect("dogfood workflow should run with mock leaves"); + let trace = replay_trace_from_execution("trace-rlm-cache", &workflow, &execution); + + let replayed = WorkflowReplayExecutor::new(trace) + .run(&workflow) + .expect("recorded dogfood trace should replay"); + + assert_eq!(replayed.status, WorkflowRunStatus::Succeeded); + assert!( + replayed + .leaf_results + .iter() + .any(|result| result.leaf_id == "regression-tests") + ); + assert!( + replayed + .control_node_results + .iter() + .any(|result| result.node_id == "teacher-review") + ); + assert!( + replayed + .control_node_results + .iter() + .any(|result| result.node_id == "summarize-cache-change") + ); + } + + #[test] + fn rlm_cache_change_replay_diverges_when_record_missing() { + let source = include_str!("../../../workflows/rlm_cache_change.star"); + let workflow = compile_starlark_workflow("rlm_cache_change.star", source) + .expect("example should compile"); + let execution = MockWorkflowExecutor::new() + .with_predicate_results("implement-until-tests-pass", vec![true]) + .run(&workflow) + .expect("dogfood workflow should run with mock leaves"); + let mut trace = + replay_trace_from_execution("trace-rlm-cache-missing", &workflow, &execution); + trace + .leaf_records + .retain(|record| record.leaf_id != "regression-tests"); + + let replayed = WorkflowReplayExecutor::new(trace) + .run(&workflow) + .expect("missing dogfood leaf record should be a replay result"); + + assert_eq!(replayed.status, WorkflowRunStatus::ReplayDiverged); + assert!(replayed.leaf_results.iter().any(|result| { + result.leaf_id == "regression-tests" + && result.status == WorkflowRunStatus::ReplayDiverged + })); + } + + fn replay_trace_from_execution( + trace_id: &str, + workflow: &WorkflowSpec, + execution: &crate::WorkflowExecution, + ) -> WorkflowReplayTrace { + let mut resolved_outputs = BTreeMap::new(); + let mut leaf_records = Vec::new(); + collect_leaf_records( + trace_id, + workflow, + &workflow.nodes, + &execution.leaf_results, + &mut resolved_outputs, + &mut leaf_records, + ); + let control_records = execution + .control_node_results + .iter() + .cloned() + .map(|result| ReplayControlRecord { + trace_id: trace_id.to_string(), + node_id: result.node_id.clone(), + kind: result.kind, + result, + generated_nodes: Vec::new(), + }) + .collect(); + + WorkflowReplayTrace { + trace_id: trace_id.to_string(), + leaf_records, + control_records, + } + } + + fn collect_leaf_records( + trace_id: &str, + workflow: &WorkflowSpec, + nodes: &[WorkflowNode], + results: &[LeafResult], + resolved_outputs: &mut BTreeMap>, + records: &mut Vec, + ) { + for node in nodes { + match node { + WorkflowNode::BranchSet(branch) => collect_leaf_records( + trace_id, + workflow, + &branch.children, + results, + resolved_outputs, + records, + ), + WorkflowNode::Leaf(leaf) => { + let result = results + .iter() + .find(|result| result.leaf_id == leaf.id) + .expect("mock execution should record every declared leaf") + .clone(); + let resolved_inputs = leaf + .depends_on_results + .iter() + .map(|dependency| { + ( + dependency.clone(), + resolved_outputs.get(dependency).cloned().unwrap_or(None), + ) + }) + .collect(); + records.push(ReplayLeafRecord { + trace_id: trace_id.to_string(), + leaf_id: leaf.id.clone(), + input_hash: compute_leaf_input_hash(workflow, leaf, &resolved_inputs) + .expect("leaf input hash should serialize"), + result: result.clone(), + }); + resolved_outputs.insert(leaf.id.clone(), result.output); + } + WorkflowNode::Sequence(sequence) => collect_leaf_records( + trace_id, + workflow, + &sequence.children, + results, + resolved_outputs, + records, + ), + WorkflowNode::LoopUntil(loop_until) => collect_leaf_records( + trace_id, + workflow, + &loop_until.children, + results, + resolved_outputs, + records, + ), + WorkflowNode::Cond(cond) => { + collect_leaf_records( + trace_id, + workflow, + &cond.then_nodes, + results, + resolved_outputs, + records, + ); + collect_leaf_records( + trace_id, + workflow, + &cond.else_nodes, + results, + resolved_outputs, + records, + ); + } + WorkflowNode::Expand(_) + | WorkflowNode::Reduce(_) + | WorkflowNode::TeacherReview(_) => {} + } + } + } + #[test] fn starlark_repair_loop() { let source = r#" diff --git a/docs/V0_9_0_RELEASE_ACCEPTANCE.md b/docs/V0_9_0_RELEASE_ACCEPTANCE.md index c63c138c5..f935578db 100644 --- a/docs/V0_9_0_RELEASE_ACCEPTANCE.md +++ b/docs/V0_9_0_RELEASE_ACCEPTANCE.md @@ -59,7 +59,7 @@ config source, result, and follow-up issue or PR. | Gate | Owner | Ship/defer decision | Evidence | | --- | --- | --- | --- | -| WhaleFlow typed IR, mock executor, replay, TeacherReview, StudentReplay, and cutline docs are tested | WhaleFlow steward | ship | #2821/#2824/#2831/#2833/#2839/#2840/#2841 plus focused local `cargo test -p codewhale-whaleflow --locked`; #2670 closed after `cargo test -p codewhale-whaleflow starlark --locked` passed 7/7 on current stewardship head. | +| WhaleFlow typed IR, mock executor, replay, TeacherReview, StudentReplay, and cutline docs are tested | WhaleFlow steward | ship | #2821/#2824/#2831/#2833/#2839/#2840/#2841 plus focused local `cargo test -p codewhale-whaleflow --locked`; #2670 closed after `cargo test -p codewhale-whaleflow starlark --locked` passed 7/7 on current stewardship head. The `rlm_cache_change.star` dogfood workflow now has recorded mock-trace replay coverage, including a missing-record divergence check. | | Live `workflow_run`, worktree application, provider calls, and TraceStore writes are deferred until cancellation/replay/atomicity semantics pass | WhaleFlow steward | defer | #2669 and #2679 remain open for live runtime execution, provider calls, TraceStore writes, Arcee/student replay, and CLI/TUI workflow mode; current v0.9 branch ships mock executor/replay foundations only. | | Model Lab / Hugging Face MVP is included or deferred with release-note wording | model-lab steward | decide | | | HarnessProfile runtime MVP is deferred; schema/resolver foundation ships with release-note wording | harness steward | ship foundation / defer runtime | #2844 (`efbcc681a`) documents the cutline; `HarnessPosture` / `HarnessProfile` config schema and strict validation are present; a pure resolver matches provider/model routes without changing runtime behavior; seed-profile runtime selection, telemetry, and status display remain follow-up work. |