From afac41e128f28b06ce31c10ae8e5832f114ad741 Mon Sep 17 00:00:00 2001
From: Hunter B <hmbown@gmail.com>
Date: Sat, 6 Jun 2026 02:00:58 -0700
Subject: [PATCH] test(whaleflow): replay dogfood workflow from recorded trace

---
 CHANGELOG.md                               |   5 +-
 crates/tui/CHANGELOG.md                    |   5 +-
 crates/whaleflow/src/starlark_authoring.rs | 189 ++++++++++++++++++++-
 docs/V0_9_0_RELEASE_ACCEPTANCE.md          |   2 +-
 4 files changed, 197 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 986853607..61e4d7198 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -111,7 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   WhaleFlow mock/runtime foundations, explicit external-memory boundaries, and
   docs alignment. Live workflow execution, provider calls, TraceStore writes,
   and mutation-oriented GUI endpoints remain deferred until their atomicity and
-  replay contracts are tested.
+  replay contracts are tested. The `rlm_cache_change.star` dogfood workflow can
+  now be replayed from recorded mock leaf/control records, and missing dogfood
+  records produce `ReplayDiverged` instead of falling back to live execution
+  (#2679).
   Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
 - Added a state-store v2 schema migration for WhaleFlow trace tables covering
   workflow, branch, leaf, control-node, and teacher-candidate runs. The
diff --git a/crates/tui/CHANGELOG.md b/crates/tui/CHANGELOG.md
index 986853607..61e4d7198 100644
--- a/crates/tui/CHANGELOG.md
+++ b/crates/tui/CHANGELOG.md
@@ -111,7 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   WhaleFlow mock/runtime foundations, explicit external-memory boundaries, and
   docs alignment. Live workflow execution, provider calls, TraceStore writes,
   and mutation-oriented GUI endpoints remain deferred until their atomicity and
-  replay contracts are tested.
+  replay contracts are tested. The `rlm_cache_change.star` dogfood workflow can
+  now be replayed from recorded mock leaf/control records, and missing dogfood
+  records produce `ReplayDiverged` instead of falling back to live execution
+  (#2679).
   Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
 - Added a state-store v2 schema migration for WhaleFlow trace tables covering
   workflow, branch, leaf, control-node, and teacher-candidate runs. The
diff --git a/crates/whaleflow/src/starlark_authoring.rs b/crates/whaleflow/src/starlark_authoring.rs
index d930ac557..ff44bdc52 100644
--- a/crates/whaleflow/src/starlark_authoring.rs
+++ b/crates/whaleflow/src/starlark_authoring.rs
@@ -416,7 +416,13 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus};
+    use std::collections::BTreeMap;
+
+    use crate::{
+        AgentType, ControlNodeKind, LeafResult, MockWorkflowExecutor, ReplayControlRecord,
+        ReplayLeafRecord, WorkflowReplayExecutor, WorkflowReplayTrace, WorkflowRunStatus,
+        compute_leaf_input_hash,
+    };
 
     #[test]
     fn starlark_compiles_to_ir() {
@@ -464,6 +470,187 @@ mod tests {
         );
     }
 
+    #[test]
+    fn rlm_cache_change_workflow_replays_from_recorded_mock_trace() {
+        let source = include_str!("../../../workflows/rlm_cache_change.star");
+        let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
+            .expect("example should compile");
+        let execution = MockWorkflowExecutor::new()
+            .with_predicate_results("implement-until-tests-pass", vec![true])
+            .run(&workflow)
+            .expect("dogfood workflow should run with mock leaves");
+        let trace = replay_trace_from_execution("trace-rlm-cache", &workflow, &execution);
+
+        let replayed = WorkflowReplayExecutor::new(trace)
+            .run(&workflow)
+            .expect("recorded dogfood trace should replay");
+
+        assert_eq!(replayed.status, WorkflowRunStatus::Succeeded);
+        assert!(
+            replayed
+                .leaf_results
+                .iter()
+                .any(|result| result.leaf_id == "regression-tests")
+        );
+        assert!(
+            replayed
+                .control_node_results
+                .iter()
+                .any(|result| result.node_id == "teacher-review")
+        );
+        assert!(
+            replayed
+                .control_node_results
+                .iter()
+                .any(|result| result.node_id == "summarize-cache-change")
+        );
+    }
+
+    #[test]
+    fn rlm_cache_change_replay_diverges_when_record_missing() {
+        let source = include_str!("../../../workflows/rlm_cache_change.star");
+        let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
+            .expect("example should compile");
+        let execution = MockWorkflowExecutor::new()
+            .with_predicate_results("implement-until-tests-pass", vec![true])
+            .run(&workflow)
+            .expect("dogfood workflow should run with mock leaves");
+        let mut trace =
+            replay_trace_from_execution("trace-rlm-cache-missing", &workflow, &execution);
+        trace
+            .leaf_records
+            .retain(|record| record.leaf_id != "regression-tests");
+
+        let replayed = WorkflowReplayExecutor::new(trace)
+            .run(&workflow)
+            .expect("missing dogfood leaf record should be a replay result");
+
+        assert_eq!(replayed.status, WorkflowRunStatus::ReplayDiverged);
+        assert!(replayed.leaf_results.iter().any(|result| {
+            result.leaf_id == "regression-tests"
+                && result.status == WorkflowRunStatus::ReplayDiverged
+        }));
+    }
+
+    fn replay_trace_from_execution(
+        trace_id: &str,
+        workflow: &WorkflowSpec,
+        execution: &crate::WorkflowExecution,
+    ) -> WorkflowReplayTrace {
+        let mut resolved_outputs = BTreeMap::new();
+        let mut leaf_records = Vec::new();
+        collect_leaf_records(
+            trace_id,
+            workflow,
+            &workflow.nodes,
+            &execution.leaf_results,
+            &mut resolved_outputs,
+            &mut leaf_records,
+        );
+        let control_records = execution
+            .control_node_results
+            .iter()
+            .cloned()
+            .map(|result| ReplayControlRecord {
+                trace_id: trace_id.to_string(),
+                node_id: result.node_id.clone(),
+                kind: result.kind,
+                result,
+                generated_nodes: Vec::new(),
+            })
+            .collect();
+
+        WorkflowReplayTrace {
+            trace_id: trace_id.to_string(),
+            leaf_records,
+            control_records,
+        }
+    }
+
+    fn collect_leaf_records(
+        trace_id: &str,
+        workflow: &WorkflowSpec,
+        nodes: &[WorkflowNode],
+        results: &[LeafResult],
+        resolved_outputs: &mut BTreeMap<String, Option<String>>,
+        records: &mut Vec<ReplayLeafRecord>,
+    ) {
+        for node in nodes {
+            match node {
+                WorkflowNode::BranchSet(branch) => collect_leaf_records(
+                    trace_id,
+                    workflow,
+                    &branch.children,
+                    results,
+                    resolved_outputs,
+                    records,
+                ),
+                WorkflowNode::Leaf(leaf) => {
+                    let result = results
+                        .iter()
+                        .find(|result| result.leaf_id == leaf.id)
+                        .expect("mock execution should record every declared leaf")
+                        .clone();
+                    let resolved_inputs = leaf
+                        .depends_on_results
+                        .iter()
+                        .map(|dependency| {
+                            (
+                                dependency.clone(),
+                                resolved_outputs.get(dependency).cloned().unwrap_or(None),
+                            )
+                        })
+                        .collect();
+                    records.push(ReplayLeafRecord {
+                        trace_id: trace_id.to_string(),
+                        leaf_id: leaf.id.clone(),
+                        input_hash: compute_leaf_input_hash(workflow, leaf, &resolved_inputs)
+                            .expect("leaf input hash should serialize"),
+                        result: result.clone(),
+                    });
+                    resolved_outputs.insert(leaf.id.clone(), result.output);
+                }
+                WorkflowNode::Sequence(sequence) => collect_leaf_records(
+                    trace_id,
+                    workflow,
+                    &sequence.children,
+                    results,
+                    resolved_outputs,
+                    records,
+                ),
+                WorkflowNode::LoopUntil(loop_until) => collect_leaf_records(
+                    trace_id,
+                    workflow,
+                    &loop_until.children,
+                    results,
+                    resolved_outputs,
+                    records,
+                ),
+                WorkflowNode::Cond(cond) => {
+                    collect_leaf_records(
+                        trace_id,
+                        workflow,
+                        &cond.then_nodes,
+                        results,
+                        resolved_outputs,
+                        records,
+                    );
+                    collect_leaf_records(
+                        trace_id,
+                        workflow,
+                        &cond.else_nodes,
+                        results,
+                        resolved_outputs,
+                        records,
+                    );
+                }
+                WorkflowNode::Expand(_)
+                | WorkflowNode::Reduce(_)
+                | WorkflowNode::TeacherReview(_) => {}
+            }
+        }
+    }
+
     #[test]
     fn starlark_repair_loop() {
         let source = r#"
diff --git a/docs/V0_9_0_RELEASE_ACCEPTANCE.md b/docs/V0_9_0_RELEASE_ACCEPTANCE.md
index c63c138c5..f935578db 100644
--- a/docs/V0_9_0_RELEASE_ACCEPTANCE.md
+++ b/docs/V0_9_0_RELEASE_ACCEPTANCE.md
@@ -59,7 +59,7 @@ config source, result, and follow-up issue or PR.
 
 | Gate | Owner | Ship/defer decision | Evidence |
 | --- | --- | --- | --- |
-| WhaleFlow typed IR, mock executor, replay, TeacherReview, StudentReplay, and cutline docs are tested | WhaleFlow steward | ship | #2821/#2824/#2831/#2833/#2839/#2840/#2841 plus focused local `cargo test -p codewhale-whaleflow --locked`; #2670 closed after `cargo test -p codewhale-whaleflow starlark --locked` passed 7/7 on current stewardship head. |
+| WhaleFlow typed IR, mock executor, replay, TeacherReview, StudentReplay, and cutline docs are tested | WhaleFlow steward | ship | #2821/#2824/#2831/#2833/#2839/#2840/#2841 plus focused local `cargo test -p codewhale-whaleflow --locked`; #2670 closed after `cargo test -p codewhale-whaleflow starlark --locked` passed 7/7 on current stewardship head. The `rlm_cache_change.star` dogfood workflow now has recorded mock-trace replay coverage, including a missing-record divergence check. |
 | Live `workflow_run`, worktree application, provider calls, and TraceStore writes are deferred until cancellation/replay/atomicity semantics pass | WhaleFlow steward | defer | #2669 and #2679 remain open for live runtime execution, provider calls, TraceStore writes, Arcee/student replay, and CLI/TUI workflow mode; current v0.9 branch ships mock executor/replay foundations only. |
 | Model Lab / Hugging Face MVP is included or deferred with release-note wording | model-lab steward | decide |  |
 | HarnessProfile runtime MVP is deferred; schema/resolver foundation ships with release-note wording | harness steward | ship foundation / defer runtime | #2844 (`efbcc681a`) documents the cutline; `HarnessPosture` / `HarnessProfile` config schema and strict validation are present; a pure resolver matches provider/model routes without changing runtime behavior; seed-profile runtime selection, telemetry, and status display remain follow-up work. |