Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
WhaleFlow mock/runtime foundations, explicit external-memory boundaries, and
docs alignment. Live workflow execution, provider calls, TraceStore writes,
and mutation-oriented GUI endpoints remain deferred until their atomicity and
replay contracts are tested.
replay contracts are tested. The `rlm_cache_change.star` dogfood workflow can
now be replayed from recorded mock leaf/control records, and missing dogfood
records produce `ReplayDiverged` instead of falling back to live execution
(#2679).
Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
- Added a state-store v2 schema migration for WhaleFlow trace tables covering
workflow, branch, leaf, control-node, and teacher-candidate runs. The
Expand Down
5 changes: 4 additions & 1 deletion crates/tui/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
WhaleFlow mock/runtime foundations, explicit external-memory boundaries, and
docs alignment. Live workflow execution, provider calls, TraceStore writes,
and mutation-oriented GUI endpoints remain deferred until their atomicity and
replay contracts are tested.
replay contracts are tested. The `rlm_cache_change.star` dogfood workflow can
now be replayed from recorded mock leaf/control records, and missing dogfood
records produce `ReplayDiverged` instead of falling back to live execution
(#2679).
Thanks @AdityaVG13 for the WhaleFlow draft and cost-tracking direction.
- Added a state-store v2 schema migration for WhaleFlow trace tables covering
workflow, branch, leaf, control-node, and teacher-candidate runs. The
Expand Down
189 changes: 188 additions & 1 deletion crates/whaleflow/src/starlark_authoring.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,13 @@ fn workflow_builtins(builder: &mut GlobalsBuilder) {
#[cfg(test)]
mod tests {
use super::*;
use crate::{AgentType, ControlNodeKind, MockWorkflowExecutor, WorkflowRunStatus};
use std::collections::BTreeMap;

use crate::{
AgentType, ControlNodeKind, LeafResult, MockWorkflowExecutor, ReplayControlRecord,
ReplayLeafRecord, WorkflowReplayExecutor, WorkflowReplayTrace, WorkflowRunStatus,
compute_leaf_input_hash,
};

#[test]
fn starlark_compiles_to_ir() {
Expand Down Expand Up @@ -464,6 +470,187 @@ mod tests {
);
}

#[test]
fn rlm_cache_change_workflow_replays_from_recorded_mock_trace() {
let source = include_str!("../../../workflows/rlm_cache_change.star");
let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
.expect("example should compile");
let execution = MockWorkflowExecutor::new()
.with_predicate_results("implement-until-tests-pass", vec![true])
.run(&workflow)
.expect("dogfood workflow should run with mock leaves");
let trace = replay_trace_from_execution("trace-rlm-cache", &workflow, &execution);

let replayed = WorkflowReplayExecutor::new(trace)
.run(&workflow)
.expect("recorded dogfood trace should replay");

assert_eq!(replayed.status, WorkflowRunStatus::Succeeded);
assert!(
replayed
.leaf_results
.iter()
.any(|result| result.leaf_id == "regression-tests")
);
assert!(
replayed
.control_node_results
.iter()
.any(|result| result.node_id == "teacher-review")
);
assert!(
replayed
.control_node_results
.iter()
.any(|result| result.node_id == "summarize-cache-change")
);
}

#[test]
fn rlm_cache_change_replay_diverges_when_record_missing() {
let source = include_str!("../../../workflows/rlm_cache_change.star");
let workflow = compile_starlark_workflow("rlm_cache_change.star", source)
.expect("example should compile");
let execution = MockWorkflowExecutor::new()
.with_predicate_results("implement-until-tests-pass", vec![true])
.run(&workflow)
.expect("dogfood workflow should run with mock leaves");
let mut trace =
replay_trace_from_execution("trace-rlm-cache-missing", &workflow, &execution);
trace
.leaf_records
.retain(|record| record.leaf_id != "regression-tests");

let replayed = WorkflowReplayExecutor::new(trace)
.run(&workflow)
.expect("missing dogfood leaf record should be a replay result");

assert_eq!(replayed.status, WorkflowRunStatus::ReplayDiverged);
assert!(replayed.leaf_results.iter().any(|result| {
result.leaf_id == "regression-tests"
&& result.status == WorkflowRunStatus::ReplayDiverged
}));
}

fn replay_trace_from_execution(
trace_id: &str,
workflow: &WorkflowSpec,
execution: &crate::WorkflowExecution,
) -> WorkflowReplayTrace {
let mut resolved_outputs = BTreeMap::new();
let mut leaf_records = Vec::new();
collect_leaf_records(
trace_id,
workflow,
&workflow.nodes,
&execution.leaf_results,
&mut resolved_outputs,
&mut leaf_records,
);
let control_records = execution
.control_node_results
.iter()
.cloned()
.map(|result| ReplayControlRecord {
trace_id: trace_id.to_string(),
node_id: result.node_id.clone(),
kind: result.kind,
result,
generated_nodes: Vec::new(),
})
.collect();

WorkflowReplayTrace {
trace_id: trace_id.to_string(),
leaf_records,
control_records,
}
}

fn collect_leaf_records(
trace_id: &str,
workflow: &WorkflowSpec,
nodes: &[WorkflowNode],
results: &[LeafResult],
resolved_outputs: &mut BTreeMap<String, Option<String>>,
records: &mut Vec<ReplayLeafRecord>,
) {
for node in nodes {
match node {
WorkflowNode::BranchSet(branch) => collect_leaf_records(
trace_id,
workflow,
&branch.children,
results,
resolved_outputs,
records,
),
WorkflowNode::Leaf(leaf) => {
let result = results
.iter()
.find(|result| result.leaf_id == leaf.id)
.expect("mock execution should record every declared leaf")
.clone();
Comment on lines +588 to +593
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Limitation in Trace Reconstruction for Loops / Multiple Executions

The collect_leaf_records helper statically traverses the workflow AST (&workflow.nodes) to reconstruct the replay trace.

Because it performs a static traversal:

  1. It will only visit each Leaf node once, even if that leaf is executed multiple times (e.g., inside a LoopUntil block with multiple iterations).
  2. The .find() call on line 590 will always retrieve the first execution result of that leaf, ignoring subsequent iterations.

This means any workflow containing loops that execute more than once will produce an incomplete or incorrect replay trace, leading to ReplayDiverged errors during replay. Consider refactoring this helper to map directly over the dynamic execution results (execution.leaf_results) and resolve their dependencies dynamically, or document this limitation if it is strictly intended for single-iteration test scenarios.

let resolved_inputs = leaf
.depends_on_results
.iter()
.map(|dependency| {
(
dependency.clone(),
resolved_outputs.get(dependency).cloned().unwrap_or(None),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The expression .cloned().unwrap_or(None) on an Option<Option<T>> can be simplified more idiomatically using .flatten(). This avoids the explicit unwrap_or(None) call and improves readability.

                                resolved_outputs.get(dependency).cloned().flatten(),

)
})
.collect();
records.push(ReplayLeafRecord {
trace_id: trace_id.to_string(),
leaf_id: leaf.id.clone(),
input_hash: compute_leaf_input_hash(workflow, leaf, &resolved_inputs)
.expect("leaf input hash should serialize"),
result: result.clone(),
});
resolved_outputs.insert(leaf.id.clone(), result.output);
}
WorkflowNode::Sequence(sequence) => collect_leaf_records(
trace_id,
workflow,
&sequence.children,
results,
resolved_outputs,
records,
),
WorkflowNode::LoopUntil(loop_until) => collect_leaf_records(
trace_id,
workflow,
&loop_until.children,
results,
resolved_outputs,
records,
),
WorkflowNode::Cond(cond) => {
collect_leaf_records(
trace_id,
workflow,
&cond.then_nodes,
results,
resolved_outputs,
records,
);
collect_leaf_records(
trace_id,
workflow,
&cond.else_nodes,
results,
resolved_outputs,
records,
);
}
WorkflowNode::Expand(_)
| WorkflowNode::Reduce(_)
| WorkflowNode::TeacherReview(_) => {}
}
}
}

#[test]
fn starlark_repair_loop() {
let source = r#"
Expand Down
2 changes: 1 addition & 1 deletion docs/V0_9_0_RELEASE_ACCEPTANCE.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ config source, result, and follow-up issue or PR.

| Gate | Owner | Ship/defer decision | Evidence |
| --- | --- | --- | --- |
| WhaleFlow typed IR, mock executor, replay, TeacherReview, StudentReplay, and cutline docs are tested | WhaleFlow steward | ship | #2821/#2824/#2831/#2833/#2839/#2840/#2841 plus focused local `cargo test -p codewhale-whaleflow --locked`; #2670 closed after `cargo test -p codewhale-whaleflow starlark --locked` passed 7/7 on current stewardship head. |
| WhaleFlow typed IR, mock executor, replay, TeacherReview, StudentReplay, and cutline docs are tested | WhaleFlow steward | ship | #2821/#2824/#2831/#2833/#2839/#2840/#2841 plus focused local `cargo test -p codewhale-whaleflow --locked`; #2670 closed after `cargo test -p codewhale-whaleflow starlark --locked` passed 7/7 on current stewardship head. The `rlm_cache_change.star` dogfood workflow now has recorded mock-trace replay coverage, including a missing-record divergence check. |
| Live `workflow_run`, worktree application, provider calls, and TraceStore writes are deferred until cancellation/replay/atomicity semantics pass | WhaleFlow steward | defer | #2669 and #2679 remain open for live runtime execution, provider calls, TraceStore writes, Arcee/student replay, and CLI/TUI workflow mode; current v0.9 branch ships mock executor/replay foundations only. |
| Model Lab / Hugging Face MVP is included or deferred with release-note wording | model-lab steward | decide | |
| HarnessProfile runtime MVP is deferred; schema/resolver foundation ships with release-note wording | harness steward | ship foundation / defer runtime | #2844 (`efbcc681a`) documents the cutline; `HarnessPosture` / `HarnessProfile` config schema and strict validation are present; a pure resolver matches provider/model routes without changing runtime behavior; seed-profile runtime selection, telemetry, and status display remain follow-up work. |
Expand Down
Loading