From 83fd5b7cd36e1b462094902e6ccc3eb949c47f6d Mon Sep 17 00:00:00 2001 From: robert Date: Fri, 26 Jun 2026 17:33:57 +1000 Subject: [PATCH 1/6] Let agent signal step failure via a tagged block A Claude CLI run always exits 0, so a user-requested failure condition ("fail the deployment if the health check is red") was undetectable from the outside. Add an octopus-fail-deployment skill that has the agent emit an block, and have ClaudeAgentOutcomeEvaluator scan the result for it and fail the step with the captured reason. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../ClaudeAgentOutcomeEvaluatorFixture.cs | 113 ++++++++++++++++++ .../ClaudeAgentOutcomeEvaluator.cs | 20 ++++ .../Skills/octopus-fail-deployment.md | 31 +++++ 3 files changed, 164 insertions(+) create mode 100644 source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md diff --git a/source/Calamari.AiAgent.Tests/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluatorFixture.cs b/source/Calamari.AiAgent.Tests/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluatorFixture.cs index 9f2447af3..8dfceb47a 100644 --- a/source/Calamari.AiAgent.Tests/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluatorFixture.cs +++ b/source/Calamari.AiAgent.Tests/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluatorFixture.cs @@ -77,4 +77,117 @@ public void ExitZero_SuccessSubtype_WithPermissionDenials_Throws_NamingDeniedToo act.Should().Throw() .Which.Message.Should().Contain("denied permission").And.Contain("Bash").And.Contain("WebFetch"); } + + [Test] + public void FailureSignal_WithReason_Throws_IncludingReason() + { + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "Smoke test returned HTTP 500 from /health after 3 retries.", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw() + .Which.Message.Should().Contain("step should fail").And.Contain("Smoke test returned HTTP 500"); + } + + [Test] + public void FailureSignal_WithMultiLineReason_Throws_PreservingReason() + { + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "\nHealth check failed:\n- /health returned 500\n- /ready timed out\n", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw() + .Which.Message.Should().Contain("/health returned 500").And.Contain("/ready timed out"); + } + + [Test] + public void FailureSignal_EmptyBlock_Throws_WithGenericMessage() + { + var result = new ResultStreamEvent { Subtype = "success", IsError = false, Result = "" }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw().WithMessage("*step should fail*"); + } + + [Test] + public void FailureSignal_SelfClosing_Throws_WithGenericMessage() + { + var result = new ResultStreamEvent { Subtype = "success", IsError = false, Result = "" }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw().WithMessage("*step should fail*"); + } + + [Test] + public void FailureSignal_WithinLargerResult_Throws() + { + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "I checked the deployment health.\nHealth check is red.\nDone.", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw().Which.Message.Should().Contain("Health check is red"); + } + + [Test] + public void FailureSignal_TakesPrecedenceOverNonSuccessSubtype() + { + var result = new ResultStreamEvent + { + Subtype = "error_max_turns", + IsError = true, + Result = "Validation failed.", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw().Which.Message.Should().Contain("Validation failed"); + } + + [Test] + public void UnclosedFailureSignal_DoesNotThrow() + { + // A truncated message never wrote the closing tag, so we cannot treat it as a deliberate, complete failure. + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "Health check is red and then the message was cut off", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().NotThrow(); + } + + [Test] + public void ResultWithoutFailureSignal_DoesNotThrow() + { + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "The deployment looks healthy. No failure conditions were met.", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().NotThrow(); + } } diff --git a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs index e32feeebb..934664fbe 100644 --- a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs +++ b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using System.Text.RegularExpressions; using Calamari.AiAgent.ClaudeCodeBehaviour.JsonResponseModels; using Calamari.Common.Commands; @@ -7,6 +8,14 @@ namespace Calamari.AiAgent.ClaudeCodeBehaviour { public static class ClaudeAgentOutcomeEvaluator { + // The agent emits this tagged block (see the octopus-fail-deployment skill) when a user-specified + // failure condition has been met. The CLI still exits 0, so this is the only way to detect an intentional + // failure from the outside. The inner text is the operator-facing reason; requiring the closing tag also + // confirms the agent finished writing the block rather than being truncated mid-message. + static readonly Regex FailureSignal = new( + @"|>(?.*?))", + RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); + public static void EnsureSuccessful(int exitCode, ResultStreamEvent? result) { if (exitCode != 0) @@ -22,6 +31,17 @@ public static void EnsureSuccessful(int exitCode, ResultStreamEvent? result) return; } + // An intentional, user-requested failure takes precedence: the run is otherwise "successful" at the + // CLI level, so check the agent's signal before the generic CLI-status checks to surface a clear reason. + if (result.Result is { } text && FailureSignal.Match(text) is { Success: true } match) + { + var reason = match.Groups["reason"].Value.Trim(); + throw new CommandException( + string.IsNullOrEmpty(reason) + ? "The agent signalled that the step should fail." + : $"The agent signalled that the step should fail: {reason}"); + } + if (result.IsError == true || !"success".Equals(result.Subtype, StringComparison.OrdinalIgnoreCase)) { var subtype = string.IsNullOrEmpty(result.Subtype) ? "" : result.Subtype; diff --git a/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md b/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md new file mode 100644 index 000000000..959efc60e --- /dev/null +++ b/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md @@ -0,0 +1,31 @@ +--- +name: octopus-fail-deployment +description: Use when the user's prompt asks for the step to FAIL under some condition — e.g. "fail the deployment if the health check is red", "if X happens, fail this step", "the runbook should fail when Y". By default an agent run always succeeds (the process exits 0), so the ONLY way to make Octopus mark this step as failed is to emit the sentinel described here. Do NOT use when the user has not expressed any failure condition — absence of the sentinel means success. +--- +By default this step **succeeds** — when your run finishes normally, Octopus marks the step green regardless of what you found. + +If the user's prompt states a condition under which the step should **fail** (for example "fail the deployment if the smoke test doesn't pass"), and you determine that condition has been met, you must explicitly signal the failure. The only way Octopus can detect this from the outside is a specific tagged block in your final response. + +## How to signal failure + +Emit this block as part of your **final** message, with the reason between the tags: + + +A short reason describing why the step failed. + + +For example: + + +Smoke test returned HTTP 500 from /health after 3 retries. + + +## Rules + +- Emit the block **only** when the user expressed a failure condition AND you have determined it is met. If the condition was not met, say nothing special and let the step succeed. +- Always write the **complete** block, including the closing `` tag. The closing tag is how Octopus confirms the message is whole — if you stop before writing it, the failure will not be detected, so finish the block before ending your turn. +- Put the tags on their **own lines**, as plain text. Do **not** wrap them in backticks, code fences, bold, or any other markdown. +- Emit the block **once**. One block is enough to fail the step. +- Keep the reason **concise and specific** — it is surfaced in the Octopus task log as the failure message, so write it for the operator who will read it. The reason may span multiple lines. +- The reason is optional but strongly encouraged; an empty block will still fail the step with a generic message. +- If you cannot determine whether the condition was met, do not guess silently — explain what you found. Only emit the block if the user's intent was that an unverifiable outcome should fail the step. \ No newline at end of file From 863e7cf2f15cdf627eec5db3dcc1a4c6f53e885a Mon Sep 17 00:00:00 2001 From: robert Date: Fri, 26 Jun 2026 20:40:32 +1000 Subject: [PATCH 2/6] Fix confusing error --- .../Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs b/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs index 77432cf98..d33e49ffa 100644 --- a/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs +++ b/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs @@ -7,6 +7,7 @@ using Calamari.Common.Features.Behaviours; using Calamari.Common.Plumbing.Extensions; using Calamari.Common.Plumbing.FileSystem; +using Calamari.Common.Plumbing.Logging; using Calamari.Common.Plumbing.Variables; namespace Calamari.Common.Plumbing.Pipeline @@ -51,6 +52,8 @@ protected virtual IEnumerable OnFinish(OnFinishResolver reso public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables) { var pathToPrimaryPackage = variables.GetPathToPrimaryPackage(lifetimeScope.Resolve(), false); + var log = lifetimeScope.Resolve(); + var deployment = new RunningDeployment(pathToPrimaryPackage, variables); try @@ -67,7 +70,7 @@ public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables) } catch (Exception installException) { - Console.Error.WriteLine("Running rollback behaviours..."); + log.Verbose("Running rollback behaviours..."); deployment.Error(installException); @@ -78,7 +81,7 @@ public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables) } catch (Exception rollbackException) { - Console.Error.WriteLine(rollbackException); + log.Error(rollbackException.Message); } throw; From e9f1247f01674cf8b8d1f8f48c60d44d7f2b05b6 Mon Sep 17 00:00:00 2001 From: robert Date: Mon, 29 Jun 2026 09:06:16 +1000 Subject: [PATCH 3/6] Remove comment --- .../ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs | 3 --- 1 file changed, 3 deletions(-) diff --git a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs index 934664fbe..daded3547 100644 --- a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs +++ b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs @@ -23,9 +23,6 @@ public static void EnsureSuccessful(int exitCode, ResultStreamEvent? result) throw new CommandException($"Claude Code exited with code {exitCode}."); } - // Exit code 0 but no terminal result event: the CLI reported success at the process level, but we - // couldn't observe a result to inspect (output-format drift or an unparseable result line). We have - // no failure signal beyond the exit code, so we defer to it rather than fail on a parsing gap. if (result == null) { return; From 9c37036dd8daef11d9393cf15637dd0986f3abca Mon Sep 17 00:00:00 2001 From: robert Date: Mon, 29 Jun 2026 10:22:54 +1000 Subject: [PATCH 4/6] Better error formatting for logs --- source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs b/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs index d33e49ffa..1f6cc7277 100644 --- a/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs +++ b/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs @@ -81,7 +81,7 @@ public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables) } catch (Exception rollbackException) { - log.Error(rollbackException.Message); + log.Error(rollbackException.PrettyPrint()); } throw; From 44f53ff104e7a3309f7b5f0bd1a7bae311311ecf Mon Sep 17 00:00:00 2001 From: robert Date: Mon, 29 Jun 2026 10:31:39 +1000 Subject: [PATCH 5/6] Document self-closing failure signal tag Align the octopus-fail-deployment skill spec and code comment with the matcher, which accepts a self-closing as a reason-less failure. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../RunAgentCommandFixture.cs | 21 +++++++++++++++++++ .../ClaudeAgentOutcomeEvaluator.cs | 5 +++-- .../Skills/octopus-fail-deployment.md | 4 ++-- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs b/source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs index a42111eba..a940e0159 100644 --- a/source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs +++ b/source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs @@ -166,4 +166,25 @@ public async Task ClaudeCode_AttachesArtifact_WhenExplicitlyAsked() // (path/name are base64-encoded, so assert on the message verb, not the file name). result.FullLog.Should().Contain("createArtifact"); } + + [Test] + [Category("Integration")] + public async Task ClaudeCode_ResultsInFailure_IfExplicitlyAsked() + { + var prompt = "I want you to analyse the results of the following set of numbers [1,2,3]. Fail this deployment if any of the numbers are greater than 2."; + var result = await CommandTestBuilder.CreateAsync() + .WithArrange(context => + { + context.Variables.Add(SpecialVariables.Action.Claude.SandboxMode, nameof(SandboxMode.None)); + context.Variables.Add(SpecialVariables.Action.Claude.ApiToken, Environment.GetEnvironmentVariable("ANTHROPIC_TOKEN")); + context.Variables.Add(SpecialVariables.Action.Claude.Prompt, prompt); + context.Variables.Add(SpecialVariables.Action.Claude.Permissions, """{"allow":["Bash", "Read"]}"""); + }) + .Execute(); + + result.WasSuccessful.Should().BeFalse(); + // NewOctopusArtifact emits an Info "##octopus[createArtifact ...]" service message + // (path/name are base64-encoded, so assert on the message verb, not the file name). + result.FullLog.Should().Contain("createArtifact"); + } } diff --git a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs index daded3547..7d1835cf2 100644 --- a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs +++ b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs @@ -10,8 +10,9 @@ public static class ClaudeAgentOutcomeEvaluator { // The agent emits this tagged block (see the octopus-fail-deployment skill) when a user-specified // failure condition has been met. The CLI still exits 0, so this is the only way to detect an intentional - // failure from the outside. The inner text is the operator-facing reason; requiring the closing tag also - // confirms the agent finished writing the block rather than being truncated mid-message. + // failure from the outside. The inner text is the operator-facing reason. We accept either a self-closing + // (a complete signal with no reason) or a paired block ending in ; + // an opening tag without its closing tag is treated as a truncated message and ignored. static readonly Regex FailureSignal = new( @"|>(?.*?))", RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); diff --git a/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md b/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md index 959efc60e..c4e38b6bf 100644 --- a/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md +++ b/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md @@ -23,9 +23,9 @@ Smoke test returned HTTP 500 from /health after 3 retries. ## Rules - Emit the block **only** when the user expressed a failure condition AND you have determined it is met. If the condition was not met, say nothing special and let the step succeed. -- Always write the **complete** block, including the closing `` tag. The closing tag is how Octopus confirms the message is whole — if you stop before writing it, the failure will not be detected, so finish the block before ending your turn. +- Always write a **complete** block — either a paired block ending in `` or a self-closing ``. A closed tag is how Octopus confirms the message is whole — if you open the block but stop before closing it, the failure will not be detected, so finish the block before ending your turn. - Put the tags on their **own lines**, as plain text. Do **not** wrap them in backticks, code fences, bold, or any other markdown. - Emit the block **once**. One block is enough to fail the step. - Keep the reason **concise and specific** — it is surfaced in the Octopus task log as the failure message, so write it for the operator who will read it. The reason may span multiple lines. -- The reason is optional but strongly encouraged; an empty block will still fail the step with a generic message. +- The reason is optional but strongly encouraged; an empty block — or a self-closing `` — will still fail the step with a generic message. - If you cannot determine whether the condition was met, do not guess silently — explain what you found. Only emit the block if the user's intent was that an unverifiable outcome should fail the step. \ No newline at end of file From a3d40756456d1e2707c21f672ce190f0a90efa79 Mon Sep 17 00:00:00 2001 From: Rob E Date: Tue, 30 Jun 2026 15:44:43 +1000 Subject: [PATCH 6/6] Update source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs Co-authored-by: Eddy Moulton <8491021+eddymoulton@users.noreply.github.com> --- .../ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs index 7d1835cf2..9e29779a3 100644 --- a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs +++ b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs @@ -8,11 +8,9 @@ namespace Calamari.AiAgent.ClaudeCodeBehaviour { public static class ClaudeAgentOutcomeEvaluator { - // The agent emits this tagged block (see the octopus-fail-deployment skill) when a user-specified - // failure condition has been met. The CLI still exits 0, so this is the only way to detect an intentional - // failure from the outside. The inner text is the operator-facing reason. We accept either a self-closing - // (a complete signal with no reason) or a paired block ending in ; - // an opening tag without its closing tag is treated as a truncated message and ignored. + // The agent emits this tagged block when a user-specified failure condition has been met + // (see the octopus-fail-deployment skill) + // Matches either a self-closing or a paired block ending in , capturing the contents ass the reason. static readonly Regex FailureSignal = new( @"|>(?.*?))", RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);