diff --git a/source/Calamari.AiAgent.Tests/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluatorFixture.cs b/source/Calamari.AiAgent.Tests/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluatorFixture.cs index 9f2447af3a..8dfceb47ac 100644 --- a/source/Calamari.AiAgent.Tests/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluatorFixture.cs +++ b/source/Calamari.AiAgent.Tests/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluatorFixture.cs @@ -77,4 +77,117 @@ public void ExitZero_SuccessSubtype_WithPermissionDenials_Throws_NamingDeniedToo act.Should().Throw() .Which.Message.Should().Contain("denied permission").And.Contain("Bash").And.Contain("WebFetch"); } + + [Test] + public void FailureSignal_WithReason_Throws_IncludingReason() + { + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "Smoke test returned HTTP 500 from /health after 3 retries.", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw() + .Which.Message.Should().Contain("step should fail").And.Contain("Smoke test returned HTTP 500"); + } + + [Test] + public void FailureSignal_WithMultiLineReason_Throws_PreservingReason() + { + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "\nHealth check failed:\n- /health returned 500\n- /ready timed out\n", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw() + .Which.Message.Should().Contain("/health returned 500").And.Contain("/ready timed out"); + } + + [Test] + public void FailureSignal_EmptyBlock_Throws_WithGenericMessage() + { + var result = new ResultStreamEvent { Subtype = "success", IsError = false, Result = "" }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw().WithMessage("*step should fail*"); + } + + [Test] + public void FailureSignal_SelfClosing_Throws_WithGenericMessage() + { + var result = new ResultStreamEvent { Subtype = "success", IsError = false, Result = "" }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw().WithMessage("*step should fail*"); + } + + [Test] + public void FailureSignal_WithinLargerResult_Throws() + { + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "I checked the deployment health.\nHealth check is red.\nDone.", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw().Which.Message.Should().Contain("Health check is red"); + } + + [Test] + public void FailureSignal_TakesPrecedenceOverNonSuccessSubtype() + { + var result = new ResultStreamEvent + { + Subtype = "error_max_turns", + IsError = true, + Result = "Validation failed.", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().Throw().Which.Message.Should().Contain("Validation failed"); + } + + [Test] + public void UnclosedFailureSignal_DoesNotThrow() + { + // A truncated message never wrote the closing tag, so we cannot treat it as a deliberate, complete failure. + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "Health check is red and then the message was cut off", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().NotThrow(); + } + + [Test] + public void ResultWithoutFailureSignal_DoesNotThrow() + { + var result = new ResultStreamEvent + { + Subtype = "success", + IsError = false, + Result = "The deployment looks healthy. No failure conditions were met.", + }; + + Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result); + + act.Should().NotThrow(); + } } diff --git a/source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs b/source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs index a42111ebad..a940e01596 100644 --- a/source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs +++ b/source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs @@ -166,4 +166,25 @@ public async Task ClaudeCode_AttachesArtifact_WhenExplicitlyAsked() // (path/name are base64-encoded, so assert on the message verb, not the file name). result.FullLog.Should().Contain("createArtifact"); } + + [Test] + [Category("Integration")] + public async Task ClaudeCode_ResultsInFailure_IfExplicitlyAsked() + { + var prompt = "I want you to analyse the results of the following set of numbers [1,2,3]. Fail this deployment if any of the numbers are greater than 2."; + var result = await CommandTestBuilder.CreateAsync() + .WithArrange(context => + { + context.Variables.Add(SpecialVariables.Action.Claude.SandboxMode, nameof(SandboxMode.None)); + context.Variables.Add(SpecialVariables.Action.Claude.ApiToken, Environment.GetEnvironmentVariable("ANTHROPIC_TOKEN")); + context.Variables.Add(SpecialVariables.Action.Claude.Prompt, prompt); + context.Variables.Add(SpecialVariables.Action.Claude.Permissions, """{"allow":["Bash", "Read"]}"""); + }) + .Execute(); + + result.WasSuccessful.Should().BeFalse(); + // NewOctopusArtifact emits an Info "##octopus[createArtifact ...]" service message + // (path/name are base64-encoded, so assert on the message verb, not the file name). + result.FullLog.Should().Contain("createArtifact"); + } } diff --git a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs index e32feeebba..9e29779a3d 100644 --- a/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs +++ b/source/Calamari.AiAgent/ClaudeCodeBehaviour/ClaudeAgentOutcomeEvaluator.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using System.Text.RegularExpressions; using Calamari.AiAgent.ClaudeCodeBehaviour.JsonResponseModels; using Calamari.Common.Commands; @@ -7,6 +8,13 @@ namespace Calamari.AiAgent.ClaudeCodeBehaviour { public static class ClaudeAgentOutcomeEvaluator { + // The agent emits this tagged block when a user-specified failure condition has been met + // (see the octopus-fail-deployment skill) + // Matches either a self-closing or a paired block ending in , capturing the contents ass the reason. + static readonly Regex FailureSignal = new( + @"|>(?.*?))", + RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled); + public static void EnsureSuccessful(int exitCode, ResultStreamEvent? result) { if (exitCode != 0) @@ -14,14 +22,22 @@ public static void EnsureSuccessful(int exitCode, ResultStreamEvent? result) throw new CommandException($"Claude Code exited with code {exitCode}."); } - // Exit code 0 but no terminal result event: the CLI reported success at the process level, but we - // couldn't observe a result to inspect (output-format drift or an unparseable result line). We have - // no failure signal beyond the exit code, so we defer to it rather than fail on a parsing gap. if (result == null) { return; } + // An intentional, user-requested failure takes precedence: the run is otherwise "successful" at the + // CLI level, so check the agent's signal before the generic CLI-status checks to surface a clear reason. + if (result.Result is { } text && FailureSignal.Match(text) is { Success: true } match) + { + var reason = match.Groups["reason"].Value.Trim(); + throw new CommandException( + string.IsNullOrEmpty(reason) + ? "The agent signalled that the step should fail." + : $"The agent signalled that the step should fail: {reason}"); + } + if (result.IsError == true || !"success".Equals(result.Subtype, StringComparison.OrdinalIgnoreCase)) { var subtype = string.IsNullOrEmpty(result.Subtype) ? "" : result.Subtype; diff --git a/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md b/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md new file mode 100644 index 0000000000..c4e38b6bf2 --- /dev/null +++ b/source/Calamari.AiAgent/ClaudeCodeBehaviour/DefaultContext/Skills/octopus-fail-deployment.md @@ -0,0 +1,31 @@ +--- +name: octopus-fail-deployment +description: Use when the user's prompt asks for the step to FAIL under some condition — e.g. "fail the deployment if the health check is red", "if X happens, fail this step", "the runbook should fail when Y". By default an agent run always succeeds (the process exits 0), so the ONLY way to make Octopus mark this step as failed is to emit the sentinel described here. Do NOT use when the user has not expressed any failure condition — absence of the sentinel means success. +--- +By default this step **succeeds** — when your run finishes normally, Octopus marks the step green regardless of what you found. + +If the user's prompt states a condition under which the step should **fail** (for example "fail the deployment if the smoke test doesn't pass"), and you determine that condition has been met, you must explicitly signal the failure. The only way Octopus can detect this from the outside is a specific tagged block in your final response. + +## How to signal failure + +Emit this block as part of your **final** message, with the reason between the tags: + + +A short reason describing why the step failed. + + +For example: + + +Smoke test returned HTTP 500 from /health after 3 retries. + + +## Rules + +- Emit the block **only** when the user expressed a failure condition AND you have determined it is met. If the condition was not met, say nothing special and let the step succeed. +- Always write a **complete** block — either a paired block ending in `` or a self-closing ``. A closed tag is how Octopus confirms the message is whole — if you open the block but stop before closing it, the failure will not be detected, so finish the block before ending your turn. +- Put the tags on their **own lines**, as plain text. Do **not** wrap them in backticks, code fences, bold, or any other markdown. +- Emit the block **once**. One block is enough to fail the step. +- Keep the reason **concise and specific** — it is surfaced in the Octopus task log as the failure message, so write it for the operator who will read it. The reason may span multiple lines. +- The reason is optional but strongly encouraged; an empty block — or a self-closing `` — will still fail the step with a generic message. +- If you cannot determine whether the condition was met, do not guess silently — explain what you found. Only emit the block if the user's intent was that an unverifiable outcome should fail the step. \ No newline at end of file diff --git a/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs b/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs index 77432cf98e..1f6cc7277f 100644 --- a/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs +++ b/source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs @@ -7,6 +7,7 @@ using Calamari.Common.Features.Behaviours; using Calamari.Common.Plumbing.Extensions; using Calamari.Common.Plumbing.FileSystem; +using Calamari.Common.Plumbing.Logging; using Calamari.Common.Plumbing.Variables; namespace Calamari.Common.Plumbing.Pipeline @@ -51,6 +52,8 @@ protected virtual IEnumerable OnFinish(OnFinishResolver reso public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables) { var pathToPrimaryPackage = variables.GetPathToPrimaryPackage(lifetimeScope.Resolve(), false); + var log = lifetimeScope.Resolve(); + var deployment = new RunningDeployment(pathToPrimaryPackage, variables); try @@ -67,7 +70,7 @@ public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables) } catch (Exception installException) { - Console.Error.WriteLine("Running rollback behaviours..."); + log.Verbose("Running rollback behaviours..."); deployment.Error(installException); @@ -78,7 +81,7 @@ public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables) } catch (Exception rollbackException) { - Console.Error.WriteLine(rollbackException); + log.Error(rollbackException.PrettyPrint()); } throw;