Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,117 @@ public void ExitZero_SuccessSubtype_WithPermissionDenials_Throws_NamingDeniedToo
act.Should().Throw<CommandException>()
.Which.Message.Should().Contain("denied permission").And.Contain("Bash").And.Contain("WebFetch");
}

[Test]
public void FailureSignal_WithReason_Throws_IncludingReason()
{
var result = new ResultStreamEvent
{
Subtype = "success",
IsError = false,
Result = "<octopus-task-failed>Smoke test returned HTTP 500 from /health after 3 retries.</octopus-task-failed>",
};

Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result);

act.Should().Throw<CommandException>()
.Which.Message.Should().Contain("step should fail").And.Contain("Smoke test returned HTTP 500");
}

[Test]
public void FailureSignal_WithMultiLineReason_Throws_PreservingReason()
{
var result = new ResultStreamEvent
{
Subtype = "success",
IsError = false,
Result = "<octopus-task-failed>\nHealth check failed:\n- /health returned 500\n- /ready timed out\n</octopus-task-failed>",
};

Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result);

act.Should().Throw<CommandException>()
.Which.Message.Should().Contain("/health returned 500").And.Contain("/ready timed out");
}

[Test]
public void FailureSignal_EmptyBlock_Throws_WithGenericMessage()
{
var result = new ResultStreamEvent { Subtype = "success", IsError = false, Result = "<octopus-task-failed></octopus-task-failed>" };

Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result);

act.Should().Throw<CommandException>().WithMessage("*step should fail*");
}

[Test]
public void FailureSignal_SelfClosing_Throws_WithGenericMessage()
{
var result = new ResultStreamEvent { Subtype = "success", IsError = false, Result = "<octopus-task-failed/>" };

Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result);

act.Should().Throw<CommandException>().WithMessage("*step should fail*");
}
Comment thread
zentron marked this conversation as resolved.

[Test]
public void FailureSignal_WithinLargerResult_Throws()
{
var result = new ResultStreamEvent
{
Subtype = "success",
IsError = false,
Result = "I checked the deployment health.\n<octopus-task-failed>Health check is red.</octopus-task-failed>\nDone.",
};

Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result);

act.Should().Throw<CommandException>().Which.Message.Should().Contain("Health check is red");
}

[Test]
public void FailureSignal_TakesPrecedenceOverNonSuccessSubtype()
{
var result = new ResultStreamEvent
{
Subtype = "error_max_turns",
IsError = true,
Result = "<octopus-task-failed>Validation failed.</octopus-task-failed>",
};

Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result);

act.Should().Throw<CommandException>().Which.Message.Should().Contain("Validation failed");
}

[Test]
public void UnclosedFailureSignal_DoesNotThrow()
{
// A truncated message never wrote the closing tag, so we cannot treat it as a deliberate, complete failure.
var result = new ResultStreamEvent
{
Subtype = "success",
IsError = false,
Result = "<octopus-task-failed>Health check is red and then the message was cut off",
};

Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result);

act.Should().NotThrow();
}

[Test]
public void ResultWithoutFailureSignal_DoesNotThrow()
{
var result = new ResultStreamEvent
{
Subtype = "success",
IsError = false,
Result = "The deployment looks healthy. No failure conditions were met.",
};

Action act = () => ClaudeAgentOutcomeEvaluator.EnsureSuccessful(0, result);

act.Should().NotThrow();
}
}
21 changes: 21 additions & 0 deletions source/Calamari.AiAgent.Tests/RunAgentCommandFixture.cs
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,25 @@ public async Task ClaudeCode_AttachesArtifact_WhenExplicitlyAsked()
// (path/name are base64-encoded, so assert on the message verb, not the file name).
result.FullLog.Should().Contain("createArtifact");
}

[Test]
[Category("Integration")]
public async Task ClaudeCode_ResultsInFailure_IfExplicitlyAsked()
{
var prompt = "I want you to analyse the results of the following set of numbers [1,2,3]. Fail this deployment if any of the numbers are greater than 2.";
var result = await CommandTestBuilder.CreateAsync<RunAgentCommand, Program>()
.WithArrange(context =>
{
context.Variables.Add(SpecialVariables.Action.Claude.SandboxMode, nameof(SandboxMode.None));
context.Variables.Add(SpecialVariables.Action.Claude.ApiToken, Environment.GetEnvironmentVariable("ANTHROPIC_TOKEN"));
context.Variables.Add(SpecialVariables.Action.Claude.Prompt, prompt);
context.Variables.Add(SpecialVariables.Action.Claude.Permissions, """{"allow":["Bash", "Read"]}""");
})
.Execute();

result.WasSuccessful.Should().BeFalse();
// NewOctopusArtifact emits an Info "##octopus[createArtifact ...]" service message
// (path/name are base64-encoded, so assert on the message verb, not the file name).
result.FullLog.Should().Contain("createArtifact");
}
}
Original file line number Diff line number Diff line change
@@ -1,27 +1,43 @@
using System;
using System.Linq;
using System.Text.RegularExpressions;
using Calamari.AiAgent.ClaudeCodeBehaviour.JsonResponseModels;
using Calamari.Common.Commands;

namespace Calamari.AiAgent.ClaudeCodeBehaviour
{
public static class ClaudeAgentOutcomeEvaluator
{
// The agent emits this tagged block when a user-specified failure condition has been met
// (see the octopus-fail-deployment skill)
// Matches either a self-closing <octopus-task-failed/> or a paired block ending in </octopus-task-failed>, capturing the contents ass the reason.
static readonly Regex FailureSignal = new(
@"<octopus-task-failed\s*(?:/>|>(?<reason>.*?)</octopus-task-failed>)",
RegexOptions.Singleline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
Comment thread
zentron marked this conversation as resolved.

public static void EnsureSuccessful(int exitCode, ResultStreamEvent? result)
{
if (exitCode != 0)
{
throw new CommandException($"Claude Code exited with code {exitCode}.");
}

// Exit code 0 but no terminal result event: the CLI reported success at the process level, but we
// couldn't observe a result to inspect (output-format drift or an unparseable result line). We have
// no failure signal beyond the exit code, so we defer to it rather than fail on a parsing gap.
if (result == null)
{
return;
}

// An intentional, user-requested failure takes precedence: the run is otherwise "successful" at the
// CLI level, so check the agent's signal before the generic CLI-status checks to surface a clear reason.
if (result.Result is { } text && FailureSignal.Match(text) is { Success: true } match)
{
var reason = match.Groups["reason"].Value.Trim();
throw new CommandException(
string.IsNullOrEmpty(reason)
? "The agent signalled that the step should fail."
: $"The agent signalled that the step should fail: {reason}");
}

if (result.IsError == true || !"success".Equals(result.Subtype, StringComparison.OrdinalIgnoreCase))
{
var subtype = string.IsNullOrEmpty(result.Subtype) ? "<none>" : result.Subtype;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
---
name: octopus-fail-deployment
description: Use when the user's prompt asks for the step to FAIL under some condition — e.g. "fail the deployment if the health check is red", "if X happens, fail this step", "the runbook should fail when Y". By default an agent run always succeeds (the process exits 0), so the ONLY way to make Octopus mark this step as failed is to emit the sentinel described here. Do NOT use when the user has not expressed any failure condition — absence of the sentinel means success.
---
By default this step **succeeds** — when your run finishes normally, Octopus marks the step green regardless of what you found.

If the user's prompt states a condition under which the step should **fail** (for example "fail the deployment if the smoke test doesn't pass"), and you determine that condition has been met, you must explicitly signal the failure. The only way Octopus can detect this from the outside is a specific tagged block in your final response.

## How to signal failure

Emit this block as part of your **final** message, with the reason between the tags:

<octopus-task-failed>
A short reason describing why the step failed.
</octopus-task-failed>

For example:

<octopus-task-failed>
Smoke test returned HTTP 500 from /health after 3 retries.
</octopus-task-failed>

## Rules

- Emit the block **only** when the user expressed a failure condition AND you have determined it is met. If the condition was not met, say nothing special and let the step succeed.
- Always write a **complete** block — either a paired block ending in `</octopus-task-failed>` or a self-closing `<octopus-task-failed/>`. A closed tag is how Octopus confirms the message is whole — if you open the block but stop before closing it, the failure will not be detected, so finish the block before ending your turn.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RE: self closing block

I would be tempted to remove that as an option. Makes this whole thing more consistent and the matching regex simpler.

I'm a bit concerned about removing flexibility that Claude might like to take advantage of however.
I'll leave it with you to decide if you think cutting that down makes things simpler enough to warrant the change.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting you say that. I originally had it not allow closing tags but then Claude pointed out it might make it more likely to use it when there happens to be no specific reason to apply.
Ill leave it in for now, but if it creates any false positives/negatives we can reconsider.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Claude pointed out it might make it more likely to use it when there happens to be no specific reason to apply

That's reason enough if Claude thinks it might be a problem without it.

- Put the tags on their **own lines**, as plain text. Do **not** wrap them in backticks, code fences, bold, or any other markdown.
- Emit the block **once**. One block is enough to fail the step.
- Keep the reason **concise and specific** — it is surfaced in the Octopus task log as the failure message, so write it for the operator who will read it. The reason may span multiple lines.
- The reason is optional but strongly encouraged; an empty block — or a self-closing `<octopus-task-failed/>` — will still fail the step with a generic message.
- If you cannot determine whether the condition was met, do not guess silently — explain what you found. Only emit the block if the user's intent was that an unverifiable outcome should fail the step.
7 changes: 5 additions & 2 deletions source/Calamari.Common/Plumbing/Pipeline/PipelineCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
using Calamari.Common.Features.Behaviours;
using Calamari.Common.Plumbing.Extensions;
using Calamari.Common.Plumbing.FileSystem;
using Calamari.Common.Plumbing.Logging;
using Calamari.Common.Plumbing.Variables;

namespace Calamari.Common.Plumbing.Pipeline
Expand Down Expand Up @@ -51,6 +52,8 @@ protected virtual IEnumerable<IOnFinishBehaviour> OnFinish(OnFinishResolver reso
public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables)
{
var pathToPrimaryPackage = variables.GetPathToPrimaryPackage(lifetimeScope.Resolve<ICalamariFileSystem>(), false);
var log = lifetimeScope.Resolve<ILog>();

var deployment = new RunningDeployment(pathToPrimaryPackage, variables);

try
Expand All @@ -67,7 +70,7 @@ public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables)
}
catch (Exception installException)
{
Console.Error.WriteLine("Running rollback behaviours...");
log.Verbose("Running rollback behaviours...");

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was annoying as hell. There is no rollback behaviour taking place, so why make it so prominent.


Comment thread
zentron marked this conversation as resolved.
deployment.Error(installException);

Expand All @@ -78,7 +81,7 @@ public async Task Execute(ILifetimeScope lifetimeScope, IVariables variables)
}
catch (Exception rollbackException)
{
Console.Error.WriteLine(rollbackException);

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should generally not be writing to Console in Calamari as it makes testing more difficult

log.Error(rollbackException.PrettyPrint());
}
Comment thread
zentron marked this conversation as resolved.

throw;
Expand Down