From 1c511817a4b79add7bb5707a7574eeab87fa84e7 Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 17:07:08 +0000 Subject: [PATCH 01/10] Add minimal RetryHelper and migrate RunnerDotcomServer --- src/Runner.Common/RetryHelper.cs | 116 ++++++++ src/Runner.Common/RunnerDotcomServer.cs | 43 +-- src/Runner.Common/RunnerService.cs | 25 +- src/Test/L0/Listener/RunnerDotcomServerL0.cs | 140 ++++++++++ src/Test/L0/RetryHelperL0.cs | 270 +++++++++++++++++++ 5 files changed, 559 insertions(+), 35 deletions(-) create mode 100644 src/Runner.Common/RetryHelper.cs create mode 100644 src/Test/L0/Listener/RunnerDotcomServerL0.cs create mode 100644 src/Test/L0/RetryHelperL0.cs diff --git a/src/Runner.Common/RetryHelper.cs b/src/Runner.Common/RetryHelper.cs new file mode 100644 index 00000000000..6f9a983264f --- /dev/null +++ b/src/Runner.Common/RetryHelper.cs @@ -0,0 +1,116 @@ +using System; +using System.Diagnostics; +using System.Threading; +using System.Threading.Tasks; +using GitHub.Runner.Sdk; + +namespace GitHub.Runner.Common +{ + public sealed class RetryExecutionContext + { + public RetryExecutionContext(string operationName, int attemptNumber, int maxAttempts, DateTime startTimeUtc) + { + OperationName = operationName; + AttemptNumber = attemptNumber; + MaxAttempts = maxAttempts; + StartTimeUtc = startTimeUtc; + } + + public string OperationName { get; } + public int AttemptNumber { get; } + public int MaxAttempts { get; } + public DateTime StartTimeUtc { get; } + } + + public sealed class RetryStrategy + { + public int MaxAttempts { get; init; } + public Func ShouldRetry { get; init; } + public Func GetBackoff { get; init; } + public Action OnRetry { get; init; } + public Action OnSuccess { get; init; } + public Action OnFailure { get; init; } + } + + public sealed class RetryHelper + { + private readonly Tracing _trace; + private readonly RetryStrategy _strategy; + + public RetryHelper(Tracing trace, RetryStrategy strategy) + { + _trace = trace ?? throw new ArgumentNullException(nameof(trace)); + _strategy = strategy ?? throw new ArgumentNullException(nameof(strategy)); + + ArgUtil.NotNull(strategy.GetBackoff, $"{nameof(strategy)}.{nameof(strategy.GetBackoff)}"); + + if (strategy.MaxAttempts <= 0) + { + throw new ArgumentOutOfRangeException($"{nameof(strategy)}.{nameof(strategy.MaxAttempts)}"); + } + } + + public async Task ExecuteAsync( + Func> operation, + CancellationToken cancellationToken = default) + { + return await ExecuteAsync("operation", operation, cancellationToken); + } + + public async Task ExecuteAsync( + Func operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + await ExecuteAsync( + "operation", + async () => + { + await operation(); + return true; + }, + cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func> operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + + operationName ??= "operation"; + + var attempt = 0; + var startTimeUtc = DateTime.UtcNow; + var stopwatch = Stopwatch.StartNew(); + while (true) + { + attempt++; + cancellationToken.ThrowIfCancellationRequested(); + var context = new RetryExecutionContext(operationName, attempt, _strategy.MaxAttempts, startTimeUtc); + + try + { + var result = await operation(); + _strategy.OnSuccess?.Invoke(context, stopwatch.Elapsed); + return result; + } + catch (Exception ex) + { + var shouldRetry = _strategy.ShouldRetry == null || _strategy.ShouldRetry(ex); + if (attempt >= _strategy.MaxAttempts || !shouldRetry) + { + _trace.Error($"[{operationName}] retry exhausted at attempt {attempt}/{_strategy.MaxAttempts}"); + _strategy.OnFailure?.Invoke(context, ex, stopwatch.Elapsed); + throw; + } + + var backoff = _strategy.GetBackoff(attempt, _strategy.MaxAttempts, ex); + _strategy.OnRetry?.Invoke(context, ex, backoff); + await Task.Delay(backoff, cancellationToken); + } + } + } + } +} diff --git a/src/Runner.Common/RunnerDotcomServer.cs b/src/Runner.Common/RunnerDotcomServer.cs index c021e8bcc23..5bbc5f3d08e 100644 --- a/src/Runner.Common/RunnerDotcomServer.cs +++ b/src/Runner.Common/RunnerDotcomServer.cs @@ -111,19 +111,30 @@ public async Task DeleteRunnerAsync(string githubUrl, string githubToken, ulong private async Task RetryRequest(string githubApiUrl, string githubToken, RequestType requestType, int maxRetryAttemptsCount = 5, string errorMessage = null, StringContent body = null) { - int retry = 0; - while (true) + var responseStatus = System.Net.HttpStatusCode.OK; + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - retry++; - using (var httpClientHandler = HostContext.CreateHttpClientHandler()) - using (var httpClient = new HttpClient(httpClientHandler)) + MaxAttempts = maxRetryAttemptsCount, + ShouldRetry = _ => responseStatus != System.Net.HttpStatusCode.NotFound, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + OnRetry = (context, ex, backoff) => { - httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("RemoteAuth", githubToken); - httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); - - var responseStatus = System.Net.HttpStatusCode.OK; - try + Trace.Error($"{errorMessage} -- Attempt: {context.AttemptNumber}"); + Trace.Error(ex); + Trace.Info($"Retrying in {backoff.Seconds} seconds"); + }, + }); + + return await retryHelper.ExecuteAsync( + operationName: "RunnerDotcomServer.RetryRequest", + operation: async () => + { + using (var httpClientHandler = HostContext.CreateHttpClientHandler()) + using (var httpClient = new HttpClient(httpClientHandler)) { + httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("RemoteAuth", githubToken); + httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + HttpResponseMessage response = null; switch (requestType) { @@ -163,17 +174,9 @@ private async Task RetryRequest(string githubApiUrl, string githubToken, R } } + throw new InvalidOperationException($"Unable to process response from '{requestType} {githubApiUrl}'."); } - catch (Exception ex) when (retry < maxRetryAttemptsCount && responseStatus != System.Net.HttpStatusCode.NotFound) - { - Trace.Error($"{errorMessage} -- Attempt: {retry}"); - Trace.Error(ex); - } - } - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)); - Trace.Info($"Retrying in {backOff.Seconds} seconds"); - await Task.Delay(backOff); - } + }); } private string GetEntityUrl(string githubUrl) diff --git a/src/Runner.Common/RunnerService.cs b/src/Runner.Common/RunnerService.cs index ccaa83f698f..e79d12b929c 100644 --- a/src/Runner.Common/RunnerService.cs +++ b/src/Runner.Common/RunnerService.cs @@ -88,25 +88,20 @@ protected async Task RetryRequest(Func> func, Func shouldRetry = null ) { - var attempt = 0; - while (true) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - attempt++; - cancellationToken.ThrowIfCancellationRequested(); - try - { - return await func(); - } - // TODO: Add handling of non-retriable exceptions: https://github.com/github/actions-broker/issues/122 - catch (Exception ex) when (attempt < maxAttempts && (shouldRetry == null || shouldRetry(ex))) + MaxAttempts = maxAttempts, + ShouldRetry = shouldRetry, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15)), + OnRetry = (context, ex, backoff) => { Trace.Error("Catch exception during request"); Trace.Error(ex); - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15)); - Trace.Warning($"Back off {backOff.TotalSeconds} seconds before next retry. {maxAttempts - attempt} attempt left."); - await Task.Delay(backOff, cancellationToken); - } - } + Trace.Warning($"Back off {backoff.TotalSeconds} seconds before next retry. {context.MaxAttempts - context.AttemptNumber} attempt left."); + }, + }); + + return await retryHelper.ExecuteAsync(func, cancellationToken); } } } diff --git a/src/Test/L0/Listener/RunnerDotcomServerL0.cs b/src/Test/L0/Listener/RunnerDotcomServerL0.cs new file mode 100644 index 00000000000..df32aee15b6 --- /dev/null +++ b/src/Test/L0/Listener/RunnerDotcomServerL0.cs @@ -0,0 +1,140 @@ +using System; +using System.Collections.Generic; +using System.Net; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; +using GitHub.Runner.Common; +using GitHub.Runner.Sdk; +using Moq; +using Moq.Protected; +using Xunit; + +namespace GitHub.Runner.Common.Tests.Listener +{ + public sealed class RunnerDotcomServerL0 + { + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public async Task GetRunnerByNameAsync_RetriesTransientFailureAndSucceeds() + { + using (var hc = new TestHostContext(this)) + { + var mockHandler = new Mock(); + var expectedUri = new Uri("https://api.github.com/orgs/my-org/actions/runners?name=runner-a"); + mockHandler.Protected() + .SetupSequence>( + "SendAsync", + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()) + .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.InternalServerError) + { + Content = new StringContent("{\"message\":\"server error\"}") + }) + .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent("{\"total_count\":1,\"runners\":[{\"id\":7,\"name\":\"runner-a\"}]}") + }); + + var mockFactory = new Mock(); + mockFactory + .Setup(x => x.CreateClientHandler(It.IsAny())) + .Returns(mockHandler.Object); + hc.SetSingleton(mockFactory.Object); + + var server = new RunnerDotcomServer(); + server.Initialize(hc); + + List runners = await server.GetRunnerByNameAsync("https://github.com/my-org", "token", "runner-a"); + + Assert.Single(runners); + Assert.Equal(7ul, runners[0].Id); + Assert.Equal("runner-a", runners[0].Name); + + mockHandler.Protected().Verify( + "SendAsync", + Times.Exactly(2), + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public async Task GetRunnerByNameAsync_DoesNotRetryNotFound() + { + using (var hc = new TestHostContext(this)) + { + var mockHandler = new Mock(); + var expectedUri = new Uri("https://api.github.com/orgs/my-org/actions/runners?name=missing-runner"); + mockHandler.Protected() + .Setup>( + "SendAsync", + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()) + .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.NotFound) + { + Content = new StringContent("{\"message\":\"Not Found\"}") + }); + + var mockFactory = new Mock(); + mockFactory + .Setup(x => x.CreateClientHandler(It.IsAny())) + .Returns(mockHandler.Object); + hc.SetSingleton(mockFactory.Object); + + var server = new RunnerDotcomServer(); + server.Initialize(hc); + + await Assert.ThrowsAsync(() => server.GetRunnerByNameAsync("https://github.com/my-org", "token", "missing-runner")); + + mockHandler.Protected().Verify( + "SendAsync", + Times.Once(), + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public async Task GetRunnerByNameAsync_StopsAfterMaxAttempts() + { + using (var hc = new TestHostContext(this)) + { + var mockHandler = new Mock(); + var expectedUri = new Uri("https://api.github.com/orgs/my-org/actions/runners?name=runner-a"); + mockHandler.Protected() + .Setup>( + "SendAsync", + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()) + .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.InternalServerError) + { + Content = new StringContent("{\"message\":\"server error\"}") + }); + + var mockFactory = new Mock(); + mockFactory + .Setup(x => x.CreateClientHandler(It.IsAny())) + .Returns(mockHandler.Object); + hc.SetSingleton(mockFactory.Object); + + var server = new RunnerDotcomServer(); + server.Initialize(hc); + + await Assert.ThrowsAsync(() => server.GetRunnerByNameAsync("https://github.com/my-org", "token", "runner-a")); + + mockHandler.Protected().Verify( + "SendAsync", + Times.Exactly(3), + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()); + } + } + + } +} diff --git a/src/Test/L0/RetryHelperL0.cs b/src/Test/L0/RetryHelperL0.cs new file mode 100644 index 00000000000..ae2b335267d --- /dev/null +++ b/src/Test/L0/RetryHelperL0.cs @@ -0,0 +1,270 @@ +using System; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Xunit; + +namespace GitHub.Runner.Common.Tests +{ + public sealed class RetryHelperL0 + { + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_SucceedsOnFirstAttempt() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync(() => Task.FromResult(42)); + + Assert.Equal(42, result); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_RetriesOnTransientFailure() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var attempts = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync(() => + { + attempts++; + if (attempts < 3) + { + throw new InvalidOperationException("transient"); + } + return Task.FromResult(attempts); + }); + + Assert.Equal(3, result); + Assert.Equal(3, attempts); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_ThrowsAfterMaxAttempts() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + await Assert.ThrowsAsync(() => + helper.ExecuteAsync(() => throw new InvalidOperationException("always fails"))); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_InvokesOnRetryCallback() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var onRetryCalls = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnRetry = (_, _, _) => onRetryCalls++, + }; + var helper = new RetryHelper(trace, strategy); + var callCount = 0; + + await helper.ExecuteAsync(() => + { + callCount++; + if (callCount < 3) + { + throw new InvalidOperationException("transient"); + } + return Task.FromResult(0); + }); + + Assert.Equal(2, onRetryCalls); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_StopsRetryingWhenShouldRetryReturnsFalse() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 5, + GetBackoff = (_, _, _) => TimeSpan.Zero, + ShouldRetry = ex => ex is InvalidOperationException, + }; + var helper = new RetryHelper(trace, strategy); + + await Assert.ThrowsAsync(() => + helper.ExecuteAsync(() => throw new ArgumentException("non-retryable"))); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_ThrowsOperationCanceledWhenTokenCancelled() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + using var cts = new CancellationTokenSource(); + var strategy = new RetryStrategy + { + MaxAttempts = 5, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + cts.Cancel(); + + await Assert.ThrowsAnyAsync(() => + helper.ExecuteAsync(() => Task.FromResult(0), cts.Token)); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Constructor_ThrowsWhenMaxAttemptsIsZero() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 0, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + + Assert.Throws(() => new RetryHelper(trace, strategy)); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Constructor_ThrowsWhenGetBackoffIsNull() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = null, + }; + + Assert.Throws(() => new RetryHelper(trace, strategy)); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_InvokesOnSuccessCallback() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + RetryExecutionContext capturedContext = null; + TimeSpan capturedDuration = TimeSpan.Zero; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnSuccess = (context, duration) => + { + capturedContext = context; + capturedDuration = duration; + }, + }; + + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync("test-operation", () => Task.FromResult(7)); + + Assert.Equal(7, result); + Assert.NotNull(capturedContext); + Assert.Equal("test-operation", capturedContext.OperationName); + Assert.Equal(1, capturedContext.AttemptNumber); + Assert.True(capturedDuration >= TimeSpan.Zero); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_InvokesOnFailureForNonRetryableException() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + RetryExecutionContext capturedContext = null; + Exception capturedException = null; + TimeSpan capturedDuration = TimeSpan.Zero; + var strategy = new RetryStrategy + { + MaxAttempts = 5, + GetBackoff = (_, _, _) => TimeSpan.Zero, + ShouldRetry = ex => ex is InvalidOperationException, + OnFailure = (context, ex, duration) => + { + capturedContext = context; + capturedException = ex; + capturedDuration = duration; + }, + }; + + var helper = new RetryHelper(trace, strategy); + + await Assert.ThrowsAsync(() => + helper.ExecuteAsync("non-retryable-op", () => throw new ArgumentException("fail"))); + + Assert.NotNull(capturedContext); + Assert.Equal("non-retryable-op", capturedContext.OperationName); + Assert.Equal(1, capturedContext.AttemptNumber); + Assert.IsType(capturedException); + Assert.True(capturedDuration >= TimeSpan.Zero); + } + } + + private TestHostContext CreateTestContext([CallerMemberName] string testName = "") + { + return new TestHostContext(this, testName); + } + } +} From dec9142aad9c44c820b1c09b5c48955b22f0162b Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 19:39:19 +0000 Subject: [PATCH 02/10] Migrate ConfigurationManager retry loops to RetryHelper --- .../Configuration/ConfigurationManager.cs | 102 +++++++++--------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/src/Runner.Listener/Configuration/ConfigurationManager.cs b/src/Runner.Listener/Configuration/ConfigurationManager.cs index 360728a1c9b..ed2c83e85e6 100644 --- a/src/Runner.Listener/Configuration/ConfigurationManager.cs +++ b/src/Runner.Listener/Configuration/ConfigurationManager.cs @@ -775,21 +775,33 @@ private async Task GetJITRunnerTokenAsync(string gith throw new ArgumentException($"'{githubUrl}' should point to an org or repository."); } - int retryCount = 0; - while (retryCount < 3) + var responseStatus = System.Net.HttpStatusCode.OK; + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - using (var httpClientHandler = HostContext.CreateHttpClientHandler()) - using (var httpClient = new HttpClient(httpClientHandler)) + MaxAttempts = 3, + ShouldRetry = _ => responseStatus != System.Net.HttpStatusCode.NotFound, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + OnRetry = (context, ex, backoff) => { - var base64EncodingToken = Convert.ToBase64String(Encoding.UTF8.GetBytes($"github:{githubToken}")); - HostContext.SecretMasker.AddValue(base64EncodingToken); - httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("basic", base64EncodingToken); - httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); - httpClient.DefaultRequestHeaders.Accept.ParseAdd("application/vnd.github.v3+json"); + Trace.Error($"Failed to get JIT runner token -- Attempt: {context.AttemptNumber}"); + Trace.Error(ex); + Trace.Info($"Retrying in {backoff.Seconds} seconds"); + }, + }); - var responseStatus = System.Net.HttpStatusCode.OK; - try + return await retryHelper.ExecuteAsync( + operationName: "GetJITRunnerTokenAsync", + operation: async () => + { + using (var httpClientHandler = HostContext.CreateHttpClientHandler()) + using (var httpClient = new HttpClient(httpClientHandler)) { + var base64EncodingToken = Convert.ToBase64String(Encoding.UTF8.GetBytes($"github:{githubToken}")); + HostContext.SecretMasker.AddValue(base64EncodingToken); + httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("basic", base64EncodingToken); + httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + httpClient.DefaultRequestHeaders.Accept.ParseAdd("application/vnd.github.v3+json"); + var response = await httpClient.PostAsync(githubApiUrl, new StringContent(string.Empty)); responseStatus = response.StatusCode; var githubRequestId = UrlUtil.GetGitHubRequestId(response.Headers); @@ -807,19 +819,10 @@ private async Task GetJITRunnerTokenAsync(string gith _term.WriteError(errorResponse); response.EnsureSuccessStatusCode(); } + + throw new InvalidOperationException($"Unable to process response from 'POST {githubApiUrl}'."); } - catch (Exception ex) when (retryCount < 2 && responseStatus != System.Net.HttpStatusCode.NotFound) - { - retryCount++; - Trace.Error($"Failed to get JIT runner token -- Attempt: {retryCount}"); - Trace.Error(ex); - } - } - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)); - Trace.Info($"Retrying in {backOff.Seconds} seconds"); - await Task.Delay(backOff); - } - return null; + }); } private async Task GetTenantCredential(string githubUrl, string githubToken, string runnerEvent) @@ -835,24 +838,36 @@ private async Task GetTenantCredential(string githubUrl, strin githubApiUrl = $"{gitHubUrlBuilder.Scheme}://{gitHubUrlBuilder.Host}/api/v3/actions/runner-registration"; } - int retryCount = 0; - while (retryCount < 3) + var responseStatus = System.Net.HttpStatusCode.OK; + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - using (var httpClientHandler = HostContext.CreateHttpClientHandler()) - using (var httpClient = new HttpClient(httpClientHandler)) + MaxAttempts = 3, + ShouldRetry = _ => responseStatus != System.Net.HttpStatusCode.NotFound, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + OnRetry = (context, ex, backoff) => { - httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("RemoteAuth", githubToken); - httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + Trace.Error($"Failed to get tenant credentials -- Attempt: {context.AttemptNumber}"); + Trace.Error(ex); + Trace.Info($"Retrying in {backoff.Seconds} seconds"); + }, + }); - var bodyObject = new Dictionary() - { - {"url", githubUrl}, - {"runner_event", runnerEvent} - }; + var bodyObject = new Dictionary() + { + {"url", githubUrl}, + {"runner_event", runnerEvent} + }; - var responseStatus = System.Net.HttpStatusCode.OK; - try + return await retryHelper.ExecuteAsync( + operationName: "GetTenantCredential", + operation: async () => + { + using (var httpClientHandler = HostContext.CreateHttpClientHandler()) + using (var httpClient = new HttpClient(httpClientHandler)) { + httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("RemoteAuth", githubToken); + httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + var response = await httpClient.PostAsync(githubApiUrl, new StringContent(StringUtil.ConvertToJson(bodyObject), null, "application/json")); responseStatus = response.StatusCode; var githubRequestId = UrlUtil.GetGitHubRequestId(response.Headers); @@ -870,19 +885,10 @@ private async Task GetTenantCredential(string githubUrl, strin _term.WriteError(errorResponse); response.EnsureSuccessStatusCode(); } + + throw new InvalidOperationException($"Unable to process response from 'POST {githubApiUrl}'."); } - catch (Exception ex) when (retryCount < 2 && responseStatus != System.Net.HttpStatusCode.NotFound) - { - retryCount++; - Trace.Error($"Failed to get tenant credentials -- Attempt: {retryCount}"); - Trace.Error(ex); - } - } - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)); - Trace.Info($"Retrying in {backOff.Seconds} seconds"); - await Task.Delay(backOff); - } - return null; + }); } } } From 3134c5034b79ce7433412c21aee072354428bd9e Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 20:01:41 +0000 Subject: [PATCH 03/10] Migrate EstablishVssConnection to RetryHelper --- src/Runner.Common/RunnerService.cs | 31 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/Runner.Common/RunnerService.cs b/src/Runner.Common/RunnerService.cs index e79d12b929c..6280305b7a1 100644 --- a/src/Runner.Common/RunnerService.cs +++ b/src/Runner.Common/RunnerService.cs @@ -46,26 +46,25 @@ protected async Task EstablishVssConnection(Uri serverUrl, VssCre { Trace.Info($"EstablishVssConnection"); Trace.Info($"Establish connection with {timeout.TotalSeconds} seconds timeout."); - int attemptCount = 5; - while (attemptCount-- > 0) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - var connection = VssUtil.CreateConnection(serverUrl, credentials, timeout: timeout); - try - { - await connection.ConnectAsync(); - return connection; - } - catch (Exception ex) when (attemptCount > 0) + MaxAttempts = 5, + GetBackoff = (_, _, _) => TimeSpan.FromMilliseconds(100), + OnRetry = (context, ex, _) => { - Trace.Info($"Catch exception during connect. {attemptCount} attempt left."); + Trace.Info($"Catch exception during connect. {context.MaxAttempts - context.AttemptNumber} attempt left."); Trace.Error(ex); + }, + }); - await HostContext.Delay(TimeSpan.FromMilliseconds(100), CancellationToken.None); - } - } - - // should never reach here. - throw new InvalidOperationException(nameof(EstablishVssConnection)); + return await retryHelper.ExecuteAsync( + operationName: nameof(EstablishVssConnection), + operation: async () => + { + var connection = VssUtil.CreateConnection(serverUrl, credentials, timeout: timeout); + await connection.ConnectAsync(); + return connection; + }); } protected async Task RetryRequest(Func func, From 4f7006e996e7997d4acb4525ef3076d082693886 Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 20:10:00 +0000 Subject: [PATCH 04/10] Migrate LocationServer retries to RetryHelper --- src/Runner.Common/LocationServer.cs | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/Runner.Common/LocationServer.cs b/src/Runner.Common/LocationServer.cs index 25e09228332..ee324727456 100644 --- a/src/Runner.Common/LocationServer.cs +++ b/src/Runner.Common/LocationServer.cs @@ -23,21 +23,25 @@ public sealed class LocationServer : RunnerService, ILocationServer public async Task ConnectAsync(VssConnection jobConnection) { _connection = jobConnection; - int attemptCount = 5; - while (!_connection.HasAuthenticated && attemptCount-- > 0) + if (!_connection.HasAuthenticated) { - try + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - await _connection.ConnectAsync(); - break; - } - catch (Exception ex) when (attemptCount > 0) - { - Trace.Info($"Catch exception during connect. {attemptCount} attempt left."); - Trace.Error(ex); - } + MaxAttempts = 5, + GetBackoff = (_, _, _) => TimeSpan.FromMilliseconds(100), + OnRetry = (context, ex, _) => + { + Trace.Info($"Catch exception during connect. {context.MaxAttempts - context.AttemptNumber} attempt left."); + Trace.Error(ex); + }, + }); - await Task.Delay(100); + await retryHelper.ExecuteAsync( + operationName: nameof(ConnectAsync), + operation: async () => + { + await _connection.ConnectAsync(); + }); } _locationClient = _connection.GetClient(); From 1e9ba050615ebe1064f0e67894c18793fcae4bd0 Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 20:27:42 +0000 Subject: [PATCH 05/10] Add context-aware RetryHelper operation callbacks --- src/Runner.Common/JobServer.cs | 47 ++++++++++++++----------- src/Runner.Common/RetryHelper.cs | 59 ++++++++++++++++++++++++++++++-- src/Test/L0/RetryHelperL0.cs | 33 ++++++++++++++++++ 3 files changed, 117 insertions(+), 22 deletions(-) diff --git a/src/Runner.Common/JobServer.cs b/src/Runner.Common/JobServer.cs index eac20a2b983..5afb10f0892 100644 --- a/src/Runner.Common/JobServer.cs +++ b/src/Runner.Common/JobServer.cs @@ -61,33 +61,40 @@ public sealed class JobServer : RunnerService, IJobServer public async Task ConnectAsync(VssConnection jobConnection) { _connection = jobConnection; - int totalAttempts = 5; - int attemptCount = totalAttempts; var configurationStore = HostContext.GetService(); var runnerSettings = configurationStore.GetSettings(); - while (!_connection.HasAuthenticated && attemptCount-- > 0) + if (!_connection.HasAuthenticated) { - try - { - await _connection.ConnectAsync(); - break; - } - catch (Exception ex) when (attemptCount > 0) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - Trace.Info($"Catch exception during connect. {attemptCount} attempts left."); - Trace.Error(ex); - - if (runnerSettings.IsHostedServer) + MaxAttempts = 5, + GetBackoff = (attempt, _, _) => BackoffTimerHelper.GetExponentialBackoff(attempt, TimeSpan.FromMilliseconds(100), TimeSpan.FromSeconds(3.2), TimeSpan.FromMilliseconds(100)), + OnRetry = (context, ex, _) => { - await CheckNetworkEndpointsAsync(attemptCount); - } - } - - int attempt = totalAttempts - attemptCount; - TimeSpan backoff = BackoffTimerHelper.GetExponentialBackoff(attempt, TimeSpan.FromMilliseconds(100), TimeSpan.FromSeconds(3.2), TimeSpan.FromMilliseconds(100)); + Trace.Info($"Catch exception during connect. {context.MaxAttempts - context.AttemptNumber} attempts left."); + Trace.Error(ex); + }, + }); + + await retryHelper.ExecuteAsync( + operationName: nameof(ConnectAsync), + operation: async context => + { + try + { + await _connection.ConnectAsync(); + } + catch + { + if (runnerSettings.IsHostedServer && context.AttemptNumber < context.MaxAttempts) + { + await CheckNetworkEndpointsAsync(context.MaxAttempts - context.AttemptNumber); + } - await Task.Delay(backoff); + throw; + } + }); } _taskClient = _connection.GetClient(); diff --git a/src/Runner.Common/RetryHelper.cs b/src/Runner.Common/RetryHelper.cs index 6f9a983264f..3669ebb8038 100644 --- a/src/Runner.Common/RetryHelper.cs +++ b/src/Runner.Common/RetryHelper.cs @@ -57,6 +57,13 @@ public async Task ExecuteAsync( return await ExecuteAsync("operation", operation, cancellationToken); } + public async Task ExecuteAsync( + Func> operation, + CancellationToken cancellationToken = default) + { + return await ExecuteAsync("operation", operation, cancellationToken); + } + public async Task ExecuteAsync( Func operation, CancellationToken cancellationToken = default) @@ -64,7 +71,30 @@ public async Task ExecuteAsync( ArgUtil.NotNull(operation, nameof(operation)); await ExecuteAsync( "operation", - async () => + async _ => + { + await operation(); + return true; + }, + cancellationToken); + } + + public async Task ExecuteAsync( + Func operation, + CancellationToken cancellationToken = default) + { + await ExecuteAsync("operation", operation, cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + await ExecuteAsync( + operationName, + async _ => { await operation(); return true; @@ -72,12 +102,37 @@ await ExecuteAsync( cancellationToken); } + public async Task ExecuteAsync( + string operationName, + Func operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + await ExecuteAsync( + operationName, + async context => + { + await operation(context); + return true; + }, + cancellationToken); + } + public async Task ExecuteAsync( string operationName, Func> operation, CancellationToken cancellationToken = default) { ArgUtil.NotNull(operation, nameof(operation)); + return await ExecuteAsync(operationName, _ => operation(), cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func> operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); operationName ??= "operation"; @@ -92,7 +147,7 @@ public async Task ExecuteAsync( try { - var result = await operation(); + var result = await operation(context); _strategy.OnSuccess?.Invoke(context, stopwatch.Elapsed); return result; } diff --git a/src/Test/L0/RetryHelperL0.cs b/src/Test/L0/RetryHelperL0.cs index ae2b335267d..76852cd5a60 100644 --- a/src/Test/L0/RetryHelperL0.cs +++ b/src/Test/L0/RetryHelperL0.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; @@ -262,6 +263,38 @@ await Assert.ThrowsAsync(() => } } + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_PassesContextToOperationCallback() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var seenAttempts = new List(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + + var helper = new RetryHelper(trace, strategy); + var result = await helper.ExecuteAsync("context-op", context => + { + seenAttempts.Add(context.AttemptNumber); + if (context.AttemptNumber < context.MaxAttempts) + { + throw new InvalidOperationException("transient"); + } + + return Task.FromResult(context.AttemptNumber); + }); + + Assert.Equal(3, result); + Assert.Equal(new[] { 1, 2, 3 }, seenAttempts); + } + } + private TestHostContext CreateTestContext([CallerMemberName] string testName = "") { return new TestHostContext(this, testName); From 14772b8edbfe8684a5a57101376b262cfc4ad613 Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 20:42:02 +0000 Subject: [PATCH 06/10] Migrate SelfUpdater download retries to RetryHelper --- src/Runner.Listener/SelfUpdater.cs | 179 +++++++++++++++-------------- 1 file changed, 94 insertions(+), 85 deletions(-) diff --git a/src/Runner.Listener/SelfUpdater.cs b/src/Runner.Listener/SelfUpdater.cs index 9cf6ae8a9e9..0b7c8fa7530 100644 --- a/src/Runner.Listener/SelfUpdater.cs +++ b/src/Runner.Listener/SelfUpdater.cs @@ -269,110 +269,119 @@ private async Task DownLoadRunner(string downloadDirectory, string packa { var stopWatch = Stopwatch.StartNew(); int runnerSuffix = 1; - string archiveFile = null; - bool downloadSucceeded = false; + var retryHelper = new RetryHelper(Trace, new RetryStrategy + { + MaxAttempts = Constants.RunnerDownloadRetryMaxAttempts, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }); - // Download the runner, using multiple attempts in order to be resilient against any networking/CDN issues - for (int attempt = 1; attempt <= Constants.RunnerDownloadRetryMaxAttempts; attempt++) + try { - // Generate an available package name, and do our best effort to clean up stale local zip files - while (true) - { - if (_targetPackage.Platform.StartsWith("win")) + return await retryHelper.ExecuteAsync( + "DownLoadRunner", + async (context) => { - archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.zip"); - } - else - { - archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.tar.gz"); - } + string archiveFile = null; - try - { - // delete .zip file - if (!string.IsNullOrEmpty(archiveFile) && File.Exists(archiveFile)) + // Generate an available package name, and do our best effort to clean up stale local zip files. + while (true) { - Trace.Verbose("Deleting latest runner package zip '{0}'", archiveFile); - IOUtil.DeleteFile(archiveFile); - } - - break; - } - catch (Exception ex) - { - // couldn't delete the file for whatever reason, so generate another name - Trace.Warning("Failed to delete runner package zip '{0}'. Exception: {1}", archiveFile, ex); - runnerSuffix++; - } - } + if (_targetPackage.Platform.StartsWith("win")) + { + archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.zip"); + } + else + { + archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.tar.gz"); + } - // Allow a 15-minute package download timeout, which is good enough to update the runner from a 1 Mbit/s ADSL connection. - if (!int.TryParse(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_DOWNLOAD_TIMEOUT") ?? string.Empty, out int timeoutSeconds)) - { - timeoutSeconds = 15 * 60; - } + try + { + // delete .zip file + if (!string.IsNullOrEmpty(archiveFile) && File.Exists(archiveFile)) + { + Trace.Verbose("Deleting latest runner package zip '{0}'", archiveFile); + IOUtil.DeleteFile(archiveFile); + } + + break; + } + catch (Exception ex) + { + // couldn't delete the file for whatever reason, so generate another name + Trace.Warning("Failed to delete runner package zip '{0}'. Exception: {1}", archiveFile, ex); + runnerSuffix++; + } + } - Trace.Info($"Attempt {attempt}: save latest runner into {archiveFile}."); + // Allow a 15-minute package download timeout, which is good enough to update the runner from a 1 Mbit/s ADSL connection. + if (!int.TryParse(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_DOWNLOAD_TIMEOUT") ?? string.Empty, out int timeoutSeconds)) + { + timeoutSeconds = 15 * 60; + } - using (var downloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) - using (var downloadCts = CancellationTokenSource.CreateLinkedTokenSource(downloadTimeout.Token, token)) - { - try - { - Trace.Info($"Download runner: begin download"); - long downloadSize = 0; + Trace.Info($"Attempt {context.AttemptNumber}: save latest runner into {archiveFile}."); - //open zip stream in async mode - using (HttpClient httpClient = new(HostContext.CreateHttpClientHandler())) + using (var downloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) + using (var downloadCts = CancellationTokenSource.CreateLinkedTokenSource(downloadTimeout.Token, token)) { - if (!string.IsNullOrEmpty(_targetPackage.Token)) + try { - Trace.Info($"Adding authorization token ({_targetPackage.Token.Length} chars)"); - httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", _targetPackage.Token); + Trace.Info($"Download runner: begin download"); + long downloadSize = 0; + + //open zip stream in async mode + using (HttpClient httpClient = new(HostContext.CreateHttpClientHandler())) + { + if (!string.IsNullOrEmpty(_targetPackage.Token)) + { + Trace.Info($"Adding authorization token ({_targetPackage.Token.Length} chars)"); + httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", _targetPackage.Token); + } + + Trace.Info($"Downloading {packageDownloadUrl}"); + + using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: 4096, useAsync: true)) + using (Stream result = await httpClient.GetStreamAsync(packageDownloadUrl)) + { + //81920 is the default used by System.IO.Stream.CopyTo and is under the large object heap threshold (85k). + await result.CopyToAsync(fs, 81920, downloadCts.Token); + await fs.FlushAsync(downloadCts.Token); + downloadSize = fs.Length; + } + } + + Trace.Info($"Download runner: finished download"); + stopWatch.Stop(); + _updateTrace.Enqueue($"PackageDownloadTime: {stopWatch.ElapsedMilliseconds}ms"); + _updateTrace.Enqueue($"Attempts: {context.AttemptNumber}"); + _updateTrace.Enqueue($"PackageSize: {downloadSize / 1024 / 1024}MB"); + return archiveFile; } - - Trace.Info($"Downloading {packageDownloadUrl}"); - - using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: 4096, useAsync: true)) - using (Stream result = await httpClient.GetStreamAsync(packageDownloadUrl)) + catch (OperationCanceledException) when (token.IsCancellationRequested) { - //81920 is the default used by System.IO.Stream.CopyTo and is under the large object heap threshold (85k). - await result.CopyToAsync(fs, 81920, downloadCts.Token); - await fs.FlushAsync(downloadCts.Token); - downloadSize = fs.Length; + Trace.Info($"Runner download has been cancelled."); + throw; } - } + catch (Exception ex) + { + if (downloadCts.Token.IsCancellationRequested) + { + Trace.Warning($"Runner download has timed out after {timeoutSeconds} seconds"); + } - Trace.Info($"Download runner: finished download"); - downloadSucceeded = true; - stopWatch.Stop(); - _updateTrace.Enqueue($"PackageDownloadTime: {stopWatch.ElapsedMilliseconds}ms"); - _updateTrace.Enqueue($"Attempts: {attempt}"); - _updateTrace.Enqueue($"PackageSize: {downloadSize / 1024 / 1024}MB"); - break; - } - catch (OperationCanceledException) when (token.IsCancellationRequested) - { - Trace.Info($"Runner download has been cancelled."); - throw; - } - catch (Exception ex) - { - if (downloadCts.Token.IsCancellationRequested) - { - Trace.Warning($"Runner download has timed out after {timeoutSeconds} seconds"); + Trace.Warning($"Failed to get package '{archiveFile}' from '{packageDownloadUrl}'. Exception {ex}"); + throw; + } } - - Trace.Warning($"Failed to get package '{archiveFile}' from '{packageDownloadUrl}'. Exception {ex}"); - } - } + }, + cancellationToken: token); } - - if (downloadSucceeded) + catch (OperationCanceledException) when (token.IsCancellationRequested) { - return archiveFile; + throw; } - else + catch (Exception) { return null; } From dc0a097a129e6dba9745478dae1d260acbda7c6a Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 22:03:40 +0000 Subject: [PATCH 07/10] Refactor retry flow to OperationOutcome model --- src/Runner.Common/ActionsRunServer.cs | 17 +- src/Runner.Common/BrokerServer.cs | 28 ++-- src/Runner.Common/RetryHelper.cs | 68 +++++++- src/Runner.Common/RunServer.cs | 48 ++++-- src/Runner.Common/RunnerDotcomServer.cs | 18 ++- src/Runner.Common/RunnerService.cs | 26 +--- .../Configuration/ConfigurationManager.cs | 40 ++--- src/Test/L0/Listener/RunnerDotcomServerL0.cs | 2 +- src/Test/L0/RetryHelperL0.cs | 147 ++++++++++++++++-- 9 files changed, 303 insertions(+), 91 deletions(-) diff --git a/src/Runner.Common/ActionsRunServer.cs b/src/Runner.Common/ActionsRunServer.cs index 3ded580071d..cc69ed9471b 100644 --- a/src/Runner.Common/ActionsRunServer.cs +++ b/src/Runner.Common/ActionsRunServer.cs @@ -40,12 +40,17 @@ private void CheckConnection() public Task GetJobMessageAsync(string id, CancellationToken cancellationToken) { CheckConnection(); - var jobMessage = RetryRequest(async () => - { - return await _actionsRunServerClient.GetJobMessageAsync(id, cancellationToken); - }, cancellationToken); - - return jobMessage; + return RetryRequest( + async () => + { + try + { + return OperationOutcome.Success( + await _actionsRunServerClient.GetJobMessageAsync(id, cancellationToken)); + } + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } } } diff --git a/src/Runner.Common/BrokerServer.cs b/src/Runner.Common/BrokerServer.cs index c642b4af1cf..d21b179b9af 100644 --- a/src/Runner.Common/BrokerServer.cs +++ b/src/Runner.Common/BrokerServer.cs @@ -66,10 +66,21 @@ public async Task CreateSessionAsync(TaskAgentSession session, public Task GetRunnerMessageAsync(Guid? sessionId, TaskAgentStatus status, string version, string os, string architecture, bool disableUpdate, CancellationToken cancellationToken) { CheckConnection(); - var brokerSession = RetryRequest( - async () => await _brokerHttpClient.GetRunnerMessageAsync(sessionId, version, status, os, architecture, disableUpdate, cancellationToken), cancellationToken, shouldRetry: ShouldRetryException); - - return brokerSession; + return RetryRequest( + async () => + { + try + { + return OperationOutcome.Success( + await _brokerHttpClient.GetRunnerMessageAsync(sessionId, version, status, os, architecture, disableUpdate, cancellationToken)); + } + catch (AccessDeniedException) { throw; } + catch (VssUnauthorizedException) { throw; } + catch (RunnerNotFoundException) { throw; } + catch (HostedRunnerDeprovisionedException) { throw; } + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } public async Task AcknowledgeRunnerRequestAsync(string runnerRequestId, Guid? sessionId, TaskAgentStatus status, string version, string os, string architecture, CancellationToken cancellationToken) @@ -106,14 +117,5 @@ public Task ForceRefreshConnection(VssCredentials credentials) return Task.CompletedTask; } - public bool ShouldRetryException(Exception ex) - { - if (ex is AccessDeniedException || ex is VssUnauthorizedException || ex is RunnerNotFoundException || ex is HostedRunnerDeprovisionedException) - { - return false; - } - - return true; - } } } diff --git a/src/Runner.Common/RetryHelper.cs b/src/Runner.Common/RetryHelper.cs index 3669ebb8038..e564d46e19d 100644 --- a/src/Runner.Common/RetryHelper.cs +++ b/src/Runner.Common/RetryHelper.cs @@ -25,13 +25,25 @@ public RetryExecutionContext(string operationName, int attemptNumber, int maxAtt public sealed class RetryStrategy { public int MaxAttempts { get; init; } - public Func ShouldRetry { get; init; } public Func GetBackoff { get; init; } public Action OnRetry { get; init; } public Action OnSuccess { get; init; } public Action OnFailure { get; init; } } + public abstract record OperationOutcome + { + public sealed record Success(T Value) : OperationOutcome; + public sealed record TransientFailure(string Reason) : OperationOutcome; + } + + public static class OperationOutcome + { + public static OperationOutcome Success(T value) => new OperationOutcome.Success(value); + + public static OperationOutcome TransientFailure(string reason) => new OperationOutcome.TransientFailure(reason); + } + public sealed class RetryHelper { private readonly Tracing _trace; @@ -153,8 +165,7 @@ public async Task ExecuteAsync( } catch (Exception ex) { - var shouldRetry = _strategy.ShouldRetry == null || _strategy.ShouldRetry(ex); - if (attempt >= _strategy.MaxAttempts || !shouldRetry) + if (attempt >= _strategy.MaxAttempts) { _trace.Error($"[{operationName}] retry exhausted at attempt {attempt}/{_strategy.MaxAttempts}"); _strategy.OnFailure?.Invoke(context, ex, stopwatch.Elapsed); @@ -167,5 +178,56 @@ public async Task ExecuteAsync( } } } + public async Task ExecuteAsync( + string operationName, + Func>> operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + return await ExecuteAsync(operationName, _ => operation(), cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func>> operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + + operationName ??= "operation"; + + var attempt = 0; + var startTimeUtc = DateTime.UtcNow; + var stopwatch = Stopwatch.StartNew(); + string lastFailureReason = null; + + while (true) + { + attempt++; + cancellationToken.ThrowIfCancellationRequested(); + var context = new RetryExecutionContext(operationName, attempt, _strategy.MaxAttempts, startTimeUtc); + + var outcome = await operation(context); + + if (outcome is OperationOutcome.Success success) + { + _strategy.OnSuccess?.Invoke(context, stopwatch.Elapsed); + return success.Value; + } + + lastFailureReason = ((OperationOutcome.TransientFailure)outcome).Reason; + + if (attempt >= _strategy.MaxAttempts) + { + _trace.Error($"[{operationName}] retry exhausted at attempt {attempt}/{_strategy.MaxAttempts}: {lastFailureReason}"); + _strategy.OnFailure?.Invoke(context, null, stopwatch.Elapsed); + throw new InvalidOperationException($"[{operationName}] failed after {attempt} attempt(s): {lastFailureReason}"); + } + + var backoff = _strategy.GetBackoff(attempt, _strategy.MaxAttempts, null); + _strategy.OnRetry?.Invoke(context, null, backoff); + await Task.Delay(backoff, cancellationToken); + } + } } } diff --git a/src/Runner.Common/RunServer.cs b/src/Runner.Common/RunServer.cs index cfdb98cb909..3565b4c1c99 100644 --- a/src/Runner.Common/RunServer.cs +++ b/src/Runner.Common/RunServer.cs @@ -64,11 +64,19 @@ public Task GetJobMessageAsync(string id, string billing { CheckConnection(); return RetryRequest( - async () => await _runServiceHttpClient.GetJobMessageAsync(requestUri, id, VarUtil.OS, billingOwnerId, cancellationToken), cancellationToken, - shouldRetry: ex => - ex is not TaskOrchestrationJobNotFoundException && // HTTP status 404 - ex is not TaskOrchestrationJobAlreadyAcquiredException && // HTTP status 409 - ex is not TaskOrchestrationJobUnprocessableException); // HTTP status 422 + async () => + { + try + { + return OperationOutcome.Success( + await _runServiceHttpClient.GetJobMessageAsync(requestUri, id, VarUtil.OS, billingOwnerId, cancellationToken)); + } + catch (TaskOrchestrationJobNotFoundException) { throw; } // HTTP status 404 + catch (TaskOrchestrationJobAlreadyAcquiredException) { throw; } // HTTP status 409 + catch (TaskOrchestrationJobUnprocessableException) { throw; } // HTTP status 422 + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } public Task CompleteJobAsync( @@ -86,19 +94,35 @@ public Task CompleteJobAsync( { CheckConnection(); return RetryRequest( - async () => await _runServiceHttpClient.CompleteJobAsync(requestUri, planId, jobId, result, outputs, stepResults, jobAnnotations, environmentUrl, telemetry, billingOwnerId, infrastructureFailureCategory, cancellationToken), cancellationToken, - shouldRetry: ex => - ex is not VssUnauthorizedException && // HTTP status 401 - ex is not TaskOrchestrationJobNotFoundException); // HTTP status 404 + async () => + { + try + { + await _runServiceHttpClient.CompleteJobAsync(requestUri, planId, jobId, result, outputs, stepResults, jobAnnotations, environmentUrl, telemetry, billingOwnerId, infrastructureFailureCategory, cancellationToken); + return OperationOutcome.Success(true); + } + catch (VssUnauthorizedException) { throw; } // HTTP status 401 + catch (TaskOrchestrationJobNotFoundException) { throw; } // HTTP status 404 + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } public Task RenewJobAsync(Guid planId, Guid jobId, CancellationToken cancellationToken) { CheckConnection(); return RetryRequest( - async () => await _runServiceHttpClient.RenewJobAsync(requestUri, planId, jobId, cancellationToken), cancellationToken, - shouldRetry: ex => - ex is not TaskOrchestrationJobNotFoundException); // HTTP status 404 + async () => + { + try + { + return OperationOutcome.Success( + await _runServiceHttpClient.RenewJobAsync(requestUri, planId, jobId, cancellationToken)); + } + catch (TaskOrchestrationJobNotFoundException) { throw; } // HTTP status 404 + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } } } diff --git a/src/Runner.Common/RunnerDotcomServer.cs b/src/Runner.Common/RunnerDotcomServer.cs index 5bbc5f3d08e..78dd74249a3 100644 --- a/src/Runner.Common/RunnerDotcomServer.cs +++ b/src/Runner.Common/RunnerDotcomServer.cs @@ -111,21 +111,18 @@ public async Task DeleteRunnerAsync(string githubUrl, string githubToken, ulong private async Task RetryRequest(string githubApiUrl, string githubToken, RequestType requestType, int maxRetryAttemptsCount = 5, string errorMessage = null, StringContent body = null) { - var responseStatus = System.Net.HttpStatusCode.OK; var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = maxRetryAttemptsCount, - ShouldRetry = _ => responseStatus != System.Net.HttpStatusCode.NotFound, GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), - OnRetry = (context, ex, backoff) => + OnRetry = (context, _, backoff) => { Trace.Error($"{errorMessage} -- Attempt: {context.AttemptNumber}"); - Trace.Error(ex); Trace.Info($"Retrying in {backoff.Seconds} seconds"); }, }); - return await retryHelper.ExecuteAsync( + return await retryHelper.ExecuteAsync( operationName: "RunnerDotcomServer.RetryRequest", operation: async () => { @@ -156,21 +153,26 @@ private async Task RetryRequest(string githubApiUrl, string githubToken, R if (response != null) { - responseStatus = response.StatusCode; var githubRequestId = UrlUtil.GetGitHubRequestId(response.Headers); if (response.IsSuccessStatusCode) { Trace.Info($"Http response code: {response.StatusCode} from '{requestType.ToString()} {githubApiUrl}' ({githubRequestId})"); var jsonResponse = await response.Content.ReadAsStringAsync(); - return StringUtil.ConvertFromJson(jsonResponse); + return OperationOutcome.Success(StringUtil.ConvertFromJson(jsonResponse)); } else { _term.WriteError($"Http response code: {response.StatusCode} from '{requestType.ToString()} {githubApiUrl}' (Request Id: {githubRequestId})"); var errorResponse = await response.Content.ReadAsStringAsync(); _term.WriteError(errorResponse); - response.EnsureSuccessStatusCode(); + + if (response.StatusCode == System.Net.HttpStatusCode.NotFound) + { + response.EnsureSuccessStatusCode(); + } + + return OperationOutcome.TransientFailure($"Http response code: {response.StatusCode} from '{requestType} {githubApiUrl}'"); } } diff --git a/src/Runner.Common/RunnerService.cs b/src/Runner.Common/RunnerService.cs index 6280305b7a1..6a6b21ae0c5 100644 --- a/src/Runner.Common/RunnerService.cs +++ b/src/Runner.Common/RunnerService.cs @@ -67,40 +67,30 @@ protected async Task EstablishVssConnection(Uri serverUrl, VssCre }); } - protected async Task RetryRequest(Func func, + protected async Task RetryRequest(Func>> func, CancellationToken cancellationToken, - int maxAttempts = 5, - Func shouldRetry = null + int maxAttempts = 5 ) { - async Task wrappedFunc() - { - await func(); - return Unit.Value; - } - await RetryRequest(wrappedFunc, cancellationToken, maxAttempts, shouldRetry); + await RetryRequest(func, cancellationToken, maxAttempts); } - protected async Task RetryRequest(Func> func, + protected async Task RetryRequest(Func>> func, CancellationToken cancellationToken, - int maxAttempts = 5, - Func shouldRetry = null + int maxAttempts = 5 ) { var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = maxAttempts, - ShouldRetry = shouldRetry, GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15)), - OnRetry = (context, ex, backoff) => + OnRetry = (context, _, backoff) => { - Trace.Error("Catch exception during request"); - Trace.Error(ex); - Trace.Warning($"Back off {backoff.TotalSeconds} seconds before next retry. {context.MaxAttempts - context.AttemptNumber} attempt left."); + Trace.Warning($"Transient failure during request, retrying. Attempt {context.AttemptNumber}/{context.MaxAttempts}. Back off {backoff.TotalSeconds} seconds."); }, }); - return await retryHelper.ExecuteAsync(func, cancellationToken); + return await retryHelper.ExecuteAsync("RetryRequest", func, cancellationToken); } } } diff --git a/src/Runner.Listener/Configuration/ConfigurationManager.cs b/src/Runner.Listener/Configuration/ConfigurationManager.cs index ed2c83e85e6..91a36fee241 100644 --- a/src/Runner.Listener/Configuration/ConfigurationManager.cs +++ b/src/Runner.Listener/Configuration/ConfigurationManager.cs @@ -775,21 +775,18 @@ private async Task GetJITRunnerTokenAsync(string gith throw new ArgumentException($"'{githubUrl}' should point to an org or repository."); } - var responseStatus = System.Net.HttpStatusCode.OK; var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = 3, - ShouldRetry = _ => responseStatus != System.Net.HttpStatusCode.NotFound, GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), - OnRetry = (context, ex, backoff) => + OnRetry = (context, _, backoff) => { Trace.Error($"Failed to get JIT runner token -- Attempt: {context.AttemptNumber}"); - Trace.Error(ex); Trace.Info($"Retrying in {backoff.Seconds} seconds"); }, }); - return await retryHelper.ExecuteAsync( + return await retryHelper.ExecuteAsync( operationName: "GetJITRunnerTokenAsync", operation: async () => { @@ -803,24 +800,27 @@ private async Task GetJITRunnerTokenAsync(string gith httpClient.DefaultRequestHeaders.Accept.ParseAdd("application/vnd.github.v3+json"); var response = await httpClient.PostAsync(githubApiUrl, new StringContent(string.Empty)); - responseStatus = response.StatusCode; var githubRequestId = UrlUtil.GetGitHubRequestId(response.Headers); if (response.IsSuccessStatusCode) { Trace.Info($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}' ({githubRequestId})"); var jsonResponse = await response.Content.ReadAsStringAsync(); - return StringUtil.ConvertFromJson(jsonResponse); + return OperationOutcome.Success(StringUtil.ConvertFromJson(jsonResponse)); } else { _term.WriteError($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}' (Request Id: {githubRequestId})"); var errorResponse = await response.Content.ReadAsStringAsync(); _term.WriteError(errorResponse); - response.EnsureSuccessStatusCode(); - } - throw new InvalidOperationException($"Unable to process response from 'POST {githubApiUrl}'."); + if (response.StatusCode == System.Net.HttpStatusCode.NotFound) + { + response.EnsureSuccessStatusCode(); + } + + return OperationOutcome.TransientFailure($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}'"); + } } }); } @@ -838,16 +838,13 @@ private async Task GetTenantCredential(string githubUrl, strin githubApiUrl = $"{gitHubUrlBuilder.Scheme}://{gitHubUrlBuilder.Host}/api/v3/actions/runner-registration"; } - var responseStatus = System.Net.HttpStatusCode.OK; var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = 3, - ShouldRetry = _ => responseStatus != System.Net.HttpStatusCode.NotFound, GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), - OnRetry = (context, ex, backoff) => + OnRetry = (context, _, backoff) => { Trace.Error($"Failed to get tenant credentials -- Attempt: {context.AttemptNumber}"); - Trace.Error(ex); Trace.Info($"Retrying in {backoff.Seconds} seconds"); }, }); @@ -858,7 +855,7 @@ private async Task GetTenantCredential(string githubUrl, strin {"runner_event", runnerEvent} }; - return await retryHelper.ExecuteAsync( + return await retryHelper.ExecuteAsync( operationName: "GetTenantCredential", operation: async () => { @@ -869,24 +866,27 @@ private async Task GetTenantCredential(string githubUrl, strin httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); var response = await httpClient.PostAsync(githubApiUrl, new StringContent(StringUtil.ConvertToJson(bodyObject), null, "application/json")); - responseStatus = response.StatusCode; var githubRequestId = UrlUtil.GetGitHubRequestId(response.Headers); if (response.IsSuccessStatusCode) { Trace.Info($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}' ({githubRequestId})"); var jsonResponse = await response.Content.ReadAsStringAsync(); - return StringUtil.ConvertFromJson(jsonResponse); + return OperationOutcome.Success(StringUtil.ConvertFromJson(jsonResponse)); } else { _term.WriteError($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}' (Request Id: {githubRequestId})"); var errorResponse = await response.Content.ReadAsStringAsync(); _term.WriteError(errorResponse); - response.EnsureSuccessStatusCode(); - } - throw new InvalidOperationException($"Unable to process response from 'POST {githubApiUrl}'."); + if (response.StatusCode == System.Net.HttpStatusCode.NotFound) + { + response.EnsureSuccessStatusCode(); + } + + return OperationOutcome.TransientFailure($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}'"); + } } }); } diff --git a/src/Test/L0/Listener/RunnerDotcomServerL0.cs b/src/Test/L0/Listener/RunnerDotcomServerL0.cs index df32aee15b6..96df0e74dc8 100644 --- a/src/Test/L0/Listener/RunnerDotcomServerL0.cs +++ b/src/Test/L0/Listener/RunnerDotcomServerL0.cs @@ -126,7 +126,7 @@ public async Task GetRunnerByNameAsync_StopsAfterMaxAttempts() var server = new RunnerDotcomServer(); server.Initialize(hc); - await Assert.ThrowsAsync(() => server.GetRunnerByNameAsync("https://github.com/my-org", "token", "runner-a")); + await Assert.ThrowsAsync(() => server.GetRunnerByNameAsync("https://github.com/my-org", "token", "runner-a")); mockHandler.Protected().Verify( "SendAsync", diff --git a/src/Test/L0/RetryHelperL0.cs b/src/Test/L0/RetryHelperL0.cs index 76852cd5a60..6cfaa7929f6 100644 --- a/src/Test/L0/RetryHelperL0.cs +++ b/src/Test/L0/RetryHelperL0.cs @@ -77,7 +77,7 @@ public async Task ExecuteAsync_ThrowsAfterMaxAttempts() var helper = new RetryHelper(trace, strategy); await Assert.ThrowsAsync(() => - helper.ExecuteAsync(() => throw new InvalidOperationException("always fails"))); + helper.ExecuteAsync(() => Task.FromException(new InvalidOperationException("always fails")))); } } @@ -116,21 +116,34 @@ await helper.ExecuteAsync(() => [Fact] [Trait("Level", "L0")] [Trait("Category", "Common")] - public async Task ExecuteAsync_StopsRetryingWhenShouldRetryReturnsFalse() + public async Task ExecuteAsync_WithOutcome_PermanentFailurePropagatesImmediately() { using (TestHostContext hc = CreateTestContext()) { var trace = hc.GetTrace(); + var calls = 0; var strategy = new RetryStrategy { MaxAttempts = 5, GetBackoff = (_, _, _) => TimeSpan.Zero, - ShouldRetry = ex => ex is InvalidOperationException, }; var helper = new RetryHelper(trace, strategy); + // Throwing directly from an outcome-based operation propagates immediately without retrying await Assert.ThrowsAsync(() => - helper.ExecuteAsync(() => throw new ArgumentException("non-retryable"))); + helper.ExecuteAsync( + "permanent-failure-op", + () => + { + calls++; + if (calls == 1) + { + return Task.FromResult>(new OperationOutcome.TransientFailure("first attempt transient")); + } + throw new ArgumentException("permanent on second attempt"); + })); + + Assert.Equal(2, calls); } } @@ -229,7 +242,7 @@ public async Task ExecuteAsync_InvokesOnSuccessCallback() [Fact] [Trait("Level", "L0")] [Trait("Category", "Common")] - public async Task ExecuteAsync_InvokesOnFailureForNonRetryableException() + public async Task ExecuteAsync_InvokesOnFailureWhenRetriesExhausted() { using (TestHostContext hc = CreateTestContext()) { @@ -239,9 +252,8 @@ public async Task ExecuteAsync_InvokesOnFailureForNonRetryableException() TimeSpan capturedDuration = TimeSpan.Zero; var strategy = new RetryStrategy { - MaxAttempts = 5, + MaxAttempts = 2, GetBackoff = (_, _, _) => TimeSpan.Zero, - ShouldRetry = ex => ex is InvalidOperationException, OnFailure = (context, ex, duration) => { capturedContext = context; @@ -253,11 +265,12 @@ public async Task ExecuteAsync_InvokesOnFailureForNonRetryableException() var helper = new RetryHelper(trace, strategy); await Assert.ThrowsAsync(() => - helper.ExecuteAsync("non-retryable-op", () => throw new ArgumentException("fail"))); + helper.ExecuteAsync("exhausted-op", + () => Task.FromException(new ArgumentException("fail")))); Assert.NotNull(capturedContext); - Assert.Equal("non-retryable-op", capturedContext.OperationName); - Assert.Equal(1, capturedContext.AttemptNumber); + Assert.Equal("exhausted-op", capturedContext.OperationName); + Assert.Equal(2, capturedContext.AttemptNumber); Assert.IsType(capturedException); Assert.True(capturedDuration >= TimeSpan.Zero); } @@ -295,6 +308,120 @@ public async Task ExecuteAsync_PassesContextToOperationCallback() } } + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_SucceedsOnFirstAttempt() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync( + "outcome-op", + () => Task.FromResult>(new OperationOutcome.Success(99))); + + Assert.Equal(99, result); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_RetriesOnTransientFailure() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var calls = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync( + "outcome-op", + () => + { + calls++; + if (calls < 3) + { + return Task.FromResult>(new OperationOutcome.TransientFailure("not ready yet")); + } + return Task.FromResult>(new OperationOutcome.Success(calls)); + }); + + Assert.Equal(3, result); + Assert.Equal(3, calls); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_ThrowsAfterMaxTransientFailures() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var ex = await Assert.ThrowsAsync(() => + helper.ExecuteAsync( + "outcome-op", + () => Task.FromResult>(new OperationOutcome.TransientFailure("always failing")))); + + Assert.Contains("always failing", ex.Message); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_InvokesOnRetryCallback() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var onRetryCalls = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnRetry = (_, _, _) => onRetryCalls++, + }; + var helper = new RetryHelper(trace, strategy); + var calls = 0; + + await helper.ExecuteAsync( + "outcome-op", + () => + { + calls++; + if (calls < 3) + { + return Task.FromResult>(new OperationOutcome.TransientFailure("not ready")); + } + return Task.FromResult>(new OperationOutcome.Success(calls)); + }); + + Assert.Equal(2, onRetryCalls); + } + } + private TestHostContext CreateTestContext([CallerMemberName] string testName = "") { return new TestHostContext(this, testName); From 7dc5ff0ee2247e9cac085e1f5891c6e3fbf6c40e Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 22:12:43 +0000 Subject: [PATCH 08/10] Add standard RetryBackoffs policies --- src/Runner.Common/RetryBackoffs.cs | 117 ++++++++++++++++++ src/Runner.Common/RunnerDotcomServer.cs | 2 +- src/Runner.Common/RunnerService.cs | 4 +- .../Configuration/ConfigurationManager.cs | 4 +- src/Test/L0/RetryBackoffsL0.cs | 90 ++++++++++++++ 5 files changed, 212 insertions(+), 5 deletions(-) create mode 100644 src/Runner.Common/RetryBackoffs.cs create mode 100644 src/Test/L0/RetryBackoffsL0.cs diff --git a/src/Runner.Common/RetryBackoffs.cs b/src/Runner.Common/RetryBackoffs.cs new file mode 100644 index 00000000000..1f08125bb53 --- /dev/null +++ b/src/Runner.Common/RetryBackoffs.cs @@ -0,0 +1,117 @@ +using System; + +namespace GitHub.Runner.Common +{ + public static class RetryBackoffs + { + public static Func Fixed(TimeSpan delay) + { + if (delay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(delay)); + } + + return (_, _, _) => delay; + } + + public static Func Random(TimeSpan minDelay, TimeSpan maxDelay) + { + ValidateRange(minDelay, maxDelay); + + return (_, _, _) => NextDelay(minDelay, maxDelay); + } + + public static Func Exponential(TimeSpan minDelay, TimeSpan maxDelay, TimeSpan deltaDelay) + { + ValidateRange(minDelay, maxDelay); + if (deltaDelay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(deltaDelay)); + } + + return (attempt, _, _) => + { + var exponent = Math.Max(0, attempt); + var additional = (Math.Pow(2.0, exponent) - 1.0) * deltaDelay.TotalMilliseconds; + var delayMs = Math.Min(minDelay.TotalMilliseconds + additional, maxDelay.TotalMilliseconds); + return TimeSpan.FromMilliseconds(Math.Max(0, delayMs)); + }; + } + + public static Func ExponentialFullJitter(TimeSpan minDelay, TimeSpan maxDelay, TimeSpan deltaDelay) + { + ValidateRange(minDelay, maxDelay); + if (deltaDelay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(deltaDelay)); + } + + return (attempt, _, _) => + { + var exponent = Math.Max(0, attempt); + var additional = (Math.Pow(2.0, exponent) - 1.0) * deltaDelay.TotalMilliseconds; + var capMs = Math.Min(minDelay.TotalMilliseconds + additional, maxDelay.TotalMilliseconds); + var floorMs = minDelay.TotalMilliseconds; + + if (capMs <= floorMs) + { + return TimeSpan.FromMilliseconds(floorMs); + } + + var jitterMs = global::System.Random.Shared.NextDouble() * (capMs - floorMs); + return TimeSpan.FromMilliseconds(floorMs + jitterMs); + }; + } + + public static Func DecorrelatedJitter(TimeSpan minDelay, TimeSpan maxDelay) + { + ValidateRange(minDelay, maxDelay); + + var previous = minDelay; + return (_, _, _) => + { + var upperBoundMs = Math.Min(maxDelay.TotalMilliseconds, previous.TotalMilliseconds * 3); + if (upperBoundMs <= minDelay.TotalMilliseconds) + { + previous = minDelay; + return previous; + } + + var nextMs = minDelay.TotalMilliseconds + (global::System.Random.Shared.NextDouble() * (upperBoundMs - minDelay.TotalMilliseconds)); + previous = TimeSpan.FromMilliseconds(nextMs); + return previous; + }; + } + + private static void ValidateRange(TimeSpan minDelay, TimeSpan maxDelay) + { + if (minDelay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(minDelay)); + } + + if (maxDelay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(maxDelay)); + } + + if (maxDelay < minDelay) + { + throw new ArgumentOutOfRangeException(nameof(maxDelay)); + } + } + + private static TimeSpan NextDelay(TimeSpan minDelay, TimeSpan maxDelay) + { + if (minDelay == maxDelay) + { + return minDelay; + } + + var minMs = minDelay.TotalMilliseconds; + var rangeMs = maxDelay.TotalMilliseconds - minMs; + var jitterMs = global::System.Random.Shared.NextDouble() * rangeMs; + return TimeSpan.FromMilliseconds(minMs + jitterMs); + } + } +} diff --git a/src/Runner.Common/RunnerDotcomServer.cs b/src/Runner.Common/RunnerDotcomServer.cs index 78dd74249a3..3d4e2d87ba4 100644 --- a/src/Runner.Common/RunnerDotcomServer.cs +++ b/src/Runner.Common/RunnerDotcomServer.cs @@ -114,7 +114,7 @@ private async Task RetryRequest(string githubApiUrl, string githubToken, R var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = maxRetryAttemptsCount, - GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + GetBackoff = RetryBackoffs.Random(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), OnRetry = (context, _, backoff) => { Trace.Error($"{errorMessage} -- Attempt: {context.AttemptNumber}"); diff --git a/src/Runner.Common/RunnerService.cs b/src/Runner.Common/RunnerService.cs index 6a6b21ae0c5..d5d308e09ad 100644 --- a/src/Runner.Common/RunnerService.cs +++ b/src/Runner.Common/RunnerService.cs @@ -49,7 +49,7 @@ protected async Task EstablishVssConnection(Uri serverUrl, VssCre var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = 5, - GetBackoff = (_, _, _) => TimeSpan.FromMilliseconds(100), + GetBackoff = RetryBackoffs.Fixed(TimeSpan.FromMilliseconds(100)), OnRetry = (context, ex, _) => { Trace.Info($"Catch exception during connect. {context.MaxAttempts - context.AttemptNumber} attempt left."); @@ -83,7 +83,7 @@ protected async Task RetryRequest(Func>> func, var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = maxAttempts, - GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15)), + GetBackoff = RetryBackoffs.Random(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15)), OnRetry = (context, _, backoff) => { Trace.Warning($"Transient failure during request, retrying. Attempt {context.AttemptNumber}/{context.MaxAttempts}. Back off {backoff.TotalSeconds} seconds."); diff --git a/src/Runner.Listener/Configuration/ConfigurationManager.cs b/src/Runner.Listener/Configuration/ConfigurationManager.cs index 91a36fee241..fd44ab82652 100644 --- a/src/Runner.Listener/Configuration/ConfigurationManager.cs +++ b/src/Runner.Listener/Configuration/ConfigurationManager.cs @@ -778,7 +778,7 @@ private async Task GetJITRunnerTokenAsync(string gith var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = 3, - GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + GetBackoff = RetryBackoffs.Random(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), OnRetry = (context, _, backoff) => { Trace.Error($"Failed to get JIT runner token -- Attempt: {context.AttemptNumber}"); @@ -841,7 +841,7 @@ private async Task GetTenantCredential(string githubUrl, strin var retryHelper = new RetryHelper(Trace, new RetryStrategy { MaxAttempts = 3, - GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + GetBackoff = RetryBackoffs.Random(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), OnRetry = (context, _, backoff) => { Trace.Error($"Failed to get tenant credentials -- Attempt: {context.AttemptNumber}"); diff --git a/src/Test/L0/RetryBackoffsL0.cs b/src/Test/L0/RetryBackoffsL0.cs new file mode 100644 index 00000000000..6d1043254d2 --- /dev/null +++ b/src/Test/L0/RetryBackoffsL0.cs @@ -0,0 +1,90 @@ +using System; +using Xunit; + +namespace GitHub.Runner.Common.Tests +{ + public sealed class RetryBackoffsL0 + { + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Fixed_ReturnsSameDelay() + { + var backoff = RetryBackoffs.Fixed(TimeSpan.FromMilliseconds(250)); + + var first = backoff(1, 5, null); + var second = backoff(4, 5, new InvalidOperationException("ignored")); + + Assert.Equal(TimeSpan.FromMilliseconds(250), first); + Assert.Equal(TimeSpan.FromMilliseconds(250), second); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Random_StaysWithinBounds() + { + var min = TimeSpan.FromMilliseconds(100); + var max = TimeSpan.FromMilliseconds(200); + var backoff = RetryBackoffs.Random(min, max); + + for (var i = 0; i < 100; i++) + { + var delay = backoff(i + 1, 100, null); + Assert.InRange(delay, min, max); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Exponential_IsBoundedByMax() + { + var min = TimeSpan.FromMilliseconds(100); + var max = TimeSpan.FromMilliseconds(500); + var delta = TimeSpan.FromMilliseconds(100); + var backoff = RetryBackoffs.Exponential(min, max, delta); + + var first = backoff(1, 5, null); + var second = backoff(2, 5, null); + var saturated = backoff(10, 10, null); + + Assert.InRange(first, min, max); + Assert.InRange(second, first, max); + Assert.Equal(max, saturated); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void ExponentialFullJitter_IsWithinBounds() + { + var min = TimeSpan.FromMilliseconds(50); + var max = TimeSpan.FromMilliseconds(500); + var delta = TimeSpan.FromMilliseconds(100); + var backoff = RetryBackoffs.ExponentialFullJitter(min, max, delta); + + for (var attempt = 1; attempt <= 10; attempt++) + { + var delay = backoff(attempt, 10, null); + Assert.InRange(delay, min, max); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void DecorrelatedJitter_IsWithinBounds() + { + var min = TimeSpan.FromMilliseconds(100); + var max = TimeSpan.FromMilliseconds(1000); + var backoff = RetryBackoffs.DecorrelatedJitter(min, max); + + for (var i = 0; i < 100; i++) + { + var delay = backoff(i + 1, 100, null); + Assert.InRange(delay, min, max); + } + } + } +} From ff636eae9a48115f08de716cde750c8ad35afc01 Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 22:45:34 +0000 Subject: [PATCH 09/10] Migrate service-call retries to RetryHelper --- src/Runner.Common/ResultsServer.cs | 55 +++--- src/Runner.Listener/JobDispatcher.cs | 67 ++++--- src/Runner.Listener/SelfUpdaterV2.cs | 171 +++++++++--------- src/Runner.Worker/ActionManager.cs | 250 +++++++++++++++------------ src/Runner.Worker/JobRunner.cs | 147 ++++++++++------ 5 files changed, 396 insertions(+), 294 deletions(-) diff --git a/src/Runner.Common/ResultsServer.cs b/src/Runner.Common/ResultsServer.cs index 8a1b35948c3..c9d592b8f64 100644 --- a/src/Runner.Common/ResultsServer.cs +++ b/src/Runner.Common/ResultsServer.cs @@ -53,6 +53,17 @@ public sealed class ResultServer : RunnerService, IResultsServer private String _liveConsoleFeedUrl; private string _token; + private ClientWebSocket CreateWebSocketClient(string accessToken) + { + var websocketClient = new ClientWebSocket(); + websocketClient.Options.SetRequestHeader("Authorization", $"Bearer {accessToken}"); + var userAgentValues = new List(); + userAgentValues.AddRange(UserAgentUtility.GetDefaultRestUserAgent()); + userAgentValues.AddRange(HostContext.UserAgents); + websocketClient.Options.SetRequestHeader("User-Agent", string.Join(" ", userAgentValues.Select(x => x.ToString()))); + return websocketClient; + } + public void InitializeResultsClient(Uri uri, string liveConsoleFeedUrl, string token, bool useSdk) { this._resultsClient = CreateHttpClient(uri, token, useSdk); @@ -179,12 +190,7 @@ private void InitializeWebsocketClient(string liveConsoleFeedUrl, string accessT } Trace.Info($"Creating websocket client ..." + liveConsoleFeedUrl); - this._websocketClient = new ClientWebSocket(); - this._websocketClient.Options.SetRequestHeader("Authorization", $"Bearer {accessToken}"); - var userAgentValues = new List(); - userAgentValues.AddRange(UserAgentUtility.GetDefaultRestUserAgent()); - userAgentValues.AddRange(HostContext.UserAgents); - this._websocketClient.Options.SetRequestHeader("User-Agent", string.Join(" ", userAgentValues.Select(x => x.ToString()))); + this._websocketClient = CreateWebSocketClient(accessToken); // during initialization, retry upto 3 times to setup connection this._websocketConnectTask = ConnectWebSocketClient(liveConsoleFeedUrl, delay, retryConnection); @@ -192,29 +198,34 @@ private void InitializeWebsocketClient(string liveConsoleFeedUrl, string accessT private async Task ConnectWebSocketClient(string feedStreamUrl, TimeSpan delay, bool retryConnection = false) { - bool connected = false; - int retries = 0; - - do + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - try + MaxAttempts = retryConnection ? 3 : 1, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnRetry = (_, ex, _) => + { + Trace.Info("Exception caught during websocket client connect, retry connection."); + Trace.Error(ex); + _lastConnectionFailure = DateTime.Now; + this._websocketClient = CreateWebSocketClient(_token); + }, + OnFailure = (_, _, _) => + { + this._websocketClient = null; + _lastConnectionFailure = DateTime.Now; + }, + }); + + await retryHelper.ExecuteAsync( + operationName: nameof(ConnectWebSocketClient), + operation: async () => { Trace.Info($"Attempting to start websocket client with delay {delay}."); await Task.Delay(delay); using var connectTimeoutTokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(30)); await this._websocketClient.ConnectAsync(new Uri(feedStreamUrl), connectTimeoutTokenSource.Token); Trace.Info($"Successfully started websocket client."); - connected = true; - } - catch (Exception ex) - { - Trace.Info("Exception caught during websocket client connect, retry connection."); - Trace.Error(ex); - retries++; - this._websocketClient = null; - _lastConnectionFailure = DateTime.Now; - } - } while (retryConnection && !connected && retries < 3); + }); } public async Task AppendLiveConsoleFeedAsync(Guid scopeIdentifier, string hubName, Guid planId, Guid timelineId, Guid timelineRecordId, Guid stepId, IList lines, long? startLine, CancellationToken cancellationToken) diff --git a/src/Runner.Listener/JobDispatcher.cs b/src/Runner.Listener/JobDispatcher.cs index 10076441eff..e5161154e04 100644 --- a/src/Runner.Listener/JobDispatcher.cs +++ b/src/Runner.Listener/JobDispatcher.cs @@ -1126,38 +1126,57 @@ private async Task CompleteJobRequestAsync(int poolId, Pipelines.AgentJobRequest } var runnerServer = HostContext.GetService(); - int completeJobRequestRetryLimit = 5; - List exceptions = new(); - while (completeJobRequestRetryLimit-- > 0) + var exceptions = new List(); + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - try - { - await runnerServer.FinishAgentRequestAsync(poolId, message.RequestId, lockToken, DateTime.UtcNow, result, CancellationToken.None); - return; - } - catch (TaskAgentJobNotFoundException) - { - Trace.Info($"TaskAgentJobNotFoundException received, job {message.JobId} is no longer valid."); - return; - } - catch (TaskAgentJobTokenExpiredException) - { - Trace.Info($"TaskAgentJobTokenExpiredException received, job {message.JobId} is no longer valid."); - return; - } - catch (Exception ex) + MaxAttempts = 5, + GetBackoff = RetryBackoffs.Fixed(TimeSpan.FromSeconds(5)), + OnRetry = (context, ex, _) => { Trace.Error($"Catch exception during complete runner jobrequest {message.RequestId}."); Trace.Error(ex); exceptions.Add(ex); - } + }, + OnFailure = (_, ex, _) => + { + if (ex != null) + { + exceptions.Add(ex); + } + }, + }); - // delay 5 seconds before next retry. - await Task.Delay(TimeSpan.FromSeconds(5)); + var completed = await retryHelper.ExecuteAsync( + operationName: nameof(CompleteJobRequestAsync), + operation: async _ => + { + try + { + await runnerServer.FinishAgentRequestAsync(poolId, message.RequestId, lockToken, DateTime.UtcNow, result, CancellationToken.None); + return true; + } + catch (TaskAgentJobNotFoundException) + { + Trace.Info($"TaskAgentJobNotFoundException received, job {message.JobId} is no longer valid."); + return false; + } + catch (TaskAgentJobTokenExpiredException) + { + Trace.Info($"TaskAgentJobTokenExpiredException received, job {message.JobId} is no longer valid."); + return false; + } + }, + cancellationToken: CancellationToken.None); + + if (completed) + { + return; } - // rethrow all catched exceptions during retry. - throw new AggregateException(exceptions); + if (exceptions.Count > 0) + { + throw new AggregateException(exceptions); + } } // log an error issue to job level timeline record diff --git a/src/Runner.Listener/SelfUpdaterV2.cs b/src/Runner.Listener/SelfUpdaterV2.cs index 78a2acdd3ad..00daeef3cff 100644 --- a/src/Runner.Listener/SelfUpdaterV2.cs +++ b/src/Runner.Listener/SelfUpdaterV2.cs @@ -234,104 +234,113 @@ private async Task DownLoadRunner(string downloadDirectory, string packa { var stopWatch = Stopwatch.StartNew(); int runnerSuffix = 1; - string archiveFile = null; - bool downloadSucceeded = false; + var retryHelper = new RetryHelper(Trace, new RetryStrategy + { + MaxAttempts = Constants.RunnerDownloadRetryMaxAttempts, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }); - // Download the runner, using multiple attempts in order to be resilient against any networking/CDN issues - for (int attempt = 1; attempt <= Constants.RunnerDownloadRetryMaxAttempts; attempt++) + try { - // Generate an available package name, and do our best effort to clean up stale local zip files - while (true) - { - if (packagePlatform.StartsWith("win")) - { - archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.zip"); - } - else + return await retryHelper.ExecuteAsync( + "DownLoadRunner", + async (context) => { - archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.tar.gz"); - } + string archiveFile = null; - try - { - // delete .zip file - if (!string.IsNullOrEmpty(archiveFile) && File.Exists(archiveFile)) + // Generate an available package name, and do our best effort to clean up stale local zip files. + while (true) { - Trace.Verbose("Deleting latest runner package zip '{0}'", archiveFile); - IOUtil.DeleteFile(archiveFile); - } - - break; - } - catch (Exception ex) - { - // couldn't delete the file for whatever reason, so generate another name - Trace.Warning("Failed to delete runner package zip '{0}'. Exception: {1}", archiveFile, ex); - runnerSuffix++; - } - } + if (packagePlatform.StartsWith("win")) + { + archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.zip"); + } + else + { + archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.tar.gz"); + } - // Allow a 15-minute package download timeout, which is good enough to update the runner from a 1 Mbit/s ADSL connection. - if (!int.TryParse(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_DOWNLOAD_TIMEOUT") ?? string.Empty, out int timeoutSeconds)) - { - timeoutSeconds = 15 * 60; - } + try + { + // delete .zip file + if (!string.IsNullOrEmpty(archiveFile) && File.Exists(archiveFile)) + { + Trace.Verbose("Deleting latest runner package zip '{0}'", archiveFile); + IOUtil.DeleteFile(archiveFile); + } + + break; + } + catch (Exception ex) + { + // couldn't delete the file for whatever reason, so generate another name + Trace.Warning("Failed to delete runner package zip '{0}'. Exception: {1}", archiveFile, ex); + runnerSuffix++; + } + } - Trace.Info($"Attempt {attempt}: save latest runner into {archiveFile}."); + // Allow a 15-minute package download timeout, which is good enough to update the runner from a 1 Mbit/s ADSL connection. + if (!int.TryParse(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_DOWNLOAD_TIMEOUT") ?? string.Empty, out int timeoutSeconds)) + { + timeoutSeconds = 15 * 60; + } - using (var downloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) - using (var downloadCts = CancellationTokenSource.CreateLinkedTokenSource(downloadTimeout.Token, token)) - { - try - { - Trace.Info($"Download runner: begin download"); - long downloadSize = 0; + Trace.Info($"Attempt {context.AttemptNumber}: save latest runner into {archiveFile}."); - //open zip stream in async mode - using (HttpClient httpClient = new(HostContext.CreateHttpClientHandler())) + using (var downloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) + using (var downloadCts = CancellationTokenSource.CreateLinkedTokenSource(downloadTimeout.Token, token)) { - Trace.Info($"Downloading {packageDownloadUrl}"); - - using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: 4096, useAsync: true)) - using (Stream result = await httpClient.GetStreamAsync(packageDownloadUrl)) + try { - //81920 is the default used by System.IO.Stream.CopyTo and is under the large object heap threshold (85k). - await result.CopyToAsync(fs, 81920, downloadCts.Token); - await fs.FlushAsync(downloadCts.Token); - downloadSize = fs.Length; + Trace.Info($"Download runner: begin download"); + long downloadSize = 0; + + //open zip stream in async mode + using (HttpClient httpClient = new(HostContext.CreateHttpClientHandler())) + { + Trace.Info($"Downloading {packageDownloadUrl}"); + + using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: 4096, useAsync: true)) + using (Stream result = await httpClient.GetStreamAsync(packageDownloadUrl)) + { + //81920 is the default used by System.IO.Stream.CopyTo and is under the large object heap threshold (85k). + await result.CopyToAsync(fs, 81920, downloadCts.Token); + await fs.FlushAsync(downloadCts.Token); + downloadSize = fs.Length; + } + } + + Trace.Info($"Download runner: finished download"); + stopWatch.Stop(); + _updateTrace.Enqueue($"PackageDownloadTime: {stopWatch.ElapsedMilliseconds}ms"); + _updateTrace.Enqueue($"Attempts: {context.AttemptNumber}"); + _updateTrace.Enqueue($"PackageSize: {downloadSize / 1024 / 1024}MB"); + return archiveFile; } - } + catch (OperationCanceledException) when (token.IsCancellationRequested) + { + Trace.Info($"Runner download has been cancelled."); + throw; + } + catch (Exception ex) + { + if (downloadCts.Token.IsCancellationRequested) + { + Trace.Warning($"Runner download has timed out after {timeoutSeconds} seconds"); + } - Trace.Info($"Download runner: finished download"); - downloadSucceeded = true; - stopWatch.Stop(); - _updateTrace.Enqueue($"PackageDownloadTime: {stopWatch.ElapsedMilliseconds}ms"); - _updateTrace.Enqueue($"Attempts: {attempt}"); - _updateTrace.Enqueue($"PackageSize: {downloadSize / 1024 / 1024}MB"); - break; - } - catch (OperationCanceledException) when (token.IsCancellationRequested) - { - Trace.Info($"Runner download has been cancelled."); - throw; - } - catch (Exception ex) - { - if (downloadCts.Token.IsCancellationRequested) - { - Trace.Warning($"Runner download has timed out after {timeoutSeconds} seconds"); + Trace.Warning($"Failed to get package '{archiveFile}' from '{packageDownloadUrl}'. Exception {ex}"); + throw; + } } - - Trace.Warning($"Failed to get package '{archiveFile}' from '{packageDownloadUrl}'. Exception {ex}"); - } - } + }, + cancellationToken: token); } - - if (downloadSucceeded) + catch (OperationCanceledException) when (token.IsCancellationRequested) { - return archiveFile; + throw; } - else + catch (Exception) { return null; } diff --git a/src/Runner.Worker/ActionManager.cs b/src/Runner.Worker/ActionManager.cs index dea9e7dc032..0a966204d9b 100644 --- a/src/Runner.Worker/ActionManager.cs +++ b/src/Runner.Worker/ActionManager.cs @@ -889,57 +889,66 @@ private async Task BuildActionContainerAsync(IExecutionContext executionContext, var launchServer = HostContext.GetService(); var jobServer = HostContext.GetService(); var actionDownloadInfos = default(WebApi.ActionDownloadInfoCollection); - for (var attempt = 1; attempt <= 3; attempt++) + Exception nonRetryableResolveException = null; + var resolveInfoRetryHelper = new RetryHelper(Trace, new RetryStrategy { - try + MaxAttempts = 3, + GetBackoff = (_, _, _) => String.IsNullOrEmpty(Environment.GetEnvironmentVariable("_GITHUB_ACTION_DOWNLOAD_NO_BACKOFF")) + ? BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(10), TimeSpan.FromSeconds(30)) + : TimeSpan.Zero, + OnRetry = (_, ex, backoff) => { - if (MessageUtil.IsRunServiceJob(executionContext.Global.Variables.Get(Constants.Variables.System.JobRequestType))) + executionContext.Output($"Failed to resolve action download info. Error: {ex.Message}"); + executionContext.Debug(ex.ToString()); + if (backoff > TimeSpan.Zero) { - var displayHelpfulActionsDownloadErrors = executionContext.Global.Variables.GetBoolean(Constants.Runner.Features.DisplayHelpfulActionsDownloadErrors) ?? false; - actionDownloadInfos = await launchServer.ResolveActionsDownloadInfoAsync(executionContext.Global.Plan.PlanId, executionContext.Root.Id, new WebApi.ActionReferenceList { Actions = actionReferences, Dependencies = dependencies }, executionContext.CancellationToken, displayHelpfulActionsDownloadErrors); + executionContext.Output($"Retrying in {backoff.TotalSeconds} seconds"); } - else - { - actionDownloadInfos = await jobServer.ResolveActionDownloadInfoAsync(executionContext.Global.Plan.ScopeIdentifier, executionContext.Global.Plan.PlanType, executionContext.Global.Plan.PlanId, executionContext.Root.Id, new WebApi.ActionReferenceList { Actions = actionReferences }, executionContext.CancellationToken); - } - break; - } - catch (Exception ex) when (!executionContext.CancellationToken.IsCancellationRequested) // Do not retry if the run is cancelled. + }, + }); + + var resolved = await resolveInfoRetryHelper.ExecuteAsync( + operationName: nameof(GetDownloadInfoAsync), + operation: async () => { - // UnresolvableActionDownloadInfoException is a 422 client error, don't retry - // NonRetryableActionDownloadInfoException is an non-retryable exception from Actions - // Some possible cases are: - // * Repo is rate limited - // * Repo or tag doesn't exist, or isn't public - // * Policy validation failed - if (attempt < 3 && !(ex is WebApi.UnresolvableActionDownloadInfoException) && !(ex is WebApi.NonRetryableActionDownloadInfoException)) + try { - executionContext.Output($"Failed to resolve action download info. Error: {ex.Message}"); - executionContext.Debug(ex.ToString()); - if (String.IsNullOrEmpty(Environment.GetEnvironmentVariable("_GITHUB_ACTION_DOWNLOAD_NO_BACKOFF"))) + if (MessageUtil.IsRunServiceJob(executionContext.Global.Variables.Get(Constants.Variables.System.JobRequestType))) + { + var displayHelpfulActionsDownloadErrors = executionContext.Global.Variables.GetBoolean(Constants.Runner.Features.DisplayHelpfulActionsDownloadErrors) ?? false; + actionDownloadInfos = await launchServer.ResolveActionsDownloadInfoAsync(executionContext.Global.Plan.PlanId, executionContext.Root.Id, new WebApi.ActionReferenceList { Actions = actionReferences, Dependencies = dependencies }, executionContext.CancellationToken, displayHelpfulActionsDownloadErrors); + } + else { - var backoff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(10), TimeSpan.FromSeconds(30)); - executionContext.Output($"Retrying in {backoff.TotalSeconds} seconds"); - await Task.Delay(backoff); + actionDownloadInfos = await jobServer.ResolveActionDownloadInfoAsync(executionContext.Global.Plan.ScopeIdentifier, executionContext.Global.Plan.PlanType, executionContext.Global.Plan.PlanId, executionContext.Root.Id, new WebApi.ActionReferenceList { Actions = actionReferences }, executionContext.CancellationToken); } + return OperationOutcome.Success(true); } - else + catch (Exception ex) when (!executionContext.CancellationToken.IsCancellationRequested) { - // Some possible cases are: - // * Repo is rate limited - // * Repo or tag doesn't exist, or isn't public - // * Policy validation failed + // UnresolvableActionDownloadInfoException is a 422 client error, don't retry. + // NonRetryableActionDownloadInfoException is a non-retryable exception from Actions. if (ex is WebApi.UnresolvableActionDownloadInfoException) { - throw; + nonRetryableResolveException = ex; + return OperationOutcome.Success(false); } - else + + if (ex is WebApi.NonRetryableActionDownloadInfoException) { - // This exception will be traced as an infrastructure failure - throw new WebApi.FailedToResolveActionDownloadInfoException("Failed to resolve action download info.", ex); + // This exception will be traced as an infrastructure failure. + nonRetryableResolveException = new WebApi.FailedToResolveActionDownloadInfoException("Failed to resolve action download info.", ex); + return OperationOutcome.Success(false); } + + return OperationOutcome.TransientFailure(ex.Message); } - } + }, + cancellationToken: executionContext.CancellationToken); + + if (!resolved) + { + throw nonRetryableResolveException; } ArgUtil.NotNull(actionDownloadInfos, nameof(actionDownloadInfos)); @@ -1400,115 +1409,132 @@ private AuthenticationHeaderValue CreateAuthHeader(IExecutionContext executionCo private async Task DownloadRepositoryArchive(IExecutionContext executionContext, string downloadUrl, string downloadAuthToken, string archiveFile) { Trace.Info($"Save archive '{downloadUrl}' into {archiveFile}."); - int retryCount = 0; + int attemptNumber = 0; + Exception nonRetryableDownloadException = null; // Allow up to 20 * 60s for any action to be downloaded from github graph. int timeoutSeconds = 20 * 60; try { - while (retryCount < 3) + var downloadArchiveRetryHelper = new RetryHelper(Trace, new RetryStrategy { - string requestId = string.Empty; - using (var actionDownloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) - using (var actionDownloadCancellation = CancellationTokenSource.CreateLinkedTokenSource(actionDownloadTimeout.Token, executionContext.CancellationToken)) + MaxAttempts = 3, + GetBackoff = (_, _, _) => String.IsNullOrEmpty(Environment.GetEnvironmentVariable("_GITHUB_ACTION_DOWNLOAD_NO_BACKOFF")) + ? BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(10), TimeSpan.FromSeconds(30)) + : TimeSpan.Zero, + OnRetry = (context, _, backoff) => { - try + if (backoff > TimeSpan.Zero) { - //open zip stream in async mode - using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: _defaultFileStreamBufferSize, useAsync: true)) - using (var httpClientHandler = HostContext.CreateHttpClientHandler()) - using (var httpClient = new HttpClient(httpClientHandler)) - { - httpClient.DefaultRequestHeaders.Authorization = CreateAuthHeader(executionContext, downloadUrl, downloadAuthToken); + executionContext.Warning($"Back off {backoff.TotalSeconds} seconds before retry."); + } + }, + }); - httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); - using (var response = await httpClient.GetAsync(downloadUrl)) + var downloaded = await downloadArchiveRetryHelper.ExecuteAsync( + operationName: nameof(DownloadRepositoryArchive), + operation: async (context) => + { + attemptNumber = context.AttemptNumber; + string requestId = string.Empty; + using (var actionDownloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) + using (var actionDownloadCancellation = CancellationTokenSource.CreateLinkedTokenSource(actionDownloadTimeout.Token, executionContext.CancellationToken)) + { + try + { + //open zip stream in async mode + using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: _defaultFileStreamBufferSize, useAsync: true)) + using (var httpClientHandler = HostContext.CreateHttpClientHandler()) + using (var httpClient = new HttpClient(httpClientHandler)) { - requestId = UrlUtil.GetGitHubRequestId(response.Headers); - if (!string.IsNullOrEmpty(requestId)) - { - Trace.Info($"Request URL: {downloadUrl} X-GitHub-Request-Id: {requestId} Http Status: {response.StatusCode}"); - } + httpClient.DefaultRequestHeaders.Authorization = CreateAuthHeader(executionContext, downloadUrl, downloadAuthToken); - if (response.IsSuccessStatusCode) + httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + using (var response = await httpClient.GetAsync(downloadUrl)) { - using (var result = await response.Content.ReadAsStreamAsync()) + requestId = UrlUtil.GetGitHubRequestId(response.Headers); + if (!string.IsNullOrEmpty(requestId)) { - await result.CopyToAsync(fs, _defaultCopyBufferSize, actionDownloadCancellation.Token); - await fs.FlushAsync(actionDownloadCancellation.Token); + Trace.Info($"Request URL: {downloadUrl} X-GitHub-Request-Id: {requestId} Http Status: {response.StatusCode}"); + } - // download succeed, break out the retry loop. - break; + if (response.IsSuccessStatusCode) + { + using (var result = await response.Content.ReadAsStreamAsync()) + { + await result.CopyToAsync(fs, _defaultCopyBufferSize, actionDownloadCancellation.Token); + await fs.FlushAsync(actionDownloadCancellation.Token); + return OperationOutcome.Success(true); + } } - } - else if (response.StatusCode == HttpStatusCode.NotFound) - { - // It doesn't make sense to retry in this case, so just stop - throw new ActionNotFoundException(new Uri(downloadUrl), requestId); - } - else if (response.StatusCode == HttpStatusCode.Forbidden) - { - // It doesn't make sense to retry in this case, so just stop - throw new AccessDeniedException($"Access denied to '{downloadUrl}' ({requestId})"); - } - else - { - // Something else bad happened, let's go to our retry logic + + if (response.StatusCode == HttpStatusCode.NotFound) + { + nonRetryableDownloadException = new ActionNotFoundException(new Uri(downloadUrl), requestId); + return OperationOutcome.Success(false); + } + + if (response.StatusCode == HttpStatusCode.Forbidden) + { + nonRetryableDownloadException = new AccessDeniedException($"Access denied to '{downloadUrl}' ({requestId})"); + return OperationOutcome.Success(false); + } + response.EnsureSuccessStatusCode(); + return OperationOutcome.Success(true); } } } - } - catch (OperationCanceledException) when (executionContext.CancellationToken.IsCancellationRequested) - { - Trace.Info("Action download has been cancelled."); - throw; - } - catch (OperationCanceledException ex) when (!executionContext.CancellationToken.IsCancellationRequested && retryCount >= 2) - { - Trace.Info($"Action download final retry timeout after {timeoutSeconds} seconds."); - throw new TimeoutException($"Action '{downloadUrl}' download has timed out. Error: {ex.Message} {requestId}"); - } - catch (ActionNotFoundException) - { - Trace.Info($"The action at '{downloadUrl}' does not exist"); - throw; - } - catch (AccessDeniedException) - { - Trace.Info($"Access denied to '{downloadUrl}'"); - throw; - } - catch (Exception ex) when (retryCount < 2) - { - retryCount++; - Trace.Error($"Fail to download archive '{downloadUrl}' -- Attempt: {retryCount}"); - Trace.Error(ex); - if (actionDownloadTimeout.Token.IsCancellationRequested) + catch (OperationCanceledException) when (executionContext.CancellationToken.IsCancellationRequested) { - // action download didn't finish within timeout - executionContext.Warning($"Action '{downloadUrl}' didn't finish download within {timeoutSeconds} seconds. {requestId}"); + Trace.Info("Action download has been cancelled."); + throw; } - else + catch (OperationCanceledException ex) when (!executionContext.CancellationToken.IsCancellationRequested && context.AttemptNumber >= context.MaxAttempts) { - executionContext.Warning($"Failed to download action '{downloadUrl}'. Error: {ex.Message} {requestId}"); + Trace.Info($"Action download final retry timeout after {timeoutSeconds} seconds."); + nonRetryableDownloadException = new TimeoutException($"Action '{downloadUrl}' download has timed out. Error: {ex.Message} {requestId}"); + return OperationOutcome.Success(false); + } + catch (Exception ex) + { + Trace.Error($"Fail to download archive '{downloadUrl}' -- Attempt: {context.AttemptNumber}"); + Trace.Error(ex); + if (actionDownloadTimeout.Token.IsCancellationRequested) + { + // action download didn't finish within timeout + executionContext.Warning($"Action '{downloadUrl}' didn't finish download within {timeoutSeconds} seconds. {requestId}"); + } + else + { + executionContext.Warning($"Failed to download action '{downloadUrl}'. Error: {ex.Message} {requestId}"); + } + + return OperationOutcome.TransientFailure(ex.Message); } } - } + }, + cancellationToken: executionContext.CancellationToken); - if (String.IsNullOrEmpty(Environment.GetEnvironmentVariable("_GITHUB_ACTION_DOWNLOAD_NO_BACKOFF"))) + if (!downloaded) + { + if (nonRetryableDownloadException is ActionNotFoundException) { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(10), TimeSpan.FromSeconds(30)); - executionContext.Warning($"Back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); + Trace.Info($"The action at '{downloadUrl}' does not exist"); + } + else if (nonRetryableDownloadException is AccessDeniedException) + { + Trace.Info($"Access denied to '{downloadUrl}'"); } + + throw nonRetryableDownloadException; } } catch (Exception ex) when (!(ex is AccessDeniedException) && !(ex is OperationCanceledException) && !executionContext.CancellationToken.IsCancellationRequested) { - Trace.Error($"Failed to download archive '{downloadUrl}' after {retryCount + 1} attempts."); + Trace.Error($"Failed to download archive '{downloadUrl}' after {attemptNumber} attempts."); Trace.Error(ex); - throw new FailedToDownloadActionException($"Failed to download archive '{downloadUrl}' after {retryCount + 1} attempts.", ex); + throw new FailedToDownloadActionException($"Failed to download archive '{downloadUrl}' after {attemptNumber} attempts.", ex); } ArgUtil.NotNullOrEmpty(archiveFile, nameof(archiveFile)); diff --git a/src/Runner.Worker/JobRunner.cs b/src/Runner.Worker/JobRunner.cs index 3c4799a29e0..d94331201ed 100644 --- a/src/Runner.Worker/JobRunner.cs +++ b/src/Runner.Worker/JobRunner.cs @@ -323,41 +323,57 @@ private async Task CompleteJobAsync(IRunServer runServer, IExecution } Trace.Info($"Raising job completed against run service"); - var completeJobRetryLimit = 5; var exceptions = new List(); - while (completeJobRetryLimit-- > 0) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - try - { - await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, result, jobContext.JobOutputs, jobContext.Global.StepsResult, jobContext.Global.JobAnnotations, environmentUrl, telemetry, billingOwnerId: message.BillingOwnerId, infrastructureFailureCategory: jobContext.Global.InfrastructureFailureCategory, default); - return result; - } - catch (VssUnauthorizedException ex) + MaxAttempts = 5, + GetBackoff = RetryBackoffs.Fixed(TimeSpan.FromSeconds(5)), + OnRetry = (_, ex, _) => { Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); Trace.Error(ex); exceptions.Add(ex); - break; - } - catch (TaskOrchestrationJobNotFoundException ex) + }, + OnFailure = (_, ex, _) => { - Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); - Trace.Error(ex); - exceptions.Add(ex); - break; - } - catch (Exception ex) + if (ex != null) + { + exceptions.Add(ex); + } + }, + }); + + var completed = await retryHelper.ExecuteAsync( + operationName: "CompleteJobAsync", + operation: async _ => { - Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); - Trace.Error(ex); - exceptions.Add(ex); - } + try + { + await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, result, jobContext.JobOutputs, jobContext.Global.StepsResult, jobContext.Global.JobAnnotations, environmentUrl, telemetry, billingOwnerId: message.BillingOwnerId, infrastructureFailureCategory: jobContext.Global.InfrastructureFailureCategory, default); + return true; + } + catch (VssUnauthorizedException ex) + { + Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); + Trace.Error(ex); + exceptions.Add(ex); + return false; + } + catch (TaskOrchestrationJobNotFoundException ex) + { + Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); + Trace.Error(ex); + exceptions.Add(ex); + return false; + } + }, + cancellationToken: default); - // delay 5 seconds before next retry. - await Task.Delay(TimeSpan.FromSeconds(5)); + if (completed) + { + return result; } - // rethrow exceptions from all attempts. throw new AggregateException(exceptions); } @@ -406,46 +422,67 @@ private async Task CompleteJobAsync(IJobServer jobServer, IExecution Trace.Info($"Raising job completed event"); var jobCompletedEvent = new JobCompletedEvent(message.RequestId, message.JobId, result, jobContext.JobOutputs, jobContext.ActionsEnvironment, jobContext.Global.StepsTelemetry, jobContext.Global.JobTelemetry); - var completeJobRetryLimit = 5; var exceptions = new List(); - while (completeJobRetryLimit-- > 0) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - try - { - await jobServer.RaisePlanEventAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, jobCompletedEvent, default(CancellationToken)); - return result; - } - catch (TaskOrchestrationPlanNotFoundException ex) - { - Trace.Error($"TaskOrchestrationPlanNotFoundException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); - Trace.Error(ex); - return TaskResult.Failed; - } - catch (TaskOrchestrationPlanSecurityException ex) - { - Trace.Error($"TaskOrchestrationPlanSecurityException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); - Trace.Error(ex); - return TaskResult.Failed; - } - catch (TaskOrchestrationPlanTerminatedException ex) - { - Trace.Error($"TaskOrchestrationPlanTerminatedException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); - Trace.Error(ex); - return TaskResult.Failed; - } - catch (Exception ex) + MaxAttempts = 5, + GetBackoff = RetryBackoffs.Fixed(TimeSpan.FromSeconds(5)), + OnRetry = (_, ex, _) => { Trace.Error($"Catch exception while attempting to raise JobCompletedEvent for job {message.JobId}, job request {message.RequestId}."); Trace.Error(ex); exceptions.Add(ex); - } + }, + OnFailure = (_, ex, _) => + { + if (ex != null) + { + exceptions.Add(ex); + } + }, + }); + + var completed = await retryHelper.ExecuteAsync( + operationName: "RaiseJobCompletedEvent", + operation: async _ => + { + try + { + await jobServer.RaisePlanEventAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, jobCompletedEvent, default(CancellationToken)); + return true; + } + catch (TaskOrchestrationPlanNotFoundException ex) + { + Trace.Error($"TaskOrchestrationPlanNotFoundException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); + Trace.Error(ex); + return false; + } + catch (TaskOrchestrationPlanSecurityException ex) + { + Trace.Error($"TaskOrchestrationPlanSecurityException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); + Trace.Error(ex); + return false; + } + catch (TaskOrchestrationPlanTerminatedException ex) + { + Trace.Error($"TaskOrchestrationPlanTerminatedException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); + Trace.Error(ex); + return false; + } + }, + cancellationToken: default); - // delay 5 seconds before next retry. - await Task.Delay(TimeSpan.FromSeconds(5)); + if (completed) + { + return result; } - // rethrow exceptions from all attempts. - throw new AggregateException(exceptions); + if (exceptions.Count > 0) + { + throw new AggregateException(exceptions); + } + + return TaskResult.Failed; } private void MaskTelemetrySecrets(List jobTelemetry) From 2a1616a0563a8928a1cbc4549db8daba6ecbf214 Mon Sep 17 00:00:00 2001 From: James Frasca Date: Fri, 19 Jun 2026 23:21:29 +0000 Subject: [PATCH 10/10] Migrate worker container retries to RetryHelper --- src/Runner.Worker/ActionManager.cs | 81 +++++++++--------- .../ContainerOperationProvider.cs | 85 ++++++++++--------- 2 files changed, 86 insertions(+), 80 deletions(-) diff --git a/src/Runner.Worker/ActionManager.cs b/src/Runner.Worker/ActionManager.cs index 0a966204d9b..ddd023113e3 100644 --- a/src/Runner.Worker/ActionManager.cs +++ b/src/Runner.Worker/ActionManager.cs @@ -767,29 +767,30 @@ private async Task PullActionContainerAsync(IExecutionContext executionContext, // Pull down docker image with retry up to 3 times var dockerManager = HostContext.GetService(); - int retryCount = 0; int pullExitCode = 0; - while (retryCount < 3) + var pullRetryHelper = new RetryHelper(Trace, new RetryStrategy { - pullExitCode = await dockerManager.DockerPull(executionContext, setupInfo.Container.Image); - if (pullExitCode == 0) + MaxAttempts = 3, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)), + OnRetry = (_, _, backoff) => { - break; - } - else + executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backoff.TotalSeconds} seconds before retry."); + }, + }); + + await pullRetryHelper.ExecuteAsync( + operationName: nameof(PullActionContainerAsync), + operation: async () => { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + pullExitCode = await dockerManager.DockerPull(executionContext, setupInfo.Container.Image); + return pullExitCode == 0 + ? OperationOutcome.Success(true) + : OperationOutcome.TransientFailure($"Docker pull failed with exit code {pullExitCode}"); + }, + cancellationToken: executionContext.CancellationToken); executionContext.Output("##[endgroup]"); - if (retryCount == 3 && pullExitCode != 0) + if (pullExitCode != 0) { throw new InvalidOperationException($"Docker pull failed with exit code {pullExitCode}"); } @@ -811,35 +812,37 @@ private async Task BuildActionContainerAsync(IExecutionContext executionContext, // Build docker image with retry up to 3 times var dockerManager = HostContext.GetService(); - int retryCount = 0; int buildExitCode = 0; var imageName = $"{dockerManager.DockerInstanceLabel}:{Guid.NewGuid().ToString("N")}"; - while (retryCount < 3) + var buildRetryHelper = new RetryHelper(Trace, new RetryStrategy { - buildExitCode = await dockerManager.DockerBuild( - executionContext, - setupInfo.Container.WorkingDirectory, - setupInfo.Container.Dockerfile, - Directory.GetParent(setupInfo.Container.Dockerfile).FullName, - imageName); - if (buildExitCode == 0) + MaxAttempts = 3, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)), + OnRetry = (_, _, backoff) => { - break; - } - else + executionContext.Warning($"Docker build failed with exit code {buildExitCode}, back off {backoff.TotalSeconds} seconds before retry."); + }, + }); + + await buildRetryHelper.ExecuteAsync( + operationName: nameof(BuildActionContainerAsync), + operation: async () => { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker build failed with exit code {buildExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + buildExitCode = await dockerManager.DockerBuild( + executionContext, + setupInfo.Container.WorkingDirectory, + setupInfo.Container.Dockerfile, + Directory.GetParent(setupInfo.Container.Dockerfile).FullName, + imageName); + + return buildExitCode == 0 + ? OperationOutcome.Success(true) + : OperationOutcome.TransientFailure($"Docker build failed with exit code {buildExitCode}"); + }, + cancellationToken: executionContext.CancellationToken); executionContext.Output("##[endgroup]"); - if (retryCount == 3 && buildExitCode != 0) + if (buildExitCode != 0) { throw new InvalidOperationException($"Docker build failed with exit code {buildExitCode}"); } diff --git a/src/Runner.Worker/ContainerOperationProvider.cs b/src/Runner.Worker/ContainerOperationProvider.cs index c5cccb77ef0..53f2940cf99 100644 --- a/src/Runner.Worker/ContainerOperationProvider.cs +++ b/src/Runner.Worker/ContainerOperationProvider.cs @@ -196,31 +196,32 @@ private async Task StartContainerAsync(IExecutionContext executionContext, Conta var configLocation = await ContainerRegistryLogin(executionContext, container); // Pull down docker image with retry up to 3 times - int retryCount = 0; int pullExitCode = 0; - while (retryCount < 3) + var pullRetryHelper = new RetryHelper(Trace, new RetryStrategy { - pullExitCode = await _dockerManager.DockerPull(executionContext, container.ContainerImage, configLocation); - if (pullExitCode == 0) + MaxAttempts = 3, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)), + OnRetry = (_, _, backoff) => { - break; - } - else + executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backoff.TotalSeconds} seconds before retry."); + }, + }); + + await pullRetryHelper.ExecuteAsync( + operationName: nameof(StartContainerAsync), + operation: async () => { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + pullExitCode = await _dockerManager.DockerPull(executionContext, container.ContainerImage, configLocation); + return pullExitCode == 0 + ? OperationOutcome.Success(true) + : OperationOutcome.TransientFailure($"Docker pull failed with exit code {pullExitCode}"); + }, + cancellationToken: executionContext.CancellationToken); // Remove credentials after pulling ContainerRegistryLogout(configLocation); - if (retryCount == 3 && pullExitCode != 0) + if (pullExitCode != 0) { throw new InvalidOperationException($"Docker pull failed with exit code {pullExitCode}"); } @@ -468,33 +469,35 @@ private async Task ContainerRegistryLogin(IExecutionContext executionCon } // Login docker with retry up to 3 times - int retryCount = 0; int loginExitCode = 0; - while (retryCount < 3) - { - loginExitCode = await _dockerManager.DockerLogin( - executionContext, - configLocation, - container.RegistryServer, - container.RegistryAuthUsername, - container.RegistryAuthPassword); - if (loginExitCode == 0) - { - break; - } - else + var loginRetryHelper = new RetryHelper(Trace, new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)), + OnRetry = (_, _, backoff) => { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker login for '{container.RegistryServer}' failed with exit code {loginExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + executionContext.Warning($"Docker login for '{container.RegistryServer}' failed with exit code {loginExitCode}, back off {backoff.TotalSeconds} seconds before retry."); + }, + }); - if (retryCount == 3 && loginExitCode != 0) + await loginRetryHelper.ExecuteAsync( + operationName: nameof(ContainerRegistryLogin), + operation: async () => + { + loginExitCode = await _dockerManager.DockerLogin( + executionContext, + configLocation, + container.RegistryServer, + container.RegistryAuthUsername, + container.RegistryAuthPassword); + + return loginExitCode == 0 + ? OperationOutcome.Success(true) + : OperationOutcome.TransientFailure($"Docker login for '{container.RegistryServer}' failed with exit code {loginExitCode}"); + }, + cancellationToken: executionContext.CancellationToken); + + if (loginExitCode != 0) { throw new InvalidOperationException($"Docker login for '{container.RegistryServer}' failed with exit code {loginExitCode}"); }