diff --git a/src/Runner.Common/ActionsRunServer.cs b/src/Runner.Common/ActionsRunServer.cs index 3ded580071d..cc69ed9471b 100644 --- a/src/Runner.Common/ActionsRunServer.cs +++ b/src/Runner.Common/ActionsRunServer.cs @@ -40,12 +40,17 @@ private void CheckConnection() public Task GetJobMessageAsync(string id, CancellationToken cancellationToken) { CheckConnection(); - var jobMessage = RetryRequest(async () => - { - return await _actionsRunServerClient.GetJobMessageAsync(id, cancellationToken); - }, cancellationToken); - - return jobMessage; + return RetryRequest( + async () => + { + try + { + return OperationOutcome.Success( + await _actionsRunServerClient.GetJobMessageAsync(id, cancellationToken)); + } + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } } } diff --git a/src/Runner.Common/BrokerServer.cs b/src/Runner.Common/BrokerServer.cs index c642b4af1cf..d21b179b9af 100644 --- a/src/Runner.Common/BrokerServer.cs +++ b/src/Runner.Common/BrokerServer.cs @@ -66,10 +66,21 @@ public async Task CreateSessionAsync(TaskAgentSession session, public Task GetRunnerMessageAsync(Guid? sessionId, TaskAgentStatus status, string version, string os, string architecture, bool disableUpdate, CancellationToken cancellationToken) { CheckConnection(); - var brokerSession = RetryRequest( - async () => await _brokerHttpClient.GetRunnerMessageAsync(sessionId, version, status, os, architecture, disableUpdate, cancellationToken), cancellationToken, shouldRetry: ShouldRetryException); - - return brokerSession; + return RetryRequest( + async () => + { + try + { + return OperationOutcome.Success( + await _brokerHttpClient.GetRunnerMessageAsync(sessionId, version, status, os, architecture, disableUpdate, cancellationToken)); + } + catch (AccessDeniedException) { throw; } + catch (VssUnauthorizedException) { throw; } + catch (RunnerNotFoundException) { throw; } + catch (HostedRunnerDeprovisionedException) { throw; } + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } public async Task AcknowledgeRunnerRequestAsync(string runnerRequestId, Guid? sessionId, TaskAgentStatus status, string version, string os, string architecture, CancellationToken cancellationToken) @@ -106,14 +117,5 @@ public Task ForceRefreshConnection(VssCredentials credentials) return Task.CompletedTask; } - public bool ShouldRetryException(Exception ex) - { - if (ex is AccessDeniedException || ex is VssUnauthorizedException || ex is RunnerNotFoundException || ex is HostedRunnerDeprovisionedException) - { - return false; - } - - return true; - } } } diff --git a/src/Runner.Common/JobServer.cs b/src/Runner.Common/JobServer.cs index eac20a2b983..5afb10f0892 100644 --- a/src/Runner.Common/JobServer.cs +++ b/src/Runner.Common/JobServer.cs @@ -61,33 +61,40 @@ public sealed class JobServer : RunnerService, IJobServer public async Task ConnectAsync(VssConnection jobConnection) { _connection = jobConnection; - int totalAttempts = 5; - int attemptCount = totalAttempts; var configurationStore = HostContext.GetService(); var runnerSettings = configurationStore.GetSettings(); - while (!_connection.HasAuthenticated && attemptCount-- > 0) + if (!_connection.HasAuthenticated) { - try - { - await _connection.ConnectAsync(); - break; - } - catch (Exception ex) when (attemptCount > 0) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - Trace.Info($"Catch exception during connect. {attemptCount} attempts left."); - Trace.Error(ex); - - if (runnerSettings.IsHostedServer) + MaxAttempts = 5, + GetBackoff = (attempt, _, _) => BackoffTimerHelper.GetExponentialBackoff(attempt, TimeSpan.FromMilliseconds(100), TimeSpan.FromSeconds(3.2), TimeSpan.FromMilliseconds(100)), + OnRetry = (context, ex, _) => { - await CheckNetworkEndpointsAsync(attemptCount); - } - } - - int attempt = totalAttempts - attemptCount; - TimeSpan backoff = BackoffTimerHelper.GetExponentialBackoff(attempt, TimeSpan.FromMilliseconds(100), TimeSpan.FromSeconds(3.2), TimeSpan.FromMilliseconds(100)); + Trace.Info($"Catch exception during connect. {context.MaxAttempts - context.AttemptNumber} attempts left."); + Trace.Error(ex); + }, + }); + + await retryHelper.ExecuteAsync( + operationName: nameof(ConnectAsync), + operation: async context => + { + try + { + await _connection.ConnectAsync(); + } + catch + { + if (runnerSettings.IsHostedServer && context.AttemptNumber < context.MaxAttempts) + { + await CheckNetworkEndpointsAsync(context.MaxAttempts - context.AttemptNumber); + } - await Task.Delay(backoff); + throw; + } + }); } _taskClient = _connection.GetClient(); diff --git a/src/Runner.Common/LocationServer.cs b/src/Runner.Common/LocationServer.cs index 25e09228332..ee324727456 100644 --- a/src/Runner.Common/LocationServer.cs +++ b/src/Runner.Common/LocationServer.cs @@ -23,21 +23,25 @@ public sealed class LocationServer : RunnerService, ILocationServer public async Task ConnectAsync(VssConnection jobConnection) { _connection = jobConnection; - int attemptCount = 5; - while (!_connection.HasAuthenticated && attemptCount-- > 0) + if (!_connection.HasAuthenticated) { - try + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - await _connection.ConnectAsync(); - break; - } - catch (Exception ex) when (attemptCount > 0) - { - Trace.Info($"Catch exception during connect. {attemptCount} attempt left."); - Trace.Error(ex); - } + MaxAttempts = 5, + GetBackoff = (_, _, _) => TimeSpan.FromMilliseconds(100), + OnRetry = (context, ex, _) => + { + Trace.Info($"Catch exception during connect. {context.MaxAttempts - context.AttemptNumber} attempt left."); + Trace.Error(ex); + }, + }); - await Task.Delay(100); + await retryHelper.ExecuteAsync( + operationName: nameof(ConnectAsync), + operation: async () => + { + await _connection.ConnectAsync(); + }); } _locationClient = _connection.GetClient(); diff --git a/src/Runner.Common/ResultsServer.cs b/src/Runner.Common/ResultsServer.cs index 8a1b35948c3..c9d592b8f64 100644 --- a/src/Runner.Common/ResultsServer.cs +++ b/src/Runner.Common/ResultsServer.cs @@ -53,6 +53,17 @@ public sealed class ResultServer : RunnerService, IResultsServer private String _liveConsoleFeedUrl; private string _token; + private ClientWebSocket CreateWebSocketClient(string accessToken) + { + var websocketClient = new ClientWebSocket(); + websocketClient.Options.SetRequestHeader("Authorization", $"Bearer {accessToken}"); + var userAgentValues = new List(); + userAgentValues.AddRange(UserAgentUtility.GetDefaultRestUserAgent()); + userAgentValues.AddRange(HostContext.UserAgents); + websocketClient.Options.SetRequestHeader("User-Agent", string.Join(" ", userAgentValues.Select(x => x.ToString()))); + return websocketClient; + } + public void InitializeResultsClient(Uri uri, string liveConsoleFeedUrl, string token, bool useSdk) { this._resultsClient = CreateHttpClient(uri, token, useSdk); @@ -179,12 +190,7 @@ private void InitializeWebsocketClient(string liveConsoleFeedUrl, string accessT } Trace.Info($"Creating websocket client ..." + liveConsoleFeedUrl); - this._websocketClient = new ClientWebSocket(); - this._websocketClient.Options.SetRequestHeader("Authorization", $"Bearer {accessToken}"); - var userAgentValues = new List(); - userAgentValues.AddRange(UserAgentUtility.GetDefaultRestUserAgent()); - userAgentValues.AddRange(HostContext.UserAgents); - this._websocketClient.Options.SetRequestHeader("User-Agent", string.Join(" ", userAgentValues.Select(x => x.ToString()))); + this._websocketClient = CreateWebSocketClient(accessToken); // during initialization, retry upto 3 times to setup connection this._websocketConnectTask = ConnectWebSocketClient(liveConsoleFeedUrl, delay, retryConnection); @@ -192,29 +198,34 @@ private void InitializeWebsocketClient(string liveConsoleFeedUrl, string accessT private async Task ConnectWebSocketClient(string feedStreamUrl, TimeSpan delay, bool retryConnection = false) { - bool connected = false; - int retries = 0; - - do + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - try + MaxAttempts = retryConnection ? 3 : 1, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnRetry = (_, ex, _) => + { + Trace.Info("Exception caught during websocket client connect, retry connection."); + Trace.Error(ex); + _lastConnectionFailure = DateTime.Now; + this._websocketClient = CreateWebSocketClient(_token); + }, + OnFailure = (_, _, _) => + { + this._websocketClient = null; + _lastConnectionFailure = DateTime.Now; + }, + }); + + await retryHelper.ExecuteAsync( + operationName: nameof(ConnectWebSocketClient), + operation: async () => { Trace.Info($"Attempting to start websocket client with delay {delay}."); await Task.Delay(delay); using var connectTimeoutTokenSource = new CancellationTokenSource(TimeSpan.FromSeconds(30)); await this._websocketClient.ConnectAsync(new Uri(feedStreamUrl), connectTimeoutTokenSource.Token); Trace.Info($"Successfully started websocket client."); - connected = true; - } - catch (Exception ex) - { - Trace.Info("Exception caught during websocket client connect, retry connection."); - Trace.Error(ex); - retries++; - this._websocketClient = null; - _lastConnectionFailure = DateTime.Now; - } - } while (retryConnection && !connected && retries < 3); + }); } public async Task AppendLiveConsoleFeedAsync(Guid scopeIdentifier, string hubName, Guid planId, Guid timelineId, Guid timelineRecordId, Guid stepId, IList lines, long? startLine, CancellationToken cancellationToken) diff --git a/src/Runner.Common/RetryBackoffs.cs b/src/Runner.Common/RetryBackoffs.cs new file mode 100644 index 00000000000..1f08125bb53 --- /dev/null +++ b/src/Runner.Common/RetryBackoffs.cs @@ -0,0 +1,117 @@ +using System; + +namespace GitHub.Runner.Common +{ + public static class RetryBackoffs + { + public static Func Fixed(TimeSpan delay) + { + if (delay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(delay)); + } + + return (_, _, _) => delay; + } + + public static Func Random(TimeSpan minDelay, TimeSpan maxDelay) + { + ValidateRange(minDelay, maxDelay); + + return (_, _, _) => NextDelay(minDelay, maxDelay); + } + + public static Func Exponential(TimeSpan minDelay, TimeSpan maxDelay, TimeSpan deltaDelay) + { + ValidateRange(minDelay, maxDelay); + if (deltaDelay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(deltaDelay)); + } + + return (attempt, _, _) => + { + var exponent = Math.Max(0, attempt); + var additional = (Math.Pow(2.0, exponent) - 1.0) * deltaDelay.TotalMilliseconds; + var delayMs = Math.Min(minDelay.TotalMilliseconds + additional, maxDelay.TotalMilliseconds); + return TimeSpan.FromMilliseconds(Math.Max(0, delayMs)); + }; + } + + public static Func ExponentialFullJitter(TimeSpan minDelay, TimeSpan maxDelay, TimeSpan deltaDelay) + { + ValidateRange(minDelay, maxDelay); + if (deltaDelay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(deltaDelay)); + } + + return (attempt, _, _) => + { + var exponent = Math.Max(0, attempt); + var additional = (Math.Pow(2.0, exponent) - 1.0) * deltaDelay.TotalMilliseconds; + var capMs = Math.Min(minDelay.TotalMilliseconds + additional, maxDelay.TotalMilliseconds); + var floorMs = minDelay.TotalMilliseconds; + + if (capMs <= floorMs) + { + return TimeSpan.FromMilliseconds(floorMs); + } + + var jitterMs = global::System.Random.Shared.NextDouble() * (capMs - floorMs); + return TimeSpan.FromMilliseconds(floorMs + jitterMs); + }; + } + + public static Func DecorrelatedJitter(TimeSpan minDelay, TimeSpan maxDelay) + { + ValidateRange(minDelay, maxDelay); + + var previous = minDelay; + return (_, _, _) => + { + var upperBoundMs = Math.Min(maxDelay.TotalMilliseconds, previous.TotalMilliseconds * 3); + if (upperBoundMs <= minDelay.TotalMilliseconds) + { + previous = minDelay; + return previous; + } + + var nextMs = minDelay.TotalMilliseconds + (global::System.Random.Shared.NextDouble() * (upperBoundMs - minDelay.TotalMilliseconds)); + previous = TimeSpan.FromMilliseconds(nextMs); + return previous; + }; + } + + private static void ValidateRange(TimeSpan minDelay, TimeSpan maxDelay) + { + if (minDelay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(minDelay)); + } + + if (maxDelay < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException(nameof(maxDelay)); + } + + if (maxDelay < minDelay) + { + throw new ArgumentOutOfRangeException(nameof(maxDelay)); + } + } + + private static TimeSpan NextDelay(TimeSpan minDelay, TimeSpan maxDelay) + { + if (minDelay == maxDelay) + { + return minDelay; + } + + var minMs = minDelay.TotalMilliseconds; + var rangeMs = maxDelay.TotalMilliseconds - minMs; + var jitterMs = global::System.Random.Shared.NextDouble() * rangeMs; + return TimeSpan.FromMilliseconds(minMs + jitterMs); + } + } +} diff --git a/src/Runner.Common/RetryHelper.cs b/src/Runner.Common/RetryHelper.cs new file mode 100644 index 00000000000..e564d46e19d --- /dev/null +++ b/src/Runner.Common/RetryHelper.cs @@ -0,0 +1,233 @@ +using System; +using System.Diagnostics; +using System.Threading; +using System.Threading.Tasks; +using GitHub.Runner.Sdk; + +namespace GitHub.Runner.Common +{ + public sealed class RetryExecutionContext + { + public RetryExecutionContext(string operationName, int attemptNumber, int maxAttempts, DateTime startTimeUtc) + { + OperationName = operationName; + AttemptNumber = attemptNumber; + MaxAttempts = maxAttempts; + StartTimeUtc = startTimeUtc; + } + + public string OperationName { get; } + public int AttemptNumber { get; } + public int MaxAttempts { get; } + public DateTime StartTimeUtc { get; } + } + + public sealed class RetryStrategy + { + public int MaxAttempts { get; init; } + public Func GetBackoff { get; init; } + public Action OnRetry { get; init; } + public Action OnSuccess { get; init; } + public Action OnFailure { get; init; } + } + + public abstract record OperationOutcome + { + public sealed record Success(T Value) : OperationOutcome; + public sealed record TransientFailure(string Reason) : OperationOutcome; + } + + public static class OperationOutcome + { + public static OperationOutcome Success(T value) => new OperationOutcome.Success(value); + + public static OperationOutcome TransientFailure(string reason) => new OperationOutcome.TransientFailure(reason); + } + + public sealed class RetryHelper + { + private readonly Tracing _trace; + private readonly RetryStrategy _strategy; + + public RetryHelper(Tracing trace, RetryStrategy strategy) + { + _trace = trace ?? throw new ArgumentNullException(nameof(trace)); + _strategy = strategy ?? throw new ArgumentNullException(nameof(strategy)); + + ArgUtil.NotNull(strategy.GetBackoff, $"{nameof(strategy)}.{nameof(strategy.GetBackoff)}"); + + if (strategy.MaxAttempts <= 0) + { + throw new ArgumentOutOfRangeException($"{nameof(strategy)}.{nameof(strategy.MaxAttempts)}"); + } + } + + public async Task ExecuteAsync( + Func> operation, + CancellationToken cancellationToken = default) + { + return await ExecuteAsync("operation", operation, cancellationToken); + } + + public async Task ExecuteAsync( + Func> operation, + CancellationToken cancellationToken = default) + { + return await ExecuteAsync("operation", operation, cancellationToken); + } + + public async Task ExecuteAsync( + Func operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + await ExecuteAsync( + "operation", + async _ => + { + await operation(); + return true; + }, + cancellationToken); + } + + public async Task ExecuteAsync( + Func operation, + CancellationToken cancellationToken = default) + { + await ExecuteAsync("operation", operation, cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + await ExecuteAsync( + operationName, + async _ => + { + await operation(); + return true; + }, + cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + await ExecuteAsync( + operationName, + async context => + { + await operation(context); + return true; + }, + cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func> operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + return await ExecuteAsync(operationName, _ => operation(), cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func> operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + + operationName ??= "operation"; + + var attempt = 0; + var startTimeUtc = DateTime.UtcNow; + var stopwatch = Stopwatch.StartNew(); + while (true) + { + attempt++; + cancellationToken.ThrowIfCancellationRequested(); + var context = new RetryExecutionContext(operationName, attempt, _strategy.MaxAttempts, startTimeUtc); + + try + { + var result = await operation(context); + _strategy.OnSuccess?.Invoke(context, stopwatch.Elapsed); + return result; + } + catch (Exception ex) + { + if (attempt >= _strategy.MaxAttempts) + { + _trace.Error($"[{operationName}] retry exhausted at attempt {attempt}/{_strategy.MaxAttempts}"); + _strategy.OnFailure?.Invoke(context, ex, stopwatch.Elapsed); + throw; + } + + var backoff = _strategy.GetBackoff(attempt, _strategy.MaxAttempts, ex); + _strategy.OnRetry?.Invoke(context, ex, backoff); + await Task.Delay(backoff, cancellationToken); + } + } + } + public async Task ExecuteAsync( + string operationName, + Func>> operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + return await ExecuteAsync(operationName, _ => operation(), cancellationToken); + } + + public async Task ExecuteAsync( + string operationName, + Func>> operation, + CancellationToken cancellationToken = default) + { + ArgUtil.NotNull(operation, nameof(operation)); + + operationName ??= "operation"; + + var attempt = 0; + var startTimeUtc = DateTime.UtcNow; + var stopwatch = Stopwatch.StartNew(); + string lastFailureReason = null; + + while (true) + { + attempt++; + cancellationToken.ThrowIfCancellationRequested(); + var context = new RetryExecutionContext(operationName, attempt, _strategy.MaxAttempts, startTimeUtc); + + var outcome = await operation(context); + + if (outcome is OperationOutcome.Success success) + { + _strategy.OnSuccess?.Invoke(context, stopwatch.Elapsed); + return success.Value; + } + + lastFailureReason = ((OperationOutcome.TransientFailure)outcome).Reason; + + if (attempt >= _strategy.MaxAttempts) + { + _trace.Error($"[{operationName}] retry exhausted at attempt {attempt}/{_strategy.MaxAttempts}: {lastFailureReason}"); + _strategy.OnFailure?.Invoke(context, null, stopwatch.Elapsed); + throw new InvalidOperationException($"[{operationName}] failed after {attempt} attempt(s): {lastFailureReason}"); + } + + var backoff = _strategy.GetBackoff(attempt, _strategy.MaxAttempts, null); + _strategy.OnRetry?.Invoke(context, null, backoff); + await Task.Delay(backoff, cancellationToken); + } + } + } +} diff --git a/src/Runner.Common/RunServer.cs b/src/Runner.Common/RunServer.cs index cfdb98cb909..3565b4c1c99 100644 --- a/src/Runner.Common/RunServer.cs +++ b/src/Runner.Common/RunServer.cs @@ -64,11 +64,19 @@ public Task GetJobMessageAsync(string id, string billing { CheckConnection(); return RetryRequest( - async () => await _runServiceHttpClient.GetJobMessageAsync(requestUri, id, VarUtil.OS, billingOwnerId, cancellationToken), cancellationToken, - shouldRetry: ex => - ex is not TaskOrchestrationJobNotFoundException && // HTTP status 404 - ex is not TaskOrchestrationJobAlreadyAcquiredException && // HTTP status 409 - ex is not TaskOrchestrationJobUnprocessableException); // HTTP status 422 + async () => + { + try + { + return OperationOutcome.Success( + await _runServiceHttpClient.GetJobMessageAsync(requestUri, id, VarUtil.OS, billingOwnerId, cancellationToken)); + } + catch (TaskOrchestrationJobNotFoundException) { throw; } // HTTP status 404 + catch (TaskOrchestrationJobAlreadyAcquiredException) { throw; } // HTTP status 409 + catch (TaskOrchestrationJobUnprocessableException) { throw; } // HTTP status 422 + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } public Task CompleteJobAsync( @@ -86,19 +94,35 @@ public Task CompleteJobAsync( { CheckConnection(); return RetryRequest( - async () => await _runServiceHttpClient.CompleteJobAsync(requestUri, planId, jobId, result, outputs, stepResults, jobAnnotations, environmentUrl, telemetry, billingOwnerId, infrastructureFailureCategory, cancellationToken), cancellationToken, - shouldRetry: ex => - ex is not VssUnauthorizedException && // HTTP status 401 - ex is not TaskOrchestrationJobNotFoundException); // HTTP status 404 + async () => + { + try + { + await _runServiceHttpClient.CompleteJobAsync(requestUri, planId, jobId, result, outputs, stepResults, jobAnnotations, environmentUrl, telemetry, billingOwnerId, infrastructureFailureCategory, cancellationToken); + return OperationOutcome.Success(true); + } + catch (VssUnauthorizedException) { throw; } // HTTP status 401 + catch (TaskOrchestrationJobNotFoundException) { throw; } // HTTP status 404 + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } public Task RenewJobAsync(Guid planId, Guid jobId, CancellationToken cancellationToken) { CheckConnection(); return RetryRequest( - async () => await _runServiceHttpClient.RenewJobAsync(requestUri, planId, jobId, cancellationToken), cancellationToken, - shouldRetry: ex => - ex is not TaskOrchestrationJobNotFoundException); // HTTP status 404 + async () => + { + try + { + return OperationOutcome.Success( + await _runServiceHttpClient.RenewJobAsync(requestUri, planId, jobId, cancellationToken)); + } + catch (TaskOrchestrationJobNotFoundException) { throw; } // HTTP status 404 + catch (OperationCanceledException) { throw; } + catch (Exception ex) { return OperationOutcome.TransientFailure(ex.Message); } + }, cancellationToken); } } } diff --git a/src/Runner.Common/RunnerDotcomServer.cs b/src/Runner.Common/RunnerDotcomServer.cs index c021e8bcc23..3d4e2d87ba4 100644 --- a/src/Runner.Common/RunnerDotcomServer.cs +++ b/src/Runner.Common/RunnerDotcomServer.cs @@ -111,19 +111,27 @@ public async Task DeleteRunnerAsync(string githubUrl, string githubToken, ulong private async Task RetryRequest(string githubApiUrl, string githubToken, RequestType requestType, int maxRetryAttemptsCount = 5, string errorMessage = null, StringContent body = null) { - int retry = 0; - while (true) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - retry++; - using (var httpClientHandler = HostContext.CreateHttpClientHandler()) - using (var httpClient = new HttpClient(httpClientHandler)) + MaxAttempts = maxRetryAttemptsCount, + GetBackoff = RetryBackoffs.Random(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + OnRetry = (context, _, backoff) => { - httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("RemoteAuth", githubToken); - httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); - - var responseStatus = System.Net.HttpStatusCode.OK; - try + Trace.Error($"{errorMessage} -- Attempt: {context.AttemptNumber}"); + Trace.Info($"Retrying in {backoff.Seconds} seconds"); + }, + }); + + return await retryHelper.ExecuteAsync( + operationName: "RunnerDotcomServer.RetryRequest", + operation: async () => + { + using (var httpClientHandler = HostContext.CreateHttpClientHandler()) + using (var httpClient = new HttpClient(httpClientHandler)) { + httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("RemoteAuth", githubToken); + httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + HttpResponseMessage response = null; switch (requestType) { @@ -145,35 +153,32 @@ private async Task RetryRequest(string githubApiUrl, string githubToken, R if (response != null) { - responseStatus = response.StatusCode; var githubRequestId = UrlUtil.GetGitHubRequestId(response.Headers); if (response.IsSuccessStatusCode) { Trace.Info($"Http response code: {response.StatusCode} from '{requestType.ToString()} {githubApiUrl}' ({githubRequestId})"); var jsonResponse = await response.Content.ReadAsStringAsync(); - return StringUtil.ConvertFromJson(jsonResponse); + return OperationOutcome.Success(StringUtil.ConvertFromJson(jsonResponse)); } else { _term.WriteError($"Http response code: {response.StatusCode} from '{requestType.ToString()} {githubApiUrl}' (Request Id: {githubRequestId})"); var errorResponse = await response.Content.ReadAsStringAsync(); _term.WriteError(errorResponse); - response.EnsureSuccessStatusCode(); + + if (response.StatusCode == System.Net.HttpStatusCode.NotFound) + { + response.EnsureSuccessStatusCode(); + } + + return OperationOutcome.TransientFailure($"Http response code: {response.StatusCode} from '{requestType} {githubApiUrl}'"); } } + throw new InvalidOperationException($"Unable to process response from '{requestType} {githubApiUrl}'."); } - catch (Exception ex) when (retry < maxRetryAttemptsCount && responseStatus != System.Net.HttpStatusCode.NotFound) - { - Trace.Error($"{errorMessage} -- Attempt: {retry}"); - Trace.Error(ex); - } - } - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)); - Trace.Info($"Retrying in {backOff.Seconds} seconds"); - await Task.Delay(backOff); - } + }); } private string GetEntityUrl(string githubUrl) diff --git a/src/Runner.Common/RunnerService.cs b/src/Runner.Common/RunnerService.cs index ccaa83f698f..d5d308e09ad 100644 --- a/src/Runner.Common/RunnerService.cs +++ b/src/Runner.Common/RunnerService.cs @@ -46,67 +46,51 @@ protected async Task EstablishVssConnection(Uri serverUrl, VssCre { Trace.Info($"EstablishVssConnection"); Trace.Info($"Establish connection with {timeout.TotalSeconds} seconds timeout."); - int attemptCount = 5; - while (attemptCount-- > 0) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - var connection = VssUtil.CreateConnection(serverUrl, credentials, timeout: timeout); - try + MaxAttempts = 5, + GetBackoff = RetryBackoffs.Fixed(TimeSpan.FromMilliseconds(100)), + OnRetry = (context, ex, _) => { - await connection.ConnectAsync(); - return connection; - } - catch (Exception ex) when (attemptCount > 0) - { - Trace.Info($"Catch exception during connect. {attemptCount} attempt left."); + Trace.Info($"Catch exception during connect. {context.MaxAttempts - context.AttemptNumber} attempt left."); Trace.Error(ex); + }, + }); - await HostContext.Delay(TimeSpan.FromMilliseconds(100), CancellationToken.None); - } - } - - // should never reach here. - throw new InvalidOperationException(nameof(EstablishVssConnection)); + return await retryHelper.ExecuteAsync( + operationName: nameof(EstablishVssConnection), + operation: async () => + { + var connection = VssUtil.CreateConnection(serverUrl, credentials, timeout: timeout); + await connection.ConnectAsync(); + return connection; + }); } - protected async Task RetryRequest(Func func, + protected async Task RetryRequest(Func>> func, CancellationToken cancellationToken, - int maxAttempts = 5, - Func shouldRetry = null + int maxAttempts = 5 ) { - async Task wrappedFunc() - { - await func(); - return Unit.Value; - } - await RetryRequest(wrappedFunc, cancellationToken, maxAttempts, shouldRetry); + await RetryRequest(func, cancellationToken, maxAttempts); } - protected async Task RetryRequest(Func> func, + protected async Task RetryRequest(Func>> func, CancellationToken cancellationToken, - int maxAttempts = 5, - Func shouldRetry = null + int maxAttempts = 5 ) { - var attempt = 0; - while (true) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - attempt++; - cancellationToken.ThrowIfCancellationRequested(); - try - { - return await func(); - } - // TODO: Add handling of non-retriable exceptions: https://github.com/github/actions-broker/issues/122 - catch (Exception ex) when (attempt < maxAttempts && (shouldRetry == null || shouldRetry(ex))) + MaxAttempts = maxAttempts, + GetBackoff = RetryBackoffs.Random(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15)), + OnRetry = (context, _, backoff) => { - Trace.Error("Catch exception during request"); - Trace.Error(ex); - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(5), TimeSpan.FromSeconds(15)); - Trace.Warning($"Back off {backOff.TotalSeconds} seconds before next retry. {maxAttempts - attempt} attempt left."); - await Task.Delay(backOff, cancellationToken); - } - } + Trace.Warning($"Transient failure during request, retrying. Attempt {context.AttemptNumber}/{context.MaxAttempts}. Back off {backoff.TotalSeconds} seconds."); + }, + }); + + return await retryHelper.ExecuteAsync("RetryRequest", func, cancellationToken); } } } diff --git a/src/Runner.Listener/Configuration/ConfigurationManager.cs b/src/Runner.Listener/Configuration/ConfigurationManager.cs index 360728a1c9b..fd44ab82652 100644 --- a/src/Runner.Listener/Configuration/ConfigurationManager.cs +++ b/src/Runner.Listener/Configuration/ConfigurationManager.cs @@ -775,51 +775,54 @@ private async Task GetJITRunnerTokenAsync(string gith throw new ArgumentException($"'{githubUrl}' should point to an org or repository."); } - int retryCount = 0; - while (retryCount < 3) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - using (var httpClientHandler = HostContext.CreateHttpClientHandler()) - using (var httpClient = new HttpClient(httpClientHandler)) + MaxAttempts = 3, + GetBackoff = RetryBackoffs.Random(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + OnRetry = (context, _, backoff) => { - var base64EncodingToken = Convert.ToBase64String(Encoding.UTF8.GetBytes($"github:{githubToken}")); - HostContext.SecretMasker.AddValue(base64EncodingToken); - httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("basic", base64EncodingToken); - httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); - httpClient.DefaultRequestHeaders.Accept.ParseAdd("application/vnd.github.v3+json"); + Trace.Error($"Failed to get JIT runner token -- Attempt: {context.AttemptNumber}"); + Trace.Info($"Retrying in {backoff.Seconds} seconds"); + }, + }); - var responseStatus = System.Net.HttpStatusCode.OK; - try + return await retryHelper.ExecuteAsync( + operationName: "GetJITRunnerTokenAsync", + operation: async () => + { + using (var httpClientHandler = HostContext.CreateHttpClientHandler()) + using (var httpClient = new HttpClient(httpClientHandler)) { + var base64EncodingToken = Convert.ToBase64String(Encoding.UTF8.GetBytes($"github:{githubToken}")); + HostContext.SecretMasker.AddValue(base64EncodingToken); + httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("basic", base64EncodingToken); + httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + httpClient.DefaultRequestHeaders.Accept.ParseAdd("application/vnd.github.v3+json"); + var response = await httpClient.PostAsync(githubApiUrl, new StringContent(string.Empty)); - responseStatus = response.StatusCode; var githubRequestId = UrlUtil.GetGitHubRequestId(response.Headers); if (response.IsSuccessStatusCode) { Trace.Info($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}' ({githubRequestId})"); var jsonResponse = await response.Content.ReadAsStringAsync(); - return StringUtil.ConvertFromJson(jsonResponse); + return OperationOutcome.Success(StringUtil.ConvertFromJson(jsonResponse)); } else { _term.WriteError($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}' (Request Id: {githubRequestId})"); var errorResponse = await response.Content.ReadAsStringAsync(); _term.WriteError(errorResponse); - response.EnsureSuccessStatusCode(); + + if (response.StatusCode == System.Net.HttpStatusCode.NotFound) + { + response.EnsureSuccessStatusCode(); + } + + return OperationOutcome.TransientFailure($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}'"); } } - catch (Exception ex) when (retryCount < 2 && responseStatus != System.Net.HttpStatusCode.NotFound) - { - retryCount++; - Trace.Error($"Failed to get JIT runner token -- Attempt: {retryCount}"); - Trace.Error(ex); - } - } - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)); - Trace.Info($"Retrying in {backOff.Seconds} seconds"); - await Task.Delay(backOff); - } - return null; + }); } private async Task GetTenantCredential(string githubUrl, string githubToken, string runnerEvent) @@ -835,54 +838,57 @@ private async Task GetTenantCredential(string githubUrl, strin githubApiUrl = $"{gitHubUrlBuilder.Scheme}://{gitHubUrlBuilder.Host}/api/v3/actions/runner-registration"; } - int retryCount = 0; - while (retryCount < 3) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - using (var httpClientHandler = HostContext.CreateHttpClientHandler()) - using (var httpClient = new HttpClient(httpClientHandler)) + MaxAttempts = 3, + GetBackoff = RetryBackoffs.Random(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)), + OnRetry = (context, _, backoff) => { - httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("RemoteAuth", githubToken); - httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + Trace.Error($"Failed to get tenant credentials -- Attempt: {context.AttemptNumber}"); + Trace.Info($"Retrying in {backoff.Seconds} seconds"); + }, + }); - var bodyObject = new Dictionary() - { - {"url", githubUrl}, - {"runner_event", runnerEvent} - }; + var bodyObject = new Dictionary() + { + {"url", githubUrl}, + {"runner_event", runnerEvent} + }; - var responseStatus = System.Net.HttpStatusCode.OK; - try + return await retryHelper.ExecuteAsync( + operationName: "GetTenantCredential", + operation: async () => + { + using (var httpClientHandler = HostContext.CreateHttpClientHandler()) + using (var httpClient = new HttpClient(httpClientHandler)) { + httpClient.DefaultRequestHeaders.Authorization = new AuthenticationHeaderValue("RemoteAuth", githubToken); + httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + var response = await httpClient.PostAsync(githubApiUrl, new StringContent(StringUtil.ConvertToJson(bodyObject), null, "application/json")); - responseStatus = response.StatusCode; var githubRequestId = UrlUtil.GetGitHubRequestId(response.Headers); if (response.IsSuccessStatusCode) { Trace.Info($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}' ({githubRequestId})"); var jsonResponse = await response.Content.ReadAsStringAsync(); - return StringUtil.ConvertFromJson(jsonResponse); + return OperationOutcome.Success(StringUtil.ConvertFromJson(jsonResponse)); } else { _term.WriteError($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}' (Request Id: {githubRequestId})"); var errorResponse = await response.Content.ReadAsStringAsync(); _term.WriteError(errorResponse); - response.EnsureSuccessStatusCode(); + + if (response.StatusCode == System.Net.HttpStatusCode.NotFound) + { + response.EnsureSuccessStatusCode(); + } + + return OperationOutcome.TransientFailure($"Http response code: {response.StatusCode} from 'POST {githubApiUrl}'"); } } - catch (Exception ex) when (retryCount < 2 && responseStatus != System.Net.HttpStatusCode.NotFound) - { - retryCount++; - Trace.Error($"Failed to get tenant credentials -- Attempt: {retryCount}"); - Trace.Error(ex); - } - } - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(5)); - Trace.Info($"Retrying in {backOff.Seconds} seconds"); - await Task.Delay(backOff); - } - return null; + }); } } } diff --git a/src/Runner.Listener/JobDispatcher.cs b/src/Runner.Listener/JobDispatcher.cs index 10076441eff..e5161154e04 100644 --- a/src/Runner.Listener/JobDispatcher.cs +++ b/src/Runner.Listener/JobDispatcher.cs @@ -1126,38 +1126,57 @@ private async Task CompleteJobRequestAsync(int poolId, Pipelines.AgentJobRequest } var runnerServer = HostContext.GetService(); - int completeJobRequestRetryLimit = 5; - List exceptions = new(); - while (completeJobRequestRetryLimit-- > 0) + var exceptions = new List(); + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - try - { - await runnerServer.FinishAgentRequestAsync(poolId, message.RequestId, lockToken, DateTime.UtcNow, result, CancellationToken.None); - return; - } - catch (TaskAgentJobNotFoundException) - { - Trace.Info($"TaskAgentJobNotFoundException received, job {message.JobId} is no longer valid."); - return; - } - catch (TaskAgentJobTokenExpiredException) - { - Trace.Info($"TaskAgentJobTokenExpiredException received, job {message.JobId} is no longer valid."); - return; - } - catch (Exception ex) + MaxAttempts = 5, + GetBackoff = RetryBackoffs.Fixed(TimeSpan.FromSeconds(5)), + OnRetry = (context, ex, _) => { Trace.Error($"Catch exception during complete runner jobrequest {message.RequestId}."); Trace.Error(ex); exceptions.Add(ex); - } + }, + OnFailure = (_, ex, _) => + { + if (ex != null) + { + exceptions.Add(ex); + } + }, + }); - // delay 5 seconds before next retry. - await Task.Delay(TimeSpan.FromSeconds(5)); + var completed = await retryHelper.ExecuteAsync( + operationName: nameof(CompleteJobRequestAsync), + operation: async _ => + { + try + { + await runnerServer.FinishAgentRequestAsync(poolId, message.RequestId, lockToken, DateTime.UtcNow, result, CancellationToken.None); + return true; + } + catch (TaskAgentJobNotFoundException) + { + Trace.Info($"TaskAgentJobNotFoundException received, job {message.JobId} is no longer valid."); + return false; + } + catch (TaskAgentJobTokenExpiredException) + { + Trace.Info($"TaskAgentJobTokenExpiredException received, job {message.JobId} is no longer valid."); + return false; + } + }, + cancellationToken: CancellationToken.None); + + if (completed) + { + return; } - // rethrow all catched exceptions during retry. - throw new AggregateException(exceptions); + if (exceptions.Count > 0) + { + throw new AggregateException(exceptions); + } } // log an error issue to job level timeline record diff --git a/src/Runner.Listener/SelfUpdater.cs b/src/Runner.Listener/SelfUpdater.cs index 9cf6ae8a9e9..0b7c8fa7530 100644 --- a/src/Runner.Listener/SelfUpdater.cs +++ b/src/Runner.Listener/SelfUpdater.cs @@ -269,110 +269,119 @@ private async Task DownLoadRunner(string downloadDirectory, string packa { var stopWatch = Stopwatch.StartNew(); int runnerSuffix = 1; - string archiveFile = null; - bool downloadSucceeded = false; + var retryHelper = new RetryHelper(Trace, new RetryStrategy + { + MaxAttempts = Constants.RunnerDownloadRetryMaxAttempts, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }); - // Download the runner, using multiple attempts in order to be resilient against any networking/CDN issues - for (int attempt = 1; attempt <= Constants.RunnerDownloadRetryMaxAttempts; attempt++) + try { - // Generate an available package name, and do our best effort to clean up stale local zip files - while (true) - { - if (_targetPackage.Platform.StartsWith("win")) + return await retryHelper.ExecuteAsync( + "DownLoadRunner", + async (context) => { - archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.zip"); - } - else - { - archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.tar.gz"); - } + string archiveFile = null; - try - { - // delete .zip file - if (!string.IsNullOrEmpty(archiveFile) && File.Exists(archiveFile)) + // Generate an available package name, and do our best effort to clean up stale local zip files. + while (true) { - Trace.Verbose("Deleting latest runner package zip '{0}'", archiveFile); - IOUtil.DeleteFile(archiveFile); - } - - break; - } - catch (Exception ex) - { - // couldn't delete the file for whatever reason, so generate another name - Trace.Warning("Failed to delete runner package zip '{0}'. Exception: {1}", archiveFile, ex); - runnerSuffix++; - } - } + if (_targetPackage.Platform.StartsWith("win")) + { + archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.zip"); + } + else + { + archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.tar.gz"); + } - // Allow a 15-minute package download timeout, which is good enough to update the runner from a 1 Mbit/s ADSL connection. - if (!int.TryParse(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_DOWNLOAD_TIMEOUT") ?? string.Empty, out int timeoutSeconds)) - { - timeoutSeconds = 15 * 60; - } + try + { + // delete .zip file + if (!string.IsNullOrEmpty(archiveFile) && File.Exists(archiveFile)) + { + Trace.Verbose("Deleting latest runner package zip '{0}'", archiveFile); + IOUtil.DeleteFile(archiveFile); + } + + break; + } + catch (Exception ex) + { + // couldn't delete the file for whatever reason, so generate another name + Trace.Warning("Failed to delete runner package zip '{0}'. Exception: {1}", archiveFile, ex); + runnerSuffix++; + } + } - Trace.Info($"Attempt {attempt}: save latest runner into {archiveFile}."); + // Allow a 15-minute package download timeout, which is good enough to update the runner from a 1 Mbit/s ADSL connection. + if (!int.TryParse(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_DOWNLOAD_TIMEOUT") ?? string.Empty, out int timeoutSeconds)) + { + timeoutSeconds = 15 * 60; + } - using (var downloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) - using (var downloadCts = CancellationTokenSource.CreateLinkedTokenSource(downloadTimeout.Token, token)) - { - try - { - Trace.Info($"Download runner: begin download"); - long downloadSize = 0; + Trace.Info($"Attempt {context.AttemptNumber}: save latest runner into {archiveFile}."); - //open zip stream in async mode - using (HttpClient httpClient = new(HostContext.CreateHttpClientHandler())) + using (var downloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) + using (var downloadCts = CancellationTokenSource.CreateLinkedTokenSource(downloadTimeout.Token, token)) { - if (!string.IsNullOrEmpty(_targetPackage.Token)) + try { - Trace.Info($"Adding authorization token ({_targetPackage.Token.Length} chars)"); - httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", _targetPackage.Token); + Trace.Info($"Download runner: begin download"); + long downloadSize = 0; + + //open zip stream in async mode + using (HttpClient httpClient = new(HostContext.CreateHttpClientHandler())) + { + if (!string.IsNullOrEmpty(_targetPackage.Token)) + { + Trace.Info($"Adding authorization token ({_targetPackage.Token.Length} chars)"); + httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Bearer", _targetPackage.Token); + } + + Trace.Info($"Downloading {packageDownloadUrl}"); + + using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: 4096, useAsync: true)) + using (Stream result = await httpClient.GetStreamAsync(packageDownloadUrl)) + { + //81920 is the default used by System.IO.Stream.CopyTo and is under the large object heap threshold (85k). + await result.CopyToAsync(fs, 81920, downloadCts.Token); + await fs.FlushAsync(downloadCts.Token); + downloadSize = fs.Length; + } + } + + Trace.Info($"Download runner: finished download"); + stopWatch.Stop(); + _updateTrace.Enqueue($"PackageDownloadTime: {stopWatch.ElapsedMilliseconds}ms"); + _updateTrace.Enqueue($"Attempts: {context.AttemptNumber}"); + _updateTrace.Enqueue($"PackageSize: {downloadSize / 1024 / 1024}MB"); + return archiveFile; } - - Trace.Info($"Downloading {packageDownloadUrl}"); - - using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: 4096, useAsync: true)) - using (Stream result = await httpClient.GetStreamAsync(packageDownloadUrl)) + catch (OperationCanceledException) when (token.IsCancellationRequested) { - //81920 is the default used by System.IO.Stream.CopyTo and is under the large object heap threshold (85k). - await result.CopyToAsync(fs, 81920, downloadCts.Token); - await fs.FlushAsync(downloadCts.Token); - downloadSize = fs.Length; + Trace.Info($"Runner download has been cancelled."); + throw; } - } + catch (Exception ex) + { + if (downloadCts.Token.IsCancellationRequested) + { + Trace.Warning($"Runner download has timed out after {timeoutSeconds} seconds"); + } - Trace.Info($"Download runner: finished download"); - downloadSucceeded = true; - stopWatch.Stop(); - _updateTrace.Enqueue($"PackageDownloadTime: {stopWatch.ElapsedMilliseconds}ms"); - _updateTrace.Enqueue($"Attempts: {attempt}"); - _updateTrace.Enqueue($"PackageSize: {downloadSize / 1024 / 1024}MB"); - break; - } - catch (OperationCanceledException) when (token.IsCancellationRequested) - { - Trace.Info($"Runner download has been cancelled."); - throw; - } - catch (Exception ex) - { - if (downloadCts.Token.IsCancellationRequested) - { - Trace.Warning($"Runner download has timed out after {timeoutSeconds} seconds"); + Trace.Warning($"Failed to get package '{archiveFile}' from '{packageDownloadUrl}'. Exception {ex}"); + throw; + } } - - Trace.Warning($"Failed to get package '{archiveFile}' from '{packageDownloadUrl}'. Exception {ex}"); - } - } + }, + cancellationToken: token); } - - if (downloadSucceeded) + catch (OperationCanceledException) when (token.IsCancellationRequested) { - return archiveFile; + throw; } - else + catch (Exception) { return null; } diff --git a/src/Runner.Listener/SelfUpdaterV2.cs b/src/Runner.Listener/SelfUpdaterV2.cs index 78a2acdd3ad..00daeef3cff 100644 --- a/src/Runner.Listener/SelfUpdaterV2.cs +++ b/src/Runner.Listener/SelfUpdaterV2.cs @@ -234,104 +234,113 @@ private async Task DownLoadRunner(string downloadDirectory, string packa { var stopWatch = Stopwatch.StartNew(); int runnerSuffix = 1; - string archiveFile = null; - bool downloadSucceeded = false; + var retryHelper = new RetryHelper(Trace, new RetryStrategy + { + MaxAttempts = Constants.RunnerDownloadRetryMaxAttempts, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }); - // Download the runner, using multiple attempts in order to be resilient against any networking/CDN issues - for (int attempt = 1; attempt <= Constants.RunnerDownloadRetryMaxAttempts; attempt++) + try { - // Generate an available package name, and do our best effort to clean up stale local zip files - while (true) - { - if (packagePlatform.StartsWith("win")) - { - archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.zip"); - } - else + return await retryHelper.ExecuteAsync( + "DownLoadRunner", + async (context) => { - archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.tar.gz"); - } + string archiveFile = null; - try - { - // delete .zip file - if (!string.IsNullOrEmpty(archiveFile) && File.Exists(archiveFile)) + // Generate an available package name, and do our best effort to clean up stale local zip files. + while (true) { - Trace.Verbose("Deleting latest runner package zip '{0}'", archiveFile); - IOUtil.DeleteFile(archiveFile); - } - - break; - } - catch (Exception ex) - { - // couldn't delete the file for whatever reason, so generate another name - Trace.Warning("Failed to delete runner package zip '{0}'. Exception: {1}", archiveFile, ex); - runnerSuffix++; - } - } + if (packagePlatform.StartsWith("win")) + { + archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.zip"); + } + else + { + archiveFile = Path.Combine(downloadDirectory, $"runner{runnerSuffix}.tar.gz"); + } - // Allow a 15-minute package download timeout, which is good enough to update the runner from a 1 Mbit/s ADSL connection. - if (!int.TryParse(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_DOWNLOAD_TIMEOUT") ?? string.Empty, out int timeoutSeconds)) - { - timeoutSeconds = 15 * 60; - } + try + { + // delete .zip file + if (!string.IsNullOrEmpty(archiveFile) && File.Exists(archiveFile)) + { + Trace.Verbose("Deleting latest runner package zip '{0}'", archiveFile); + IOUtil.DeleteFile(archiveFile); + } + + break; + } + catch (Exception ex) + { + // couldn't delete the file for whatever reason, so generate another name + Trace.Warning("Failed to delete runner package zip '{0}'. Exception: {1}", archiveFile, ex); + runnerSuffix++; + } + } - Trace.Info($"Attempt {attempt}: save latest runner into {archiveFile}."); + // Allow a 15-minute package download timeout, which is good enough to update the runner from a 1 Mbit/s ADSL connection. + if (!int.TryParse(Environment.GetEnvironmentVariable("GITHUB_ACTIONS_RUNNER_DOWNLOAD_TIMEOUT") ?? string.Empty, out int timeoutSeconds)) + { + timeoutSeconds = 15 * 60; + } - using (var downloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) - using (var downloadCts = CancellationTokenSource.CreateLinkedTokenSource(downloadTimeout.Token, token)) - { - try - { - Trace.Info($"Download runner: begin download"); - long downloadSize = 0; + Trace.Info($"Attempt {context.AttemptNumber}: save latest runner into {archiveFile}."); - //open zip stream in async mode - using (HttpClient httpClient = new(HostContext.CreateHttpClientHandler())) + using (var downloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) + using (var downloadCts = CancellationTokenSource.CreateLinkedTokenSource(downloadTimeout.Token, token)) { - Trace.Info($"Downloading {packageDownloadUrl}"); - - using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: 4096, useAsync: true)) - using (Stream result = await httpClient.GetStreamAsync(packageDownloadUrl)) + try { - //81920 is the default used by System.IO.Stream.CopyTo and is under the large object heap threshold (85k). - await result.CopyToAsync(fs, 81920, downloadCts.Token); - await fs.FlushAsync(downloadCts.Token); - downloadSize = fs.Length; + Trace.Info($"Download runner: begin download"); + long downloadSize = 0; + + //open zip stream in async mode + using (HttpClient httpClient = new(HostContext.CreateHttpClientHandler())) + { + Trace.Info($"Downloading {packageDownloadUrl}"); + + using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: 4096, useAsync: true)) + using (Stream result = await httpClient.GetStreamAsync(packageDownloadUrl)) + { + //81920 is the default used by System.IO.Stream.CopyTo and is under the large object heap threshold (85k). + await result.CopyToAsync(fs, 81920, downloadCts.Token); + await fs.FlushAsync(downloadCts.Token); + downloadSize = fs.Length; + } + } + + Trace.Info($"Download runner: finished download"); + stopWatch.Stop(); + _updateTrace.Enqueue($"PackageDownloadTime: {stopWatch.ElapsedMilliseconds}ms"); + _updateTrace.Enqueue($"Attempts: {context.AttemptNumber}"); + _updateTrace.Enqueue($"PackageSize: {downloadSize / 1024 / 1024}MB"); + return archiveFile; } - } + catch (OperationCanceledException) when (token.IsCancellationRequested) + { + Trace.Info($"Runner download has been cancelled."); + throw; + } + catch (Exception ex) + { + if (downloadCts.Token.IsCancellationRequested) + { + Trace.Warning($"Runner download has timed out after {timeoutSeconds} seconds"); + } - Trace.Info($"Download runner: finished download"); - downloadSucceeded = true; - stopWatch.Stop(); - _updateTrace.Enqueue($"PackageDownloadTime: {stopWatch.ElapsedMilliseconds}ms"); - _updateTrace.Enqueue($"Attempts: {attempt}"); - _updateTrace.Enqueue($"PackageSize: {downloadSize / 1024 / 1024}MB"); - break; - } - catch (OperationCanceledException) when (token.IsCancellationRequested) - { - Trace.Info($"Runner download has been cancelled."); - throw; - } - catch (Exception ex) - { - if (downloadCts.Token.IsCancellationRequested) - { - Trace.Warning($"Runner download has timed out after {timeoutSeconds} seconds"); + Trace.Warning($"Failed to get package '{archiveFile}' from '{packageDownloadUrl}'. Exception {ex}"); + throw; + } } - - Trace.Warning($"Failed to get package '{archiveFile}' from '{packageDownloadUrl}'. Exception {ex}"); - } - } + }, + cancellationToken: token); } - - if (downloadSucceeded) + catch (OperationCanceledException) when (token.IsCancellationRequested) { - return archiveFile; + throw; } - else + catch (Exception) { return null; } diff --git a/src/Runner.Worker/ActionManager.cs b/src/Runner.Worker/ActionManager.cs index dea9e7dc032..ddd023113e3 100644 --- a/src/Runner.Worker/ActionManager.cs +++ b/src/Runner.Worker/ActionManager.cs @@ -767,29 +767,30 @@ private async Task PullActionContainerAsync(IExecutionContext executionContext, // Pull down docker image with retry up to 3 times var dockerManager = HostContext.GetService(); - int retryCount = 0; int pullExitCode = 0; - while (retryCount < 3) + var pullRetryHelper = new RetryHelper(Trace, new RetryStrategy { - pullExitCode = await dockerManager.DockerPull(executionContext, setupInfo.Container.Image); - if (pullExitCode == 0) + MaxAttempts = 3, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)), + OnRetry = (_, _, backoff) => { - break; - } - else + executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backoff.TotalSeconds} seconds before retry."); + }, + }); + + await pullRetryHelper.ExecuteAsync( + operationName: nameof(PullActionContainerAsync), + operation: async () => { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + pullExitCode = await dockerManager.DockerPull(executionContext, setupInfo.Container.Image); + return pullExitCode == 0 + ? OperationOutcome.Success(true) + : OperationOutcome.TransientFailure($"Docker pull failed with exit code {pullExitCode}"); + }, + cancellationToken: executionContext.CancellationToken); executionContext.Output("##[endgroup]"); - if (retryCount == 3 && pullExitCode != 0) + if (pullExitCode != 0) { throw new InvalidOperationException($"Docker pull failed with exit code {pullExitCode}"); } @@ -811,35 +812,37 @@ private async Task BuildActionContainerAsync(IExecutionContext executionContext, // Build docker image with retry up to 3 times var dockerManager = HostContext.GetService(); - int retryCount = 0; int buildExitCode = 0; var imageName = $"{dockerManager.DockerInstanceLabel}:{Guid.NewGuid().ToString("N")}"; - while (retryCount < 3) + var buildRetryHelper = new RetryHelper(Trace, new RetryStrategy { - buildExitCode = await dockerManager.DockerBuild( - executionContext, - setupInfo.Container.WorkingDirectory, - setupInfo.Container.Dockerfile, - Directory.GetParent(setupInfo.Container.Dockerfile).FullName, - imageName); - if (buildExitCode == 0) + MaxAttempts = 3, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)), + OnRetry = (_, _, backoff) => { - break; - } - else + executionContext.Warning($"Docker build failed with exit code {buildExitCode}, back off {backoff.TotalSeconds} seconds before retry."); + }, + }); + + await buildRetryHelper.ExecuteAsync( + operationName: nameof(BuildActionContainerAsync), + operation: async () => { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker build failed with exit code {buildExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + buildExitCode = await dockerManager.DockerBuild( + executionContext, + setupInfo.Container.WorkingDirectory, + setupInfo.Container.Dockerfile, + Directory.GetParent(setupInfo.Container.Dockerfile).FullName, + imageName); + + return buildExitCode == 0 + ? OperationOutcome.Success(true) + : OperationOutcome.TransientFailure($"Docker build failed with exit code {buildExitCode}"); + }, + cancellationToken: executionContext.CancellationToken); executionContext.Output("##[endgroup]"); - if (retryCount == 3 && buildExitCode != 0) + if (buildExitCode != 0) { throw new InvalidOperationException($"Docker build failed with exit code {buildExitCode}"); } @@ -889,57 +892,66 @@ private async Task BuildActionContainerAsync(IExecutionContext executionContext, var launchServer = HostContext.GetService(); var jobServer = HostContext.GetService(); var actionDownloadInfos = default(WebApi.ActionDownloadInfoCollection); - for (var attempt = 1; attempt <= 3; attempt++) + Exception nonRetryableResolveException = null; + var resolveInfoRetryHelper = new RetryHelper(Trace, new RetryStrategy { - try + MaxAttempts = 3, + GetBackoff = (_, _, _) => String.IsNullOrEmpty(Environment.GetEnvironmentVariable("_GITHUB_ACTION_DOWNLOAD_NO_BACKOFF")) + ? BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(10), TimeSpan.FromSeconds(30)) + : TimeSpan.Zero, + OnRetry = (_, ex, backoff) => { - if (MessageUtil.IsRunServiceJob(executionContext.Global.Variables.Get(Constants.Variables.System.JobRequestType))) + executionContext.Output($"Failed to resolve action download info. Error: {ex.Message}"); + executionContext.Debug(ex.ToString()); + if (backoff > TimeSpan.Zero) { - var displayHelpfulActionsDownloadErrors = executionContext.Global.Variables.GetBoolean(Constants.Runner.Features.DisplayHelpfulActionsDownloadErrors) ?? false; - actionDownloadInfos = await launchServer.ResolveActionsDownloadInfoAsync(executionContext.Global.Plan.PlanId, executionContext.Root.Id, new WebApi.ActionReferenceList { Actions = actionReferences, Dependencies = dependencies }, executionContext.CancellationToken, displayHelpfulActionsDownloadErrors); + executionContext.Output($"Retrying in {backoff.TotalSeconds} seconds"); } - else - { - actionDownloadInfos = await jobServer.ResolveActionDownloadInfoAsync(executionContext.Global.Plan.ScopeIdentifier, executionContext.Global.Plan.PlanType, executionContext.Global.Plan.PlanId, executionContext.Root.Id, new WebApi.ActionReferenceList { Actions = actionReferences }, executionContext.CancellationToken); - } - break; - } - catch (Exception ex) when (!executionContext.CancellationToken.IsCancellationRequested) // Do not retry if the run is cancelled. + }, + }); + + var resolved = await resolveInfoRetryHelper.ExecuteAsync( + operationName: nameof(GetDownloadInfoAsync), + operation: async () => { - // UnresolvableActionDownloadInfoException is a 422 client error, don't retry - // NonRetryableActionDownloadInfoException is an non-retryable exception from Actions - // Some possible cases are: - // * Repo is rate limited - // * Repo or tag doesn't exist, or isn't public - // * Policy validation failed - if (attempt < 3 && !(ex is WebApi.UnresolvableActionDownloadInfoException) && !(ex is WebApi.NonRetryableActionDownloadInfoException)) + try { - executionContext.Output($"Failed to resolve action download info. Error: {ex.Message}"); - executionContext.Debug(ex.ToString()); - if (String.IsNullOrEmpty(Environment.GetEnvironmentVariable("_GITHUB_ACTION_DOWNLOAD_NO_BACKOFF"))) + if (MessageUtil.IsRunServiceJob(executionContext.Global.Variables.Get(Constants.Variables.System.JobRequestType))) { - var backoff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(10), TimeSpan.FromSeconds(30)); - executionContext.Output($"Retrying in {backoff.TotalSeconds} seconds"); - await Task.Delay(backoff); + var displayHelpfulActionsDownloadErrors = executionContext.Global.Variables.GetBoolean(Constants.Runner.Features.DisplayHelpfulActionsDownloadErrors) ?? false; + actionDownloadInfos = await launchServer.ResolveActionsDownloadInfoAsync(executionContext.Global.Plan.PlanId, executionContext.Root.Id, new WebApi.ActionReferenceList { Actions = actionReferences, Dependencies = dependencies }, executionContext.CancellationToken, displayHelpfulActionsDownloadErrors); } + else + { + actionDownloadInfos = await jobServer.ResolveActionDownloadInfoAsync(executionContext.Global.Plan.ScopeIdentifier, executionContext.Global.Plan.PlanType, executionContext.Global.Plan.PlanId, executionContext.Root.Id, new WebApi.ActionReferenceList { Actions = actionReferences }, executionContext.CancellationToken); + } + return OperationOutcome.Success(true); } - else + catch (Exception ex) when (!executionContext.CancellationToken.IsCancellationRequested) { - // Some possible cases are: - // * Repo is rate limited - // * Repo or tag doesn't exist, or isn't public - // * Policy validation failed + // UnresolvableActionDownloadInfoException is a 422 client error, don't retry. + // NonRetryableActionDownloadInfoException is a non-retryable exception from Actions. if (ex is WebApi.UnresolvableActionDownloadInfoException) { - throw; + nonRetryableResolveException = ex; + return OperationOutcome.Success(false); } - else + + if (ex is WebApi.NonRetryableActionDownloadInfoException) { - // This exception will be traced as an infrastructure failure - throw new WebApi.FailedToResolveActionDownloadInfoException("Failed to resolve action download info.", ex); + // This exception will be traced as an infrastructure failure. + nonRetryableResolveException = new WebApi.FailedToResolveActionDownloadInfoException("Failed to resolve action download info.", ex); + return OperationOutcome.Success(false); } + + return OperationOutcome.TransientFailure(ex.Message); } - } + }, + cancellationToken: executionContext.CancellationToken); + + if (!resolved) + { + throw nonRetryableResolveException; } ArgUtil.NotNull(actionDownloadInfos, nameof(actionDownloadInfos)); @@ -1400,115 +1412,132 @@ private AuthenticationHeaderValue CreateAuthHeader(IExecutionContext executionCo private async Task DownloadRepositoryArchive(IExecutionContext executionContext, string downloadUrl, string downloadAuthToken, string archiveFile) { Trace.Info($"Save archive '{downloadUrl}' into {archiveFile}."); - int retryCount = 0; + int attemptNumber = 0; + Exception nonRetryableDownloadException = null; // Allow up to 20 * 60s for any action to be downloaded from github graph. int timeoutSeconds = 20 * 60; try { - while (retryCount < 3) + var downloadArchiveRetryHelper = new RetryHelper(Trace, new RetryStrategy { - string requestId = string.Empty; - using (var actionDownloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) - using (var actionDownloadCancellation = CancellationTokenSource.CreateLinkedTokenSource(actionDownloadTimeout.Token, executionContext.CancellationToken)) + MaxAttempts = 3, + GetBackoff = (_, _, _) => String.IsNullOrEmpty(Environment.GetEnvironmentVariable("_GITHUB_ACTION_DOWNLOAD_NO_BACKOFF")) + ? BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(10), TimeSpan.FromSeconds(30)) + : TimeSpan.Zero, + OnRetry = (context, _, backoff) => { - try + if (backoff > TimeSpan.Zero) { - //open zip stream in async mode - using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: _defaultFileStreamBufferSize, useAsync: true)) - using (var httpClientHandler = HostContext.CreateHttpClientHandler()) - using (var httpClient = new HttpClient(httpClientHandler)) - { - httpClient.DefaultRequestHeaders.Authorization = CreateAuthHeader(executionContext, downloadUrl, downloadAuthToken); + executionContext.Warning($"Back off {backoff.TotalSeconds} seconds before retry."); + } + }, + }); - httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); - using (var response = await httpClient.GetAsync(downloadUrl)) + var downloaded = await downloadArchiveRetryHelper.ExecuteAsync( + operationName: nameof(DownloadRepositoryArchive), + operation: async (context) => + { + attemptNumber = context.AttemptNumber; + string requestId = string.Empty; + using (var actionDownloadTimeout = new CancellationTokenSource(TimeSpan.FromSeconds(timeoutSeconds))) + using (var actionDownloadCancellation = CancellationTokenSource.CreateLinkedTokenSource(actionDownloadTimeout.Token, executionContext.CancellationToken)) + { + try + { + //open zip stream in async mode + using (FileStream fs = new(archiveFile, FileMode.Create, FileAccess.Write, FileShare.None, bufferSize: _defaultFileStreamBufferSize, useAsync: true)) + using (var httpClientHandler = HostContext.CreateHttpClientHandler()) + using (var httpClient = new HttpClient(httpClientHandler)) { - requestId = UrlUtil.GetGitHubRequestId(response.Headers); - if (!string.IsNullOrEmpty(requestId)) - { - Trace.Info($"Request URL: {downloadUrl} X-GitHub-Request-Id: {requestId} Http Status: {response.StatusCode}"); - } + httpClient.DefaultRequestHeaders.Authorization = CreateAuthHeader(executionContext, downloadUrl, downloadAuthToken); - if (response.IsSuccessStatusCode) + httpClient.DefaultRequestHeaders.UserAgent.AddRange(HostContext.UserAgents); + using (var response = await httpClient.GetAsync(downloadUrl)) { - using (var result = await response.Content.ReadAsStreamAsync()) + requestId = UrlUtil.GetGitHubRequestId(response.Headers); + if (!string.IsNullOrEmpty(requestId)) { - await result.CopyToAsync(fs, _defaultCopyBufferSize, actionDownloadCancellation.Token); - await fs.FlushAsync(actionDownloadCancellation.Token); + Trace.Info($"Request URL: {downloadUrl} X-GitHub-Request-Id: {requestId} Http Status: {response.StatusCode}"); + } - // download succeed, break out the retry loop. - break; + if (response.IsSuccessStatusCode) + { + using (var result = await response.Content.ReadAsStreamAsync()) + { + await result.CopyToAsync(fs, _defaultCopyBufferSize, actionDownloadCancellation.Token); + await fs.FlushAsync(actionDownloadCancellation.Token); + return OperationOutcome.Success(true); + } } - } - else if (response.StatusCode == HttpStatusCode.NotFound) - { - // It doesn't make sense to retry in this case, so just stop - throw new ActionNotFoundException(new Uri(downloadUrl), requestId); - } - else if (response.StatusCode == HttpStatusCode.Forbidden) - { - // It doesn't make sense to retry in this case, so just stop - throw new AccessDeniedException($"Access denied to '{downloadUrl}' ({requestId})"); - } - else - { - // Something else bad happened, let's go to our retry logic + + if (response.StatusCode == HttpStatusCode.NotFound) + { + nonRetryableDownloadException = new ActionNotFoundException(new Uri(downloadUrl), requestId); + return OperationOutcome.Success(false); + } + + if (response.StatusCode == HttpStatusCode.Forbidden) + { + nonRetryableDownloadException = new AccessDeniedException($"Access denied to '{downloadUrl}' ({requestId})"); + return OperationOutcome.Success(false); + } + response.EnsureSuccessStatusCode(); + return OperationOutcome.Success(true); } } } - } - catch (OperationCanceledException) when (executionContext.CancellationToken.IsCancellationRequested) - { - Trace.Info("Action download has been cancelled."); - throw; - } - catch (OperationCanceledException ex) when (!executionContext.CancellationToken.IsCancellationRequested && retryCount >= 2) - { - Trace.Info($"Action download final retry timeout after {timeoutSeconds} seconds."); - throw new TimeoutException($"Action '{downloadUrl}' download has timed out. Error: {ex.Message} {requestId}"); - } - catch (ActionNotFoundException) - { - Trace.Info($"The action at '{downloadUrl}' does not exist"); - throw; - } - catch (AccessDeniedException) - { - Trace.Info($"Access denied to '{downloadUrl}'"); - throw; - } - catch (Exception ex) when (retryCount < 2) - { - retryCount++; - Trace.Error($"Fail to download archive '{downloadUrl}' -- Attempt: {retryCount}"); - Trace.Error(ex); - if (actionDownloadTimeout.Token.IsCancellationRequested) + catch (OperationCanceledException) when (executionContext.CancellationToken.IsCancellationRequested) { - // action download didn't finish within timeout - executionContext.Warning($"Action '{downloadUrl}' didn't finish download within {timeoutSeconds} seconds. {requestId}"); + Trace.Info("Action download has been cancelled."); + throw; } - else + catch (OperationCanceledException ex) when (!executionContext.CancellationToken.IsCancellationRequested && context.AttemptNumber >= context.MaxAttempts) { - executionContext.Warning($"Failed to download action '{downloadUrl}'. Error: {ex.Message} {requestId}"); + Trace.Info($"Action download final retry timeout after {timeoutSeconds} seconds."); + nonRetryableDownloadException = new TimeoutException($"Action '{downloadUrl}' download has timed out. Error: {ex.Message} {requestId}"); + return OperationOutcome.Success(false); + } + catch (Exception ex) + { + Trace.Error($"Fail to download archive '{downloadUrl}' -- Attempt: {context.AttemptNumber}"); + Trace.Error(ex); + if (actionDownloadTimeout.Token.IsCancellationRequested) + { + // action download didn't finish within timeout + executionContext.Warning($"Action '{downloadUrl}' didn't finish download within {timeoutSeconds} seconds. {requestId}"); + } + else + { + executionContext.Warning($"Failed to download action '{downloadUrl}'. Error: {ex.Message} {requestId}"); + } + + return OperationOutcome.TransientFailure(ex.Message); } } - } + }, + cancellationToken: executionContext.CancellationToken); - if (String.IsNullOrEmpty(Environment.GetEnvironmentVariable("_GITHUB_ACTION_DOWNLOAD_NO_BACKOFF"))) + if (!downloaded) + { + if (nonRetryableDownloadException is ActionNotFoundException) { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(10), TimeSpan.FromSeconds(30)); - executionContext.Warning($"Back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); + Trace.Info($"The action at '{downloadUrl}' does not exist"); } + else if (nonRetryableDownloadException is AccessDeniedException) + { + Trace.Info($"Access denied to '{downloadUrl}'"); + } + + throw nonRetryableDownloadException; } } catch (Exception ex) when (!(ex is AccessDeniedException) && !(ex is OperationCanceledException) && !executionContext.CancellationToken.IsCancellationRequested) { - Trace.Error($"Failed to download archive '{downloadUrl}' after {retryCount + 1} attempts."); + Trace.Error($"Failed to download archive '{downloadUrl}' after {attemptNumber} attempts."); Trace.Error(ex); - throw new FailedToDownloadActionException($"Failed to download archive '{downloadUrl}' after {retryCount + 1} attempts.", ex); + throw new FailedToDownloadActionException($"Failed to download archive '{downloadUrl}' after {attemptNumber} attempts.", ex); } ArgUtil.NotNullOrEmpty(archiveFile, nameof(archiveFile)); diff --git a/src/Runner.Worker/ContainerOperationProvider.cs b/src/Runner.Worker/ContainerOperationProvider.cs index c5cccb77ef0..53f2940cf99 100644 --- a/src/Runner.Worker/ContainerOperationProvider.cs +++ b/src/Runner.Worker/ContainerOperationProvider.cs @@ -196,31 +196,32 @@ private async Task StartContainerAsync(IExecutionContext executionContext, Conta var configLocation = await ContainerRegistryLogin(executionContext, container); // Pull down docker image with retry up to 3 times - int retryCount = 0; int pullExitCode = 0; - while (retryCount < 3) + var pullRetryHelper = new RetryHelper(Trace, new RetryStrategy { - pullExitCode = await _dockerManager.DockerPull(executionContext, container.ContainerImage, configLocation); - if (pullExitCode == 0) + MaxAttempts = 3, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)), + OnRetry = (_, _, backoff) => { - break; - } - else + executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backoff.TotalSeconds} seconds before retry."); + }, + }); + + await pullRetryHelper.ExecuteAsync( + operationName: nameof(StartContainerAsync), + operation: async () => { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker pull failed with exit code {pullExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + pullExitCode = await _dockerManager.DockerPull(executionContext, container.ContainerImage, configLocation); + return pullExitCode == 0 + ? OperationOutcome.Success(true) + : OperationOutcome.TransientFailure($"Docker pull failed with exit code {pullExitCode}"); + }, + cancellationToken: executionContext.CancellationToken); // Remove credentials after pulling ContainerRegistryLogout(configLocation); - if (retryCount == 3 && pullExitCode != 0) + if (pullExitCode != 0) { throw new InvalidOperationException($"Docker pull failed with exit code {pullExitCode}"); } @@ -468,33 +469,35 @@ private async Task ContainerRegistryLogin(IExecutionContext executionCon } // Login docker with retry up to 3 times - int retryCount = 0; int loginExitCode = 0; - while (retryCount < 3) - { - loginExitCode = await _dockerManager.DockerLogin( - executionContext, - configLocation, - container.RegistryServer, - container.RegistryAuthUsername, - container.RegistryAuthPassword); - if (loginExitCode == 0) - { - break; - } - else + var loginRetryHelper = new RetryHelper(Trace, new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)), + OnRetry = (_, _, backoff) => { - retryCount++; - if (retryCount < 3) - { - var backOff = BackoffTimerHelper.GetRandomBackoff(TimeSpan.FromSeconds(1), TimeSpan.FromSeconds(10)); - executionContext.Warning($"Docker login for '{container.RegistryServer}' failed with exit code {loginExitCode}, back off {backOff.TotalSeconds} seconds before retry."); - await Task.Delay(backOff); - } - } - } + executionContext.Warning($"Docker login for '{container.RegistryServer}' failed with exit code {loginExitCode}, back off {backoff.TotalSeconds} seconds before retry."); + }, + }); - if (retryCount == 3 && loginExitCode != 0) + await loginRetryHelper.ExecuteAsync( + operationName: nameof(ContainerRegistryLogin), + operation: async () => + { + loginExitCode = await _dockerManager.DockerLogin( + executionContext, + configLocation, + container.RegistryServer, + container.RegistryAuthUsername, + container.RegistryAuthPassword); + + return loginExitCode == 0 + ? OperationOutcome.Success(true) + : OperationOutcome.TransientFailure($"Docker login for '{container.RegistryServer}' failed with exit code {loginExitCode}"); + }, + cancellationToken: executionContext.CancellationToken); + + if (loginExitCode != 0) { throw new InvalidOperationException($"Docker login for '{container.RegistryServer}' failed with exit code {loginExitCode}"); } diff --git a/src/Runner.Worker/JobRunner.cs b/src/Runner.Worker/JobRunner.cs index 3c4799a29e0..d94331201ed 100644 --- a/src/Runner.Worker/JobRunner.cs +++ b/src/Runner.Worker/JobRunner.cs @@ -323,41 +323,57 @@ private async Task CompleteJobAsync(IRunServer runServer, IExecution } Trace.Info($"Raising job completed against run service"); - var completeJobRetryLimit = 5; var exceptions = new List(); - while (completeJobRetryLimit-- > 0) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - try - { - await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, result, jobContext.JobOutputs, jobContext.Global.StepsResult, jobContext.Global.JobAnnotations, environmentUrl, telemetry, billingOwnerId: message.BillingOwnerId, infrastructureFailureCategory: jobContext.Global.InfrastructureFailureCategory, default); - return result; - } - catch (VssUnauthorizedException ex) + MaxAttempts = 5, + GetBackoff = RetryBackoffs.Fixed(TimeSpan.FromSeconds(5)), + OnRetry = (_, ex, _) => { Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); Trace.Error(ex); exceptions.Add(ex); - break; - } - catch (TaskOrchestrationJobNotFoundException ex) + }, + OnFailure = (_, ex, _) => { - Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); - Trace.Error(ex); - exceptions.Add(ex); - break; - } - catch (Exception ex) + if (ex != null) + { + exceptions.Add(ex); + } + }, + }); + + var completed = await retryHelper.ExecuteAsync( + operationName: "CompleteJobAsync", + operation: async _ => { - Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); - Trace.Error(ex); - exceptions.Add(ex); - } + try + { + await runServer.CompleteJobAsync(message.Plan.PlanId, message.JobId, result, jobContext.JobOutputs, jobContext.Global.StepsResult, jobContext.Global.JobAnnotations, environmentUrl, telemetry, billingOwnerId: message.BillingOwnerId, infrastructureFailureCategory: jobContext.Global.InfrastructureFailureCategory, default); + return true; + } + catch (VssUnauthorizedException ex) + { + Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); + Trace.Error(ex); + exceptions.Add(ex); + return false; + } + catch (TaskOrchestrationJobNotFoundException ex) + { + Trace.Error($"Catch exception while attempting to complete job {message.JobId}, job request {message.RequestId}."); + Trace.Error(ex); + exceptions.Add(ex); + return false; + } + }, + cancellationToken: default); - // delay 5 seconds before next retry. - await Task.Delay(TimeSpan.FromSeconds(5)); + if (completed) + { + return result; } - // rethrow exceptions from all attempts. throw new AggregateException(exceptions); } @@ -406,46 +422,67 @@ private async Task CompleteJobAsync(IJobServer jobServer, IExecution Trace.Info($"Raising job completed event"); var jobCompletedEvent = new JobCompletedEvent(message.RequestId, message.JobId, result, jobContext.JobOutputs, jobContext.ActionsEnvironment, jobContext.Global.StepsTelemetry, jobContext.Global.JobTelemetry); - var completeJobRetryLimit = 5; var exceptions = new List(); - while (completeJobRetryLimit-- > 0) + var retryHelper = new RetryHelper(Trace, new RetryStrategy { - try - { - await jobServer.RaisePlanEventAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, jobCompletedEvent, default(CancellationToken)); - return result; - } - catch (TaskOrchestrationPlanNotFoundException ex) - { - Trace.Error($"TaskOrchestrationPlanNotFoundException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); - Trace.Error(ex); - return TaskResult.Failed; - } - catch (TaskOrchestrationPlanSecurityException ex) - { - Trace.Error($"TaskOrchestrationPlanSecurityException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); - Trace.Error(ex); - return TaskResult.Failed; - } - catch (TaskOrchestrationPlanTerminatedException ex) - { - Trace.Error($"TaskOrchestrationPlanTerminatedException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); - Trace.Error(ex); - return TaskResult.Failed; - } - catch (Exception ex) + MaxAttempts = 5, + GetBackoff = RetryBackoffs.Fixed(TimeSpan.FromSeconds(5)), + OnRetry = (_, ex, _) => { Trace.Error($"Catch exception while attempting to raise JobCompletedEvent for job {message.JobId}, job request {message.RequestId}."); Trace.Error(ex); exceptions.Add(ex); - } + }, + OnFailure = (_, ex, _) => + { + if (ex != null) + { + exceptions.Add(ex); + } + }, + }); + + var completed = await retryHelper.ExecuteAsync( + operationName: "RaiseJobCompletedEvent", + operation: async _ => + { + try + { + await jobServer.RaisePlanEventAsync(message.Plan.ScopeIdentifier, message.Plan.PlanType, message.Plan.PlanId, jobCompletedEvent, default(CancellationToken)); + return true; + } + catch (TaskOrchestrationPlanNotFoundException ex) + { + Trace.Error($"TaskOrchestrationPlanNotFoundException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); + Trace.Error(ex); + return false; + } + catch (TaskOrchestrationPlanSecurityException ex) + { + Trace.Error($"TaskOrchestrationPlanSecurityException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); + Trace.Error(ex); + return false; + } + catch (TaskOrchestrationPlanTerminatedException ex) + { + Trace.Error($"TaskOrchestrationPlanTerminatedException received, while attempting to raise JobCompletedEvent for job {message.JobId}."); + Trace.Error(ex); + return false; + } + }, + cancellationToken: default); - // delay 5 seconds before next retry. - await Task.Delay(TimeSpan.FromSeconds(5)); + if (completed) + { + return result; } - // rethrow exceptions from all attempts. - throw new AggregateException(exceptions); + if (exceptions.Count > 0) + { + throw new AggregateException(exceptions); + } + + return TaskResult.Failed; } private void MaskTelemetrySecrets(List jobTelemetry) diff --git a/src/Test/L0/Listener/RunnerDotcomServerL0.cs b/src/Test/L0/Listener/RunnerDotcomServerL0.cs new file mode 100644 index 00000000000..96df0e74dc8 --- /dev/null +++ b/src/Test/L0/Listener/RunnerDotcomServerL0.cs @@ -0,0 +1,140 @@ +using System; +using System.Collections.Generic; +using System.Net; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; +using GitHub.Runner.Common; +using GitHub.Runner.Sdk; +using Moq; +using Moq.Protected; +using Xunit; + +namespace GitHub.Runner.Common.Tests.Listener +{ + public sealed class RunnerDotcomServerL0 + { + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public async Task GetRunnerByNameAsync_RetriesTransientFailureAndSucceeds() + { + using (var hc = new TestHostContext(this)) + { + var mockHandler = new Mock(); + var expectedUri = new Uri("https://api.github.com/orgs/my-org/actions/runners?name=runner-a"); + mockHandler.Protected() + .SetupSequence>( + "SendAsync", + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()) + .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.InternalServerError) + { + Content = new StringContent("{\"message\":\"server error\"}") + }) + .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new StringContent("{\"total_count\":1,\"runners\":[{\"id\":7,\"name\":\"runner-a\"}]}") + }); + + var mockFactory = new Mock(); + mockFactory + .Setup(x => x.CreateClientHandler(It.IsAny())) + .Returns(mockHandler.Object); + hc.SetSingleton(mockFactory.Object); + + var server = new RunnerDotcomServer(); + server.Initialize(hc); + + List runners = await server.GetRunnerByNameAsync("https://github.com/my-org", "token", "runner-a"); + + Assert.Single(runners); + Assert.Equal(7ul, runners[0].Id); + Assert.Equal("runner-a", runners[0].Name); + + mockHandler.Protected().Verify( + "SendAsync", + Times.Exactly(2), + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public async Task GetRunnerByNameAsync_DoesNotRetryNotFound() + { + using (var hc = new TestHostContext(this)) + { + var mockHandler = new Mock(); + var expectedUri = new Uri("https://api.github.com/orgs/my-org/actions/runners?name=missing-runner"); + mockHandler.Protected() + .Setup>( + "SendAsync", + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()) + .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.NotFound) + { + Content = new StringContent("{\"message\":\"Not Found\"}") + }); + + var mockFactory = new Mock(); + mockFactory + .Setup(x => x.CreateClientHandler(It.IsAny())) + .Returns(mockHandler.Object); + hc.SetSingleton(mockFactory.Object); + + var server = new RunnerDotcomServer(); + server.Initialize(hc); + + await Assert.ThrowsAsync(() => server.GetRunnerByNameAsync("https://github.com/my-org", "token", "missing-runner")); + + mockHandler.Protected().Verify( + "SendAsync", + Times.Once(), + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Runner")] + public async Task GetRunnerByNameAsync_StopsAfterMaxAttempts() + { + using (var hc = new TestHostContext(this)) + { + var mockHandler = new Mock(); + var expectedUri = new Uri("https://api.github.com/orgs/my-org/actions/runners?name=runner-a"); + mockHandler.Protected() + .Setup>( + "SendAsync", + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()) + .ReturnsAsync(new HttpResponseMessage(HttpStatusCode.InternalServerError) + { + Content = new StringContent("{\"message\":\"server error\"}") + }); + + var mockFactory = new Mock(); + mockFactory + .Setup(x => x.CreateClientHandler(It.IsAny())) + .Returns(mockHandler.Object); + hc.SetSingleton(mockFactory.Object); + + var server = new RunnerDotcomServer(); + server.Initialize(hc); + + await Assert.ThrowsAsync(() => server.GetRunnerByNameAsync("https://github.com/my-org", "token", "runner-a")); + + mockHandler.Protected().Verify( + "SendAsync", + Times.Exactly(3), + ItExpr.Is(m => m.Method == HttpMethod.Get && m.RequestUri == expectedUri), + ItExpr.IsAny()); + } + } + + } +} diff --git a/src/Test/L0/RetryBackoffsL0.cs b/src/Test/L0/RetryBackoffsL0.cs new file mode 100644 index 00000000000..6d1043254d2 --- /dev/null +++ b/src/Test/L0/RetryBackoffsL0.cs @@ -0,0 +1,90 @@ +using System; +using Xunit; + +namespace GitHub.Runner.Common.Tests +{ + public sealed class RetryBackoffsL0 + { + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Fixed_ReturnsSameDelay() + { + var backoff = RetryBackoffs.Fixed(TimeSpan.FromMilliseconds(250)); + + var first = backoff(1, 5, null); + var second = backoff(4, 5, new InvalidOperationException("ignored")); + + Assert.Equal(TimeSpan.FromMilliseconds(250), first); + Assert.Equal(TimeSpan.FromMilliseconds(250), second); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Random_StaysWithinBounds() + { + var min = TimeSpan.FromMilliseconds(100); + var max = TimeSpan.FromMilliseconds(200); + var backoff = RetryBackoffs.Random(min, max); + + for (var i = 0; i < 100; i++) + { + var delay = backoff(i + 1, 100, null); + Assert.InRange(delay, min, max); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Exponential_IsBoundedByMax() + { + var min = TimeSpan.FromMilliseconds(100); + var max = TimeSpan.FromMilliseconds(500); + var delta = TimeSpan.FromMilliseconds(100); + var backoff = RetryBackoffs.Exponential(min, max, delta); + + var first = backoff(1, 5, null); + var second = backoff(2, 5, null); + var saturated = backoff(10, 10, null); + + Assert.InRange(first, min, max); + Assert.InRange(second, first, max); + Assert.Equal(max, saturated); + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void ExponentialFullJitter_IsWithinBounds() + { + var min = TimeSpan.FromMilliseconds(50); + var max = TimeSpan.FromMilliseconds(500); + var delta = TimeSpan.FromMilliseconds(100); + var backoff = RetryBackoffs.ExponentialFullJitter(min, max, delta); + + for (var attempt = 1; attempt <= 10; attempt++) + { + var delay = backoff(attempt, 10, null); + Assert.InRange(delay, min, max); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void DecorrelatedJitter_IsWithinBounds() + { + var min = TimeSpan.FromMilliseconds(100); + var max = TimeSpan.FromMilliseconds(1000); + var backoff = RetryBackoffs.DecorrelatedJitter(min, max); + + for (var i = 0; i < 100; i++) + { + var delay = backoff(i + 1, 100, null); + Assert.InRange(delay, min, max); + } + } + } +} diff --git a/src/Test/L0/RetryHelperL0.cs b/src/Test/L0/RetryHelperL0.cs new file mode 100644 index 00000000000..6cfaa7929f6 --- /dev/null +++ b/src/Test/L0/RetryHelperL0.cs @@ -0,0 +1,430 @@ +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; +using Xunit; + +namespace GitHub.Runner.Common.Tests +{ + public sealed class RetryHelperL0 + { + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_SucceedsOnFirstAttempt() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync(() => Task.FromResult(42)); + + Assert.Equal(42, result); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_RetriesOnTransientFailure() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var attempts = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync(() => + { + attempts++; + if (attempts < 3) + { + throw new InvalidOperationException("transient"); + } + return Task.FromResult(attempts); + }); + + Assert.Equal(3, result); + Assert.Equal(3, attempts); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_ThrowsAfterMaxAttempts() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + await Assert.ThrowsAsync(() => + helper.ExecuteAsync(() => Task.FromException(new InvalidOperationException("always fails")))); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_InvokesOnRetryCallback() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var onRetryCalls = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnRetry = (_, _, _) => onRetryCalls++, + }; + var helper = new RetryHelper(trace, strategy); + var callCount = 0; + + await helper.ExecuteAsync(() => + { + callCount++; + if (callCount < 3) + { + throw new InvalidOperationException("transient"); + } + return Task.FromResult(0); + }); + + Assert.Equal(2, onRetryCalls); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_PermanentFailurePropagatesImmediately() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var calls = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 5, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + // Throwing directly from an outcome-based operation propagates immediately without retrying + await Assert.ThrowsAsync(() => + helper.ExecuteAsync( + "permanent-failure-op", + () => + { + calls++; + if (calls == 1) + { + return Task.FromResult>(new OperationOutcome.TransientFailure("first attempt transient")); + } + throw new ArgumentException("permanent on second attempt"); + })); + + Assert.Equal(2, calls); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_ThrowsOperationCanceledWhenTokenCancelled() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + using var cts = new CancellationTokenSource(); + var strategy = new RetryStrategy + { + MaxAttempts = 5, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + cts.Cancel(); + + await Assert.ThrowsAnyAsync(() => + helper.ExecuteAsync(() => Task.FromResult(0), cts.Token)); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Constructor_ThrowsWhenMaxAttemptsIsZero() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 0, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + + Assert.Throws(() => new RetryHelper(trace, strategy)); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public void Constructor_ThrowsWhenGetBackoffIsNull() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = null, + }; + + Assert.Throws(() => new RetryHelper(trace, strategy)); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_InvokesOnSuccessCallback() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + RetryExecutionContext capturedContext = null; + TimeSpan capturedDuration = TimeSpan.Zero; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnSuccess = (context, duration) => + { + capturedContext = context; + capturedDuration = duration; + }, + }; + + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync("test-operation", () => Task.FromResult(7)); + + Assert.Equal(7, result); + Assert.NotNull(capturedContext); + Assert.Equal("test-operation", capturedContext.OperationName); + Assert.Equal(1, capturedContext.AttemptNumber); + Assert.True(capturedDuration >= TimeSpan.Zero); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_InvokesOnFailureWhenRetriesExhausted() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + RetryExecutionContext capturedContext = null; + Exception capturedException = null; + TimeSpan capturedDuration = TimeSpan.Zero; + var strategy = new RetryStrategy + { + MaxAttempts = 2, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnFailure = (context, ex, duration) => + { + capturedContext = context; + capturedException = ex; + capturedDuration = duration; + }, + }; + + var helper = new RetryHelper(trace, strategy); + + await Assert.ThrowsAsync(() => + helper.ExecuteAsync("exhausted-op", + () => Task.FromException(new ArgumentException("fail")))); + + Assert.NotNull(capturedContext); + Assert.Equal("exhausted-op", capturedContext.OperationName); + Assert.Equal(2, capturedContext.AttemptNumber); + Assert.IsType(capturedException); + Assert.True(capturedDuration >= TimeSpan.Zero); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_PassesContextToOperationCallback() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var seenAttempts = new List(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + + var helper = new RetryHelper(trace, strategy); + var result = await helper.ExecuteAsync("context-op", context => + { + seenAttempts.Add(context.AttemptNumber); + if (context.AttemptNumber < context.MaxAttempts) + { + throw new InvalidOperationException("transient"); + } + + return Task.FromResult(context.AttemptNumber); + }); + + Assert.Equal(3, result); + Assert.Equal(new[] { 1, 2, 3 }, seenAttempts); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_SucceedsOnFirstAttempt() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync( + "outcome-op", + () => Task.FromResult>(new OperationOutcome.Success(99))); + + Assert.Equal(99, result); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_RetriesOnTransientFailure() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var calls = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var result = await helper.ExecuteAsync( + "outcome-op", + () => + { + calls++; + if (calls < 3) + { + return Task.FromResult>(new OperationOutcome.TransientFailure("not ready yet")); + } + return Task.FromResult>(new OperationOutcome.Success(calls)); + }); + + Assert.Equal(3, result); + Assert.Equal(3, calls); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_ThrowsAfterMaxTransientFailures() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + }; + var helper = new RetryHelper(trace, strategy); + + var ex = await Assert.ThrowsAsync(() => + helper.ExecuteAsync( + "outcome-op", + () => Task.FromResult>(new OperationOutcome.TransientFailure("always failing")))); + + Assert.Contains("always failing", ex.Message); + } + } + + [Fact] + [Trait("Level", "L0")] + [Trait("Category", "Common")] + public async Task ExecuteAsync_WithOutcome_InvokesOnRetryCallback() + { + using (TestHostContext hc = CreateTestContext()) + { + var trace = hc.GetTrace(); + var onRetryCalls = 0; + var strategy = new RetryStrategy + { + MaxAttempts = 3, + GetBackoff = (_, _, _) => TimeSpan.Zero, + OnRetry = (_, _, _) => onRetryCalls++, + }; + var helper = new RetryHelper(trace, strategy); + var calls = 0; + + await helper.ExecuteAsync( + "outcome-op", + () => + { + calls++; + if (calls < 3) + { + return Task.FromResult>(new OperationOutcome.TransientFailure("not ready")); + } + return Task.FromResult>(new OperationOutcome.Success(calls)); + }); + + Assert.Equal(2, onRetryCalls); + } + } + + private TestHostContext CreateTestContext([CallerMemberName] string testName = "") + { + return new TestHostContext(this, testName); + } + } +}