From 76059197b35ad1046fe194d37a60ceb75c64a948 Mon Sep 17 00:00:00 2001 From: Brian Hill Date: Wed, 13 May 2026 15:05:28 -0400 Subject: [PATCH 1/2] Bound FlowLogger summary length and aggregate per-domain failures Discovery against an appliance with 235 domains produced a 50+ KB FlowLogger summary - two lines per failed domain ([FAIL] ListFileStore-X followed by [SKIP] Domain-X, both echoing the full HTTP 401 body). When the orchestrator returned that as JobResult.FailureMessage, Command's SQL update against AgentJobStatus.Message failed with "String or binary data would be truncated" and the entire job result was lost. The trace showed the job ran fine, but the customer saw a failed job. Two changes: - FlowLogger.GetSummary() now caps output at 3500 chars and appends a "[truncated, N chars omitted; check orchestrator log for full breadcrumb]" marker. Defense in depth - even if a future job somehow blows up the summary again, the result update won't fail. Trace log still carries the full picture. - Discovery aggregates per-domain failures by error signature instead of emitting one FAIL + one SKIP per failed domain. Identical 401s across N domains collapse into one [SKIP] DomainsFailed[HTTP 401 Unauthorized: Authentication failure.] - 114 domain(s): AashishSandbox, ADT_*, ADT_*, ADT_*, ADT_* (+109 more). The /_links self URL (which varies per domain) is stripped from the group key so the grouping actually works. Successful domains still emit a per-path Discovered-X step so operators can see what was found. Net effect on the user's 235-domain appliance: summary drops from ~50 KB / 235+ step lines to ~1 KB / ~10 step lines. Co-Authored-By: Claude Opus 4.7 (1M context) --- DataPower/FlowLogger.cs | 15 ++++- DataPower/Jobs/Discovery.cs | 116 ++++++++++++++++++++++++++---------- 2 files changed, 98 insertions(+), 33 deletions(-) diff --git a/DataPower/FlowLogger.cs b/DataPower/FlowLogger.cs index 917f45c..8d67677 100644 --- a/DataPower/FlowLogger.cs +++ b/DataPower/FlowLogger.cs @@ -172,6 +172,12 @@ public void EndBranch() public bool HasFailures => _steps.Any(s => s.Status == StepStatus.Failed); + // Command's AgentJobStatus.Message column has a hard length cap (typically + // NVARCHAR(4000)). Truncate aggressively so a job result update never fails + // with "String or binary data would be truncated". Leaves headroom for the + // FailureMessage prefix the caller adds. + private const int MaxSummaryChars = 3500; + public string GetSummary() { var hasFailures = HasFailures; @@ -202,7 +208,14 @@ public string GetSummary() } sb.Append("----------------------------------------"); - return sb.ToString(); + var summary = sb.ToString(); + if (summary.Length > MaxSummaryChars) + { + var omitted = summary.Length - MaxSummaryChars; + summary = summary.Substring(0, MaxSummaryChars) + + $"\n... [truncated, {omitted} chars omitted; check orchestrator log for full breadcrumb]"; + } + return summary; } public void Dispose() diff --git a/DataPower/Jobs/Discovery.cs b/DataPower/Jobs/Discovery.cs index f7750c9..71dd91b 100644 --- a/DataPower/Jobs/Discovery.cs +++ b/DataPower/Jobs/Discovery.cs @@ -15,6 +15,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Text.RegularExpressions; using Keyfactor.Extensions.Orchestrator.DataPower.Client; using Keyfactor.Logging; using Keyfactor.Orchestrators.Common.Enums; @@ -53,6 +54,29 @@ public class Discovery : JobBase, IDiscoveryJobExtension // exact casing has shifted across Command versions. private static readonly string[] DirsToSearchKeys = { "dirs", "Dirs", "directories", "Directories", "DirsToSearch" }; + // Extracts a stable group key for an exception thrown by a per-domain + // ListFileStoreDirectories call. Strips domain-specific bits (the /_links/self/href + // URL changes per domain) so identical RBAC failures across hundreds of domains + // collapse into a single error group. + private static readonly Regex ErrorMessageRegex = new Regex( + "\"error\"\\s*:\\s*\\[\\s*\"([^\"]+)\"", + RegexOptions.Compiled | RegexOptions.Singleline); + + private static string ErrorSignatureOf(Exception ex) + { + var apiEx = DataPowerApiException.Find(ex); + if (apiEx != null) + { + var m = ErrorMessageRegex.Match(apiEx.ResponseBody ?? string.Empty); + if (m.Success) + return $"HTTP {(int)apiEx.StatusCode} {apiEx.StatusCode}: {m.Groups[1].Value}"; + return $"HTTP {(int)apiEx.StatusCode} {apiEx.StatusCode}"; + } + + var msg = ex?.Message ?? "(no message)"; + return msg.Length > 80 ? msg.Substring(0, 80) + "..." : msg; + } + private static (HashSet Dirs, string Source) ResolveDirsToSearch(DiscoveryJobConfiguration config) { if (config?.JobProperties != null) @@ -175,53 +199,81 @@ private JobResult PerformDiscovery(DiscoveryJobConfiguration config, SubmitDisco flow.Branch($"PerDomain (count={domainCount})"); try { + var listedOk = 0; + var emptyNameSkipped = 0; + + // Group per-domain failures by error signature instead of emitting a + // FAIL+SKIP line per failed domain. On appliances with 200+ domains and + // a non-trivial RBAC story, that easily produces a 50 KB summary which + // overflows Command's AgentJobStatus.Message column. One aggregated SKIP + // line per error class scales fine and is far more readable. + var errorGroups = new Dictionary>(StringComparer.Ordinal); + foreach (var domain in domains) { if (string.IsNullOrWhiteSpace(domain?.Name)) { - flow.Skip("Domain", "empty domain name"); + emptyNameSkipped++; continue; } + List directories; try { - List directories = null; - flow.Step($"ListFileStore-{domain.Name}", () => - { - directories = apiClient.ListFileStoreDirectories(domain.Name); - }); - - // DataPower's filestore location names carry a trailing colon - // (e.g. "cert:" / "pubcert:" / "sharedcert:"). Strip it before - // matching and before composing the store path. - var certDirectories = directories - .Select(d => d?.TrimEnd(':')) - .Where(d => !string.IsNullOrEmpty(d) && certStoreDirectories.Contains(d)) - .ToList(); - - var isDefault = string.Equals(domain.Name, DefaultDomainName, StringComparison.OrdinalIgnoreCase); - foreach (var dir in certDirectories) - { - if (ApplianceWideDirectories.Contains(dir) && !isDefault) - { - flow.Skip($"{domain.Name}\\{dir}", "appliance-wide; emitted only under default"); - continue; - } - - var storePath = $"{domain.Name}\\{dir}"; - discoveredLocations.Add(storePath); - flow.Step($"Discovered-{storePath}"); - } + directories = apiClient.ListFileStoreDirectories(domain.Name); } catch (Exception ex) { // Resilient by design: one inaccessible domain should not abort discovery - var inner = DescribeException(ex); - flow.Skip($"Domain-{domain.Name}", $"unable to list directories: {inner}"); - Logger.LogWarning(ex, "Unable to list filestore directories for domain {DomainName}: {ErrorMessage}", - domain.Name, inner); + var signature = ErrorSignatureOf(ex); + if (!errorGroups.TryGetValue(signature, out var list)) + { + list = new List(); + errorGroups[signature] = list; + } + list.Add(domain.Name); + Logger.LogWarning(ex, + "Unable to list filestore directories for domain {DomainName}: {ErrorMessage}", + domain.Name, DescribeException(ex)); + continue; + } + + listedOk++; + + // DataPower's filestore location names carry a trailing colon + // (e.g. "cert:" / "pubcert:" / "sharedcert:"). Strip it before + // matching and before composing the store path. + var certDirectories = directories + .Select(d => d?.TrimEnd(':')) + .Where(d => !string.IsNullOrEmpty(d) && certStoreDirectories.Contains(d)) + .ToList(); + + var isDefault = string.Equals(domain.Name, DefaultDomainName, StringComparison.OrdinalIgnoreCase); + foreach (var dir in certDirectories) + { + if (ApplianceWideDirectories.Contains(dir) && !isDefault) + continue; // appliance-wide; emitted only under default + + var storePath = $"{domain.Name}\\{dir}"; + discoveredLocations.Add(storePath); + flow.Step($"Discovered-{storePath}"); } } + + var listedDetail = $"listed={listedOk}/{domainCount}"; + if (emptyNameSkipped > 0) + listedDetail += $", emptyName={emptyNameSkipped}"; + if (errorGroups.Count > 0) + listedDetail += $", failed={errorGroups.Values.Sum(v => v.Count)}"; + flow.Step("ListFileStore", listedDetail); + + foreach (var kvp in errorGroups.OrderByDescending(g => g.Value.Count)) + { + var sample = kvp.Value.Take(5).ToList(); + var more = kvp.Value.Count - sample.Count; + var sampleStr = string.Join(", ", sample) + (more > 0 ? $" (+{more} more)" : ""); + flow.Skip($"DomainsFailed[{kvp.Key}]", $"{kvp.Value.Count} domain(s): {sampleStr}"); + } } finally { From 0cd878feeb0c551c936dbe2d4c7a9bab65928f8c Mon Sep 17 00:00:00 2001 From: Brian Hill Date: Tue, 19 May 2026 13:12:59 -0400 Subject: [PATCH 2/2] Add 1.2.1 CHANGELOG entry for FlowLogger truncation fix --- CHANGELOG.md | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7190db1..f0427d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,25 @@ -## 1.2.0 - unreleased +## 1.2.1 + +### Fixed +* Discovery against appliances with a large number of application domains + (200+) where the orchestrator's API user lacks read access to many of them + no longer produces a SQL truncation failure in Keyfactor Command. Two + related fixes: + * `FlowLogger.GetSummary()` output is now capped at 3500 characters with + a `[truncated, N chars omitted; check orchestrator log for full + breadcrumb]` marker. Defense-in-depth against Command's + `AgentJobStatus.Message` column overflow (NVARCHAR(4000)). + * Discovery aggregates per-domain failures by error signature instead of + emitting one `FAIL` and one `SKIP` line per failed domain. Identical + HTTP errors across N domains collapse into one summary line with a + count and a 5-domain sample. Successful domains still emit one + `Discovered-` step each. + + Net effect on a 235-domain appliance with 114 inaccessible domains: the + breadcrumb summary drops from ~50 KB and 235+ step lines to ~1 KB and + ~10 step lines, well under Command's column cap. + +## 1.2.0 ### Added * Discovery job: automatically enumerates all application domains on a DataPower