From 37406c140bed7c011cb280aad62b29aa1ec55b52 Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 13 Feb 2026 10:08:43 -0600 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=A4=96=20fix:=20harden=20agent=20skil?= =?UTF-8?q?l=20discovery=20against=20transient=20SSH=20failures?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Treat transient SKILL.md stat/read failures as transport errors instead of invalid skill diagnostics, then reuse the last good diagnostics snapshot for list/listDiagnostics when discovery fails transiently. - add AgentSkillTransientDiscoveryError classification in agentSkillsService - add ORPC diagnostics cache + transient fallback helper - route both agentSkills.list and listDiagnostics through shared diagnostics loader - add service and cache tests for transient and non-transient failure paths --- _Generated with `mux` • Model: `openai:gpt-5.3-codex` • Thinking: `xhigh` • Cost: `$0.00`_ --- .../orpc/agentSkillsDiagnosticsCache.test.ts | 96 +++++++++++++++++++ src/node/orpc/agentSkillsDiagnosticsCache.ts | 50 ++++++++++ src/node/orpc/router.ts | 24 ++++- .../agentSkills/agentSkillsService.test.ts | 71 +++++++++++++- .../agentSkills/agentSkillsService.ts | 55 ++++++++++- 5 files changed, 291 insertions(+), 5 deletions(-) create mode 100644 src/node/orpc/agentSkillsDiagnosticsCache.test.ts create mode 100644 src/node/orpc/agentSkillsDiagnosticsCache.ts diff --git a/src/node/orpc/agentSkillsDiagnosticsCache.test.ts b/src/node/orpc/agentSkillsDiagnosticsCache.test.ts new file mode 100644 index 0000000000..b3be708442 --- /dev/null +++ b/src/node/orpc/agentSkillsDiagnosticsCache.test.ts @@ -0,0 +1,96 @@ +import { describe, expect, test } from "bun:test"; + +import { + AgentSkillTransientDiscoveryError, + type DiscoverAgentSkillsDiagnosticsResult, +} from "@/node/services/agentSkills/agentSkillsService"; +import { + getAgentSkillsDiscoveryCacheKey, + loadAgentSkillsDiagnosticsWithFallback, +} from "./agentSkillsDiagnosticsCache"; + +describe("agentSkillsDiagnosticsCache", () => { + test("reuses cached diagnostics after a transient discovery failure", async () => { + const cache = new Map(); + const cacheKey = getAgentSkillsDiscoveryCacheKey({ + workspaceId: "workspace-1", + disableWorkspaceAgents: true, + }); + + const freshDiagnostics = { + skills: [ + { + name: "pull-requests", + description: "PR workflow", + scope: "project" as const, + }, + ], + invalidSkills: [], + }; + + const seeded = await loadAgentSkillsDiagnosticsWithFallback({ + cache, + cacheKey, + discover: () => Promise.resolve(freshDiagnostics), + }); + expect(seeded).toBe(freshDiagnostics); + + const fallback = await loadAgentSkillsDiagnosticsWithFallback({ + cache, + cacheKey, + discover: () => + Promise.reject( + new AgentSkillTransientDiscoveryError("SSH connection to host is in backoff") + ), + }); + + expect(fallback).toBe(freshDiagnostics); + }); + + test("does not hide non-transient discovery failures", async () => { + const cache = new Map(); + const cacheKey = getAgentSkillsDiscoveryCacheKey({ projectPath: "/repo" }); + + await loadAgentSkillsDiagnosticsWithFallback({ + cache, + cacheKey, + discover: () => Promise.resolve({ skills: [], invalidSkills: [] }), + }); + + let caught: unknown; + try { + await loadAgentSkillsDiagnosticsWithFallback({ + cache, + cacheKey, + discover: () => Promise.reject(new Error("SKILL.md has invalid frontmatter")), + }); + } catch (error) { + caught = error; + } + + expect(caught).toBeInstanceOf(Error); + if (!(caught instanceof Error)) { + throw new Error("expected an error to be thrown"); + } + expect(caught.message).toContain("invalid frontmatter"); + }); + + test("rethrows transient discovery errors when no cache exists", async () => { + const cache = new Map(); + const cacheKey = getAgentSkillsDiscoveryCacheKey({ projectPath: "/repo" }); + + let caught: unknown; + try { + await loadAgentSkillsDiagnosticsWithFallback({ + cache, + cacheKey, + discover: () => + Promise.reject(new AgentSkillTransientDiscoveryError("Connection timed out")), + }); + } catch (error) { + caught = error; + } + + expect(caught).toBeInstanceOf(AgentSkillTransientDiscoveryError); + }); +}); diff --git a/src/node/orpc/agentSkillsDiagnosticsCache.ts b/src/node/orpc/agentSkillsDiagnosticsCache.ts new file mode 100644 index 0000000000..d715489cb9 --- /dev/null +++ b/src/node/orpc/agentSkillsDiagnosticsCache.ts @@ -0,0 +1,50 @@ +import { + AgentSkillTransientDiscoveryError, + type DiscoverAgentSkillsDiagnosticsResult, +} from "@/node/services/agentSkills/agentSkillsService"; +import { log } from "@/node/services/log"; + +export interface AgentSkillsDiscoveryCacheInput { + projectPath?: string; + workspaceId?: string; + disableWorkspaceAgents?: boolean; +} + +export function getAgentSkillsDiscoveryCacheKey(input: AgentSkillsDiscoveryCacheInput): string { + const disableWorkspaceAgents = input.disableWorkspaceAgents === true ? "1" : "0"; + + if (input.workspaceId) { + return `workspace:${input.workspaceId}:disableWorkspaceAgents:${disableWorkspaceAgents}`; + } + + if (input.projectPath) { + return `project:${input.projectPath}:disableWorkspaceAgents:${disableWorkspaceAgents}`; + } + + throw new Error("Either projectPath or workspaceId must be provided"); +} + +export async function loadAgentSkillsDiagnosticsWithFallback(args: { + cache: Map; + cacheKey: string; + discover: () => Promise; +}): Promise { + try { + const diagnostics = await args.discover(); + args.cache.set(args.cacheKey, diagnostics); + return diagnostics; + } catch (error) { + if (error instanceof AgentSkillTransientDiscoveryError) { + const cached = args.cache.get(args.cacheKey); + // During SSH hiccups we prefer a stale-but-correct snapshot over surfacing false invalid-skill diagnostics. + if (cached) { + log.warn( + `Agent skill diagnostics discovery transiently failed for ${args.cacheKey}; using cached result: ${error.message}` + ); + return cached; + } + } + + throw error; + } +} diff --git a/src/node/orpc/router.ts b/src/node/orpc/router.ts index a14984f887..23a52c5b40 100644 --- a/src/node/orpc/router.ts +++ b/src/node/orpc/router.ts @@ -42,10 +42,14 @@ import { normalizeTaskSettings, } from "@/common/types/tasks"; import { - discoverAgentSkills, discoverAgentSkillsDiagnostics, + type DiscoverAgentSkillsDiagnosticsResult, readAgentSkill, } from "@/node/services/agentSkills/agentSkillsService"; +import { + getAgentSkillsDiscoveryCacheKey, + loadAgentSkillsDiagnosticsWithFallback, +} from "./agentSkillsDiagnosticsCache"; import { discoverAgentDefinitions, readAgentDefinition, @@ -107,6 +111,8 @@ async function resolveAgentDiscoveryContext( return { runtime, discoveryPath: input.projectPath! }; } +const agentSkillsDiagnosticsCache = new Map(); + function isErrnoWithCode(error: unknown, code: string): boolean { return Boolean(error && typeof error === "object" && "code" in error && error.code === code); } @@ -839,7 +845,14 @@ export const router = (authToken?: string) => { await context.aiService.waitForInit(input.workspaceId); } const { runtime, discoveryPath } = await resolveAgentDiscoveryContext(context, input); - return discoverAgentSkills(runtime, discoveryPath); + const cacheKey = getAgentSkillsDiscoveryCacheKey(input); + const diagnostics = await loadAgentSkillsDiagnosticsWithFallback({ + cache: agentSkillsDiagnosticsCache, + cacheKey, + discover: () => discoverAgentSkillsDiagnostics(runtime, discoveryPath), + }); + + return diagnostics.skills; }), listDiagnostics: t .input(schemas.agentSkills.listDiagnostics.input) @@ -850,7 +863,12 @@ export const router = (authToken?: string) => { await context.aiService.waitForInit(input.workspaceId); } const { runtime, discoveryPath } = await resolveAgentDiscoveryContext(context, input); - return discoverAgentSkillsDiagnostics(runtime, discoveryPath); + const cacheKey = getAgentSkillsDiscoveryCacheKey(input); + return loadAgentSkillsDiagnosticsWithFallback({ + cache: agentSkillsDiagnosticsCache, + cacheKey, + discover: () => discoverAgentSkillsDiagnostics(runtime, discoveryPath), + }); }), get: t .input(schemas.agentSkills.get.input) diff --git a/src/node/services/agentSkills/agentSkillsService.test.ts b/src/node/services/agentSkills/agentSkillsService.test.ts index 7970aab156..292185aeae 100644 --- a/src/node/services/agentSkills/agentSkillsService.test.ts +++ b/src/node/services/agentSkills/agentSkillsService.test.ts @@ -1,12 +1,13 @@ import * as fs from "node:fs/promises"; import * as path from "node:path"; -import { describe, expect, test } from "bun:test"; +import { describe, expect, spyOn, test } from "bun:test"; import { SkillNameSchema } from "@/common/orpc/schemas"; import { LocalRuntime } from "@/node/runtime/LocalRuntime"; import { DisposableTempDir } from "@/node/services/tempDir"; import { + AgentSkillTransientDiscoveryError, discoverAgentSkills, discoverAgentSkillsDiagnostics, readAgentSkill, @@ -225,6 +226,74 @@ describe("agentSkillsService", () => { ).toContain("must match directory name"); }); + test("discoverAgentSkillsDiagnostics rethrows transient stat errors", async () => { + using project = new DisposableTempDir("agent-skills-transient-stat"); + using global = new DisposableTempDir("agent-skills-global"); + + const projectSkillsRoot = path.join(project.path, ".mux", "skills"); + const globalSkillsRoot = global.path; + + await writeSkill(projectSkillsRoot, "foo", "valid"); + + const roots = { projectRoot: projectSkillsRoot, globalRoot: globalSkillsRoot }; + const runtime = new LocalRuntime(project.path); + const originalStat = runtime.stat.bind(runtime); + + spyOn(runtime, "stat").mockImplementation( + async (targetPath: string, abortSignal?: AbortSignal) => { + if (targetPath.endsWith(path.join("foo", "SKILL.md"))) { + throw new Error( + "SSH connection to test-host is in backoff for 2s. Last error: connection timed out" + ); + } + + return originalStat(targetPath, abortSignal); + } + ); + + let caught: unknown; + try { + await discoverAgentSkillsDiagnostics(runtime, project.path, { roots }); + } catch (error) { + caught = error; + } + + expect(caught).toBeInstanceOf(AgentSkillTransientDiscoveryError); + }); + + test("discoverAgentSkillsDiagnostics rethrows transient read errors", async () => { + using project = new DisposableTempDir("agent-skills-transient-read"); + using global = new DisposableTempDir("agent-skills-global"); + + const projectSkillsRoot = path.join(project.path, ".mux", "skills"); + const globalSkillsRoot = global.path; + + await writeSkill(projectSkillsRoot, "foo", "valid"); + + const roots = { projectRoot: projectSkillsRoot, globalRoot: globalSkillsRoot }; + const runtime = new LocalRuntime(project.path); + const originalReadFile = runtime.readFile.bind(runtime); + + spyOn(runtime, "readFile").mockImplementation( + (targetPath: string, abortSignal?: AbortSignal) => { + if (targetPath.endsWith(path.join("foo", "SKILL.md"))) { + throw new Error("kex_exchange_identification: Connection closed by remote host"); + } + + return originalReadFile(targetPath, abortSignal); + } + ); + + let caught: unknown; + try { + await discoverAgentSkillsDiagnostics(runtime, project.path, { roots }); + } catch (error) { + caught = error; + } + + expect(caught).toBeInstanceOf(AgentSkillTransientDiscoveryError); + }); + test("discovers symlinked skill directories", async () => { using project = new DisposableTempDir("agent-skills-symlink"); using skillSource = new DisposableTempDir("agent-skills-source"); diff --git a/src/node/services/agentSkills/agentSkillsService.ts b/src/node/services/agentSkills/agentSkillsService.ts index 7302f3b369..bbb598d6c8 100644 --- a/src/node/services/agentSkills/agentSkillsService.ts +++ b/src/node/services/agentSkills/agentSkillsService.ts @@ -59,6 +59,46 @@ function formatError(error: unknown): string { return error instanceof Error ? error.message : String(error); } +// Keep transient transport failures distinct from deterministic SKILL.md issues. +// Callers can safely reuse stale diagnostics for these errors instead of surfacing false "missing/unreadable" problems. +export class AgentSkillTransientDiscoveryError extends Error { + constructor(message: string) { + super(message); + this.name = "AgentSkillTransientDiscoveryError"; + } +} + +const NON_TRANSIENT_SKILL_IO_ERROR_PATTERNS: readonly RegExp[] = [ + /no such file or directory/i, + /not a directory/i, + /permission denied/i, + /is a directory/i, +]; + +const TRANSIENT_SKILL_IO_ERROR_PATTERNS: readonly RegExp[] = [ + /ssh connection to .* is in backoff/i, + /ssh connection to .* did not become healthy/i, + /ssh probe timed out/i, + /operation aborted/i, + /timed out/i, + /connection reset by peer/i, + /connection (?:to .* )?closed/i, + /connection refused/i, + /network is unreachable/i, + /no route to host/i, + /broken pipe/i, + /kex_exchange_identification/i, + /resource temporarily unavailable/i, +]; + +function isLikelyTransientSkillIoError(message: string): boolean { + if (NON_TRANSIENT_SKILL_IO_ERROR_PATTERNS.some((pattern) => pattern.test(message))) { + return false; + } + + return TRANSIENT_SKILL_IO_ERROR_PATTERNS.some((pattern) => pattern.test(message)); +} + async function listSkillDirectoriesFromLocalFs(root: string): Promise { try { const entries = await fs.readdir(root, { withFileTypes: true }); @@ -111,7 +151,14 @@ async function readSkillDescriptorFromDir( let stat; try { stat = await runtime.stat(skillFilePath); - } catch { + } catch (err) { + const message = formatError(err); + if (isLikelyTransientSkillIoError(message)) { + throw new AgentSkillTransientDiscoveryError( + `Transient error while stat-ing ${skillFilePath}: ${message}` + ); + } + options?.invalidSkills?.push({ directoryName, scope, @@ -152,6 +199,12 @@ async function readSkillDescriptorFromDir( content = await readFileString(runtime, skillFilePath); } catch (err) { const message = formatError(err); + if (isLikelyTransientSkillIoError(message)) { + throw new AgentSkillTransientDiscoveryError( + `Transient error while reading ${skillFilePath}: ${message}` + ); + } + log.warn(`Failed to read SKILL.md for ${directoryName}: ${message}`); options?.invalidSkills?.push({ directoryName, From 80ab4ac086510facc619ad8e8aee73fafd5aa2e7 Mon Sep 17 00:00:00 2001 From: Ammar Date: Fri, 13 Feb 2026 11:29:50 -0600 Subject: [PATCH 2/3] fix: avoid aborting non-diagnostic skill discovery on transient errors --- .../agentSkills/agentSkillsService.test.ts | 31 +++++++++++++++++++ .../agentSkills/agentSkillsService.ts | 22 +++++++++---- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/node/services/agentSkills/agentSkillsService.test.ts b/src/node/services/agentSkills/agentSkillsService.test.ts index 292185aeae..f4af3637be 100644 --- a/src/node/services/agentSkills/agentSkillsService.test.ts +++ b/src/node/services/agentSkills/agentSkillsService.test.ts @@ -294,6 +294,37 @@ describe("agentSkillsService", () => { expect(caught).toBeInstanceOf(AgentSkillTransientDiscoveryError); }); + test("discoverAgentSkills skips transient skill I/O failures and returns other skills", async () => { + using project = new DisposableTempDir("agent-skills-transient-discovery"); + using global = new DisposableTempDir("agent-skills-global"); + + const projectSkillsRoot = path.join(project.path, ".mux", "skills"); + const globalSkillsRoot = global.path; + + await writeSkill(projectSkillsRoot, "foo", "valid"); + await writeSkill(projectSkillsRoot, "bar", "also valid"); + + const roots = { projectRoot: projectSkillsRoot, globalRoot: globalSkillsRoot }; + const runtime = new LocalRuntime(project.path); + const originalStat = runtime.stat.bind(runtime); + + spyOn(runtime, "stat").mockImplementation( + async (targetPath: string, abortSignal?: AbortSignal) => { + if (targetPath.endsWith(path.join("foo", "SKILL.md"))) { + throw new Error( + "SSH connection to test-host is in backoff for 2s. Last error: connection timed out" + ); + } + + return originalStat(targetPath, abortSignal); + } + ); + + const skills = await discoverAgentSkills(runtime, project.path, { roots }); + + expect(skills.map((skill) => skill.name)).toEqual(["bar", "init", "mux-docs"]); + }); + test("discovers symlinked skill directories", async () => { using project = new DisposableTempDir("agent-skills-symlink"); using skillSource = new DisposableTempDir("agent-skills-source"); diff --git a/src/node/services/agentSkills/agentSkillsService.ts b/src/node/services/agentSkills/agentSkillsService.ts index bbb598d6c8..44775821cb 100644 --- a/src/node/services/agentSkills/agentSkillsService.ts +++ b/src/node/services/agentSkills/agentSkillsService.ts @@ -305,12 +305,22 @@ export async function discoverAgentSkills( } const skillDir = runtime.normalizePath(directoryName, resolvedRoot); - const descriptor = await readSkillDescriptorFromDir( - runtime, - skillDir, - directoryName, - scan.scope - ); + let descriptor: AgentSkillDescriptor | null; + try { + descriptor = await readSkillDescriptorFromDir(runtime, skillDir, directoryName, scan.scope); + } catch (error) { + if (error instanceof AgentSkillTransientDiscoveryError) { + // Non-diagnostic callers (for example stream context hints) should degrade per-skill + // rather than dropping all discovered skills when SSH transport briefly flaps. + log.warn( + `Skipping ${scan.scope} skill '${directoryName}' after transient discovery error: ${error.message}` + ); + continue; + } + + throw error; + } + if (!descriptor) continue; // Precedence: project overwrites global. From cfaafd44ab9564c559e45cc36d3616f0719d0135 Mon Sep 17 00:00:00 2001 From: Ammar Date: Mon, 16 Feb 2026 19:29:30 -0600 Subject: [PATCH 3/3] fix: keep agentSkills.list on non-diagnostic discovery --- src/node/orpc/router.ts | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/node/orpc/router.ts b/src/node/orpc/router.ts index 23a52c5b40..9beddde866 100644 --- a/src/node/orpc/router.ts +++ b/src/node/orpc/router.ts @@ -42,6 +42,7 @@ import { normalizeTaskSettings, } from "@/common/types/tasks"; import { + discoverAgentSkills, discoverAgentSkillsDiagnostics, type DiscoverAgentSkillsDiagnosticsResult, readAgentSkill, @@ -845,14 +846,9 @@ export const router = (authToken?: string) => { await context.aiService.waitForInit(input.workspaceId); } const { runtime, discoveryPath } = await resolveAgentDiscoveryContext(context, input); - const cacheKey = getAgentSkillsDiscoveryCacheKey(input); - const diagnostics = await loadAgentSkillsDiagnosticsWithFallback({ - cache: agentSkillsDiagnosticsCache, - cacheKey, - discover: () => discoverAgentSkillsDiagnostics(runtime, discoveryPath), - }); - - return diagnostics.skills; + // Keep list resilient on first-load transient SSH hiccups by using non-diagnostic + // discovery, which skips only the affected skill instead of failing the whole request. + return discoverAgentSkills(runtime, discoveryPath); }), listDiagnostics: t .input(schemas.agentSkills.listDiagnostics.input)