From b6a3bce458e3bef8c8cba87dd41b6b92f58ea9f4 Mon Sep 17 00:00:00 2001
From: Nikhil Kala <nikhil.kala@orkes.io>
Date: Thu, 21 May 2026 09:03:04 -0700
Subject: [PATCH 1/6] feat(sdk): push provider env vars to server credential
 store at boot

When AgentRuntime() targets localhost, sync every known provider env var
into the server's credential store via PUT /api/credentials/{name} on
construction. This lets a corrected ANTHROPIC_API_KEY (e.g. after a
.zshrc fix) reach the long-lived server JVM without restarting it.

Also adds a credentials-API fallback to _ensure_model so the existing
AGENTSPAN_INTEGRATIONS_AUTO_REGISTER escape hatch works against OSS
Conductor, which doesn't expose /api/integrations/provider/*.

Remote servers are skipped to avoid clobbering UI-managed credentials.
---
 .../agents/_internal/provider_registry.py     |  22 ++
 .../src/agentspan/agents/runtime/runtime.py   |  88 +++++-
 .../unit/test_runtime_credentials_sync.py     | 256 ++++++++++++++++++
 3 files changed, 359 insertions(+), 7 deletions(-)
 create mode 100644 sdk/python/tests/unit/test_runtime_credentials_sync.py

diff --git a/sdk/python/src/agentspan/agents/_internal/provider_registry.py b/sdk/python/src/agentspan/agents/_internal/provider_registry.py
index d99ca809d..ba02f2ae7 100644
--- a/sdk/python/src/agentspan/agents/_internal/provider_registry.py
+++ b/sdk/python/src/agentspan/agents/_internal/provider_registry.py
@@ -58,3 +58,25 @@ def get_provider_spec(provider_name: str) -> Optional[ProviderSpec]:
     Returns ``None`` if the provider is not in the registry.
     """
     return PROVIDER_REGISTRY.get(provider_name)
+
+
+# Mirrors server-side CredentialEnvSeeder.KNOWN_ENV_VARS — the full set of
+# provider API key env vars the runtime will sync into the local server's
+# credential store on boot. Keep in sync with the Java list.
+KNOWN_PROVIDER_ENV_VARS: frozenset[str] = frozenset(
+    {
+        "OPENAI_API_KEY",
+        "ANTHROPIC_API_KEY",
+        "GEMINI_API_KEY",
+        "GOOGLE_API_KEY",
+        "MISTRAL_API_KEY",
+        "COHERE_API_KEY",
+        "XAI_API_KEY",
+        "PERPLEXITY_API_KEY",
+        "AZURE_OPENAI_API_KEY",
+        "HUGGINGFACE_API_KEY",
+        "GROQ_API_KEY",
+        "DEEPSEEK_API_KEY",
+        "TOGETHER_API_KEY",
+    }
+)
diff --git a/sdk/python/src/agentspan/agents/runtime/runtime.py b/sdk/python/src/agentspan/agents/runtime/runtime.py
index 1a3286aa7..d44aa8253 100644
--- a/sdk/python/src/agentspan/agents/runtime/runtime.py
+++ b/sdk/python/src/agentspan/agents/runtime/runtime.py
@@ -41,6 +41,16 @@
 logger = logging.getLogger("agentspan.agents.runtime")
 
 
+def _is_local_server(server_url: str) -> bool:
+    """Return True if *server_url* points to a loopback address."""
+    from urllib.parse import urlparse
+
+    if not server_url:
+        return False
+    host = (urlparse(server_url).hostname or "").lower()
+    return host in ("localhost", "127.0.0.1", "::1", "0.0.0.0")
+
+
 _RETRY_POLICY_MAP = {
     "fixed": "FIXED",
     "linear_backoff": "LINEAR_BACKOFF",
@@ -378,6 +388,12 @@ def __init__(
 
         logger.info("AgentRuntime initialized (server=%s)", self._config.server_url)
 
+        # Push shell env vars into the local server's credential store so a
+        # corrected key reaches the running JVM without a restart. Skipped
+        # for remote — would clobber UI-managed credentials.
+        if _is_local_server(self._config.server_url):
+            self._sync_provider_env_to_server()
+
     # ── Sync/async bridge ────────────────────────────────────────────
 
     @staticmethod
@@ -2072,13 +2088,26 @@ def _ensure_model(self, model_string: str) -> None:
 
             self._integration_api_available = True
         except Exception as e:
-            if self._integration_api_available is None:
-                # First failure — likely OSS Conductor without integration API
-                logger.warning(
-                    "Integration API not available (OSS Conductor?). "
-                    "Auto-registration disabled: %s",
-                    e,
-                )
+            first_failure = self._integration_api_available is None
+            # First failure → assume OSS Conductor (no integration API); push
+            # the key via /api/credentials instead. Later failures are per-model.
+            if first_failure:
+                try:
+                    self._push_credential_to_server(spec.api_key_env, api_key)
+                    logger.info(
+                        "Integration API not available (OSS Conductor?). "
+                        "Pushed %s via /api/credentials instead: %s",
+                        spec.api_key_env,
+                        e,
+                    )
+                except Exception as cred_err:
+                    logger.warning(
+                        "Auto-registration failed for '%s' on both integration "
+                        "API (%s) and credentials API (%s).",
+                        model_string,
+                        e,
+                        cred_err,
+                    )
                 self._integration_api_available = False
             else:
                 logger.warning(
@@ -2089,6 +2118,51 @@ def _ensure_model(self, model_string: str) -> None:
 
         self._ensured_models.add(model_string)
 
+    # ── Credential push (Agentspan-native /api/credentials) ────────────
+
+    def _push_credential_to_server(self, name: str, value: str) -> None:
+        """Upsert a credential on the Agentspan server via PUT /api/credentials/{name}.
+
+        Used as a fallback when the Conductor integration API is unavailable
+        (e.g. the local OSS server). Raises on HTTP error so callers can decide
+        whether to swallow the failure.
+        """
+        import httpx
+
+        base = self._config.server_url.rstrip("/")
+        url = f"{base}/credentials/{name}"
+
+        headers: Dict[str, str] = {}
+        if self._config.api_key:
+            headers["Authorization"] = f"Bearer {self._config.api_key}"
+        elif self._config.auth_key:
+            headers["X-Auth-Key"] = self._config.auth_key
+            if self._config.auth_secret:
+                headers["X-Auth-Secret"] = self._config.auth_secret
+
+        resp = httpx.put(url, json={"value": value}, headers=headers, timeout=5.0)
+        resp.raise_for_status()
+
+    def _sync_provider_env_to_server(self) -> None:
+        """Push every known provider env var into the server's credential store.
+
+        Runs once at AgentRuntime construction (localhost target only). Each
+        non-empty env var is upserted via :meth:`_push_credential_to_server`.
+        Failures are logged at debug and swallowed — this is a best-effort
+        convenience and should never block a runtime from starting.
+        """
+        from agentspan.agents._internal.provider_registry import KNOWN_PROVIDER_ENV_VARS
+
+        for name in sorted(KNOWN_PROVIDER_ENV_VARS):
+            value = os.environ.get(name)
+            if not value:
+                continue
+            try:
+                self._push_credential_to_server(name, value)
+                logger.debug("Synced %s into local server credential store", name)
+            except Exception as e:
+                logger.debug("Could not sync %s into local server: %s", name, e)
+
     def _ensure_models_for_agent(self, agent: Agent) -> None:
         """Walk the agent tree and ensure all referenced models are registered."""
         seen: set = set()
diff --git a/sdk/python/tests/unit/test_runtime_credentials_sync.py b/sdk/python/tests/unit/test_runtime_credentials_sync.py
new file mode 100644
index 000000000..24c8ab1e2
--- /dev/null
+++ b/sdk/python/tests/unit/test_runtime_credentials_sync.py
@@ -0,0 +1,256 @@
+# Copyright (c) 2025 Agentspan
+# Licensed under the MIT License. See LICENSE file in the project root for details.
+
+"""Tests for the credentials-API fallback used by auto-register & env hot-reload.
+
+Context: the local Agentspan OSS server does not expose the Orkes
+``/api/integrations/provider/*`` endpoints, so the original auto-register
+escape hatch silently no-oped against a localhost server. The runtime now
+falls back to ``PUT /api/credentials/{name}`` (the Agentspan-native credential
+store) whenever the integration API call fails, and also pushes known provider
+env vars into the store on boot when targeting localhost.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+# ── helpers ────────────────────────────────────────────────────────────
+
+
+def _make_runtime(*, server_url: str = "http://localhost:6767/api", auto_register: bool = False):
+    """Construct an AgentRuntime with heavy I/O mocked out."""
+    with (
+        patch("conductor.client.orkes_clients.OrkesClients"),
+        patch("agentspan.agents.runtime.worker_manager.TaskHandler", create=True),
+        patch("agentspan.agents.runtime.server.ensure_server_running"),
+        patch("agentspan.agents.runtime.server._is_server_ready", return_value=True),
+    ):
+        from agentspan.agents.runtime.config import AgentConfig
+        from agentspan.agents.runtime.runtime import AgentRuntime
+
+        config = AgentConfig(
+            server_url=server_url,
+            auto_start_workers=False,
+            auto_start_server=False,
+            auto_register_integrations=auto_register,
+        )
+        return AgentRuntime(config=config)
+
+
+def _api_exception(status: int, reason: str = ""):
+    """Build a conductor ApiException with the given status code."""
+    from conductor.client.http.rest import ApiException
+
+    exc = ApiException(status=status, reason=reason or f"status {status}")
+    return exc
+
+
+# ── Fix 1: credentials API fallback for _ensure_model ─────────────────
+
+
+class TestEnsureModelCredentialsFallback:
+    """When the integration API returns 404 (OSS Conductor), _ensure_model must
+    fall back to pushing the provider API key via /api/credentials/{name}."""
+
+    def test_falls_back_to_credentials_on_integration_api_404(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-test-real-key")
+        rt = _make_runtime(auto_register=True)
+
+        # Integration API raises 404 (mimics OSS Conductor)
+        fake_integration_client = MagicMock()
+        fake_integration_client.save_integration.side_effect = _api_exception(404, "Not Found")
+        rt._integration_client_instance = fake_integration_client
+
+        pushed = []
+        rt._push_credential_to_server = lambda name, value: pushed.append((name, value))
+
+        rt._ensure_model("anthropic/claude-3-5-sonnet")
+
+        # Integration API was tried, then credentials fallback used.
+        assert fake_integration_client.save_integration.called
+        assert pushed == [("ANTHROPIC_API_KEY", "sk-ant-test-real-key")]
+
+    def test_skips_fallback_when_integration_api_succeeds(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-test-key")
+        rt = _make_runtime(auto_register=True)
+
+        fake_integration_client = MagicMock()
+        # save_integration returns successfully
+        rt._integration_client_instance = fake_integration_client
+
+        pushed = []
+        rt._push_credential_to_server = lambda name, value: pushed.append((name, value))
+
+        rt._ensure_model("anthropic/claude-3-5-sonnet")
+
+        assert fake_integration_client.save_integration.called
+        assert fake_integration_client.save_integration_api.called
+        # Credentials fallback NOT invoked when the integration API works.
+        assert pushed == []
+
+    def test_no_push_when_api_key_env_missing(self, monkeypatch):
+        monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+        rt = _make_runtime(auto_register=True)
+
+        fake_integration_client = MagicMock()
+        rt._integration_client_instance = fake_integration_client
+
+        pushed = []
+        rt._push_credential_to_server = lambda name, value: pushed.append((name, value))
+
+        rt._ensure_model("anthropic/claude-3-5-sonnet")
+
+        # Without an env var there is nothing to push.
+        assert not fake_integration_client.save_integration.called
+        assert pushed == []
+
+    def test_does_not_push_blank_key(self, monkeypatch):
+        # An empty string env var (e.g. the .zshrc typo case) must not be
+        # forwarded to the server — that's exactly the bug we're protecting against.
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "")
+        rt = _make_runtime(auto_register=True)
+
+        fake_integration_client = MagicMock()
+        rt._integration_client_instance = fake_integration_client
+
+        pushed = []
+        rt._push_credential_to_server = lambda name, value: pushed.append((name, value))
+
+        rt._ensure_model("anthropic/claude-3-5-sonnet")
+
+        assert not fake_integration_client.save_integration.called
+        assert pushed == []
+
+
+# ── Fix 2: hot-reload env vars on AgentRuntime boot (localhost) ───────
+
+
+class TestBootEnvCredentialSync:
+    """When targeting a localhost server, AgentRuntime should sync provider
+    env vars into the server's credential store on construction so the running
+    JVM picks up a corrected ANTHROPIC_API_KEY without a restart."""
+
+    def test_localhost_sync_pushes_env_vars(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-fresh")
+        monkeypatch.setenv("OPENAI_API_KEY", "sk-openai-fresh")
+        # Make sure other known vars are unset so they don't leak in from CI env.
+        for v in ("GEMINI_API_KEY", "MISTRAL_API_KEY", "COHERE_API_KEY"):
+            monkeypatch.delenv(v, raising=False)
+
+        pushed = {}
+
+        def fake_push(name, value):
+            pushed[name] = value
+
+        with patch.object(
+            __import__("agentspan.agents.runtime.runtime", fromlist=["AgentRuntime"]).AgentRuntime,
+            "_push_credential_to_server",
+            new=lambda self, name, value: fake_push(name, value),
+        ):
+            _make_runtime(server_url="http://localhost:6767/api")
+
+        assert pushed.get("ANTHROPIC_API_KEY") == "sk-ant-fresh"
+        assert pushed.get("OPENAI_API_KEY") == "sk-openai-fresh"
+
+    def test_remote_server_does_not_auto_sync(self, monkeypatch):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-fresh")
+
+        pushed = {}
+
+        def fake_push(name, value):
+            pushed[name] = value
+
+        with patch.object(
+            __import__("agentspan.agents.runtime.runtime", fromlist=["AgentRuntime"]).AgentRuntime,
+            "_push_credential_to_server",
+            new=lambda self, name, value: fake_push(name, value),
+        ):
+            _make_runtime(server_url="https://hosted.example.com/api")
+
+        # Remote: never auto-clobber UI-managed credentials.
+        assert pushed == {}
+
+    def test_blank_env_vars_skipped(self, monkeypatch):
+        # The exact reproducer of the .zshrc-typo bug: env var is exported but empty.
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "")
+        monkeypatch.setenv("OPENAI_API_KEY", "sk-openai-fresh")
+
+        pushed = {}
+
+        def fake_push(name, value):
+            pushed[name] = value
+
+        with patch.object(
+            __import__("agentspan.agents.runtime.runtime", fromlist=["AgentRuntime"]).AgentRuntime,
+            "_push_credential_to_server",
+            new=lambda self, name, value: fake_push(name, value),
+        ):
+            _make_runtime(server_url="http://127.0.0.1:6767/api")
+
+        assert "ANTHROPIC_API_KEY" not in pushed
+        assert pushed.get("OPENAI_API_KEY") == "sk-openai-fresh"
+
+
+# ── _push_credential_to_server itself ─────────────────────────────────
+
+
+class TestPushCredentialToServer:
+    """The HTTP method/URL/headers used to push a credential."""
+
+    def test_calls_put_credentials_endpoint(self, monkeypatch):
+        rt = _make_runtime(server_url="http://localhost:6767/api")
+
+        captured = {}
+
+        class FakeResp:
+            status_code = 200
+
+            def raise_for_status(self):
+                return None
+
+        def fake_put(url, json=None, headers=None, timeout=None):
+            captured["url"] = url
+            captured["json"] = json
+            captured["headers"] = headers or {}
+            return FakeResp()
+
+        with patch("httpx.put", side_effect=fake_put):
+            rt._push_credential_to_server("ANTHROPIC_API_KEY", "sk-ant-xyz")
+
+        assert captured["url"].endswith("/api/credentials/ANTHROPIC_API_KEY")
+        assert captured["json"] == {"value": "sk-ant-xyz"}
+
+    def test_sends_auth_header_when_api_key_configured(self):
+        from agentspan.agents.runtime.config import AgentConfig
+        from agentspan.agents.runtime.runtime import AgentRuntime
+
+        with (
+            patch("conductor.client.orkes_clients.OrkesClients"),
+            patch("agentspan.agents.runtime.worker_manager.TaskHandler", create=True),
+            patch("agentspan.agents.runtime.server.ensure_server_running"),
+        ):
+            cfg = AgentConfig(
+                server_url="http://localhost:6767/api",
+                api_key="bearer-token-abc",
+                auto_start_workers=False,
+                auto_start_server=False,
+            )
+            rt = AgentRuntime(config=cfg)
+
+        captured = {}
+
+        class FakeResp:
+            status_code = 200
+
+            def raise_for_status(self):
+                return None
+
+        def fake_put(url, json=None, headers=None, timeout=None):
+            captured["headers"] = headers or {}
+            return FakeResp()
+
+        with patch("httpx.put", side_effect=fake_put):
+            rt._push_credential_to_server("OPENAI_API_KEY", "sk-x")
+
+        assert captured["headers"].get("Authorization") == "Bearer bearer-token-abc"

From 16b1c9cf8cf30c50eedae770cade957018c409ec Mon Sep 17 00:00:00 2001
From: Nikhil Kala <nikhil.kala@orkes.io>
Date: Thu, 21 May 2026 09:03:14 -0700
Subject: [PATCH 2/6] feat(server): fail fast when provider API key is empty at
 startup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, an env var set to "" (e.g. a .zshrc typo) silently configured
Spring AI with an empty key. The actual error surfaced mid-stream as
"cannot retry due to server authentication" — misleading and hard to
diagnose.

Now AgentspanAIModelProvider.getModel throws IllegalStateException
before the network call when no per-user credential exists AND the env
var is null or blank, naming the specific env var and three remediation
paths (env + restart / PUT /api/credentials / Credentials UI).

Also normalises blank per-user credentials to missing so they take the
same fail-fast path. Adds a package-private lookupEnv() indirection over
System.getenv for testability.
---
 .../runtime/ai/AgentspanAIModelProvider.java  |  41 ++++-
 .../ai/AgentspanAIModelProviderTest.java      | 174 ++++++++++++++++++
 2 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 server/src/test/java/dev/agentspan/runtime/ai/AgentspanAIModelProviderTest.java

diff --git a/server/src/main/java/dev/agentspan/runtime/ai/AgentspanAIModelProvider.java b/server/src/main/java/dev/agentspan/runtime/ai/AgentspanAIModelProvider.java
index fb1f75914..03238deb3 100644
--- a/server/src/main/java/dev/agentspan/runtime/ai/AgentspanAIModelProvider.java
+++ b/server/src/main/java/dev/agentspan/runtime/ai/AgentspanAIModelProvider.java
@@ -102,12 +102,22 @@ public AIModel getModel(LLMWorkerInput input) {
         log.debug("getModel called for provider='{}' model='{}'", provider, input.getModel());
         String userApiKey = resolveUserApiKey(provider);
         log.debug("resolveUserApiKey('{}') returned: {}", provider, userApiKey != null ? "key found" : "null");
+
+        // A blank credential is worse than no credential — it would be passed
+        // to Spring AI and produce a 401 with a misleading "cannot retry due
+        // to server authentication" message. Treat blank as missing.
+        if (userApiKey != null && userApiKey.isBlank()) {
+            log.warn("Per-user credential for '{}' resolved to a blank value — ignoring.", provider);
+            userApiKey = null;
+        }
+
         if (userApiKey != null || baseUrl != null) {
             try {
                 // If we have a base URL but no user key, try the server-wide key
                 if (userApiKey == null) {
                     String envVar = PROVIDER_TO_ENV_VAR.get(provider.toLowerCase());
-                    userApiKey = envVar != null ? System.getenv(envVar) : null;
+                    String envValue = envVar != null ? lookupEnv(envVar) : null;
+                    userApiKey = (envValue != null && !envValue.isBlank()) ? envValue : null;
                 }
                 if (userApiKey != null) {
                     AIModel model = createModelWithKey(provider, userApiKey, baseUrl);
@@ -122,10 +132,39 @@ public AIModel getModel(LLMWorkerInput input) {
             }
         }
 
+        // Before falling back to the server-wide bean: if the env var is empty/missing,
+        // the bean was configured with "" at startup — fail fast instead of letting Spring AI emit a misleading mid-stream 401.
+        String envVar = PROVIDER_TO_ENV_VAR.get(provider.toLowerCase());
+        if (envVar != null) {
+            String envValue = lookupEnv(envVar);
+            if (envValue == null || envValue.isBlank()) {
+                throw new IllegalStateException(
+                        "No API key configured for provider '"
+                                + provider
+                                + "'. The server started with an empty "
+                                + envVar
+                                + ", and no credential exists in the store. "
+                                + "Set "
+                                + envVar
+                                + " in the environment before starting the server, "
+                                + "push it via PUT /api/credentials/"
+                                + envVar
+                                + ", or save it via the Credentials UI.");
+            }
+        }
+
         // Fall back to server-wide model
         return super.getModel(input);
     }
 
+    /**
+     * Indirection over {@link System#getenv(String)} so tests can inject env vars.
+     * Package-private; production code calls {@code System.getenv} directly.
+     */
+    String lookupEnv(String name) {
+        return System.getenv(name);
+    }
+
     /**
      * Resolve a per-user API key for the given LLM provider.
      *
diff --git a/server/src/test/java/dev/agentspan/runtime/ai/AgentspanAIModelProviderTest.java b/server/src/test/java/dev/agentspan/runtime/ai/AgentspanAIModelProviderTest.java
new file mode 100644
index 000000000..b418a56c4
--- /dev/null
+++ b/server/src/test/java/dev/agentspan/runtime/ai/AgentspanAIModelProviderTest.java
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.ai;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.conductoross.conductor.ai.AIModel;
+import org.conductoross.conductor.ai.ModelConfiguration;
+import org.conductoross.conductor.ai.models.LLMWorkerInput;
+import org.junit.jupiter.api.Test;
+import org.springframework.core.env.Environment;
+
+import dev.agentspan.runtime.credentials.CredentialResolutionService;
+import dev.agentspan.runtime.credentials.ExecutionTokenService;
+
+/**
+ * Unit tests for the empty-key fail-fast path added to
+ * {@link AgentspanAIModelProvider#getModel(LLMWorkerInput)}.
+ *
+ * <p>Background: when the server was started with an EMPTY env var (e.g. due
+ * to a {@code .zshrc} typo), Spring AI silently configured the provider bean
+ * with {@code ""} and the provider would later return 401 mid-stream with the
+ * misleading "cannot retry due to server authentication" message. The new
+ * code throws {@link IllegalStateException} before making the doomed call.</p>
+ */
+class AgentspanAIModelProviderTest {
+
+    /** Mockable provider that lets the test inject a fake env lookup. */
+    static class TestProvider extends AgentspanAIModelProvider {
+        private final Map<String, String> env;
+
+        TestProvider(
+                CredentialResolutionService resolutionService,
+                ExecutionTokenService tokenService,
+                Map<String, String> env) {
+            super(List.<ModelConfiguration<? extends AIModel>>of(), mock(Environment.class), resolutionService, tokenService);
+            this.env = env;
+        }
+
+        @Override
+        String lookupEnv(String name) {
+            return env.get(name);
+        }
+    }
+
+    private CredentialResolutionService mockResolutionService(String userKey) {
+        CredentialResolutionService svc = mock(CredentialResolutionService.class);
+        try {
+            if (userKey == null) {
+                when(svc.resolve(anyString(), anyString())).thenReturn(null);
+            } else {
+                when(svc.resolve(anyString(), anyString())).thenReturn(userKey);
+            }
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+        return svc;
+    }
+
+    private LLMWorkerInput input(String provider, String model) {
+        LLMWorkerInput in = new LLMWorkerInput();
+        in.setLlmProvider(provider);
+        in.setModel(model);
+        return in;
+    }
+
+    @Test
+    void throwsWhenEnvVarEmptyAndNoCredential() {
+        // .zshrc-typo reproducer: env var is set but EMPTY, no per-user credential.
+        Map<String, String> env = new HashMap<>();
+        env.put("ANTHROPIC_API_KEY", ""); // empty, like Spring's ${ANTHROPIC_API_KEY:} default
+
+        TestProvider provider = new TestProvider(
+                mockResolutionService(null), mock(ExecutionTokenService.class), env);
+
+        assertThatThrownBy(() -> provider.getModel(input("anthropic", "claude-3-5-sonnet")))
+                .isInstanceOf(IllegalStateException.class)
+                .hasMessageContaining("anthropic")
+                .hasMessageContaining("ANTHROPIC_API_KEY")
+                .hasMessageContaining("PUT /api/credentials");
+    }
+
+    @Test
+    void throwsWhenEnvVarMissingAndNoCredential() {
+        // Env var completely absent (System.getenv returns null), no per-user credential.
+        Map<String, String> env = new HashMap<>(); // ANTHROPIC_API_KEY not present
+
+        TestProvider provider = new TestProvider(
+                mockResolutionService(null), mock(ExecutionTokenService.class), env);
+
+        assertThatThrownBy(() -> provider.getModel(input("anthropic", "claude-3-5-sonnet")))
+                .isInstanceOf(IllegalStateException.class)
+                .hasMessageContaining("ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void doesNotThrowWhenBlankCredentialButValidEnvVar() {
+        // Per-user credential is blank (would be ignored), env var has a real key.
+        // Spring AI bean is properly configured, fall-through to super is safe.
+        // (We don't reach super in this stub — but the absence of IllegalStateException
+        // proves the fail-fast did not fire.)
+        Map<String, String> env = new HashMap<>();
+        env.put("ANTHROPIC_API_KEY", "sk-ant-real-key");
+
+        TestProvider provider = new TestProvider(
+                mockResolutionService("   "), mock(ExecutionTokenService.class), env);
+
+        // The provider may return null from super.getModel (since we didn't wire one),
+        // but the important behavior is that no IllegalStateException is thrown.
+        try {
+            provider.getModel(input("anthropic", "claude-3-5-sonnet"));
+        } catch (IllegalStateException e) {
+            throw new AssertionError("Should not have thrown — env var is set", e);
+        } catch (Exception ignored) {
+            // super.getModel may throw something else (model not registered) — that's fine.
+        }
+    }
+
+    @Test
+    void doesNotThrowForUnknownProvider() {
+        // Provider not in PROVIDER_TO_ENV_VAR map (no envVar resolved) — skip the
+        // fail-fast so unknown/custom providers fall through to super untouched.
+        Map<String, String> env = new HashMap<>();
+
+        TestProvider provider = new TestProvider(
+                mockResolutionService(null), mock(ExecutionTokenService.class), env);
+
+        try {
+            provider.getModel(input("some-custom-provider", "some-model"));
+        } catch (IllegalStateException e) {
+            throw new AssertionError("Should not throw for unknown provider", e);
+        } catch (Exception ignored) {
+            // super may throw — that's not our concern here.
+        }
+    }
+
+    @Test
+    void blankCredentialIsTreatedAsMissing() {
+        // Per-user credential exists but is blank ("  "). The provider should NOT
+        // try to build a model with it. Combined with a valid env var → no throw.
+        // Combined with a missing env var → throws the fail-fast.
+        Map<String, String> env = new HashMap<>(); // no env var
+        TestProvider provider = new TestProvider(
+                mockResolutionService("   "), mock(ExecutionTokenService.class), env);
+
+        assertThatThrownBy(() -> provider.getModel(input("anthropic", "claude-3-5-sonnet")))
+                .isInstanceOf(IllegalStateException.class)
+                .hasMessageContaining("ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void errorMessageNamesCorrectEnvVarPerProvider() {
+        Map<String, String> env = new HashMap<>();
+        TestProvider provider = new TestProvider(
+                mockResolutionService(null), mock(ExecutionTokenService.class), env);
+
+        // OpenAI → OPENAI_API_KEY
+        assertThatThrownBy(() -> provider.getModel(input("openai", "gpt-4o")))
+                .hasMessageContaining("OPENAI_API_KEY");
+        // Mistral → MISTRAL_API_KEY
+        assertThatThrownBy(() -> provider.getModel(input("mistral", "mistral-large")))
+                .hasMessageContaining("MISTRAL_API_KEY");
+    }
+}

From 949533958af05de8ed2b411f260ad54316cd8ba5 Mon Sep 17 00:00:00 2001
From: Nikhil Kala <nikhil.kala@orkes.io>
Date: Thu, 21 May 2026 09:06:51 -0700
Subject: [PATCH 3/6] feat(server): clarify provider auth errors for invalid
 keys

The empty-key fail-fast covers only the "env var set to empty" case.
A non-empty but invalid key (typo, expired, revoked) still produced
Spring AI's misleading "cannot retry due to server authentication"
streaming error.

Adds an AuthClarifyingChatModel decorator that catches authentication
errors (401 / Unauthorized / invalid_api_key in the exception or its
cause chain) and rethrows as IllegalStateException with a clear message
naming the provider, env var, and update paths. Non-auth errors flow
through unchanged.

AuthClarifyingAIModel wraps the whole AIModel and returns the wrapped
ChatModel from getChatModel(); AgentspanAIModelProvider.createModelWithKey
applies the wrapper to every model it constructs.
---
 .../runtime/ai/AgentspanAIModelProvider.java  |  12 ++
 .../runtime/ai/AuthClarifyingAIModel.java     | 109 ++++++++++++++++
 .../runtime/ai/AuthClarifyingChatModel.java   |  69 ++++++++++
 .../runtime/ai/AuthErrorMessageMapper.java    |  62 +++++++++
 .../ai/AuthClarifyingChatModelTest.java       | 123 ++++++++++++++++++
 .../ai/AuthErrorMessageMapperTest.java        |  91 +++++++++++++
 6 files changed, 466 insertions(+)
 create mode 100644 server/src/main/java/dev/agentspan/runtime/ai/AuthClarifyingAIModel.java
 create mode 100644 server/src/main/java/dev/agentspan/runtime/ai/AuthClarifyingChatModel.java
 create mode 100644 server/src/main/java/dev/agentspan/runtime/ai/AuthErrorMessageMapper.java
 create mode 100644 server/src/test/java/dev/agentspan/runtime/ai/AuthClarifyingChatModelTest.java
 create mode 100644 server/src/test/java/dev/agentspan/runtime/ai/AuthErrorMessageMapperTest.java

diff --git a/server/src/main/java/dev/agentspan/runtime/ai/AgentspanAIModelProvider.java b/server/src/main/java/dev/agentspan/runtime/ai/AgentspanAIModelProvider.java
index 03238deb3..513f9aa24 100644
--- a/server/src/main/java/dev/agentspan/runtime/ai/AgentspanAIModelProvider.java
+++ b/server/src/main/java/dev/agentspan/runtime/ai/AgentspanAIModelProvider.java
@@ -298,8 +298,20 @@ private String resolveBaseUrl(String provider) {
 
     /**
      * Create a fresh AIModel instance with a per-user API key and optional base URL.
+     *
+     * <p>The returned model is wrapped in {@link AuthClarifyingAIModel} so a
+     * 401 from the upstream provider (caused by a typo, expired, or revoked
+     * key) surfaces with a clear remediation message instead of Spring AI's
+     * misleading "cannot retry due to server authentication" mid-stream error.</p>
      */
     private AIModel createModelWithKey(String provider, String apiKey, String baseUrl) {
+        AIModel raw = createRawModelWithKey(provider, apiKey, baseUrl);
+        if (raw == null) return null;
+        String envVar = PROVIDER_TO_ENV_VAR.getOrDefault(provider.toLowerCase(), "<api key>");
+        return new AuthClarifyingAIModel(raw, provider, envVar);
+    }
+
+    private AIModel createRawModelWithKey(String provider, String apiKey, String baseUrl) {
         ModelConfiguration<? extends AIModel> config =
                 switch (provider.toLowerCase()) {
                     case "openai" -> new OpenAIConfiguration(apiKey, baseUrl, null);
diff --git a/server/src/main/java/dev/agentspan/runtime/ai/AuthClarifyingAIModel.java b/server/src/main/java/dev/agentspan/runtime/ai/AuthClarifyingAIModel.java
new file mode 100644
index 000000000..f73a36fa6
--- /dev/null
+++ b/server/src/main/java/dev/agentspan/runtime/ai/AuthClarifyingAIModel.java
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.ai;
+
+import java.util.List;
+
+import org.conductoross.conductor.ai.AIModel;
+import org.conductoross.conductor.ai.models.AudioGenRequest;
+import org.conductoross.conductor.ai.models.ChatCompletion;
+import org.conductoross.conductor.ai.models.EmbeddingGenRequest;
+import org.conductoross.conductor.ai.models.ImageGenRequest;
+import org.conductoross.conductor.ai.models.LLMResponse;
+import org.conductoross.conductor.ai.models.VideoGenRequest;
+import org.conductoross.conductor.ai.video.VideoModel;
+import org.conductoross.conductor.ai.video.VideoOptions;
+import org.springframework.ai.chat.model.ChatModel;
+import org.springframework.ai.chat.prompt.ChatOptions;
+import org.springframework.ai.image.ImageModel;
+import org.springframework.ai.image.ImageOptions;
+import org.springframework.ai.tool.ToolCallback;
+
+/**
+ * Delegating {@link AIModel} that returns an {@link AuthClarifyingChatModel}
+ * from {@link #getChatModel()}. All other methods forward to the wrapped
+ * model. Used in {@link AgentspanAIModelProvider#createModelWithKey} to
+ * surface clear errors when a non-empty but invalid API key is rejected
+ * by the upstream provider.
+ */
+final class AuthClarifyingAIModel implements AIModel {
+
+    private final AIModel delegate;
+    private final String provider;
+    private final String envVar;
+
+    AuthClarifyingAIModel(AIModel delegate, String provider, String envVar) {
+        this.delegate = delegate;
+        this.provider = provider;
+        this.envVar = envVar;
+    }
+
+    @Override
+    public ChatModel getChatModel() {
+        return new AuthClarifyingChatModel(delegate.getChatModel(), provider, envVar);
+    }
+
+    // ── pure delegation ───────────────────────────────────────────────
+
+    @Override
+    public String getModelProvider() {
+        return delegate.getModelProvider();
+    }
+
+    @Override
+    public List<String> getProviderAliases() {
+        return delegate.getProviderAliases();
+    }
+
+    @Override
+    public List<Float> generateEmbeddings(EmbeddingGenRequest request) {
+        return delegate.generateEmbeddings(request);
+    }
+
+    @Override
+    public ChatOptions getChatOptions(ChatCompletion input) {
+        return delegate.getChatOptions(input);
+    }
+
+    @Override
+    public ImageOptions getImageOptions(ImageGenRequest input) {
+        return delegate.getImageOptions(input);
+    }
+
+    @Override
+    public ImageModel getImageModel() {
+        return delegate.getImageModel();
+    }
+
+    @Override
+    public VideoOptions getVideoOptions(VideoGenRequest input) {
+        return delegate.getVideoOptions(input);
+    }
+
+    @Override
+    public VideoModel getVideoModel() {
+        return delegate.getVideoModel();
+    }
+
+    @Override
+    public LLMResponse generateVideo(VideoGenRequest request) {
+        return delegate.generateVideo(request);
+    }
+
+    @Override
+    public LLMResponse checkVideoStatus(VideoGenRequest request) {
+        return delegate.checkVideoStatus(request);
+    }
+
+    @Override
+    public LLMResponse generateAudio(AudioGenRequest request) {
+        return delegate.generateAudio(request);
+    }
+
+    @Override
+    public List<ToolCallback> getToolCallback(ChatCompletion input) {
+        return delegate.getToolCallback(input);
+    }
+}
diff --git a/server/src/main/java/dev/agentspan/runtime/ai/AuthClarifyingChatModel.java b/server/src/main/java/dev/agentspan/runtime/ai/AuthClarifyingChatModel.java
new file mode 100644
index 000000000..7c5c0dcac
--- /dev/null
+++ b/server/src/main/java/dev/agentspan/runtime/ai/AuthClarifyingChatModel.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.ai;
+
+import org.springframework.ai.chat.model.ChatModel;
+import org.springframework.ai.chat.model.ChatResponse;
+import org.springframework.ai.chat.prompt.ChatOptions;
+import org.springframework.ai.chat.prompt.Prompt;
+
+import reactor.core.publisher.Flux;
+
+/**
+ * Delegating {@link ChatModel} that translates upstream provider
+ * authentication failures into a clear {@link IllegalStateException}.
+ *
+ * <p>Catches both synchronous (from {@link #call}) and reactive (from
+ * {@link #stream}) auth errors using {@link AuthErrorMessageMapper}.
+ * Non-auth errors are passed through unchanged.</p>
+ */
+final class AuthClarifyingChatModel implements ChatModel {
+
+    private final ChatModel delegate;
+    private final String provider;
+    private final String envVar;
+
+    AuthClarifyingChatModel(ChatModel delegate, String provider, String envVar) {
+        this.delegate = delegate;
+        this.provider = provider;
+        this.envVar = envVar;
+    }
+
+    @Override
+    public ChatResponse call(Prompt prompt) {
+        try {
+            return delegate.call(prompt);
+        } catch (RuntimeException e) {
+            if (AuthErrorMessageMapper.isAuthFailure(e)) {
+                throw new IllegalStateException(
+                        AuthErrorMessageMapper.buildMessage(provider, envVar), e);
+            }
+            throw e;
+        }
+    }
+
+    @Override
+    public Flux<ChatResponse> stream(Prompt prompt) {
+        Flux<ChatResponse> upstream;
+        try {
+            upstream = delegate.stream(prompt);
+        } catch (RuntimeException e) {
+            if (AuthErrorMessageMapper.isAuthFailure(e)) {
+                throw new IllegalStateException(
+                        AuthErrorMessageMapper.buildMessage(provider, envVar), e);
+            }
+            throw e;
+        }
+        return upstream.onErrorMap(
+                AuthErrorMessageMapper::isAuthFailure,
+                e -> new IllegalStateException(
+                        AuthErrorMessageMapper.buildMessage(provider, envVar), e));
+    }
+
+    @Override
+    public ChatOptions getDefaultOptions() {
+        return delegate.getDefaultOptions();
+    }
+}
diff --git a/server/src/main/java/dev/agentspan/runtime/ai/AuthErrorMessageMapper.java b/server/src/main/java/dev/agentspan/runtime/ai/AuthErrorMessageMapper.java
new file mode 100644
index 000000000..738b3ded5
--- /dev/null
+++ b/server/src/main/java/dev/agentspan/runtime/ai/AuthErrorMessageMapper.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.ai;
+
+/**
+ * Recognises authentication failures from upstream LLM providers and
+ * formats a clear, actionable error message.
+ *
+ * <p>The fail-fast in {@link AgentspanAIModelProvider#getModel} catches the
+ * "empty key at JVM startup" case. This mapper handles the harder case: a
+ * non-empty but invalid key (typo, expired, revoked) where the only signal
+ * is a 401 returned by the provider's HTTP endpoint, often surfaced as a
+ * {@code NonTransientAiException} mid-stream with the generic message
+ * "cannot retry due to server authentication".</p>
+ */
+final class AuthErrorMessageMapper {
+
+    private AuthErrorMessageMapper() {}
+
+    /**
+     * Return true if the throwable (or any cause in its chain) looks like a
+     * provider authentication failure. Matches "401", "Unauthorized", and
+     * "invalid_api_key" / "invalid api key" in the message text.
+     */
+    static boolean isAuthFailure(Throwable t) {
+        Throwable cur = t;
+        while (cur != null) {
+            String msg = cur.getMessage();
+            if (msg != null) {
+                String lower = msg.toLowerCase();
+                if (lower.contains("401")
+                        || lower.contains("unauthorized")
+                        || lower.contains("invalid_api_key")
+                        || lower.contains("invalid api key")) {
+                    return true;
+                }
+            }
+            cur = cur.getCause();
+        }
+        return false;
+    }
+
+    /**
+     * Build a user-facing error message naming the provider, the env var,
+     * and the three remediation paths (env, credentials API, UI).
+     */
+    static String buildMessage(String provider, String envVar) {
+        return "Provider rejected the API key for '"
+                + provider
+                + "' (401). "
+                + envVar
+                + " is set but invalid, expired, or revoked. "
+                + "Update "
+                + envVar
+                + " in the server's environment and restart, "
+                + "push a fresh value via PUT /api/credentials/"
+                + envVar
+                + ", or save it via the Credentials UI.";
+    }
+}
diff --git a/server/src/test/java/dev/agentspan/runtime/ai/AuthClarifyingChatModelTest.java b/server/src/test/java/dev/agentspan/runtime/ai/AuthClarifyingChatModelTest.java
new file mode 100644
index 000000000..c45431913
--- /dev/null
+++ b/server/src/test/java/dev/agentspan/runtime/ai/AuthClarifyingChatModelTest.java
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.ai;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.ai.chat.messages.UserMessage;
+import org.springframework.ai.chat.model.ChatModel;
+import org.springframework.ai.chat.model.ChatResponse;
+import org.springframework.ai.chat.prompt.Prompt;
+import org.springframework.ai.retry.NonTransientAiException;
+
+import reactor.core.publisher.Flux;
+
+/**
+ * Unit tests for {@link AuthClarifyingChatModel}.
+ *
+ * <p>Wraps an upstream Spring AI {@link ChatModel}: when {@code call()} or
+ * {@code stream()} fail with an authentication error (typically a 401 from
+ * the provider), the wrapper rethrows as {@link IllegalStateException} with
+ * a message naming the env var and remediation paths. Non-auth errors flow
+ * through unchanged.</p>
+ */
+class AuthClarifyingChatModelTest {
+
+    private static final Prompt PROMPT = new Prompt(new UserMessage("hello"));
+
+    /** Tiny stub ChatModel whose call/stream behavior the test controls. */
+    static class StubChatModel implements ChatModel {
+        Throwable callError;
+        Throwable streamError;
+        ChatResponse callResult;
+        Flux<ChatResponse> streamResult = Flux.empty();
+        List<Prompt> callsReceived = new ArrayList<>();
+
+        @Override
+        public ChatResponse call(Prompt prompt) {
+            callsReceived.add(prompt);
+            if (callError != null) throwUnchecked(callError);
+            return callResult;
+        }
+
+        @Override
+        public Flux<ChatResponse> stream(Prompt prompt) {
+            if (streamError != null) {
+                return Flux.error(streamError);
+            }
+            return streamResult;
+        }
+
+        private static void throwUnchecked(Throwable t) {
+            if (t instanceof RuntimeException re) throw re;
+            throw new RuntimeException(t);
+        }
+    }
+
+    private AuthClarifyingChatModel wrap(ChatModel delegate) {
+        return new AuthClarifyingChatModel(delegate, "anthropic", "ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void callRethrowsAuthErrorAsIllegalStateExceptionWithClearMessage() {
+        StubChatModel stub = new StubChatModel();
+        stub.callError = new NonTransientAiException("HTTP 401 Unauthorized: Invalid x-api-key");
+
+        assertThatThrownBy(() -> wrap(stub).call(PROMPT))
+                .isInstanceOf(IllegalStateException.class)
+                .hasMessageContaining("anthropic")
+                .hasMessageContaining("ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void callPassesNonAuthErrorsThrough() {
+        StubChatModel stub = new StubChatModel();
+        stub.callError = new RuntimeException("HTTP 500 Internal Server Error");
+
+        assertThatThrownBy(() -> wrap(stub).call(PROMPT))
+                .isInstanceOf(RuntimeException.class)
+                // NOT wrapped — original exception bubbles up unchanged.
+                .isNotInstanceOf(IllegalStateException.class)
+                .hasMessage("HTTP 500 Internal Server Error");
+    }
+
+    @Test
+    void callForwardsSuccessfulResponse() {
+        StubChatModel stub = new StubChatModel();
+        ChatResponse expected = new ChatResponse(List.of());
+        stub.callResult = expected;
+
+        ChatResponse actual = wrap(stub).call(PROMPT);
+
+        assertThat(actual).isSameAs(expected);
+        assertThat(stub.callsReceived).containsExactly(PROMPT);
+    }
+
+    @Test
+    void streamMapsAuthErrorsMidStream() {
+        StubChatModel stub = new StubChatModel();
+        stub.streamError = new NonTransientAiException("401 Unauthorized");
+
+        // blockLast() rethrows the terminal error from the Flux.
+        assertThatThrownBy(() -> wrap(stub).stream(PROMPT).blockLast())
+                .isInstanceOf(IllegalStateException.class)
+                .hasMessageContaining("ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void streamPassesNonAuthErrorsThrough() {
+        StubChatModel stub = new StubChatModel();
+        stub.streamError = new RuntimeException("HTTP 503");
+
+        assertThatThrownBy(() -> wrap(stub).stream(PROMPT).blockLast())
+                .isNotInstanceOf(IllegalStateException.class)
+                .hasMessage("HTTP 503");
+    }
+}
diff --git a/server/src/test/java/dev/agentspan/runtime/ai/AuthErrorMessageMapperTest.java b/server/src/test/java/dev/agentspan/runtime/ai/AuthErrorMessageMapperTest.java
new file mode 100644
index 000000000..4266e82a9
--- /dev/null
+++ b/server/src/test/java/dev/agentspan/runtime/ai/AuthErrorMessageMapperTest.java
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.ai;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.ai.retry.NonTransientAiException;
+
+/**
+ * Unit tests for {@link AuthErrorMessageMapper}.
+ *
+ * <p>The mapper recognises authentication failures from upstream LLM providers
+ * (typically surfaced by Spring AI as {@link NonTransientAiException} or via
+ * HTTP client errors carrying "401" / "Unauthorized" / "invalid_api_key" in
+ * the message) and produces a clear, actionable error message naming the
+ * provider, env var, and remediation paths.</p>
+ */
+class AuthErrorMessageMapperTest {
+
+    @Test
+    void detects401InTopLevelException() {
+        Throwable t = new NonTransientAiException(
+                "HTTP 401 Unauthorized: Invalid x-api-key");
+        assertThat(AuthErrorMessageMapper.isAuthFailure(t)).isTrue();
+    }
+
+    @Test
+    void detectsUnauthorizedInMessage() {
+        Throwable t = new RuntimeException("Unauthorized: bad token");
+        assertThat(AuthErrorMessageMapper.isAuthFailure(t)).isTrue();
+    }
+
+    @Test
+    void detectsInvalidApiKeyInMessage() {
+        Throwable t = new RuntimeException("error code: invalid_api_key");
+        assertThat(AuthErrorMessageMapper.isAuthFailure(t)).isTrue();
+    }
+
+    @Test
+    void traversesCauseChain() {
+        Throwable root = new NonTransientAiException("401 Unauthorized");
+        Throwable mid = new RuntimeException("downstream failed", root);
+        Throwable top = new RuntimeException("workflow task failed", mid);
+        assertThat(AuthErrorMessageMapper.isAuthFailure(top)).isTrue();
+    }
+
+    @Test
+    void ignoresRateLimitErrors() {
+        Throwable t = new NonTransientAiException("HTTP 429 Too Many Requests");
+        assertThat(AuthErrorMessageMapper.isAuthFailure(t)).isFalse();
+    }
+
+    @Test
+    void ignoresServerErrors() {
+        Throwable t = new RuntimeException("HTTP 500 Internal Server Error");
+        assertThat(AuthErrorMessageMapper.isAuthFailure(t)).isFalse();
+    }
+
+    @Test
+    void ignoresTimeoutErrors() {
+        Throwable t = new RuntimeException("Read timeout after 30s");
+        assertThat(AuthErrorMessageMapper.isAuthFailure(t)).isFalse();
+    }
+
+    @Test
+    void ignoresNullThrowable() {
+        assertThat(AuthErrorMessageMapper.isAuthFailure(null)).isFalse();
+    }
+
+    @Test
+    void messageNamesProviderAndEnvVar() {
+        String msg = AuthErrorMessageMapper.buildMessage("anthropic", "ANTHROPIC_API_KEY");
+        assertThat(msg)
+                .contains("anthropic")
+                .contains("ANTHROPIC_API_KEY")
+                .contains("PUT /api/credentials/ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void messageMentionsRemediationPaths() {
+        String msg = AuthErrorMessageMapper.buildMessage("openai", "OPENAI_API_KEY");
+        // Names the three ways to fix it: env, credentials API, UI.
+        assertThat(msg)
+                .containsIgnoringCase("environment")
+                .contains("/api/credentials/")
+                .containsIgnoringCase("UI");
+    }
+}

From 82f4c1e56c1fe46353ed427e742c3f9054f3f906 Mon Sep 17 00:00:00 2001
From: Nikhil Kala <nikhil.kala@orkes.io>
Date: Thu, 21 May 2026 09:07:16 -0700
Subject: [PATCH 4/6] feat(server): add /api/info endpoint with per-JVM
 instance UUID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Returns {"instance_id": "<uuid>"} where the UUID is generated once at
bean construction and stays stable for the life of the JVM. SDK clients
use it to detect server restarts and gate boot-time work (env sync)
accordingly.

Exempt from auth — body contains only a UUID and is needed pre-auth on
hosted deployments.
---
 .../agentspan/runtime/auth/AuthFilter.java    |  4 +-
 .../runtime/controller/InfoController.java    | 33 +++++++++++
 .../controller/InfoControllerTest.java        | 58 +++++++++++++++++++
 3 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 server/src/main/java/dev/agentspan/runtime/controller/InfoController.java
 create mode 100644 server/src/test/java/dev/agentspan/runtime/controller/InfoControllerTest.java

diff --git a/server/src/main/java/dev/agentspan/runtime/auth/AuthFilter.java b/server/src/main/java/dev/agentspan/runtime/auth/AuthFilter.java
index b17dfbb04..991b62b03 100644
--- a/server/src/main/java/dev/agentspan/runtime/auth/AuthFilter.java
+++ b/server/src/main/java/dev/agentspan/runtime/auth/AuthFilter.java
@@ -70,7 +70,9 @@ public AuthFilter(
     @Override
     protected boolean shouldNotFilter(HttpServletRequest request) {
         String path = request.getServletPath();
-        return "/api/auth/login".equals(path);
+        // /api/info exposes only a per-JVM instance UUID (used by SDK clients
+        // to detect server restarts) — safe to probe without auth.
+        return "/api/auth/login".equals(path) || "/api/info".equals(path);
     }
 
     @Override
diff --git a/server/src/main/java/dev/agentspan/runtime/controller/InfoController.java b/server/src/main/java/dev/agentspan/runtime/controller/InfoController.java
new file mode 100644
index 000000000..6698a0dec
--- /dev/null
+++ b/server/src/main/java/dev/agentspan/runtime/controller/InfoController.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.controller;
+
+import java.util.Map;
+import java.util.UUID;
+
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.RequestMapping;
+import org.springframework.web.bind.annotation.RestController;
+
+/**
+ * GET /api/info — returns a per-JVM ``instance_id``.
+ *
+ * <p>The id is generated once when the bean is constructed and stays stable
+ * for the life of the process. SDK clients use it to detect server restarts
+ * and gate the boot-time credential sync — same id → already synced this
+ * JVM, skip; different id → JVM is new, re-sync.</p>
+ */
+@RestController
+@RequestMapping("/api/info")
+public class InfoController {
+
+    private final String instanceId = UUID.randomUUID().toString();
+
+    @GetMapping
+    public ResponseEntity<Map<String, String>> info() {
+        return ResponseEntity.ok(Map.of("instance_id", instanceId));
+    }
+}
diff --git a/server/src/test/java/dev/agentspan/runtime/controller/InfoControllerTest.java b/server/src/test/java/dev/agentspan/runtime/controller/InfoControllerTest.java
new file mode 100644
index 000000000..343be7d10
--- /dev/null
+++ b/server/src/test/java/dev/agentspan/runtime/controller/InfoControllerTest.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.controller;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.http.ResponseEntity;
+
+/**
+ * Unit tests for {@link InfoController}.
+ *
+ * <p>The controller exposes a per-JVM ``instance_id`` so SDK clients can
+ * detect server restarts and avoid re-syncing env vars on every script run.
+ * The id must be stable for the life of the controller (same value across
+ * calls) and present in the response body.</p>
+ */
+class InfoControllerTest {
+
+    @Test
+    void returnsInstanceIdInBody() {
+        InfoController controller = new InfoController();
+
+        ResponseEntity<Map<String, String>> resp = controller.info();
+
+        assertThat(resp.getStatusCode().is2xxSuccessful()).isTrue();
+        assertThat(resp.getBody()).isNotNull();
+        assertThat(resp.getBody().get("instance_id"))
+                .isNotNull()
+                .isNotBlank();
+    }
+
+    @Test
+    void instanceIdIsStableAcrossCalls() {
+        InfoController controller = new InfoController();
+
+        String first = controller.info().getBody().get("instance_id");
+        String second = controller.info().getBody().get("instance_id");
+
+        assertThat(second).isEqualTo(first);
+    }
+
+    @Test
+    void differentControllersHaveDifferentInstanceIds() {
+        // Each JVM start (≈ each controller construction in tests) gets a fresh id.
+        InfoController a = new InfoController();
+        InfoController b = new InfoController();
+
+        String idA = a.info().getBody().get("instance_id");
+        String idB = b.info().getBody().get("instance_id");
+
+        assertThat(idA).isNotEqualTo(idB);
+    }
+}

From a61641240b0ee8c8d4e157ebd46f5f41242e3d6f Mon Sep 17 00:00:00 2001
From: Nikhil Kala <nikhil.kala@orkes.io>
Date: Thu, 21 May 2026 09:09:10 -0700
Subject: [PATCH 5/6] feat(sdk): gate boot env sync by server instance +
 env-hash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Caches (instance_id, env_hash) in ~/.agentspan/sync-marker.json. Skip
sync iff BOTH match — meaning the same JVM AND the same shell env we
already pushed. Re-sync when either changes.

The env_hash half closes the cached-credential bug: a user who fixes a
typo'd API key and re-runs without restarting the server now gets the
corrected value into the credential store, because the env hash
differs even though the JVM didn't restart.

Marker entries can be either {instance_id, env_hash} dicts (current
schema) or bare instance_id strings (legacy); the latter is treated
as a non-match so it forces a re-sync.
---
 .../src/agentspan/agents/runtime/runtime.py   | 106 ++++++-
 .../unit/test_runtime_sync_fingerprint.py     | 300 ++++++++++++++++++
 2 files changed, 402 insertions(+), 4 deletions(-)
 create mode 100644 sdk/python/tests/unit/test_runtime_sync_fingerprint.py

diff --git a/sdk/python/src/agentspan/agents/runtime/runtime.py b/sdk/python/src/agentspan/agents/runtime/runtime.py
index d44aa8253..7cbcea6f4 100644
--- a/sdk/python/src/agentspan/agents/runtime/runtime.py
+++ b/sdk/python/src/agentspan/agents/runtime/runtime.py
@@ -51,6 +51,39 @@ def _is_local_server(server_url: str) -> bool:
     return host in ("localhost", "127.0.0.1", "::1", "0.0.0.0")
 
 
+def _sync_marker_path():
+    """Filesystem location of the env-sync marker file.
+
+    Maps ``server_url`` → ``{"instance_id", "env_hash"}`` so we sync iff the
+    server JVM has restarted **or** the local env values have changed.
+    Module-level so tests can monkeypatch it to a tmp_path.
+    """
+    from pathlib import Path
+
+    return Path.home() / ".agentspan" / "sync-marker.json"
+
+
+def _compute_env_hash() -> str:
+    """Stable SHA-256 hex of all known provider env vars.
+
+    Used in the sync marker to detect env-var changes between SDK invocations
+    even when the server JVM hasn't restarted. Without this, a user who fixes
+    a typo'd API key in their shell can't get the corrected value into the
+    server without restarting the JVM.
+    """
+    import hashlib
+
+    from agentspan.agents._internal.provider_registry import KNOWN_PROVIDER_ENV_VARS
+
+    digest = hashlib.sha256()
+    for name in sorted(KNOWN_PROVIDER_ENV_VARS):
+        digest.update(name.encode("utf-8"))
+        digest.update(b"=")
+        digest.update((os.environ.get(name, "") or "").encode("utf-8"))
+        digest.update(b"\n")
+    return digest.hexdigest()
+
+
 _RETRY_POLICY_MAP = {
     "fixed": "FIXED",
     "linear_backoff": "LINEAR_BACKOFF",
@@ -2146,13 +2179,30 @@ def _push_credential_to_server(self, name: str, value: str) -> None:
     def _sync_provider_env_to_server(self) -> None:
         """Push every known provider env var into the server's credential store.
 
-        Runs once at AgentRuntime construction (localhost target only). Each
-        non-empty env var is upserted via :meth:`_push_credential_to_server`.
-        Failures are logged at debug and swallowed — this is a best-effort
-        convenience and should never block a runtime from starting.
+        Gated by ``(instance_id, env_hash)`` cached in
+        ``~/.agentspan/sync-marker.json``. Skip iff BOTH match — meaning the
+        same JVM AND the same shell env we already synced. Re-sync when the
+        server restarts OR the user corrects an env var, even within the same
+        JVM. Failures are logged at debug and swallowed.
         """
         from agentspan.agents._internal.provider_registry import KNOWN_PROVIDER_ENV_VARS
 
+        current_instance = self._fetch_server_instance_id()
+        current_env_hash = _compute_env_hash()
+        marker = self._read_sync_marker()
+        entry = marker.get(self._config.server_url)
+
+        # Legacy schema tolerance: entry used to be a bare string instance_id.
+        if isinstance(entry, dict) and current_instance is not None:
+            if (
+                entry.get("instance_id") == current_instance
+                and entry.get("env_hash") == current_env_hash
+            ):
+                logger.debug(
+                    "Skipping env sync — instance %s and env unchanged.", current_instance
+                )
+                return
+
         for name in sorted(KNOWN_PROVIDER_ENV_VARS):
             value = os.environ.get(name)
             if not value:
@@ -2163,6 +2213,54 @@ def _sync_provider_env_to_server(self) -> None:
             except Exception as e:
                 logger.debug("Could not sync %s into local server: %s", name, e)
 
+        if current_instance is not None:
+            marker[self._config.server_url] = {
+                "instance_id": current_instance,
+                "env_hash": current_env_hash,
+            }
+            self._write_sync_marker(marker)
+
+    def _fetch_server_instance_id(self) -> "Optional[str]":
+        """Return the running server's ``instance_id`` or ``None`` on any error.
+
+        Older servers without ``/api/info`` return ``None``; the caller falls
+        back to an unconditional sync (best-effort).
+        """
+        import httpx
+
+        base = self._config.server_url.rstrip("/")
+        try:
+            resp = httpx.get(f"{base}/info", timeout=2.0)
+            resp.raise_for_status()
+            return resp.json().get("instance_id")
+        except Exception as e:
+            logger.debug("Could not fetch /api/info: %s", e)
+            return None
+
+    @staticmethod
+    def _read_sync_marker() -> Dict[str, Any]:
+        """Return the marker file as a dict; empty dict if missing/corrupt.
+
+        Values can be either ``{"instance_id", "env_hash"}`` dicts (current
+        schema) or bare instance_id strings (legacy). The legacy form is
+        treated as a non-match so it forces a re-sync.
+        """
+        path = _sync_marker_path()
+        try:
+            return json.loads(path.read_text())
+        except Exception:
+            return {}
+
+    @staticmethod
+    def _write_sync_marker(marker: Dict[str, Any]) -> None:
+        """Persist the marker, creating ``~/.agentspan`` if necessary."""
+        path = _sync_marker_path()
+        try:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            path.write_text(json.dumps(marker))
+        except Exception as e:
+            logger.debug("Could not write sync marker: %s", e)
+
     def _ensure_models_for_agent(self, agent: Agent) -> None:
         """Walk the agent tree and ensure all referenced models are registered."""
         seen: set = set()
diff --git a/sdk/python/tests/unit/test_runtime_sync_fingerprint.py b/sdk/python/tests/unit/test_runtime_sync_fingerprint.py
new file mode 100644
index 000000000..41592fb70
--- /dev/null
+++ b/sdk/python/tests/unit/test_runtime_sync_fingerprint.py
@@ -0,0 +1,300 @@
+# Copyright (c) 2025 Agentspan
+# Licensed under the MIT License. See LICENSE file in the project root for details.
+
+"""Tests for the server-instance + env-hash gate on _sync_provider_env_to_server.
+
+Marker schema is::
+
+    {
+      "<server_url>": {"instance_id": "<uuid>", "env_hash": "<sha256>"}
+    }
+
+Sync runs iff EITHER the instance_id is new (server JVM restarted) OR the
+env_hash has changed (user fixed a shell-config typo and re-ran without
+restarting the server). This closes a regression from the original
+"gate by instance_id only" design that re-introduced the cached-bad-key bug.
+"""
+
+from __future__ import annotations
+
+import json
+from unittest.mock import MagicMock, patch
+
+
+def _make_runtime(server_url: str = "http://localhost:6767/api"):
+    """Construct an AgentRuntime against localhost with heavy I/O mocked out."""
+    with (
+        patch("conductor.client.orkes_clients.OrkesClients"),
+        patch("agentspan.agents.runtime.worker_manager.TaskHandler", create=True),
+        patch("agentspan.agents.runtime.server.ensure_server_running"),
+        patch("agentspan.agents.runtime.server._is_server_ready", return_value=True),
+    ):
+        from agentspan.agents.runtime.config import AgentConfig
+        from agentspan.agents.runtime.runtime import AgentRuntime
+
+        config = AgentConfig(
+            server_url=server_url,
+            auto_start_workers=False,
+            auto_start_server=False,
+        )
+        return AgentRuntime(config=config)
+
+
+def _mock_info_response(instance_id: str):
+    resp = MagicMock()
+    resp.status_code = 200
+    resp.json.return_value = {"instance_id": instance_id}
+    resp.raise_for_status.return_value = None
+    return resp
+
+
+def _put_response():
+    resp = MagicMock()
+    resp.status_code = 200
+    resp.raise_for_status.return_value = None
+    return resp
+
+
+def _current_env_hash() -> str:
+    """Same hash the runtime computes — used to build matching markers in tests."""
+    from agentspan.agents.runtime.runtime import _compute_env_hash
+
+    return _compute_env_hash()
+
+
+class TestFingerprintGate:
+    """Marker stops repeated syncs from clobbering the server for the same JVM
+    + same env, and re-runs sync when either changes."""
+
+    def test_skips_sync_when_both_instance_and_env_match(self, monkeypatch, tmp_path):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        marker = tmp_path / "sync-marker.json"
+        monkeypatch.setattr(
+            "agentspan.agents.runtime.runtime._sync_marker_path", lambda: marker
+        )
+        marker.write_text(
+            json.dumps(
+                {
+                    "http://localhost:6767/api": {
+                        "instance_id": "instance-A",
+                        "env_hash": _current_env_hash(),
+                    }
+                }
+            )
+        )
+
+        with patch("httpx.get", return_value=_mock_info_response("instance-A")) as g, patch(
+            "httpx.put", return_value=_put_response()
+        ) as p:
+            _make_runtime()
+
+        assert g.called  # info probed
+        assert not p.called  # no credential PUT
+
+    def test_syncs_when_instance_id_changes(self, monkeypatch, tmp_path):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        marker = tmp_path / "sync-marker.json"
+        monkeypatch.setattr(
+            "agentspan.agents.runtime.runtime._sync_marker_path", lambda: marker
+        )
+        marker.write_text(
+            json.dumps(
+                {
+                    "http://localhost:6767/api": {
+                        "instance_id": "instance-OLD",
+                        "env_hash": _current_env_hash(),  # env unchanged
+                    }
+                }
+            )
+        )
+
+        with patch("httpx.get", return_value=_mock_info_response("instance-NEW")), patch(
+            "httpx.put", return_value=_put_response()
+        ) as p:
+            _make_runtime()
+
+        assert p.called
+        updated = json.loads(marker.read_text())["http://localhost:6767/api"]
+        assert updated["instance_id"] == "instance-NEW"
+
+    def test_syncs_when_env_hash_changes_even_if_instance_same(self, monkeypatch, tmp_path):
+        # THE BUG THIS GUARDS AGAINST: user fixed a typo'd API key and re-ran
+        # without restarting the server. instance_id stays the same; env_hash
+        # changes; we MUST re-sync so the corrected key reaches the store.
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-CORRECTED")
+        marker = tmp_path / "sync-marker.json"
+        monkeypatch.setattr(
+            "agentspan.agents.runtime.runtime._sync_marker_path", lambda: marker
+        )
+        # Marker reflects the previous (typo'd) env hash.
+        marker.write_text(
+            json.dumps(
+                {
+                    "http://localhost:6767/api": {
+                        "instance_id": "instance-A",
+                        "env_hash": "stale-typo-hash",
+                    }
+                }
+            )
+        )
+
+        with patch("httpx.get", return_value=_mock_info_response("instance-A")), patch(
+            "httpx.put", return_value=_put_response()
+        ) as p:
+            _make_runtime()
+
+        assert p.called  # sync ran despite same instance_id
+        # Marker was updated with the new env_hash.
+        updated = json.loads(marker.read_text())["http://localhost:6767/api"]
+        assert updated["env_hash"] == _current_env_hash()
+        assert updated["env_hash"] != "stale-typo-hash"
+
+    def test_syncs_when_marker_missing(self, monkeypatch, tmp_path):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        marker = tmp_path / "sync-marker.json"  # not created
+        monkeypatch.setattr(
+            "agentspan.agents.runtime.runtime._sync_marker_path", lambda: marker
+        )
+
+        with patch("httpx.get", return_value=_mock_info_response("instance-A")), patch(
+            "httpx.put", return_value=_put_response()
+        ) as p:
+            _make_runtime()
+
+        assert p.called
+        assert marker.exists()
+        entry = json.loads(marker.read_text())["http://localhost:6767/api"]
+        assert entry["instance_id"] == "instance-A"
+        assert entry["env_hash"] == _current_env_hash()
+
+    def test_syncs_when_marker_corrupt(self, monkeypatch, tmp_path):
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        marker = tmp_path / "sync-marker.json"
+        marker.write_text("{not-valid-json")
+        monkeypatch.setattr(
+            "agentspan.agents.runtime.runtime._sync_marker_path", lambda: marker
+        )
+
+        with patch("httpx.get", return_value=_mock_info_response("instance-A")), patch(
+            "httpx.put", return_value=_put_response()
+        ) as p:
+            _make_runtime()
+
+        assert p.called
+        entry = json.loads(marker.read_text())["http://localhost:6767/api"]
+        assert entry["instance_id"] == "instance-A"
+
+    def test_legacy_marker_string_format_triggers_resync(self, monkeypatch, tmp_path):
+        # Old marker schema (server_url → instance_id string) must trigger a
+        # re-sync, not crash. After sync, the marker is rewritten in the new
+        # schema.
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        marker = tmp_path / "sync-marker.json"
+        marker.write_text(json.dumps({"http://localhost:6767/api": "instance-A"}))
+        monkeypatch.setattr(
+            "agentspan.agents.runtime.runtime._sync_marker_path", lambda: marker
+        )
+
+        with patch("httpx.get", return_value=_mock_info_response("instance-A")), patch(
+            "httpx.put", return_value=_put_response()
+        ) as p:
+            _make_runtime()
+
+        assert p.called
+        entry = json.loads(marker.read_text())["http://localhost:6767/api"]
+        assert isinstance(entry, dict)
+        assert entry["instance_id"] == "instance-A"
+        assert entry["env_hash"] == _current_env_hash()
+
+    def test_syncs_when_info_endpoint_unreachable(self, monkeypatch, tmp_path):
+        # Old server without /api/info → fall back to unconditional sync.
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        marker = tmp_path / "sync-marker.json"
+        monkeypatch.setattr(
+            "agentspan.agents.runtime.runtime._sync_marker_path", lambda: marker
+        )
+        marker.write_text(
+            json.dumps(
+                {
+                    "http://localhost:6767/api": {
+                        "instance_id": "instance-A",
+                        "env_hash": _current_env_hash(),
+                    }
+                }
+            )
+        )
+
+        def fake_get(*args, **kwargs):
+            raise RuntimeError("connection refused")
+
+        with patch("httpx.get", side_effect=fake_get), patch(
+            "httpx.put", return_value=_put_response()
+        ) as p:
+            _make_runtime()
+
+        assert p.called
+
+    def test_marker_isolated_per_server_url(self, monkeypatch, tmp_path):
+        # Two servers on different ports → independent fingerprints.
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        marker = tmp_path / "sync-marker.json"
+        monkeypatch.setattr(
+            "agentspan.agents.runtime.runtime._sync_marker_path", lambda: marker
+        )
+        marker.write_text(
+            json.dumps(
+                {
+                    "http://localhost:6767/api": {
+                        "instance_id": "instance-A",
+                        "env_hash": _current_env_hash(),
+                    },
+                    "http://localhost:7777/api": {
+                        "instance_id": "instance-X",
+                        "env_hash": _current_env_hash(),
+                    },
+                }
+            )
+        )
+
+        with patch("httpx.get", return_value=_mock_info_response("instance-X")), patch(
+            "httpx.put", return_value=_put_response()
+        ) as p:
+            _make_runtime(server_url="http://localhost:7777/api")
+
+        assert not p.called
+        # 6767's entry untouched.
+        kept = json.loads(marker.read_text())["http://localhost:6767/api"]
+        assert kept["instance_id"] == "instance-A"
+
+
+class TestComputeEnvHash:
+    """The env-hash function itself."""
+
+    def test_same_env_produces_same_hash(self, monkeypatch):
+        from agentspan.agents.runtime.runtime import _compute_env_hash
+
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        monkeypatch.setenv("OPENAI_API_KEY", "sk-openai")
+        h1 = _compute_env_hash()
+        h2 = _compute_env_hash()
+        assert h1 == h2
+
+    def test_different_env_produces_different_hash(self, monkeypatch):
+        from agentspan.agents.runtime.runtime import _compute_env_hash
+
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-old")
+        h_old = _compute_env_hash()
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-new")
+        h_new = _compute_env_hash()
+        assert h_old != h_new
+
+    def test_unset_env_var_is_treated_as_empty_string(self, monkeypatch):
+        # Removing a var changes the hash — important because going from
+        # "ANTHROPIC_API_KEY=bad" to unset is a meaningful change.
+        from agentspan.agents.runtime.runtime import _compute_env_hash
+
+        monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant")
+        h_set = _compute_env_hash()
+        monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+        h_unset = _compute_env_hash()
+        assert h_set != h_unset

From bb87b3dd7af7743394b2d95f1e54e04cfffd8061 Mon Sep 17 00:00:00 2001
From: Nikhil Kala <nikhil.kala@orkes.io>
Date: Thu, 21 May 2026 09:10:41 -0700
Subject: [PATCH 6/6] feat(server): surface stored-vs-env credential mismatch
 at startup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The credential seeder previously logged a generic "already exists in
store" warning on EVERY existing credential, even when the env value
matched. That made the rare-but-painful case (env value differs from
stored value, stored value wins, user has no idea) invisible.

Now the seeder compares values:
  - equal   → debug log, silent
  - differ  → recorded in lastMismatchedNames + warn naming the
              specific remediation paths

AgentRuntime prints a visible banner after the configuration block
listing every mismatched env var, so a user who's re-exported a fixed
key sees on the next server start that the cached value is still
winning. @Order annotations ensure the seeder runs before the banner.
---
 .../dev/agentspan/runtime/AgentRuntime.java   |  35 ++++++
 .../credentials/CredentialEnvSeeder.java      |  43 +++++++-
 .../CredentialEnvSeederMismatchTest.java      | 100 ++++++++++++++++++
 3 files changed, 174 insertions(+), 4 deletions(-)
 create mode 100644 server/src/test/java/dev/agentspan/runtime/credentials/CredentialEnvSeederMismatchTest.java

diff --git a/server/src/main/java/dev/agentspan/runtime/AgentRuntime.java b/server/src/main/java/dev/agentspan/runtime/AgentRuntime.java
index 06d7ef54f..5059875a7 100644
--- a/server/src/main/java/dev/agentspan/runtime/AgentRuntime.java
+++ b/server/src/main/java/dev/agentspan/runtime/AgentRuntime.java
@@ -17,10 +17,15 @@
 import org.springframework.boot.autoconfigure.data.mongo.MongoDataAutoConfiguration;
 import org.springframework.boot.autoconfigure.jdbc.DataSourceAutoConfiguration;
 import org.springframework.boot.autoconfigure.mongo.MongoAutoConfiguration;
+import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.context.annotation.ComponentScan;
+import org.springframework.core.Ordered;
+import org.springframework.core.annotation.Order;
 import org.springframework.core.env.Environment;
 import org.springframework.scheduling.annotation.EnableScheduling;
 
+import dev.agentspan.runtime.credentials.CredentialEnvSeeder;
+
 import lombok.RequiredArgsConstructor;
 
 @SpringBootApplication(
@@ -34,12 +39,16 @@
             "dev.agentspan.runtime"
         })
 @RequiredArgsConstructor
+@Order(Ordered.HIGHEST_PRECEDENCE + 1000)
 public class AgentRuntime implements ApplicationRunner {
 
     private final Logger log = LoggerFactory.getLogger(AgentRuntime.class);
 
     private final Environment environment;
 
+    @Autowired(required = false)
+    private CredentialEnvSeeder credentialEnvSeeder;
+
     public static void main(String[] args) {
         SpringApplication.run(AgentRuntime.class, args);
     }
@@ -75,6 +84,32 @@ public void run(ApplicationArguments args) {
         log.info("\n\n\n");
 
         checkAIProviders(environment);
+        warnOnStoredCredentialMismatch();
+    }
+
+    /**
+     * Surface any env-vs-stored credential mismatch detected by the seeder so
+     * a user who's fat-fingered an API key and re-exported it sees on the
+     * very next server start that the cached value is still winning.
+     */
+    private void warnOnStoredCredentialMismatch() {
+        if (credentialEnvSeeder == null) return;
+        var mismatched = credentialEnvSeeder.getLastMismatchedNames();
+        if (mismatched.isEmpty()) return;
+
+        log.warn("┌─────────────────────────────────────────────────────────────────┐");
+        log.warn("│  STORED CREDENTIAL DIFFERS FROM ENVIRONMENT                     │");
+        log.warn("│                                                                 │");
+        log.warn("│  The following env vars are SET but differ from the cached      │");
+        log.warn("│  credential the server is using:                                │");
+        for (String name : mismatched) {
+            log.warn("│    - {}", padRight(name, 58) + "│");
+        }
+        log.warn("│                                                                 │");
+        log.warn("│  The CACHED value is being used. To update the stored value:    │");
+        log.warn("│    agentspan credentials set NAME \"$NAME\"                     │");
+        log.warn("│  or PUT /api/credentials/NAME or use the Credentials UI.        │");
+        log.warn("└─────────────────────────────────────────────────────────────────┘");
     }
 
     private void checkAIProviders(Environment env) {
diff --git a/server/src/main/java/dev/agentspan/runtime/credentials/CredentialEnvSeeder.java b/server/src/main/java/dev/agentspan/runtime/credentials/CredentialEnvSeeder.java
index cfea7764d..5035a4512 100644
--- a/server/src/main/java/dev/agentspan/runtime/credentials/CredentialEnvSeeder.java
+++ b/server/src/main/java/dev/agentspan/runtime/credentials/CredentialEnvSeeder.java
@@ -15,6 +15,8 @@
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.boot.ApplicationArguments;
 import org.springframework.boot.ApplicationRunner;
+import org.springframework.core.Ordered;
+import org.springframework.core.annotation.Order;
 import org.springframework.stereotype.Component;
 
 /**
@@ -32,6 +34,7 @@
  * (Vault, AWS SM, etc.) manage their own secrets.</p>
  */
 @Component
+@Order(Ordered.HIGHEST_PRECEDENCE + 100)
 public class CredentialEnvSeeder implements ApplicationRunner {
 
     private static final Logger log = LoggerFactory.getLogger(CredentialEnvSeeder.class);
@@ -107,6 +110,15 @@ public class CredentialEnvSeeder implements ApplicationRunner {
     private final CredentialStoreProvider storeProvider;
     private final Function<String, String> envLookup;
 
+    /**
+     * Env vars whose VALUE differs from what's stored under the same name.
+     * Populated by {@link #run(ApplicationArguments)} and surfaced via the
+     * startup banner ({@code AgentRuntime}) so a user who's fat-fingered an
+     * API key and re-exported it can see immediately that the server is
+     * still using the cached value.
+     */
+    private final java.util.List<String> lastMismatchedNames = new java.util.ArrayList<>();
+
     @Value("${agentspan.credentials.store:built-in}")
     private String credentialsStore;
 
@@ -124,6 +136,8 @@ public CredentialEnvSeeder(CredentialStoreProvider storeProvider) {
 
     @Override
     public void run(ApplicationArguments args) {
+        lastMismatchedNames.clear();
+
         if (!"built-in".equals(credentialsStore)) {
             log.debug("Credential env seeding skipped — store={} is not built-in", credentialsStore);
             return;
@@ -161,10 +175,22 @@ public void run(ApplicationArguments args) {
             }
 
             if (existing != null) {
-                log.warn(
-                        "Credential '{}' already exists in store — skipping env import. "
-                                + "To update the value, use the Credentials UI.",
-                        name);
+                if (!existing.equals(value)) {
+                    // The user-facing pain point: env value differs from the
+                    // stored value but the stored value wins. Surface loudly
+                    // so the user doesn't lose an hour debugging.
+                    lastMismatchedNames.add(name);
+                    log.warn(
+                            "Credential '{}' in environment DIFFERS from stored value — "
+                                    + "stored value will be used. To update the stored value, "
+                                    + "run `agentspan credentials set {} \"$" + name + "\"`, "
+                                    + "push via PUT /api/credentials/{}, or use the Credentials UI.",
+                            name,
+                            name,
+                            name);
+                } else {
+                    log.debug("Credential '{}' matches env — leaving stored value in place", name);
+                }
                 skipped++;
                 continue;
             }
@@ -182,4 +208,13 @@ public void run(ApplicationArguments args) {
             log.info("Credential env seeding complete: {} created, {} already existed (skipped)", created, skipped);
         }
     }
+
+    /**
+     * Names of env vars whose value differed from the stored value on the
+     * most recent {@link #run(ApplicationArguments)}. Empty when env matches
+     * stored or no env vars were set.
+     */
+    public java.util.List<String> getLastMismatchedNames() {
+        return java.util.List.copyOf(lastMismatchedNames);
+    }
 }
diff --git a/server/src/test/java/dev/agentspan/runtime/credentials/CredentialEnvSeederMismatchTest.java b/server/src/test/java/dev/agentspan/runtime/credentials/CredentialEnvSeederMismatchTest.java
new file mode 100644
index 000000000..aed361d30
--- /dev/null
+++ b/server/src/test/java/dev/agentspan/runtime/credentials/CredentialEnvSeederMismatchTest.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2025 AgentSpan
+ * Licensed under the MIT License.
+ */
+package dev.agentspan.runtime.credentials;
+
+import static dev.agentspan.runtime.credentials.CredentialEnvSeeder.ANONYMOUS_USER_ID;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.util.function.Function;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.boot.DefaultApplicationArguments;
+
+/**
+ * Unit tests for the env-vs-stored mismatch detection added to
+ * {@link CredentialEnvSeeder}.
+ *
+ * <p>Background: before this change the seeder logged "Credential already
+ * exists in store — skipping env import" at WARN on every startup, even when
+ * the env value EQUALED the stored value. Users had no signal of a real
+ * mismatch — and worse, fixing the env and restarting the server did nothing
+ * because the stored value wasn't compared, just preserved.</p>
+ *
+ * <p>After: the seeder tracks {@code lastMismatchedNames} (env value differs
+ * from stored value) so callers and startup banners can surface the problem
+ * visibly. Tests assert on this field instead of inspecting log output.</p>
+ */
+class CredentialEnvSeederMismatchTest {
+
+    private void setStoreField(CredentialEnvSeeder seeder) throws Exception {
+        var field = CredentialEnvSeeder.class.getDeclaredField("credentialsStore");
+        field.setAccessible(true);
+        field.set(seeder, "built-in");
+    }
+
+    @Test
+    void recordsMismatchWhenEnvDiffersFromStored() throws Exception {
+        CredentialStoreProvider store = mock(CredentialStoreProvider.class);
+        when(store.get(eq(ANONYMOUS_USER_ID), eq("ANTHROPIC_API_KEY"))).thenReturn("sk-OLD-stored");
+
+        Function<String, String> env = name -> "ANTHROPIC_API_KEY".equals(name) ? "sk-NEW-from-env" : null;
+
+        CredentialEnvSeeder seeder = new CredentialEnvSeeder(store, env);
+        setStoreField(seeder);
+        seeder.run(new DefaultApplicationArguments());
+
+        assertThat(seeder.getLastMismatchedNames()).contains("ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void doesNotRecordMismatchWhenEnvMatchesStored() throws Exception {
+        CredentialStoreProvider store = mock(CredentialStoreProvider.class);
+        when(store.get(eq(ANONYMOUS_USER_ID), eq("ANTHROPIC_API_KEY"))).thenReturn("sk-SAME-value");
+
+        Function<String, String> env = name -> "ANTHROPIC_API_KEY".equals(name) ? "sk-SAME-value" : null;
+
+        CredentialEnvSeeder seeder = new CredentialEnvSeeder(store, env);
+        setStoreField(seeder);
+        seeder.run(new DefaultApplicationArguments());
+
+        assertThat(seeder.getLastMismatchedNames()).doesNotContain("ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void recordsMismatchAcrossMultipleProviders() throws Exception {
+        CredentialStoreProvider store = mock(CredentialStoreProvider.class);
+        when(store.get(anyString(), eq("OPENAI_API_KEY"))).thenReturn("sk-old-openai");
+        when(store.get(anyString(), eq("ANTHROPIC_API_KEY"))).thenReturn("sk-old-anthropic");
+
+        Function<String, String> env = name -> switch (name) {
+            case "OPENAI_API_KEY" -> "sk-new-openai";
+            case "ANTHROPIC_API_KEY" -> "sk-old-anthropic"; // matches — should NOT be recorded
+            default -> null;
+        };
+
+        CredentialEnvSeeder seeder = new CredentialEnvSeeder(store, env);
+        setStoreField(seeder);
+        seeder.run(new DefaultApplicationArguments());
+
+        assertThat(seeder.getLastMismatchedNames())
+                .contains("OPENAI_API_KEY")
+                .doesNotContain("ANTHROPIC_API_KEY");
+    }
+
+    @Test
+    void mismatchNamesEmptyWhenNoEnvVarsAreSet() throws Exception {
+        CredentialStoreProvider store = mock(CredentialStoreProvider.class);
+
+        CredentialEnvSeeder seeder = new CredentialEnvSeeder(store, name -> null);
+        setStoreField(seeder);
+        seeder.run(new DefaultApplicationArguments());
+
+        assertThat(seeder.getLastMismatchedNames()).isEmpty();
+    }
+}