fil-technology · fil-technology · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,10 @@ The format is based on Keep a Changelog, and Esh follows Semantic Versioning.
 
 ## [Unreleased]
 
+### Added
+- Backend capability reports for MLX and llama.cpp runtime feature detection.
+- Normalized prompt cache keys on new cache manifests for future cache lookup and reuse policy.
+
 ## [0.1.37] - 2026-04-30
 
 ### Added

diff --git a/README.md b/README.md
@@ -127,7 +127,7 @@ Inspect required and optional engines:
 
 ### External callers
 
-Use `esh capabilities` to get a JSON map of supported backends, installed models, and whether each path supports direct inference, cache build, and cache load.
+Use `esh capabilities` to get a JSON map of supported backends, installed models, and whether each path supports direct inference, cache build, and cache load. Internally, backends also expose capability reports for runtime readiness and feature support; MLX currently reports direct inference, token streaming, and prompt cache build/load, while llama.cpp reports direct inference and token streaming with GGUF cache features marked unavailable.
 
 Use `esh infer` for machine-friendly inference. It returns JSON for both MLX and GGUF models, and MLX cache load stays optional rather than being the only supported integration path.
 
@@ -184,7 +184,7 @@ Supported routes in v1:
 
 Notes:
 - unsupported request fields are ignored when safe
-- `stream` is not supported yet
+- `stream: true` is supported for OpenAI-compatible chat/responses and Anthropic-compatible messages; backend token streaming remains runtime-dependent
 - text inputs are supported for chat/responses in v1
 - `/v1/models` includes installed text models only for strict OpenAI-compatible clients such as Xcode
 - `/v1/audio/models` returns the reusable MLX TTS model catalog with voices, languages, output formats, and capabilities so external agents can present and reuse voice choices
@@ -509,6 +509,7 @@ Resume from a saved cache:
 Important:
 - cache artifacts are backend-specific
 - cache artifacts are model-specific
+- new cache artifacts include a normalized prompt cache key that is backend-, model-, tokenizer-, runtime-, and tool-signature-aware
 - Esh reuses one cache pipeline, but artifacts are not portable across runtimes/models
 
 ## Typical Use Cases

diff --git a/Sources/EshCore/Backends/GGUF/LlamaCppBackend.swift b/Sources/EshCore/Backends/GGUF/LlamaCppBackend.swift
@@ -4,9 +4,16 @@ public struct LlamaCppBackend: InferenceBackend, Sendable {
     public let kind: BackendKind = .gguf
     public let runtimeVersion: String
     public static let runtimeNotFoundMessage = "llama.cpp runtime not found. Install it with `brew install llama.cpp`, or set ESH_LLAMA_CPP_CLI to your `llama-cli` path."
+    private let executableResolver: @Sendable () throws -> URL
 
-    public init(runtimeVersion: String = "llama.cpp-cli-v1") {
+    public init(
+        runtimeVersion: String = "llama.cpp-cli-v1",
+        executableResolver: (@Sendable () throws -> URL)? = nil
+    ) {
         self.runtimeVersion = runtimeVersion
+        self.executableResolver = executableResolver ?? {
+            try LlamaCppBackend.defaultResolveExecutable()
+        }
     }
 
     public func loadRuntime(for install: ModelInstall) async throws -> BackendRuntime {
@@ -26,6 +33,53 @@ public struct LlamaCppBackend: InferenceBackend, Sendable {
         return nil
     }
 
+    public func capabilityReport(for install: ModelInstall) -> BackendCapabilityReport {
+        var warnings: [String] = []
+        var unavailable: [UnavailableBackendFeature] = [
+            .init(
+                feature: .promptCacheBuild,
+                reason: "GGUF cache build is not supported by the llama.cpp backend yet."
+            ),
+            .init(
+                feature: .promptCacheLoad,
+                reason: "GGUF cache load is not supported by the llama.cpp backend yet."
+            ),
+            .init(
+                feature: .promptCacheBenchmark,
+                reason: "GGUF cache benchmarking hooks are not implemented yet."
+            )
+        ]
+
+        do {
+            _ = try locateModelFile(for: install)
+            _ = try resolveExecutable()
+        } catch {
+            let reason = error.localizedDescription
+            warnings.append(reason)
+            unavailable.append(.init(feature: .directInference, reason: reason))
+            unavailable.append(.init(feature: .tokenStreaming, reason: reason))
+            return BackendCapabilityReport(
+                backend: kind,
+                runtimeVersion: runtimeVersion,
+                ready: false,
+                supportedFeatures: [],
+                unavailableFeatures: unavailable,
+                warnings: warnings
+            )
+        }
+
+        return BackendCapabilityReport(
+            backend: kind,
+            runtimeVersion: runtimeVersion,
+            ready: true,
+            supportedFeatures: [
+                .directInference,
+                .tokenStreaming
+            ],
+            unavailableFeatures: unavailable
+        )
+    }
+
     public func makeCompatibilityChecker(for install: ModelInstall) -> CompatibilityChecking {
         LlamaCppCompatibilityChecker(install: install, runtimeVersion: runtimeVersion)
     }
@@ -53,6 +107,10 @@ public struct LlamaCppBackend: InferenceBackend, Sendable {
     }
 
     func resolveExecutable() throws -> URL {
+        try executableResolver()
+    }
+
+    private static func defaultResolveExecutable() throws -> URL {
         let env = ProcessInfo.processInfo.environment
         let executable = URL(fileURLWithPath: CommandLine.arguments[0]).resolvingSymlinksInPath()
         let bundledCandidate = executable
@@ -85,7 +143,6 @@ public struct LlamaCppBackend: InferenceBackend, Sendable {
 
         throw StoreError.invalidManifest(Self.runtimeNotFoundMessage)
     }
-
 }
 
 private struct LlamaCppCompatibilityChecker: CompatibilityChecking, Sendable {

diff --git a/Sources/EshCore/Backends/MLX/MLXBackend.swift b/Sources/EshCore/Backends/MLX/MLXBackend.swift
@@ -21,6 +21,44 @@ public struct MLXBackend: InferenceBackend, RemoteModelConfigValidating, Sendabl
         return MLXRuntime(bridge: bridge, install: install)
     }
 
+    public func capabilityReport(for install: ModelInstall) -> BackendCapabilityReport {
+        do {
+            _ = try locator.resolveModelPath(for: install)
+            return BackendCapabilityReport(
+                backend: kind,
+                runtimeVersion: runtimeVersion,
+                ready: true,
+                supportedFeatures: [
+                    .directInference,
+                    .tokenStreaming,
+                    .promptCacheBuild,
+                    .promptCacheLoad
+                ],
+                unavailableFeatures: [
+                    UnavailableBackendFeature(
+                        feature: .promptCacheBenchmark,
+                        reason: "MLX prompt cache benchmarking is not exposed through the backend capability API yet."
+                    )
+                ]
+            )
+        } catch {
+            let reason = error.localizedDescription
+            return BackendCapabilityReport(
+                backend: kind,
+                runtimeVersion: runtimeVersion,
+                ready: false,
+                supportedFeatures: [],
+                unavailableFeatures: [
+                    .init(feature: .directInference, reason: reason),
+                    .init(feature: .tokenStreaming, reason: reason),
+                    .init(feature: .promptCacheBuild, reason: reason),
+                    .init(feature: .promptCacheLoad, reason: reason)
+                ],
+                warnings: [reason]
+            )
+        }
+    }
+
     public func validateChatModel(for install: ModelInstall) throws -> String? {
         let path = try locator.resolveModelPath(for: install)
         let response: MLXModelValidationResponse = try bridge.run(

diff --git a/Sources/EshCore/Domain/BackendCapabilities.swift b/Sources/EshCore/Domain/BackendCapabilities.swift
@@ -0,0 +1,59 @@
+import Foundation
+
+public enum BackendRuntimeFeature: String, Codable, Hashable, Sendable, CaseIterable {
+    case directInference = "direct-inference"
+    case tokenStreaming = "token-streaming"
+    case promptCacheBuild = "prompt-cache-build"
+    case promptCacheLoad = "prompt-cache-load"
+    case promptCacheBenchmark = "prompt-cache-benchmark"
+    case toolMessages = "tool-messages"
+    case multimodalInput = "multimodal-input"
+}
+
+public struct UnavailableBackendFeature: Codable, Hashable, Sendable {
+    public var feature: BackendRuntimeFeature
+    public var reason: String
+
+    public init(feature: BackendRuntimeFeature, reason: String) {
+        self.feature = feature
+        self.reason = reason
+    }
+}
+
+public struct BackendCapabilityReport: Codable, Hashable, Sendable {
+    public var backend: BackendKind
+    public var runtimeVersion: String
+    public var ready: Bool
+    public var supportedFeatures: [BackendRuntimeFeature]
+    public var unavailableFeatures: [UnavailableBackendFeature]
+    public var warnings: [String]
+
+    public init(
+        backend: BackendKind,
+        runtimeVersion: String,
+        ready: Bool,
+        supportedFeatures: [BackendRuntimeFeature],
+        unavailableFeatures: [UnavailableBackendFeature] = [],
+        warnings: [String] = []
+    ) {
+        self.backend = backend
+        self.runtimeVersion = runtimeVersion
+        self.ready = ready
+        self.supportedFeatures = orderedUnique(supportedFeatures)
+        self.unavailableFeatures = unavailableFeatures
+        self.warnings = orderedUnique(warnings)
+    }
+
+    public func supports(_ feature: BackendRuntimeFeature) -> Bool {
+        supportedFeatures.contains(feature)
+    }
+
+    public func unavailableFeature(_ feature: BackendRuntimeFeature) -> UnavailableBackendFeature? {
+        unavailableFeatures.first { $0.feature == feature }
+    }
+}
+
+private func orderedUnique<T: Hashable>(_ values: [T]) -> [T] {
+    var seen: Set<T> = []
+    return values.filter { seen.insert($0).inserted }
+}
diff --git a/Sources/EshCore/Domain/CacheManifest.swift b/Sources/EshCore/Domain/CacheManifest.swift
@@ -18,6 +18,7 @@ public struct CacheManifest: Codable, Hashable, Sendable {
     public var contextFileCount: Int?
     public var contextReused: Bool?
     public var policyReason: String?
+    public var promptCacheKey: PromptCacheKey?
 
     public init(
         backend: BackendKind,
@@ -36,7 +37,8 @@ public struct CacheManifest: Codable, Hashable, Sendable {
         contextTaskFingerprint: String? = nil,
         contextFileCount: Int? = nil,
         contextReused: Bool? = nil,
-        policyReason: String? = nil
+        policyReason: String? = nil,
+        promptCacheKey: PromptCacheKey? = nil
     ) {
         self.backend = backend
         self.modelID = modelID
@@ -55,5 +57,6 @@ public struct CacheManifest: Codable, Hashable, Sendable {
         self.contextFileCount = contextFileCount
         self.contextReused = contextReused
         self.policyReason = policyReason
+        self.promptCacheKey = promptCacheKey
     }
 }
diff --git a/Sources/EshCore/Domain/PromptCacheKey.swift b/Sources/EshCore/Domain/PromptCacheKey.swift
@@ -0,0 +1,89 @@
+import Foundation
+
+public struct PromptCacheKey: Codable, Hashable, Sendable {
+    public static let schemaVersion = "esh.prompt-cache-key.v1"
+
+    public var schemaVersion: String
+    public var hash: String
+    public var backend: BackendKind
+    public var modelID: String
+    public var tokenizerID: String?
+    public var runtimeVersion: String
+    public var toolSignature: String
+    public var normalizedMessageCount: Int
+
+    public init(
+        schemaVersion: String = PromptCacheKey.schemaVersion,
+        hash: String,
+        backend: BackendKind,
+        modelID: String,
+        tokenizerID: String? = nil,
+        runtimeVersion: String,
+        toolSignature: String,
+        normalizedMessageCount: Int
+    ) {
+        self.schemaVersion = schemaVersion
+        self.hash = hash
+        self.backend = backend
+        self.modelID = modelID
+        self.tokenizerID = tokenizerID
+        self.runtimeVersion = runtimeVersion
+        self.toolSignature = toolSignature
+        self.normalizedMessageCount = normalizedMessageCount
+    }
+
+    static func make(
+        backend: BackendKind,
+        modelID: String,
+        tokenizerID: String?,
+        runtimeVersion: String,
+        toolSignature: String?,
+        messages: [Message]
+    ) -> PromptCacheKey {
+        let effectiveToolSignature = toolSignature?.trimmingCharacters(in: .whitespacesAndNewlines).nonEmpty
+            ?? "tools:none"
+        let payload = PromptCacheKeyPayload(
+            schemaVersion: schemaVersion,
+            backend: backend,
+            modelID: modelID,
+            tokenizerID: tokenizerID,
+            runtimeVersion: runtimeVersion,
+            toolSignature: effectiveToolSignature,
+            messages: messages.map { message in
+                PromptCacheKeyMessage(role: message.role, text: message.text)
+            }
+        )
+        let data = (try? JSONCoding.encoder.encode(payload)) ?? Data()
+        let canonical = String(decoding: data, as: UTF8.self)
+        return PromptCacheKey(
+            hash: Fingerprint.sha256([canonical]),
+            backend: backend,
+            modelID: modelID,
+            tokenizerID: tokenizerID,
+            runtimeVersion: runtimeVersion,
+            toolSignature: effectiveToolSignature,
+            normalizedMessageCount: messages.count
+        )
+    }
+}
+
+private struct PromptCacheKeyPayload: Codable, Hashable, Sendable {
+    var schemaVersion: String
+    var backend: BackendKind
+    var modelID: String
+    var tokenizerID: String?
+    var runtimeVersion: String
+    var toolSignature: String
+    var messages: [PromptCacheKeyMessage]
+}
+
+private struct PromptCacheKeyMessage: Codable, Hashable, Sendable {
+    var role: Message.Role
+    var text: String
+}
+
+private extension String {
+    var nonEmpty: String? {
+        isEmpty ? nil : self
+    }
+}
diff --git a/Sources/EshCore/Protocols/InferenceBackend.swift b/Sources/EshCore/Protocols/InferenceBackend.swift
@@ -4,6 +4,19 @@ public protocol InferenceBackend: Sendable {
     var kind: BackendKind { get }
     var runtimeVersion: String { get }
 
+    func capabilityReport(for install: ModelInstall) -> BackendCapabilityReport
     func loadRuntime(for install: ModelInstall) async throws -> BackendRuntime
     func makeCompatibilityChecker(for install: ModelInstall) -> CompatibilityChecking
 }
+
+public extension InferenceBackend {
+    func capabilityReport(for install: ModelInstall) -> BackendCapabilityReport {
+        _ = install
+        return BackendCapabilityReport(
+            backend: kind,
+            runtimeVersion: runtimeVersion,
+            ready: true,
+            supportedFeatures: [.directInference]
+        )
+    }
+}
diff --git a/Sources/EshCore/Services/CacheService.swift b/Sources/EshCore/Services/CacheService.swift
@@ -55,12 +55,20 @@ public struct CacheService: Sendable {
         let snapshot = try await runtime.exportRuntimeCache()
         let encodedSnapshot = try codec.encode(snapshot: snapshot)
         let compression = try await compressor.compress(encodedSnapshot)
+        let manifestRuntimeVersion = install.runtimeVersion ?? "mlx-vlm-0.4.3+mlx-lm-bridge-v2"
+        let promptCacheKey = PromptSessionNormalizer().promptCacheKey(
+            for: session,
+            backend: runtime.backend,
+            modelID: install.id,
+            tokenizerID: install.spec.tokenizerID,
+            runtimeVersion: manifestRuntimeVersion
+        )
         let manifest = CacheManifest(
             backend: runtime.backend,
             modelID: install.id,
             tokenizerID: install.spec.tokenizerID,
             architectureFingerprint: install.spec.architectureFingerprint ?? Fingerprint.sha256([install.id, install.backendFormat]),
-            runtimeVersion: install.runtimeVersion ?? "mlx-vlm-0.4.3+mlx-lm-bridge-v2",
+            runtimeVersion: manifestRuntimeVersion,
             cacheFormatVersion: codec.formatVersion,
             compressorVersion: compressor.version,
             cacheMode: artifactMode ?? compressor.mode,
@@ -71,7 +79,8 @@ public struct CacheService: Sendable {
             contextTaskFingerprint: context?.taskFingerprint,
             contextFileCount: context?.fileCount,
             contextReused: context?.reused,
-            policyReason: context?.policyReason
+            policyReason: context?.policyReason,
+            promptCacheKey: promptCacheKey
         )
         let artifact = CacheArtifact(
             manifest: manifest,