Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ The format is based on Keep a Changelog, and Esh follows Semantic Versioning.

## [Unreleased]

### Added
- Backend capability reports for MLX and llama.cpp runtime feature detection.
- Normalized prompt cache keys on new cache manifests for future cache lookup and reuse policy.

## [0.1.37] - 2026-04-30

### Added
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ Inspect required and optional engines:

### External callers

Use `esh capabilities` to get a JSON map of supported backends, installed models, and whether each path supports direct inference, cache build, and cache load.
Use `esh capabilities` to get a JSON map of supported backends, installed models, and whether each path supports direct inference, cache build, and cache load. Internally, backends also expose capability reports for runtime readiness and feature support; MLX currently reports direct inference, token streaming, and prompt cache build/load, while llama.cpp reports direct inference and token streaming with GGUF cache features marked unavailable.

Use `esh infer` for machine-friendly inference. It returns JSON for both MLX and GGUF models, and MLX cache load stays optional rather than being the only supported integration path.

Expand Down Expand Up @@ -184,7 +184,7 @@ Supported routes in v1:

Notes:
- unsupported request fields are ignored when safe
- `stream` is not supported yet
- `stream: true` is supported for OpenAI-compatible chat/responses and Anthropic-compatible messages; backend token streaming remains runtime-dependent
- text inputs are supported for chat/responses in v1
- `/v1/models` includes installed text models only for strict OpenAI-compatible clients such as Xcode
- `/v1/audio/models` returns the reusable MLX TTS model catalog with voices, languages, output formats, and capabilities so external agents can present and reuse voice choices
Expand Down Expand Up @@ -509,6 +509,7 @@ Resume from a saved cache:
Important:
- cache artifacts are backend-specific
- cache artifacts are model-specific
- new cache artifacts include a normalized prompt cache key that is backend-, model-, tokenizer-, runtime-, and tool-signature-aware
- Esh reuses one cache pipeline, but artifacts are not portable across runtimes/models

## Typical Use Cases
Expand Down
61 changes: 59 additions & 2 deletions Sources/EshCore/Backends/GGUF/LlamaCppBackend.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@ public struct LlamaCppBackend: InferenceBackend, Sendable {
public let kind: BackendKind = .gguf
public let runtimeVersion: String
public static let runtimeNotFoundMessage = "llama.cpp runtime not found. Install it with `brew install llama.cpp`, or set ESH_LLAMA_CPP_CLI to your `llama-cli` path."
private let executableResolver: @Sendable () throws -> URL

public init(runtimeVersion: String = "llama.cpp-cli-v1") {
public init(
runtimeVersion: String = "llama.cpp-cli-v1",
executableResolver: (@Sendable () throws -> URL)? = nil
) {
self.runtimeVersion = runtimeVersion
self.executableResolver = executableResolver ?? {
try LlamaCppBackend.defaultResolveExecutable()
}
}

public func loadRuntime(for install: ModelInstall) async throws -> BackendRuntime {
Expand All @@ -26,6 +33,53 @@ public struct LlamaCppBackend: InferenceBackend, Sendable {
return nil
}

public func capabilityReport(for install: ModelInstall) -> BackendCapabilityReport {
var warnings: [String] = []
var unavailable: [UnavailableBackendFeature] = [
.init(
feature: .promptCacheBuild,
reason: "GGUF cache build is not supported by the llama.cpp backend yet."
),
.init(
feature: .promptCacheLoad,
reason: "GGUF cache load is not supported by the llama.cpp backend yet."
),
.init(
feature: .promptCacheBenchmark,
reason: "GGUF cache benchmarking hooks are not implemented yet."
)
]

do {
_ = try locateModelFile(for: install)
_ = try resolveExecutable()
} catch {
let reason = error.localizedDescription
warnings.append(reason)
unavailable.append(.init(feature: .directInference, reason: reason))
unavailable.append(.init(feature: .tokenStreaming, reason: reason))
return BackendCapabilityReport(
backend: kind,
runtimeVersion: runtimeVersion,
ready: false,
supportedFeatures: [],
unavailableFeatures: unavailable,
warnings: warnings
)
}

return BackendCapabilityReport(
backend: kind,
runtimeVersion: runtimeVersion,
ready: true,
supportedFeatures: [
.directInference,
.tokenStreaming
],
unavailableFeatures: unavailable
)
}

public func makeCompatibilityChecker(for install: ModelInstall) -> CompatibilityChecking {
LlamaCppCompatibilityChecker(install: install, runtimeVersion: runtimeVersion)
}
Expand Down Expand Up @@ -53,6 +107,10 @@ public struct LlamaCppBackend: InferenceBackend, Sendable {
}

func resolveExecutable() throws -> URL {
try executableResolver()
}

private static func defaultResolveExecutable() throws -> URL {
let env = ProcessInfo.processInfo.environment
let executable = URL(fileURLWithPath: CommandLine.arguments[0]).resolvingSymlinksInPath()
let bundledCandidate = executable
Expand Down Expand Up @@ -85,7 +143,6 @@ public struct LlamaCppBackend: InferenceBackend, Sendable {

throw StoreError.invalidManifest(Self.runtimeNotFoundMessage)
}

}

private struct LlamaCppCompatibilityChecker: CompatibilityChecking, Sendable {
Expand Down
38 changes: 38 additions & 0 deletions Sources/EshCore/Backends/MLX/MLXBackend.swift
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,44 @@ public struct MLXBackend: InferenceBackend, RemoteModelConfigValidating, Sendabl
return MLXRuntime(bridge: bridge, install: install)
}

public func capabilityReport(for install: ModelInstall) -> BackendCapabilityReport {
do {
_ = try locator.resolveModelPath(for: install)
return BackendCapabilityReport(
backend: kind,
runtimeVersion: runtimeVersion,
ready: true,
supportedFeatures: [
.directInference,
.tokenStreaming,
.promptCacheBuild,
.promptCacheLoad
],
unavailableFeatures: [
UnavailableBackendFeature(
feature: .promptCacheBenchmark,
reason: "MLX prompt cache benchmarking is not exposed through the backend capability API yet."
)
]
)
} catch {
let reason = error.localizedDescription
return BackendCapabilityReport(
backend: kind,
runtimeVersion: runtimeVersion,
ready: false,
supportedFeatures: [],
unavailableFeatures: [
.init(feature: .directInference, reason: reason),
.init(feature: .tokenStreaming, reason: reason),
.init(feature: .promptCacheBuild, reason: reason),
.init(feature: .promptCacheLoad, reason: reason)
],
warnings: [reason]
)
}
}

public func validateChatModel(for install: ModelInstall) throws -> String? {
let path = try locator.resolveModelPath(for: install)
let response: MLXModelValidationResponse = try bridge.run(
Expand Down
59 changes: 59 additions & 0 deletions Sources/EshCore/Domain/BackendCapabilities.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import Foundation

public enum BackendRuntimeFeature: String, Codable, Hashable, Sendable, CaseIterable {
case directInference = "direct-inference"
case tokenStreaming = "token-streaming"
case promptCacheBuild = "prompt-cache-build"
case promptCacheLoad = "prompt-cache-load"
case promptCacheBenchmark = "prompt-cache-benchmark"
case toolMessages = "tool-messages"
case multimodalInput = "multimodal-input"
}

public struct UnavailableBackendFeature: Codable, Hashable, Sendable {
public var feature: BackendRuntimeFeature
public var reason: String

public init(feature: BackendRuntimeFeature, reason: String) {
self.feature = feature
self.reason = reason
}
}

public struct BackendCapabilityReport: Codable, Hashable, Sendable {
public var backend: BackendKind
public var runtimeVersion: String
public var ready: Bool
public var supportedFeatures: [BackendRuntimeFeature]
public var unavailableFeatures: [UnavailableBackendFeature]
public var warnings: [String]

public init(
backend: BackendKind,
runtimeVersion: String,
ready: Bool,
supportedFeatures: [BackendRuntimeFeature],
unavailableFeatures: [UnavailableBackendFeature] = [],
warnings: [String] = []
) {
self.backend = backend
self.runtimeVersion = runtimeVersion
self.ready = ready
self.supportedFeatures = orderedUnique(supportedFeatures)
self.unavailableFeatures = unavailableFeatures
self.warnings = orderedUnique(warnings)
}

public func supports(_ feature: BackendRuntimeFeature) -> Bool {
supportedFeatures.contains(feature)
}

public func unavailableFeature(_ feature: BackendRuntimeFeature) -> UnavailableBackendFeature? {
unavailableFeatures.first { $0.feature == feature }
}
}

private func orderedUnique<T: Hashable>(_ values: [T]) -> [T] {
var seen: Set<T> = []
return values.filter { seen.insert($0).inserted }
}
5 changes: 4 additions & 1 deletion Sources/EshCore/Domain/CacheManifest.swift
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public struct CacheManifest: Codable, Hashable, Sendable {
public var contextFileCount: Int?
public var contextReused: Bool?
public var policyReason: String?
public var promptCacheKey: PromptCacheKey?

public init(
backend: BackendKind,
Expand All @@ -36,7 +37,8 @@ public struct CacheManifest: Codable, Hashable, Sendable {
contextTaskFingerprint: String? = nil,
contextFileCount: Int? = nil,
contextReused: Bool? = nil,
policyReason: String? = nil
policyReason: String? = nil,
promptCacheKey: PromptCacheKey? = nil
) {
self.backend = backend
self.modelID = modelID
Expand All @@ -55,5 +57,6 @@ public struct CacheManifest: Codable, Hashable, Sendable {
self.contextFileCount = contextFileCount
self.contextReused = contextReused
self.policyReason = policyReason
self.promptCacheKey = promptCacheKey
}
}
89 changes: 89 additions & 0 deletions Sources/EshCore/Domain/PromptCacheKey.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import Foundation

public struct PromptCacheKey: Codable, Hashable, Sendable {
public static let schemaVersion = "esh.prompt-cache-key.v1"

public var schemaVersion: String
public var hash: String
public var backend: BackendKind
public var modelID: String
public var tokenizerID: String?
public var runtimeVersion: String
public var toolSignature: String
public var normalizedMessageCount: Int

public init(
schemaVersion: String = PromptCacheKey.schemaVersion,
hash: String,
backend: BackendKind,
modelID: String,
tokenizerID: String? = nil,
runtimeVersion: String,
toolSignature: String,
normalizedMessageCount: Int
) {
self.schemaVersion = schemaVersion
self.hash = hash
self.backend = backend
self.modelID = modelID
self.tokenizerID = tokenizerID
self.runtimeVersion = runtimeVersion
self.toolSignature = toolSignature
self.normalizedMessageCount = normalizedMessageCount
}

static func make(
backend: BackendKind,
modelID: String,
tokenizerID: String?,
runtimeVersion: String,
toolSignature: String?,
messages: [Message]
) -> PromptCacheKey {
let effectiveToolSignature = toolSignature?.trimmingCharacters(in: .whitespacesAndNewlines).nonEmpty
?? "tools:none"
let payload = PromptCacheKeyPayload(
schemaVersion: schemaVersion,
backend: backend,
modelID: modelID,
tokenizerID: tokenizerID,
runtimeVersion: runtimeVersion,
toolSignature: effectiveToolSignature,
messages: messages.map { message in
PromptCacheKeyMessage(role: message.role, text: message.text)
}
)
let data = (try? JSONCoding.encoder.encode(payload)) ?? Data()
let canonical = String(decoding: data, as: UTF8.self)
return PromptCacheKey(
hash: Fingerprint.sha256([canonical]),
backend: backend,
modelID: modelID,
tokenizerID: tokenizerID,
runtimeVersion: runtimeVersion,
toolSignature: effectiveToolSignature,
normalizedMessageCount: messages.count
)
}
}

private struct PromptCacheKeyPayload: Codable, Hashable, Sendable {
var schemaVersion: String
var backend: BackendKind
var modelID: String
var tokenizerID: String?
var runtimeVersion: String
var toolSignature: String
var messages: [PromptCacheKeyMessage]
}

private struct PromptCacheKeyMessage: Codable, Hashable, Sendable {
var role: Message.Role
var text: String
}

private extension String {
var nonEmpty: String? {
isEmpty ? nil : self
}
}
13 changes: 13 additions & 0 deletions Sources/EshCore/Protocols/InferenceBackend.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,19 @@ public protocol InferenceBackend: Sendable {
var kind: BackendKind { get }
var runtimeVersion: String { get }

func capabilityReport(for install: ModelInstall) -> BackendCapabilityReport
func loadRuntime(for install: ModelInstall) async throws -> BackendRuntime
func makeCompatibilityChecker(for install: ModelInstall) -> CompatibilityChecking
}

public extension InferenceBackend {
func capabilityReport(for install: ModelInstall) -> BackendCapabilityReport {
_ = install
return BackendCapabilityReport(
backend: kind,
runtimeVersion: runtimeVersion,
ready: true,
supportedFeatures: [.directInference]
)
}
}
13 changes: 11 additions & 2 deletions Sources/EshCore/Services/CacheService.swift
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,20 @@ public struct CacheService: Sendable {
let snapshot = try await runtime.exportRuntimeCache()
let encodedSnapshot = try codec.encode(snapshot: snapshot)
let compression = try await compressor.compress(encodedSnapshot)
let manifestRuntimeVersion = install.runtimeVersion ?? "mlx-vlm-0.4.3+mlx-lm-bridge-v2"
let promptCacheKey = PromptSessionNormalizer().promptCacheKey(
for: session,
backend: runtime.backend,
modelID: install.id,
tokenizerID: install.spec.tokenizerID,
runtimeVersion: manifestRuntimeVersion
)
let manifest = CacheManifest(
backend: runtime.backend,
modelID: install.id,
tokenizerID: install.spec.tokenizerID,
architectureFingerprint: install.spec.architectureFingerprint ?? Fingerprint.sha256([install.id, install.backendFormat]),
runtimeVersion: install.runtimeVersion ?? "mlx-vlm-0.4.3+mlx-lm-bridge-v2",
runtimeVersion: manifestRuntimeVersion,
cacheFormatVersion: codec.formatVersion,
compressorVersion: compressor.version,
cacheMode: artifactMode ?? compressor.mode,
Expand All @@ -71,7 +79,8 @@ public struct CacheService: Sendable {
contextTaskFingerprint: context?.taskFingerprint,
contextFileCount: context?.fileCount,
contextReused: context?.reused,
policyReason: context?.policyReason
policyReason: context?.policyReason,
promptCacheKey: promptCacheKey
)
let artifact = CacheArtifact(
manifest: manifest,
Expand Down
Loading
Loading