diff --git a/Package.swift b/Package.swift index 442aa8b9a..bf3f7d958 100644 --- a/Package.swift +++ b/Package.swift @@ -203,6 +203,7 @@ let package = Package( name: "ContainerAPIServiceTests", dependencies: [ .product(name: "Containerization", package: "containerization"), + "ContainerAPIService", "ContainerResource", "ContainerRuntimeLinuxClient", "ContainerRuntimeClient", diff --git a/Sources/ContainerCommands/Container/ContainerRun.swift b/Sources/ContainerCommands/Container/ContainerRun.swift index 2957fe473..9f23adbe8 100644 --- a/Sources/ContainerCommands/Container/ContainerRun.swift +++ b/Sources/ContainerCommands/Container/ContainerRun.swift @@ -176,5 +176,23 @@ extension Application { } throw ArgumentParser.ExitCode(exitCode) } + static func parseRestartPolicy(_ raw: String?) -> RestartPolicy? { + guard let raw, !raw.isEmpty else { return nil } + switch raw { + case "no": + return RestartPolicy.none + case "always": + return RestartPolicy(mode: .always) + case "unless-stopped": + return RestartPolicy(mode: .unlessStopped) + default: + if raw.hasPrefix("on-failure") { + let parts = raw.split(separator: ":", maxSplits: 1) + let retries = parts.count > 1 ? Int(parts[1]) ?? 0 : 0 + return RestartPolicy(mode: .onFailure, maxRetries: retries) + } + return nil + } + } } } diff --git a/Sources/ContainerResource/Container/ContainerConfiguration.swift b/Sources/ContainerResource/Container/ContainerConfiguration.swift index 4dc75b9ad..99e898afb 100644 --- a/Sources/ContainerResource/Container/ContainerConfiguration.swift +++ b/Sources/ContainerResource/Container/ContainerConfiguration.swift @@ -61,6 +61,10 @@ public struct ContainerConfiguration: Sendable, Codable { public var shmSize: UInt64? /// Signal to send to the container process on stop (from image config). public var stopSignal: String? + /// Optional periodic healthcheck spec. When set and not effectively + /// disabled, the API server starts a per-container observer that runs + /// the configured probe and updates ``ContainerSnapshot/health``. + public var healthcheck: Healthcheck? enum CodingKeys: String, CodingKey { case id @@ -85,6 +89,7 @@ public struct ContainerConfiguration: Sendable, Codable { case capDrop case shmSize case stopSignal + case healthcheck } /// Create a configuration from the supplied Decoder, initializing missing @@ -120,6 +125,7 @@ public struct ContainerConfiguration: Sendable, Codable { capDrop = try container.decodeIfPresent([String].self, forKey: .capDrop) ?? [] shmSize = try container.decodeIfPresent(UInt64.self, forKey: .shmSize) stopSignal = try container.decodeIfPresent(String.self, forKey: .stopSignal) + healthcheck = try container.decodeIfPresent(Healthcheck.self, forKey: .healthcheck) } public struct DNSConfiguration: Sendable, Codable { diff --git a/Sources/ContainerResource/Container/ContainerSnapshot.swift b/Sources/ContainerResource/Container/ContainerSnapshot.swift index bae992423..0168b7a2e 100644 --- a/Sources/ContainerResource/Container/ContainerSnapshot.swift +++ b/Sources/ContainerResource/Container/ContainerSnapshot.swift @@ -39,16 +39,25 @@ public struct ContainerSnapshot: Codable, Sendable { public var networks: [Attachment] /// When the container was started. public var startedDate: Date? + /// The most recently observed health of the container. + /// + /// At present the daemon does not run a container-level healthcheck + /// observer, so this field is always `nil`. The shape is reserved so that + /// downstream tools (e.g. `compose`) have a stable type to read from once + /// a healthcheck observer is wired into the API server. + public var health: HealthStatus? public init( configuration: ContainerConfiguration, status: RuntimeStatus, networks: [Attachment], - startedDate: Date? = nil + startedDate: Date? = nil, + health: HealthStatus? = nil ) { self.configuration = configuration self.status = status self.networks = networks self.startedDate = startedDate + self.health = health } } diff --git a/Sources/ContainerResource/Container/HealthStatus.swift b/Sources/ContainerResource/Container/HealthStatus.swift new file mode 100644 index 000000000..3f13a4ffc --- /dev/null +++ b/Sources/ContainerResource/Container/HealthStatus.swift @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import Foundation + +/// The observed health status of a container, as derived from a periodic +/// healthcheck probe. +/// +/// At present the daemon does not run a container-level healthcheck observer, +/// so ``ContainerSnapshot/health`` is always `nil`. This type is reserved for +/// downstream tools (e.g. `compose`) that want a stable shape to read from +/// once a healthcheck observer is wired into the API server. +public enum HealthStatus: String, CaseIterable, Sendable, Codable { + /// No healthcheck has been configured or no result is yet available. + case none + /// The healthcheck is running but has not yet produced a successful probe. + case starting + /// The most recent probe(s) reported the container as healthy. + case healthy + /// The most recent probe(s) reported the container as unhealthy. + case unhealthy +} diff --git a/Sources/ContainerResource/Container/Healthcheck.swift b/Sources/ContainerResource/Container/Healthcheck.swift new file mode 100644 index 000000000..c443f8a4b --- /dev/null +++ b/Sources/ContainerResource/Container/Healthcheck.swift @@ -0,0 +1,214 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerizationError +import Foundation + +/// Configuration for a periodic, container-level healthcheck. +/// +/// The shape mirrors the Docker / compose-spec healthcheck schema so that +/// downstream tools (the canonical use case is a compose-spec orchestrator +/// implementing `depends_on.condition: service_healthy`) can populate this +/// type directly from a `docker-compose.yml` `healthcheck:` block. +/// +/// Semantics applied by the daemon's healthcheck observer: +/// +/// 1. When the observer starts and the healthcheck is enabled, the +/// container's ``ContainerSnapshot/health`` is set to +/// ``HealthStatus/starting``. +/// 2. While the wall-clock age of the container is within ``startPeriod``, +/// failed probes do not advance the consecutive failure counter. +/// Successful probes during the grace period transition the container +/// immediately to ``HealthStatus/healthy``. +/// 3. After the grace period elapses, ``retries`` consecutive failed probes +/// transition the container to ``HealthStatus/unhealthy``. A subsequent +/// successful probe resets the counter and transitions back to +/// ``HealthStatus/healthy`` without requiring a restart. +/// 4. A probe that does not return within ``timeout`` counts as a failed +/// probe. +/// 5. ``test`` of `["NONE"]` and ``disable`` set to `true` both bypass the +/// observer entirely; ``ContainerSnapshot/health`` remains `nil`. +public struct Healthcheck: Codable, Sendable, Equatable { + /// The probe specification. + /// + /// Compatible shapes: + /// - `["NONE"]` — disable any healthcheck inherited from the image. + /// - `["CMD", "executable", "arg1", ...]` — run `executable` with the + /// supplied arguments directly inside the container. Exit code `0` + /// means healthy, any other exit code means unhealthy. + /// - `["CMD-SHELL", "shell command string"]` — run the entire command + /// string through the container's default shell (`/bin/sh -c`). + public let test: [String] + + /// Time between consecutive probes, in seconds. Defaults to 30 seconds. + public let interval: TimeInterval + + /// Per-probe deadline, in seconds. A probe that does not return within + /// this window counts as a failed probe. Defaults to 30 seconds. + public let timeout: TimeInterval + + /// Number of consecutive failed probes that transition the container + /// from ``HealthStatus/healthy`` (or ``HealthStatus/starting``) to + /// ``HealthStatus/unhealthy``. Defaults to 3. + public let retries: Int + + /// Optional grace window, in seconds, during which failed probes do not + /// count toward ``retries``. The first successful probe during this + /// window transitions the container immediately to + /// ``HealthStatus/healthy``. When `nil`, no grace is applied. + public let startPeriod: TimeInterval? + + /// Optional probe interval used while the container is still within + /// ``startPeriod``. When `nil`, ``interval`` is used during the grace + /// window as well. + public let startInterval: TimeInterval? + + /// Bypass the observer entirely. Equivalent to ``test`` = `["NONE"]`. + public let disable: Bool? + + /// Default probe interval applied when the configuration omits one. + public static let defaultInterval: TimeInterval = 30 + /// Default per-probe deadline applied when the configuration omits one. + public static let defaultTimeout: TimeInterval = 30 + /// Default consecutive-failure threshold applied when the configuration + /// omits one. + public static let defaultRetries: Int = 3 + + public init( + test: [String], + interval: TimeInterval = Healthcheck.defaultInterval, + timeout: TimeInterval = Healthcheck.defaultTimeout, + retries: Int = Healthcheck.defaultRetries, + startPeriod: TimeInterval? = nil, + startInterval: TimeInterval? = nil, + disable: Bool? = nil + ) throws { + self.test = test + self.interval = interval + self.timeout = timeout + self.retries = retries + self.startPeriod = startPeriod + self.startInterval = startInterval + self.disable = disable + try validate() + } + + enum CodingKeys: String, CodingKey { + case test + case interval + case timeout + case retries + case startPeriod + case startInterval + case disable + } + + public init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + test = try container.decode([String].self, forKey: .test) + interval = try container.decodeIfPresent(TimeInterval.self, forKey: .interval) ?? Healthcheck.defaultInterval + timeout = try container.decodeIfPresent(TimeInterval.self, forKey: .timeout) ?? Healthcheck.defaultTimeout + retries = try container.decodeIfPresent(Int.self, forKey: .retries) ?? Healthcheck.defaultRetries + startPeriod = try container.decodeIfPresent(TimeInterval.self, forKey: .startPeriod) + startInterval = try container.decodeIfPresent(TimeInterval.self, forKey: .startInterval) + disable = try container.decodeIfPresent(Bool.self, forKey: .disable) + try validate() + } + + public func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + try container.encode(test, forKey: .test) + try container.encode(interval, forKey: .interval) + try container.encode(timeout, forKey: .timeout) + try container.encode(retries, forKey: .retries) + try container.encodeIfPresent(startPeriod, forKey: .startPeriod) + try container.encodeIfPresent(startInterval, forKey: .startInterval) + try container.encodeIfPresent(disable, forKey: .disable) + } + + /// Whether the healthcheck is effectively disabled (no observer should + /// be started, ``ContainerSnapshot/health`` remains `nil`). + public var isEffectivelyDisabled: Bool { + if disable == true { return true } + if test.count == 1 && test[0] == "NONE" { return true } + return false + } + + /// The probe interval that should be used at the supplied wall-clock age + /// of the container. Returns ``startInterval`` while the container is + /// still within ``startPeriod``, otherwise ``interval``. + public func probeInterval(forContainerAge age: TimeInterval) -> TimeInterval { + if let startPeriod, age < startPeriod, let startInterval { + return startInterval + } + return interval + } + + private func validate() throws { + guard !test.isEmpty else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck test must not be empty" + ) + } + if !isEffectivelyDisabled { + switch test[0] { + case "CMD", "CMD-SHELL": + guard test.count >= 2 else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck test '\(test[0])' requires at least one argument" + ) + } + default: + throw ContainerizationError( + .invalidArgument, + message: "healthcheck test must start with 'NONE', 'CMD', or 'CMD-SHELL' (got '\(test[0])')" + ) + } + } + guard interval > 0 else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck interval must be positive (got \(interval))" + ) + } + guard timeout > 0 else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck timeout must be positive (got \(timeout))" + ) + } + guard retries >= 0 else { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck retries must be non-negative (got \(retries))" + ) + } + if let startPeriod, startPeriod < 0 { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck start_period must be non-negative (got \(startPeriod))" + ) + } + if let startInterval, startInterval <= 0 { + throw ContainerizationError( + .invalidArgument, + message: "healthcheck start_interval must be positive (got \(startInterval))" + ) + } + } +} diff --git a/Sources/Services/ContainerAPIService/Client/Flags.swift b/Sources/Services/ContainerAPIService/Client/Flags.swift index f8361ef68..65cc5926f 100644 --- a/Sources/Services/ContainerAPIService/Client/Flags.swift +++ b/Sources/Services/ContainerAPIService/Client/Flags.swift @@ -345,6 +345,48 @@ public struct Flags { @Option(name: [.customLong("volume"), .short], help: "Bind mount a volume into the container") public var volumes: [String] = [] + @Option( + name: .customLong("health-cmd"), + help: "Healthcheck command to run inside the container (executed via /bin/sh -c)." + ) + public var healthCmd: String? + + @Option( + name: .customLong("health-interval"), + help: "Time between healthcheck probes, in seconds (default 30)." + ) + public var healthInterval: Double? + + @Option( + name: .customLong("health-timeout"), + help: "Per-probe deadline for the healthcheck, in seconds (default 30)." + ) + public var healthTimeout: Double? + + @Option( + name: .customLong("health-retries"), + help: "Number of consecutive failed probes before the container is reported unhealthy (default 3)." + ) + public var healthRetries: Int? + + @Option( + name: .customLong("health-start-period"), + help: "Grace window after start during which failed probes do not count, in seconds." + ) + public var healthStartPeriod: Double? + + @Option( + name: .customLong("health-start-interval"), + help: "Probe interval used while still within the grace window, in seconds." + ) + public var healthStartInterval: Double? + + @Flag( + name: .customLong("no-healthcheck"), + help: "Disable any image-baked healthcheck for this container." + ) + public var noHealthcheck: Bool = false + public func validate() throws { if dnsDisabled { let hasDNSConfig = diff --git a/Sources/Services/ContainerAPIService/Client/Utility.swift b/Sources/Services/ContainerAPIService/Client/Utility.swift index cf5b8d6df..cf6e95c66 100644 --- a/Sources/Services/ContainerAPIService/Client/Utility.swift +++ b/Sources/Services/ContainerAPIService/Client/Utility.swift @@ -268,9 +268,38 @@ public struct Utility { config.runtimeHandler = runtime } + config.healthcheck = try Self.makeHealthcheck(management: management) + return (config, kernel, management.initImage) } + private static func makeHealthcheck(management: Flags.Management) throws -> Healthcheck? { + if management.noHealthcheck { + return try Healthcheck(test: ["NONE"]) + } + guard let cmd = management.healthCmd else { + // Reject orphan health-* flags without a command — catch typos early. + if management.healthInterval != nil || management.healthTimeout != nil + || management.healthRetries != nil || management.healthStartPeriod != nil + || management.healthStartInterval != nil + { + throw ContainerizationError( + .invalidArgument, + message: "--health-* flags require --health-cmd to be specified" + ) + } + return nil + } + return try Healthcheck( + test: ["CMD-SHELL", cmd], + interval: management.healthInterval ?? Healthcheck.defaultInterval, + timeout: management.healthTimeout ?? Healthcheck.defaultTimeout, + retries: management.healthRetries ?? Healthcheck.defaultRetries, + startPeriod: management.healthStartPeriod, + startInterval: management.healthStartInterval + ) + } + static func getAttachmentConfigurations( containerId: String, builtinNetworkId: String?, diff --git a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift index b18bf55d5..0b82dc170 100644 --- a/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift +++ b/Sources/Services/ContainerAPIService/Server/Containers/ContainersService.swift @@ -35,6 +35,7 @@ public actor ContainersService { struct ContainerState { var snapshot: ContainerSnapshot var client: RuntimeClient? = nil + var healthGeneration: UInt64 = 0 func getClient() throws -> RuntimeClient { guard let client else { @@ -58,6 +59,7 @@ public actor ContainersService { private let runtimePlugins: [Plugin] private let exitMonitor: ExitMonitor private let containerSystemConfig: ContainerSystemConfig + private let healthMonitor: HealthMonitor private let lock: AsyncLock private var containers: [String: ContainerState] @@ -75,6 +77,7 @@ public actor ContainersService { let containerRoot = appRoot.appendingPathComponent("containers") try FileManager.default.createDirectory(at: containerRoot, withIntermediateDirectories: true) self.exitMonitor = ExitMonitor(log: log) + self.healthMonitor = HealthMonitor(log: log) self.lock = AsyncLock(log: log) self.containerRoot = containerRoot self.pluginLoader = pluginLoader @@ -570,7 +573,25 @@ public actor ContainersService { state.snapshot.status = .running state.snapshot.networks = sandboxSnapshot.networks state.snapshot.startedDate = Date() + state.healthGeneration &+= 1 + let healthGeneration = state.healthGeneration + let healthcheck = state.snapshot.configuration.healthcheck + let healthClient = client + let startedAt = state.snapshot.startedDate ?? Date() await self.setContainerState(id, state, context: context) + + if let healthcheck { + let prober = SandboxClientHealthProber(sandboxClient: healthClient, log: self.log) + await self.healthMonitor.register( + id: id, + generation: healthGeneration, + startedAt: startedAt, + healthcheck: healthcheck, + prober: prober + ) { [weak self] containerID, gen, status in + await self?.applyHealthUpdate(id: containerID, generation: gen, status: status) + } + } } catch { await self.exitMonitor.stopTracking(id: id) try? await client.stop(options: ContainerStopOptions.default) @@ -945,6 +966,7 @@ public actor ContainersService { } await self.exitMonitor.stopTracking(id: id) + await self.healthMonitor.unregister(id: id) // Shutdown and deregister the runtime service self.log.info("shutting down runtime service", metadata: ["id": "\(id)"]) @@ -1117,6 +1139,18 @@ public actor ContainersService { self.containers[id] = state } + /// Apply a health-status update from the ``HealthMonitor`` observer. + /// Generation-gated: drops updates whose generation does not match the + /// current container instance, the container has been removed, or its + /// status is no longer ``RuntimeStatus/running``. + private func applyHealthUpdate(id: String, generation: UInt64, status: HealthStatus) async { + guard var state = self.containers[id] else { return } + guard state.healthGeneration == generation else { return } + guard state.snapshot.status == .running else { return } + state.snapshot.health = status + self.containers[id] = state + } + private func getContainerState(id: String, context: AsyncLock.Context) throws -> ContainerState { try self._getContainerState(id: id) } diff --git a/Sources/Services/ContainerAPIService/Server/Containers/HealthMonitor.swift b/Sources/Services/ContainerAPIService/Server/Containers/HealthMonitor.swift new file mode 100644 index 000000000..0e3d6ae5c --- /dev/null +++ b/Sources/Services/ContainerAPIService/Server/Containers/HealthMonitor.swift @@ -0,0 +1,127 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import Foundation +import Logging + +/// Per-container healthcheck observer manager. Mirrors the lifecycle pattern +/// of ``ExitMonitor``: callers register a container at the moment it reaches +/// ``RuntimeStatus/running`` and unregister when it transitions away from +/// running. The actor owns the per-container observer ``Task`` and is the +/// single point that may cancel them. +/// +/// Updates flow back to the caller through the supplied ``onUpdate`` callback +/// together with the generation token that was passed to ``register``. The +/// receiver is expected to drop updates whose generation no longer matches +/// the live container instance (see CHAOS-1381 design notes). +public actor HealthMonitor { + /// Callback signature: `(containerID, generation, status)`. + public typealias HealthUpdateCallback = @Sendable (String, UInt64, HealthStatus) async -> Void + + private var tasks: [String: Task] = [:] + private let log: Logger? + + public init(log: Logger? = nil) { + self.log = log + } + + /// Start observing the addressed container. Cancels any prior observer + /// for the same id. When ``Healthcheck/isEffectivelyDisabled`` is `true` + /// the callback is invoked once with ``HealthStatus/none`` and no probe + /// loop is started. + public func register( + id: String, + generation: UInt64, + startedAt: Date, + healthcheck: Healthcheck, + prober: any HealthProber, + onUpdate: @escaping HealthUpdateCallback + ) async { + await cancelExistingTask(id: id) + + if healthcheck.isEffectivelyDisabled { + await onUpdate(id, generation, .none) + return + } + + await onUpdate(id, generation, .starting) + + let log = self.log + let task = Task { [prober] in + var stateMachine = HealthStateMachine(configuration: healthcheck) + var lastReportedStatus = stateMachine.currentStatus + + while !Task.isCancelled { + let now = Date() + let age = now.timeIntervalSince(startedAt) + let interval = healthcheck.probeInterval(forContainerAge: age) + + do { + try await Task.sleep(nanoseconds: UInt64(max(0, interval) * 1_000_000_000)) + } catch { + return + } + + let probeResult = await prober.runProbe( + containerID: id, + test: healthcheck.test, + timeout: healthcheck.timeout + ) + + let probeAge = Date().timeIntervalSince(startedAt) + switch probeResult { + case .success: + stateMachine.recordSuccess() + case .failure, .timedOut: + stateMachine.recordFailure(containerAge: probeAge) + } + + if stateMachine.currentStatus != lastReportedStatus { + lastReportedStatus = stateMachine.currentStatus + log?.info( + "health status transition", + metadata: [ + "id": "\(id)", + "status": "\(stateMachine.currentStatus)", + "result": "\(probeResult)", + ]) + await onUpdate(id, generation, stateMachine.currentStatus) + } + } + } + tasks[id] = task + } + + /// Stop observing the addressed container if a task is registered. Idempotent. + public func unregister(id: String) async { + await cancelExistingTask(id: id) + } + + /// Cancel every registered observer. Used during daemon shutdown. + public func unregisterAll() async { + for id in tasks.keys { + tasks[id]?.cancel() + } + tasks.removeAll() + } + + private func cancelExistingTask(id: String) async { + if let existing = tasks.removeValue(forKey: id) { + existing.cancel() + } + } +} diff --git a/Sources/Services/ContainerAPIService/Server/Containers/HealthProber.swift b/Sources/Services/ContainerAPIService/Server/Containers/HealthProber.swift new file mode 100644 index 000000000..ac7666b42 --- /dev/null +++ b/Sources/Services/ContainerAPIService/Server/Containers/HealthProber.swift @@ -0,0 +1,132 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import ContainerSandboxServiceClient +import Foundation +import Logging + +/// The outcome of a single healthcheck probe attempt. +public enum HealthProbeResult: Sendable, Equatable { + case success + case failure(exitCode: Int32?) + case timedOut +} + +/// Abstracts the execution of a single probe so that the observer logic can +/// be unit-tested without a running sandbox. +public protocol HealthProber: Sendable { + /// Run a single probe inside the addressed container and return the + /// outcome. The implementation is responsible for enforcing the supplied + /// `timeout`; callers expect this method to return promptly. + func runProbe( + containerID: String, + test: [String], + timeout: TimeInterval + ) async -> HealthProbeResult +} + +/// Production ``HealthProber`` that drives an existing ``SandboxClient`` to +/// spawn a fresh process per probe. Stdio is intentionally not forwarded so +/// the probe leaves no log output behind; the exit code (or absence thereof +/// on timeout) is the only signal consumed. +public struct SandboxClientHealthProber: HealthProber { + private let sandboxClient: SandboxClient + private let log: Logger? + private static let probeIDPrefix = "__container_healthcheck_" + + public init(sandboxClient: SandboxClient, log: Logger? = nil) { + self.sandboxClient = sandboxClient + self.log = log + } + + public func runProbe( + containerID: String, + test: [String], + timeout: TimeInterval + ) async -> HealthProbeResult { + guard let processConfig = Self.makeProcessConfiguration(test: test) else { + return .failure(exitCode: nil) + } + let probeID = Self.probeIDPrefix + UUID().uuidString + + do { + try await sandboxClient.createProcess(probeID, config: processConfig, stdio: [nil, nil, nil]) + try await sandboxClient.startProcess(probeID) + } catch { + log?.warning( + "healthcheck probe failed to start", + metadata: [ + "id": "\(containerID)", + "probe": "\(probeID)", + "error": "\(error)", + ]) + return .failure(exitCode: nil) + } + + let outcome = await withTaskGroup(of: HealthProbeResult.self) { group in + group.addTask { [sandboxClient] in + do { + let status = try await sandboxClient.wait(probeID) + return status.exitCode == 0 + ? .success + : .failure(exitCode: status.exitCode) + } catch is CancellationError { + return .timedOut + } catch { + return .failure(exitCode: nil) + } + } + group.addTask { + try? await Task.sleep(nanoseconds: UInt64(timeout * 1_000_000_000)) + return .timedOut + } + + let first = await group.next() ?? .failure(exitCode: nil) + // Unblock any still-running wait() by killing the synthetic probe. + // Done before draining the group so the wait task can return. + if first == .timedOut { + try? await sandboxClient.kill(probeID, signal: 9) + } + group.cancelAll() + for await _ in group {} + return first + } + return outcome + } + + private static func makeProcessConfiguration(test: [String]) -> ProcessConfiguration? { + guard let kind = test.first else { return nil } + switch kind { + case "CMD": + guard test.count >= 2 else { return nil } + return ProcessConfiguration( + executable: test[1], + arguments: Array(test.dropFirst(2)), + environment: [] + ) + case "CMD-SHELL": + guard test.count >= 2 else { return nil } + return ProcessConfiguration( + executable: "/bin/sh", + arguments: ["-c", test[1]], + environment: [] + ) + default: + return nil + } + } +} diff --git a/Sources/Services/ContainerAPIService/Server/Containers/HealthStateMachine.swift b/Sources/Services/ContainerAPIService/Server/Containers/HealthStateMachine.swift new file mode 100644 index 000000000..cb4a4e7a7 --- /dev/null +++ b/Sources/Services/ContainerAPIService/Server/Containers/HealthStateMachine.swift @@ -0,0 +1,60 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import Foundation + +/// Pure state machine that maps a sequence of healthcheck probe outcomes to a +/// ``HealthStatus`` for a single container. This type is intentionally +/// dependency-free so the transition rules (Docker-compatible: grace window, +/// retries threshold, recovery without restart) can be exercised in isolation +/// by the unit-test layer. +public struct HealthStateMachine: Sendable { + public let configuration: Healthcheck + public private(set) var currentStatus: HealthStatus + public private(set) var consecutiveFailures: Int + + public init(configuration: Healthcheck) { + self.configuration = configuration + self.consecutiveFailures = 0 + self.currentStatus = configuration.isEffectivelyDisabled ? .none : .starting + } + + /// Record a probe that completed successfully (exit code zero). Resets the + /// consecutive failure counter and transitions the status to ``.healthy``. + /// No-op when the healthcheck is disabled. + public mutating func recordSuccess() { + guard !configuration.isEffectivelyDisabled else { return } + consecutiveFailures = 0 + currentStatus = .healthy + } + + /// Record a probe that did not complete successfully. Failures occurring + /// while the container's age is still within ``Healthcheck/startPeriod`` + /// do not advance the consecutive failure counter (grace window). + /// Otherwise the counter advances and the status transitions to + /// ``.unhealthy`` once it reaches ``Healthcheck/retries``. + public mutating func recordFailure(containerAge: TimeInterval) { + guard !configuration.isEffectivelyDisabled else { return } + if let startPeriod = configuration.startPeriod, containerAge < startPeriod { + return + } + consecutiveFailures += 1 + if consecutiveFailures >= configuration.retries { + currentStatus = .unhealthy + } + } +} diff --git a/Tests/ContainerAPIServiceTests/HealthMonitorTest.swift b/Tests/ContainerAPIServiceTests/HealthMonitorTest.swift new file mode 100644 index 000000000..1865f2d36 --- /dev/null +++ b/Tests/ContainerAPIServiceTests/HealthMonitorTest.swift @@ -0,0 +1,194 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import Foundation +import Testing + +@testable import ContainerAPIService + +/// Mock prober that returns scripted results in order. Each call consumes one +/// entry; once exhausted it parks indefinitely so the test stays in a known +/// state when the observer is cancelled mid-loop. +private actor ScriptedProber: HealthProber { + private var script: [HealthProbeResult] + private var calls: [(containerID: String, test: [String], timeout: TimeInterval)] = [] + + init(_ script: [HealthProbeResult]) { + self.script = script + } + + func runProbe( + containerID: String, + test: [String], + timeout: TimeInterval + ) async -> HealthProbeResult { + calls.append((containerID, test, timeout)) + if script.isEmpty { + try? await Task.sleep(nanoseconds: 10_000_000_000) + return .failure(exitCode: nil) + } + return script.removeFirst() + } + + func recordedCalls() -> [(containerID: String, test: [String], timeout: TimeInterval)] { + calls + } +} + +/// Drains a sequence of expected status updates emitted by the monitor. +private actor StatusRecorder { + private var updates: [(id: String, generation: UInt64, status: HealthStatus)] = [] + private var continuations: [(Int, CheckedContinuation)] = [] + + func record(id: String, generation: UInt64, status: HealthStatus) { + updates.append((id, generation, status)) + // Wake any waiters whose threshold has been reached. + continuations = continuations.filter { (threshold, cont) in + if updates.count >= threshold { + cont.resume() + return false + } + return true + } + } + + func waitForUpdates(count: Int) async { + if updates.count >= count { return } + await withCheckedContinuation { cont in + continuations.append((count, cont)) + } + } + + func snapshot() -> [(id: String, generation: UInt64, status: HealthStatus)] { + updates + } +} + +struct HealthMonitorTest { + private func makeQuickHealthcheck(retries: Int = 1) throws -> Healthcheck { + try Healthcheck( + test: ["CMD-SHELL", "true"], + interval: 0.005, + timeout: 1, + retries: retries + ) + } + + @Test func disabledHealthcheckEmitsSingleNoneUpdate() async throws { + let monitor = HealthMonitor() + let prober = ScriptedProber([]) + let recorder = StatusRecorder() + + let h = try Healthcheck(test: ["NONE"]) + await monitor.register( + id: "c1", + generation: 1, + startedAt: Date(), + healthcheck: h, + prober: prober + ) { id, gen, status in + await recorder.record(id: id, generation: gen, status: status) + } + await recorder.waitForUpdates(count: 1) + let updates = await recorder.snapshot() + #expect(updates.count == 1) + #expect(updates[0].id == "c1") + #expect(updates[0].generation == 1) + #expect(updates[0].status == .none) + await monitor.unregisterAll() + } + + @Test func enabledHealthcheckEmitsStartingThenHealthy() async throws { + let monitor = HealthMonitor() + let prober = ScriptedProber([.success]) + let recorder = StatusRecorder() + + let h = try makeQuickHealthcheck() + await monitor.register( + id: "c1", + generation: 7, + startedAt: Date(), + healthcheck: h, + prober: prober + ) { id, gen, status in + await recorder.record(id: id, generation: gen, status: status) + } + await recorder.waitForUpdates(count: 2) + let updates = await recorder.snapshot() + #expect(updates.count >= 2) + #expect(updates[0].status == .starting) + #expect(updates[1].status == .healthy) + #expect(updates.allSatisfy { $0.id == "c1" && $0.generation == 7 }) + await monitor.unregisterAll() + } + + @Test func consecutiveFailuresEventuallyTransitionToUnhealthy() async throws { + let monitor = HealthMonitor() + let prober = ScriptedProber([ + .failure(exitCode: 1), + .failure(exitCode: 1), + .failure(exitCode: 1), + ]) + let recorder = StatusRecorder() + + let h = try makeQuickHealthcheck(retries: 3) + await monitor.register( + id: "c1", + generation: 1, + startedAt: Date(), + healthcheck: h, + prober: prober + ) { id, gen, status in + await recorder.record(id: id, generation: gen, status: status) + } + await recorder.waitForUpdates(count: 2) + let updates = await recorder.snapshot() + let unhealthyUpdates = updates.filter { $0.status == .unhealthy } + #expect(!unhealthyUpdates.isEmpty) + await monitor.unregisterAll() + } + + @Test func unregisterCancelsObserverLoop() async throws { + let monitor = HealthMonitor() + let prober = ScriptedProber([.success, .success, .success]) + let recorder = StatusRecorder() + + let h = try makeQuickHealthcheck() + await monitor.register( + id: "c1", + generation: 1, + startedAt: Date(), + healthcheck: h, + prober: prober + ) { id, gen, status in + await recorder.record(id: id, generation: gen, status: status) + } + // Allow at least one probe to land before cancelling. + await recorder.waitForUpdates(count: 2) + await monitor.unregister(id: "c1") + + // Sleep briefly to let any in-flight probes finish, then capture. + try await Task.sleep(nanoseconds: 50_000_000) + let after = await recorder.snapshot().count + + // Verify that no significant additional updates accrue beyond what + // arrived during the brief settle window after cancellation. + try await Task.sleep(nanoseconds: 100_000_000) + let later = await recorder.snapshot().count + #expect(later <= after + 1) + } +} diff --git a/Tests/ContainerAPIServiceTests/HealthStateMachineTest.swift b/Tests/ContainerAPIServiceTests/HealthStateMachineTest.swift new file mode 100644 index 000000000..6865b00c3 --- /dev/null +++ b/Tests/ContainerAPIServiceTests/HealthStateMachineTest.swift @@ -0,0 +1,113 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerResource +import Foundation +import Testing + +@testable import ContainerAPIService + +struct HealthStateMachineTest { + private func makeHealthcheck( + retries: Int = 3, + startPeriod: TimeInterval? = nil + ) throws -> Healthcheck { + try Healthcheck( + test: ["CMD-SHELL", "true"], + retries: retries, + startPeriod: startPeriod + ) + } + + @Test func initialStateIsStartingWhenEnabled() throws { + let sm = HealthStateMachine(configuration: try makeHealthcheck()) + #expect(sm.currentStatus == .starting) + } + + @Test func initialStateIsNoneWhenDisabled() throws { + let h = try Healthcheck(test: ["NONE"]) + let sm = HealthStateMachine(configuration: h) + #expect(sm.currentStatus == .none) + } + + @Test func successDuringGraceTransitionsImmediatelyToHealthy() throws { + let h = try makeHealthcheck(startPeriod: 60) + var sm = HealthStateMachine(configuration: h) + sm.recordSuccess() + #expect(sm.currentStatus == .healthy) + } + + @Test func failuresDuringGraceDoNotCount() throws { + let h = try makeHealthcheck(retries: 2, startPeriod: 60) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 5) + sm.recordFailure(containerAge: 10) + sm.recordFailure(containerAge: 15) + #expect(sm.currentStatus == .starting) + #expect(sm.consecutiveFailures == 0) + } + + @Test func failuresAfterGraceCountTowardRetries() throws { + let h = try makeHealthcheck(retries: 3, startPeriod: 30) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 60) + #expect(sm.currentStatus == .starting) + #expect(sm.consecutiveFailures == 1) + sm.recordFailure(containerAge: 90) + #expect(sm.currentStatus == .starting) + #expect(sm.consecutiveFailures == 2) + sm.recordFailure(containerAge: 120) + #expect(sm.currentStatus == .unhealthy) + #expect(sm.consecutiveFailures == 3) + } + + @Test func successResetsFailureCounter() throws { + let h = try makeHealthcheck(retries: 3) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 100) + sm.recordFailure(containerAge: 130) + #expect(sm.consecutiveFailures == 2) + sm.recordSuccess() + #expect(sm.currentStatus == .healthy) + #expect(sm.consecutiveFailures == 0) + } + + @Test func unhealthyRecoversToHealthyOnSuccess() throws { + let h = try makeHealthcheck(retries: 1) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 100) + #expect(sm.currentStatus == .unhealthy) + sm.recordSuccess() + #expect(sm.currentStatus == .healthy) + #expect(sm.consecutiveFailures == 0) + } + + @Test func disabledMachineIgnoresAllInputs() throws { + let h = try Healthcheck(test: ["NONE"]) + var sm = HealthStateMachine(configuration: h) + sm.recordSuccess() + sm.recordFailure(containerAge: 100) + #expect(sm.currentStatus == .none) + #expect(sm.consecutiveFailures == 0) + } + + @Test func retriesEqualsZeroFailsImmediatelyPostGrace() throws { + let h = try makeHealthcheck(retries: 0) + var sm = HealthStateMachine(configuration: h) + sm.recordFailure(containerAge: 100) + #expect(sm.currentStatus == .unhealthy) + } +} diff --git a/Tests/ContainerResourceTests/HealthcheckTest.swift b/Tests/ContainerResourceTests/HealthcheckTest.swift new file mode 100644 index 000000000..d0e10b285 --- /dev/null +++ b/Tests/ContainerResourceTests/HealthcheckTest.swift @@ -0,0 +1,173 @@ +//===----------------------------------------------------------------------===// +// Copyright © 2026 Apple Inc. and the container project authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +//===----------------------------------------------------------------------===// + +import ContainerizationError +import Foundation +import Testing + +@testable import ContainerResource + +struct HealthcheckTest { + @Test func cmdFormParsesAndValidates() throws { + let h = try Healthcheck(test: ["CMD", "curl", "-f", "http://localhost"]) + #expect(h.test == ["CMD", "curl", "-f", "http://localhost"]) + #expect(h.interval == Healthcheck.defaultInterval) + #expect(h.timeout == Healthcheck.defaultTimeout) + #expect(h.retries == Healthcheck.defaultRetries) + #expect(!h.isEffectivelyDisabled) + } + + @Test func cmdShellFormParsesAndValidates() throws { + let h = try Healthcheck(test: ["CMD-SHELL", "test -f /tmp/ready"]) + #expect(h.test == ["CMD-SHELL", "test -f /tmp/ready"]) + #expect(!h.isEffectivelyDisabled) + } + + @Test func noneFormIsEffectivelyDisabled() throws { + let h = try Healthcheck(test: ["NONE"]) + #expect(h.isEffectivelyDisabled) + } + + @Test func disableFlagBypassesObserver() throws { + let h = try Healthcheck(test: ["CMD-SHELL", "true"], disable: true) + #expect(h.isEffectivelyDisabled) + } + + @Test func emptyTestArrayRejected() { + #expect { + _ = try Healthcheck(test: []) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.code == .invalidArgument) + #expect(err.message.contains("must not be empty")) + return true + } + } + + @Test func unknownTestKindRejected() { + #expect { + _ = try Healthcheck(test: ["BADKIND", "..."]) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.code == .invalidArgument) + #expect(err.message.contains("must start with")) + return true + } + } + + @Test func cmdWithoutArgumentsRejected() { + #expect { + _ = try Healthcheck(test: ["CMD"]) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.code == .invalidArgument) + return true + } + } + + @Test func nonPositiveIntervalRejected() { + #expect { + _ = try Healthcheck(test: ["CMD-SHELL", "true"], interval: 0) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.message.contains("interval")) + return true + } + } + + @Test func negativeRetriesRejected() { + #expect { + _ = try Healthcheck(test: ["CMD-SHELL", "true"], retries: -1) + } throws: { error in + guard let err = error as? ContainerizationError else { return false } + #expect(err.message.contains("retries")) + return true + } + } + + @Test func probeIntervalUsesStartIntervalDuringGrace() throws { + let h = try Healthcheck( + test: ["CMD-SHELL", "true"], + interval: 30, + startPeriod: 60, + startInterval: 5 + ) + #expect(h.probeInterval(forContainerAge: 0) == 5) + #expect(h.probeInterval(forContainerAge: 30) == 5) + #expect(h.probeInterval(forContainerAge: 60) == 30) + #expect(h.probeInterval(forContainerAge: 600) == 30) + } + + @Test func probeIntervalFallsBackToIntervalWithoutStartInterval() throws { + let h = try Healthcheck( + test: ["CMD-SHELL", "true"], + interval: 30, + startPeriod: 60 + ) + #expect(h.probeInterval(forContainerAge: 0) == 30) + #expect(h.probeInterval(forContainerAge: 600) == 30) + } + + @Test func roundTripThroughCodable() throws { + let original = try Healthcheck( + test: ["CMD-SHELL", "test -f /tmp/ready"], + interval: 15, + timeout: 5, + retries: 5, + startPeriod: 30, + startInterval: 2, + disable: false + ) + let data = try JSONEncoder().encode(original) + let decoded = try JSONDecoder().decode(Healthcheck.self, from: data) + #expect(decoded.test == original.test) + #expect(decoded.interval == original.interval) + #expect(decoded.timeout == original.timeout) + #expect(decoded.retries == original.retries) + #expect(decoded.startPeriod == original.startPeriod) + #expect(decoded.startInterval == original.startInterval) + #expect(decoded.disable == original.disable) + } + + @Test func legacyContainerConfigurationDecodesWithoutHealthcheck() throws { + let json = """ + { + "id": "legacy", + "image": { + "reference": "redis:latest", + "descriptor": { + "mediaType": "application/vnd.oci.image.manifest.v1+json", + "digest": "sha256:0000000000000000000000000000000000000000000000000000000000000000", + "size": 0 + } + }, + "initProcess": { + "executable": "/usr/local/bin/redis-server", + "arguments": [], + "environment": [], + "workingDirectory": "/", + "terminal": false, + "user": {"id": {"uid": 0, "gid": 0}}, + "supplementalGroups": [], + "rlimits": [] + } + } + """ + let data = json.data(using: .utf8)! + let decoded = try JSONDecoder().decode(ContainerConfiguration.self, from: data) + #expect(decoded.healthcheck == nil) + } +}