From 55ddc5b5f8f931d124c02c0f591b765ebbad3f56 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 26 Feb 2026 14:03:50 -0500 Subject: [PATCH 1/4] Unify graceful stop timeout --- lib/instances/delete.go | 21 +++++++--- lib/instances/stop.go | 93 +++++++++++++++++++++++++---------------- lib/instances/types.go | 2 +- lib/system/README.md | 2 +- 4 files changed, 74 insertions(+), 44 deletions(-) diff --git a/lib/instances/delete.go b/lib/instances/delete.go index a3fd4387..5291f748 100644 --- a/lib/instances/delete.go +++ b/lib/instances/delete.go @@ -30,6 +30,7 @@ func (m *manager) deleteInstance( } inst := m.toInstance(ctx, meta) + stored := &meta.StoredMetadata log.DebugContext(ctx, "loaded instance", "instance_id", id, "state", inst.State) // 2. Get network allocation BEFORE killing VMM (while we can still query it) @@ -47,7 +48,15 @@ func (m *manager) deleteInstance( guest.CloseConn(dialer.Key()) } - // 4. If hypervisor might be running, force kill it + // 4. If running, try graceful guest shutdown before force kill. + if inst.State == StateRunning { + stopTimeout := resolveStopTimeout(stored) + if !m.tryGracefulGuestShutdown(ctx, &inst, stopTimeout) { + log.DebugContext(ctx, "graceful shutdown before delete did not complete", "instance_id", id) + } + } + + // 5. If hypervisor might be running, force kill it // Also attempt kill for StateUnknown since we can't be sure if hypervisor is running if inst.State.RequiresVMM() || inst.State == StateUnknown { log.DebugContext(ctx, "stopping hypervisor", "instance_id", id, "state", inst.State) @@ -58,7 +67,7 @@ func (m *manager) deleteInstance( } } - // 5. Release network allocation + // 6. Release network allocation if inst.NetworkEnabled { log.DebugContext(ctx, "releasing network", "instance_id", id, "network", "default") if err := m.networkManager.ReleaseAllocation(ctx, networkAlloc); err != nil { @@ -67,7 +76,7 @@ func (m *manager) deleteInstance( } } - // 6. Detach and auto-unbind devices from VFIO + // 7. Detach and auto-unbind devices from VFIO if len(inst.Devices) > 0 && m.deviceManager != nil { for _, deviceID := range inst.Devices { log.DebugContext(ctx, "detaching device", "id", id, "device", deviceID) @@ -84,7 +93,7 @@ func (m *manager) deleteInstance( } } - // 6b. Detach volumes + // 7b. Detach volumes if len(inst.Volumes) > 0 { log.DebugContext(ctx, "detaching volumes", "instance_id", id, "count", len(inst.Volumes)) for _, volAttach := range inst.Volumes { @@ -95,7 +104,7 @@ func (m *manager) deleteInstance( } } - // 6c. Destroy vGPU mdev device if present + // 7c. Destroy vGPU mdev device if present if inst.GPUMdevUUID != "" { log.InfoContext(ctx, "destroying vGPU mdev", "instance_id", id, "uuid", inst.GPUMdevUUID) if err := devices.DestroyMdev(ctx, inst.GPUMdevUUID); err != nil { @@ -104,7 +113,7 @@ func (m *manager) deleteInstance( } } - // 7. Delete all instance data + // 8. Delete all instance data log.DebugContext(ctx, "deleting instance data", "instance_id", id) if err := m.deleteInstanceData(id); err != nil { log.ErrorContext(ctx, "failed to delete instance data", "instance_id", id, "error", err) diff --git a/lib/instances/stop.go b/lib/instances/stop.go index fafc65c0..f604cd57 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -14,11 +14,60 @@ import ( ) // DefaultStopTimeout is the default grace period for graceful shutdown (seconds). -// Similar to Docker's default of 10s. -const DefaultStopTimeout = 10 +const DefaultStopTimeout = 5 + +// resolveStopTimeout returns the configured stop timeout in seconds, +// falling back to the package default when unset/invalid. +func resolveStopTimeout(stored *StoredMetadata) int { + stopTimeout := stored.StopTimeout + if stopTimeout <= 0 { + stopTimeout = DefaultStopTimeout + } + return stopTimeout +} + +// tryGracefulGuestShutdown asks guest init to shut down and waits for the +// hypervisor process to exit. Returns true if the process exited in time. +func (m *manager) tryGracefulGuestShutdown(ctx context.Context, inst *Instance, stopTimeout int) bool { + log := logger.FromContext(ctx) + + if inst.SkipGuestAgent { + log.DebugContext(ctx, "guest-agent disabled, skipping graceful guest shutdown", "instance_id", inst.Id) + return false + } + + log.DebugContext(ctx, "sending graceful shutdown signal to guest", "instance_id", inst.Id, "timeout_seconds", stopTimeout) + dialer, dialerErr := hypervisor.NewVsockDialer(inst.HypervisorType, inst.VsockSocket, inst.VsockCID) + if dialerErr != nil { + log.WarnContext(ctx, "could not create vsock dialer for graceful shutdown", "instance_id", inst.Id, "error", dialerErr) + return false + } + + // Send shutdown signal (best-effort, fire and forget) + shutdownCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + if err := guest.ShutdownInstance(shutdownCtx, dialer, 0); err != nil { + log.WarnContext(ctx, "shutdown RPC failed (will fall back to hypervisor shutdown)", "instance_id", inst.Id, "error", err) + cancel() + return false + } + cancel() + + // Wait for the hypervisor process to exit (init calls reboot(POWER_OFF)) + if inst.HypervisorPID != nil { + if WaitForProcessExit(*inst.HypervisorPID, time.Duration(stopTimeout)*time.Second) { + log.DebugContext(ctx, "VM shut down gracefully", "instance_id", inst.Id) + return true + } + + log.WarnContext(ctx, "graceful shutdown timed out, falling back to hypervisor shutdown", "instance_id", inst.Id) + return false + } + + return false +} // stopInstance gracefully stops a running instance. -// Flow: send Shutdown RPC -> wait for VM to power off -> fall back to hard kill. +// Flow: send Shutdown RPC -> wait for VM to power off -> fall back to hypervisor shutdown. // Multi-hop orchestration: Running → Shutdown → Stopped func (m *manager) stopInstance( ctx context.Context, @@ -63,41 +112,13 @@ func (m *manager) stopInstance( } // 4. Graceful shutdown: send signal to guest init via Shutdown RPC, - // then wait for VM to power off cleanly. Fall back to hard kill on timeout. - stopTimeout := stored.StopTimeout - if stopTimeout <= 0 { - stopTimeout = DefaultStopTimeout - } - - gracefulShutdown := false - if !stored.SkipGuestAgent { - log.DebugContext(ctx, "sending graceful shutdown signal to guest", "instance_id", id, "timeout_seconds", stopTimeout) - dialer, dialerErr := hypervisor.NewVsockDialer(stored.HypervisorType, stored.VsockSocket, stored.VsockCID) - if dialerErr != nil { - log.WarnContext(ctx, "could not create vsock dialer for graceful shutdown", "instance_id", id, "error", dialerErr) - } else { - // Send shutdown signal (best-effort, fire and forget) - shutdownCtx, cancel := context.WithTimeout(ctx, 5*time.Second) - if err := guest.ShutdownInstance(shutdownCtx, dialer, 0); err != nil { - log.WarnContext(ctx, "shutdown RPC failed (will hard kill)", "instance_id", id, "error", err) - } - cancel() - - // Wait for the hypervisor process to exit (init calls reboot(POWER_OFF)) - if inst.HypervisorPID != nil { - if WaitForProcessExit(*inst.HypervisorPID, time.Duration(stopTimeout)*time.Second) { - log.DebugContext(ctx, "VM shut down gracefully", "instance_id", id) - gracefulShutdown = true - } else { - log.WarnContext(ctx, "graceful shutdown timed out, falling back to hard kill", "instance_id", id) - } - } - } - } + // then wait for VM to power off cleanly. Fall back to hypervisor shutdown on timeout. + stopTimeout := resolveStopTimeout(stored) + gracefulShutdown := m.tryGracefulGuestShutdown(ctx, &inst, stopTimeout) - // 5. Hard kill if graceful shutdown didn't work + // 5. Fallback hypervisor shutdown if guest graceful shutdown didn't work if !gracefulShutdown { - log.DebugContext(ctx, "shutting down hypervisor (hard kill)", "instance_id", id) + log.DebugContext(ctx, "shutting down hypervisor (fallback)", "instance_id", id) if err := m.shutdownHypervisor(ctx, &inst); err != nil { // Log but continue - try to clean up anyway log.WarnContext(ctx, "failed to shutdown hypervisor", "instance_id", id, "error", err) diff --git a/lib/instances/types.go b/lib/instances/types.go index 9afbccc2..ce688cf5 100644 --- a/lib/instances/types.go +++ b/lib/instances/types.go @@ -91,7 +91,7 @@ type StoredMetadata struct { SkipGuestAgent bool // Skip guest-agent installation (disables exec/stat API) // Shutdown configuration - StopTimeout int // Grace period in seconds for graceful stop (0 = use default 10s) + StopTimeout int // Grace period in seconds for graceful stop (0 = use default 5s) // Exit information (populated from serial console sentinel when VM stops) ExitCode *int // App exit code, nil if VM hasn't exited diff --git a/lib/system/README.md b/lib/system/README.md index 25a5e19f..6c646139 100644 --- a/lib/system/README.md +++ b/lib/system/README.md @@ -73,7 +73,7 @@ It replaces the previous shell-based init script with cleaner logic and structur - **Exec mode** (default): Init chroots to container rootfs, runs entrypoint as child process. When the app exits, init logs exit info and cleanly shuts down the VM via `reboot(POWER_OFF)`. - **Systemd mode** (auto-detected on host): Init chroots to container rootfs, then execs /sbin/init so systemd becomes PID 1 -**Graceful shutdown:** The host sends a `Shutdown` gRPC RPC to the guest-agent, which signals PID 1 (init). Init forwards the signal to the entrypoint child process. If the app doesn't exit within the stop timeout, the host falls back to a hard hypervisor kill. +**Graceful shutdown:** The host sends a `Shutdown` gRPC RPC to the guest-agent, which signals PID 1 (init). Init forwards the signal to the entrypoint child process. If the app doesn't exit within the stop timeout, the host falls back to hypervisor shutdown. **Exit info propagation:** When the entrypoint exits, init writes a machine-parseable `HYPEMAN-EXIT` sentinel to the serial console with the exit code and a human-readable description (signal names, OOM detection via `/dev/kmsg`, shell conventions for 126/127). The host lazily parses this from the serial log when it discovers the VM has stopped, and persists `exit_code`/`exit_message` to instance metadata and the API. From a96badac6793d63ed2fa49b4e0b69630bf864447 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 26 Feb 2026 14:39:53 -0500 Subject: [PATCH 2/4] Fallback to sigkill if stop doesn't happen --- cmd/api/api/devices.go | 2 - cmd/api/api/health.go | 1 - cmd/api/api/swagger.go | 1 - cmd/api/config/config.go | 47 +++---- cmd/api/main.go | 1 - lib/builds/builder_agent/main.go | 13 +- lib/builds/file_secret_provider.go | 2 - lib/builds/file_secret_provider_test.go | 2 - lib/builds/metrics.go | 1 - lib/builds/queue.go | 1 - lib/builds/queue_test.go | 1 - lib/builds/registry_token_test.go | 4 +- lib/devices/errors.go | 2 - lib/devices/manager.go | 2 +- lib/devices/manager_test.go | 2 - lib/devices/reconcile_test.go | 179 ++++++++++++------------ lib/devices/vfio_linux.go | 25 ++-- lib/guest/metrics.go | 1 - lib/images/disk.go | 1 - lib/images/oci_test.go | 13 +- lib/images/storage.go | 10 +- lib/images/systemd.go | 1 - lib/images/systemd_test.go | 1 - lib/images/types.go | 5 +- lib/ingress/manager.go | 2 +- lib/instances/cpu.go | 11 +- lib/instances/cpu_test.go | 17 ++- lib/instances/delete.go | 6 +- lib/instances/liveness.go | 1 - lib/instances/metrics.go | 4 +- lib/instances/query_test.go | 6 +- lib/instances/stop.go | 44 +++++- lib/middleware/oapi_auth.go | 23 ++- lib/network/errors.go | 1 - lib/network/manager_test.go | 13 +- lib/resources/resource_test.go | 4 +- lib/system/README.md | 2 +- lib/system/errors.go | 1 - lib/system/guest_agent/cp.go | 1 - lib/system/guest_agent/main.go | 2 +- lib/system/guest_agent/stat.go | 1 - lib/system/guest_agent_binary.go | 1 + lib/system/init/mount.go | 1 - lib/system/init/volumes.go | 1 - lib/system/kernel.go | 3 +- lib/system/manager.go | 5 +- lib/vm_metrics/manager_test.go | 14 +- lib/volumes/archive.go | 1 - lib/volumes/archive_test.go | 9 +- lib/volumes/errors.go | 1 - lib/volumes/manager.go | 2 +- lib/volumes/storage.go | 1 - lib/volumes/types.go | 1 - openapi.go | 1 - 54 files changed, 245 insertions(+), 253 deletions(-) diff --git a/cmd/api/api/devices.go b/cmd/api/api/devices.go index 65fe095c..c40c281f 100644 --- a/cmd/api/api/devices.go +++ b/cmd/api/api/devices.go @@ -163,5 +163,3 @@ func availableDeviceToOAPI(d devices.AvailableDevice) oapi.AvailableDevice { CurrentDriver: d.CurrentDriver, } } - - diff --git a/cmd/api/api/health.go b/cmd/api/api/health.go index 1c847f26..fccc65b0 100644 --- a/cmd/api/api/health.go +++ b/cmd/api/api/health.go @@ -12,4 +12,3 @@ func (s *ApiService) GetHealth(ctx context.Context, request oapi.GetHealthReques Status: oapi.Ok, }, nil } - diff --git a/cmd/api/api/swagger.go b/cmd/api/api/swagger.go index 81c50bdf..57cfa045 100644 --- a/cmd/api/api/swagger.go +++ b/cmd/api/api/swagger.go @@ -39,4 +39,3 @@ func SwaggerUIHandler(w http.ResponseWriter, r *http.Request) { ` w.Write([]byte(html)) } - diff --git a/cmd/api/config/config.go b/cmd/api/config/config.go index 4a53434c..b758dd30 100644 --- a/cmd/api/config/config.go +++ b/cmd/api/config/config.go @@ -68,11 +68,11 @@ type NetworkConfig struct { // CaddyConfig holds Caddy reverse-proxy / ingress settings. type CaddyConfig struct { - ListenAddress string `koanf:"listen_address"` - AdminAddress string `koanf:"admin_address"` - AdminPort int `koanf:"admin_port"` - InternalDNSPort int `koanf:"internal_dns_port"` - StopOnShutdown bool `koanf:"stop_on_shutdown"` + ListenAddress string `koanf:"listen_address"` + AdminAddress string `koanf:"admin_address"` + AdminPort int `koanf:"admin_port"` + InternalDNSPort int `koanf:"internal_dns_port"` + StopOnShutdown bool `koanf:"stop_on_shutdown"` } // ACMEConfig holds ACME / TLS certificate settings. @@ -128,12 +128,12 @@ type RegistryConfig struct { // LimitsConfig holds per-instance and aggregate resource limits. type LimitsConfig struct { - MaxVcpusPerInstance int `koanf:"max_vcpus_per_instance"` - MaxMemoryPerInstance string `koanf:"max_memory_per_instance"` - MaxTotalVolumeStorage string `koanf:"max_total_volume_storage"` - MaxConcurrentBuilds int `koanf:"max_concurrent_builds"` - MaxOverlaySize string `koanf:"max_overlay_size"` - MaxImageStorage float64 `koanf:"max_image_storage"` + MaxVcpusPerInstance int `koanf:"max_vcpus_per_instance"` + MaxMemoryPerInstance string `koanf:"max_memory_per_instance"` + MaxTotalVolumeStorage string `koanf:"max_total_volume_storage"` + MaxConcurrentBuilds int `koanf:"max_concurrent_builds"` + MaxOverlaySize string `koanf:"max_overlay_size"` + MaxImageStorage float64 `koanf:"max_image_storage"` } // OversubscriptionConfig holds oversubscription ratios (1.0 = no oversubscription). @@ -170,19 +170,19 @@ type Config struct { Env string `koanf:"env"` Version string `koanf:"version"` - Network NetworkConfig `koanf:"network"` - Caddy CaddyConfig `koanf:"caddy"` - ACME ACMEConfig `koanf:"acme"` - API APIConfig `koanf:"api"` - Otel OtelConfig `koanf:"otel"` - Logging LoggingConfig `koanf:"logging"` - Build BuildConfig `koanf:"build"` - Registry RegistryConfig `koanf:"registry"` - Limits LimitsConfig `koanf:"limits"` + Network NetworkConfig `koanf:"network"` + Caddy CaddyConfig `koanf:"caddy"` + ACME ACMEConfig `koanf:"acme"` + API APIConfig `koanf:"api"` + Otel OtelConfig `koanf:"otel"` + Logging LoggingConfig `koanf:"logging"` + Build BuildConfig `koanf:"build"` + Registry RegistryConfig `koanf:"registry"` + Limits LimitsConfig `koanf:"limits"` Oversubscription OversubscriptionConfig `koanf:"oversubscription"` - Capacity CapacityConfig `koanf:"capacity"` - Hypervisor HypervisorConfig `koanf:"hypervisor"` - GPU GPUConfig `koanf:"gpu"` + Capacity CapacityConfig `koanf:"capacity"` + Hypervisor HypervisorConfig `koanf:"hypervisor"` + GPU GPUConfig `koanf:"gpu"` } // GetDefaultConfigPaths returns the default config file paths to search. @@ -201,7 +201,6 @@ func GetDefaultConfigPaths() []string { } } - // defaultConfig returns a Config struct with all default values set. func defaultConfig() *Config { return &Config{ diff --git a/cmd/api/main.go b/cmd/api/main.go index f7651475..f91e211d 100644 --- a/cmd/api/main.go +++ b/cmd/api/main.go @@ -469,4 +469,3 @@ func run() error { slog.Info("all goroutines finished") return err } - diff --git a/lib/builds/builder_agent/main.go b/lib/builds/builder_agent/main.go index 55bf202f..2540d7e0 100644 --- a/lib/builds/builder_agent/main.go +++ b/lib/builds/builder_agent/main.go @@ -50,8 +50,8 @@ type BuildConfig struct { Secrets []SecretRef `json:"secrets,omitempty"` TimeoutSeconds int `json:"timeout_seconds"` NetworkMode string `json:"network_mode"` - IsAdminBuild bool `json:"is_admin_build,omitempty"` - GlobalCacheKey string `json:"global_cache_key,omitempty"` + IsAdminBuild bool `json:"is_admin_build,omitempty"` + GlobalCacheKey string `json:"global_cache_key,omitempty"` } // SecretRef references a secret to inject during build @@ -599,8 +599,8 @@ func setupRegistryAuth(config *BuildConfig) error { dockerConfig := map[string]interface{}{ "auths": map[string]interface{}{ registryHost: map[string]string{ - "auth": authValue, // Basic auth: base64(jwt:) - "identitytoken": token, // JWT directly for OAuth2-style auth + "auth": authValue, // Basic auth: base64(jwt:) + "identitytoken": token, // JWT directly for OAuth2-style auth }, }, "credsStore": "", @@ -888,8 +888,8 @@ func runBuild(ctx context.Context, config *BuildConfig, logWriter io.Writer) (st env := make([]string, 0, len(os.Environ())+3) for _, e := range os.Environ() { if !strings.HasPrefix(e, "DOCKER_CONFIG=") && - !strings.HasPrefix(e, "BUILDKITD_FLAGS=") && - !strings.HasPrefix(e, "HOME=") { + !strings.HasPrefix(e, "BUILDKITD_FLAGS=") && + !strings.HasPrefix(e, "HOME=") { env = append(env, e) } } @@ -999,4 +999,3 @@ func getBuildkitVersion() string { out, _ := cmd.Output() return strings.TrimSpace(string(out)) } - diff --git a/lib/builds/file_secret_provider.go b/lib/builds/file_secret_provider.go index 382ba7cf..61150347 100644 --- a/lib/builds/file_secret_provider.go +++ b/lib/builds/file_secret_provider.go @@ -62,5 +62,3 @@ func (p *FileSecretProvider) GetSecrets(ctx context.Context, secretIDs []string) // Ensure FileSecretProvider implements SecretProvider var _ SecretProvider = (*FileSecretProvider)(nil) - - diff --git a/lib/builds/file_secret_provider_test.go b/lib/builds/file_secret_provider_test.go index 784ed96d..59e8741f 100644 --- a/lib/builds/file_secret_provider_test.go +++ b/lib/builds/file_secret_provider_test.go @@ -99,5 +99,3 @@ func TestNoOpSecretProvider(t *testing.T) { require.NoError(t, err) assert.Empty(t, secrets) } - - diff --git a/lib/builds/metrics.go b/lib/builds/metrics.go index 92d3c029..0adee036 100644 --- a/lib/builds/metrics.go +++ b/lib/builds/metrics.go @@ -82,4 +82,3 @@ func (m *Metrics) RegisterQueueCallbacks(queue *BuildQueue, meter metric.Meter) ) return err } - diff --git a/lib/builds/queue.go b/lib/builds/queue.go index 2fee288b..62514176 100644 --- a/lib/builds/queue.go +++ b/lib/builds/queue.go @@ -168,4 +168,3 @@ func (q *BuildQueue) QueueLength() int { defer q.mu.Unlock() return len(q.active) + len(q.pending) } - diff --git a/lib/builds/queue_test.go b/lib/builds/queue_test.go index 5f5dd9af..18aa1335 100644 --- a/lib/builds/queue_test.go +++ b/lib/builds/queue_test.go @@ -227,4 +227,3 @@ func TestBuildQueue_Counts(t *testing.T) { close(done) } - diff --git a/lib/builds/registry_token_test.go b/lib/builds/registry_token_test.go index dcb36bb2..e8eaf103 100644 --- a/lib/builds/registry_token_test.go +++ b/lib/builds/registry_token_test.go @@ -178,9 +178,9 @@ func TestRegistryTokenClaims_RepoAccess(t *testing.T) { }) t.Run("IsPullAllowedForRepo", func(t *testing.T) { - assert.True(t, claims.IsPullAllowedForRepo("builds/abc123")) // push implies pull + assert.True(t, claims.IsPullAllowedForRepo("builds/abc123")) // push implies pull assert.True(t, claims.IsPullAllowedForRepo("cache/global/node")) - assert.True(t, claims.IsPullAllowedForRepo("cache/tenant-x")) // push implies pull + assert.True(t, claims.IsPullAllowedForRepo("cache/tenant-x")) // push implies pull assert.False(t, claims.IsPullAllowedForRepo("builds/other")) }) diff --git a/lib/devices/errors.go b/lib/devices/errors.go index afacaf2e..f9f811c6 100644 --- a/lib/devices/errors.go +++ b/lib/devices/errors.go @@ -36,5 +36,3 @@ var ( // ErrIOMMUGroupConflict is returned when not all devices in IOMMU group can be passed through ErrIOMMUGroupConflict = errors.New("IOMMU group contains other devices that must also be passed through") ) - - diff --git a/lib/devices/manager.go b/lib/devices/manager.go index 6c0d84b6..74f34bb4 100644 --- a/lib/devices/manager.go +++ b/lib/devices/manager.go @@ -10,9 +10,9 @@ import ( "sync" "time" - "github.com/nrednav/cuid2" "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/paths" + "github.com/nrednav/cuid2" ) // InstanceLivenessChecker provides a way to check if an instance is running. diff --git a/lib/devices/manager_test.go b/lib/devices/manager_test.go index bb6a167f..bc512f77 100644 --- a/lib/devices/manager_test.go +++ b/lib/devices/manager_test.go @@ -161,5 +161,3 @@ func TestErrors(t *testing.T) { assert.Contains(t, ErrInvalidName.Error(), "pattern") }) } - - diff --git a/lib/devices/reconcile_test.go b/lib/devices/reconcile_test.go index d28dcf94..98db9470 100644 --- a/lib/devices/reconcile_test.go +++ b/lib/devices/reconcile_test.go @@ -15,8 +15,8 @@ import ( // mockLivenessChecker implements InstanceLivenessChecker for testing type mockLivenessChecker struct { - runningInstances map[string]bool // instanceID -> isRunning - instanceDevices map[string][]string // instanceID -> deviceIDs + runningInstances map[string]bool // instanceID -> isRunning + instanceDevices map[string][]string // instanceID -> deviceIDs } func newMockLivenessChecker() *mockLivenessChecker { @@ -55,15 +55,15 @@ func setupTestManager(t *testing.T) (*manager, *paths.Paths, string) { t.Helper() tmpDir := t.TempDir() p := paths.New(tmpDir) - + // Create devices directory require.NoError(t, os.MkdirAll(p.DevicesDir(), 0755)) - + mgr := &manager{ paths: p, vfioBinder: NewVFIOBinder(), } - + return mgr, p, tmpDir } @@ -72,10 +72,10 @@ func createTestDevice(t *testing.T, p *paths.Paths, device *Device) { t.Helper() deviceDir := p.DeviceDir(device.Id) require.NoError(t, os.MkdirAll(deviceDir, 0755)) - + data, err := json.MarshalIndent(device, "", " ") require.NoError(t, err) - + require.NoError(t, os.WriteFile(p.DeviceMetadata(device.Id), data, 0644)) } @@ -89,7 +89,7 @@ func createTestInstanceDir(t *testing.T, p *paths.Paths, instanceID string) { func TestReconcileDevices_NoDevices(t *testing.T) { mgr, _, _ := setupTestManager(t) ctx := context.Background() - + err := mgr.ReconcileDevices(ctx) require.NoError(t, err) } @@ -97,10 +97,10 @@ func TestReconcileDevices_NoDevices(t *testing.T) { func TestReconcileDevices_OrphanedAttachment_NoLivenessChecker(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + instanceID := "orphaned-instance-123" deviceID := "device-abc" - + // Create device with AttachedTo pointing to non-existent instance device := &Device{ Id: deviceID, @@ -113,13 +113,13 @@ func TestReconcileDevices_OrphanedAttachment_NoLivenessChecker(t *testing.T) { CreatedAt: time.Now(), } createTestDevice(t, p, device) - + // Don't create the instance directory - it's orphaned - + // Run reconciliation err := mgr.ReconcileDevices(ctx) require.NoError(t, err) - + // Verify attachment was cleared updatedDevice, err := mgr.loadDevice(deviceID) require.NoError(t, err) @@ -129,10 +129,10 @@ func TestReconcileDevices_OrphanedAttachment_NoLivenessChecker(t *testing.T) { func TestReconcileDevices_ValidAttachment_NoLivenessChecker(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + instanceID := "valid-instance-123" deviceID := "device-abc" - + // Create device with AttachedTo pointing to existing instance device := &Device{ Id: deviceID, @@ -145,14 +145,14 @@ func TestReconcileDevices_ValidAttachment_NoLivenessChecker(t *testing.T) { CreatedAt: time.Now(), } createTestDevice(t, p, device) - + // Create the instance directory - it exists createTestInstanceDir(t, p, instanceID) - + // Run reconciliation err := mgr.ReconcileDevices(ctx) require.NoError(t, err) - + // Verify attachment was NOT cleared (instance exists) updatedDevice, err := mgr.loadDevice(deviceID) require.NoError(t, err) @@ -163,14 +163,14 @@ func TestReconcileDevices_ValidAttachment_NoLivenessChecker(t *testing.T) { func TestReconcileDevices_OrphanedAttachment_WithLivenessChecker(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + // Set up liveness checker liveness := newMockLivenessChecker() mgr.livenessChecker = liveness - + instanceID := "stopped-instance-123" deviceID := "device-abc" - + // Create device with AttachedTo device := &Device{ Id: deviceID, @@ -183,15 +183,15 @@ func TestReconcileDevices_OrphanedAttachment_WithLivenessChecker(t *testing.T) { CreatedAt: time.Now(), } createTestDevice(t, p, device) - + // Create instance directory but mark as NOT running createTestInstanceDir(t, p, instanceID) liveness.setRunning(instanceID, false) // Stopped/standby - + // Run reconciliation err := mgr.ReconcileDevices(ctx) require.NoError(t, err) - + // Verify attachment was cleared (instance not running) updatedDevice, err := mgr.loadDevice(deviceID) require.NoError(t, err) @@ -201,14 +201,14 @@ func TestReconcileDevices_OrphanedAttachment_WithLivenessChecker(t *testing.T) { func TestReconcileDevices_ValidAttachment_WithLivenessChecker(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + // Set up liveness checker liveness := newMockLivenessChecker() mgr.livenessChecker = liveness - + instanceID := "running-instance-123" deviceID := "device-abc" - + // Create device with AttachedTo device := &Device{ Id: deviceID, @@ -221,15 +221,15 @@ func TestReconcileDevices_ValidAttachment_WithLivenessChecker(t *testing.T) { CreatedAt: time.Now(), } createTestDevice(t, p, device) - + // Create instance and mark as running createTestInstanceDir(t, p, instanceID) liveness.setRunning(instanceID, true) // Running - + // Run reconciliation err := mgr.ReconcileDevices(ctx) require.NoError(t, err) - + // Verify attachment was NOT cleared (instance is running) updatedDevice, err := mgr.loadDevice(deviceID) require.NoError(t, err) @@ -240,18 +240,18 @@ func TestReconcileDevices_ValidAttachment_WithLivenessChecker(t *testing.T) { func TestReconcileDevices_TwoWayMismatch_InstanceRefsUnknownDevice(t *testing.T) { mgr, _, _ := setupTestManager(t) ctx := context.Background() - + // Set up liveness checker with instance that references unknown device liveness := newMockLivenessChecker() mgr.livenessChecker = liveness - + instanceID := "instance-with-ghost-device" unknownDeviceID := "device-that-doesnt-exist" - + // Instance references a device that doesn't exist liveness.setInstanceDevices(instanceID, []string{unknownDeviceID}) liveness.setRunning(instanceID, true) - + // Run reconciliation - should not error, just log the mismatch err := mgr.ReconcileDevices(ctx) require.NoError(t, err) @@ -261,14 +261,14 @@ func TestReconcileDevices_TwoWayMismatch_InstanceRefsUnknownDevice(t *testing.T) func TestReconcileDevices_TwoWayMismatch_DeviceAttachedToNil(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + // Set up liveness checker liveness := newMockLivenessChecker() mgr.livenessChecker = liveness - + instanceID := "instance-123" deviceID := "device-abc" - + // Create device with NO AttachedTo device := &Device{ Id: deviceID, @@ -281,16 +281,16 @@ func TestReconcileDevices_TwoWayMismatch_DeviceAttachedToNil(t *testing.T) { CreatedAt: time.Now(), } createTestDevice(t, p, device) - + // Instance claims to have this device liveness.setInstanceDevices(instanceID, []string{deviceID}) liveness.setRunning(instanceID, true) - + // Run reconciliation - should log mismatch but not error err := mgr.ReconcileDevices(ctx) require.NoError(t, err) // Note: This is a log-only mismatch, device state should remain unchanged - + updatedDevice, err := mgr.loadDevice(deviceID) require.NoError(t, err) assert.Nil(t, updatedDevice.AttachedTo, "Device should remain unattached (log-only mismatch)") @@ -299,15 +299,15 @@ func TestReconcileDevices_TwoWayMismatch_DeviceAttachedToNil(t *testing.T) { func TestReconcileDevices_TwoWayMismatch_DeviceAttachedToWrongInstance(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + // Set up liveness checker liveness := newMockLivenessChecker() mgr.livenessChecker = liveness - + instanceID1 := "instance-1" instanceID2 := "instance-2" deviceID := "device-abc" - + // Create device attached to instance-1 device := &Device{ Id: deviceID, @@ -320,21 +320,21 @@ func TestReconcileDevices_TwoWayMismatch_DeviceAttachedToWrongInstance(t *testin CreatedAt: time.Now(), } createTestDevice(t, p, device) - + // Both instances exist and are running createTestInstanceDir(t, p, instanceID1) createTestInstanceDir(t, p, instanceID2) liveness.setRunning(instanceID1, true) liveness.setRunning(instanceID2, true) - + // instance-2 claims to have this device (mismatch!) liveness.setInstanceDevices(instanceID2, []string{deviceID}) - + // Run reconciliation - should log mismatch but not error err := mgr.ReconcileDevices(ctx) require.NoError(t, err) // Note: This is a log-only mismatch, device state should remain unchanged - + updatedDevice, err := mgr.loadDevice(deviceID) require.NoError(t, err) require.NotNil(t, updatedDevice.AttachedTo) @@ -344,15 +344,15 @@ func TestReconcileDevices_TwoWayMismatch_DeviceAttachedToWrongInstance(t *testin func TestReconcileDevices_MultipleDevices(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + // Set up liveness checker liveness := newMockLivenessChecker() mgr.livenessChecker = liveness - + runningInstanceID := "running-instance" stoppedInstanceID := "stopped-instance" orphanedInstanceID := "orphaned-instance" - + // Device 1: Attached to running instance - should stay attached device1 := &Device{ Id: "device-1", @@ -364,7 +364,7 @@ func TestReconcileDevices_MultipleDevices(t *testing.T) { AttachedTo: &runningInstanceID, CreatedAt: time.Now(), } - + // Device 2: Attached to stopped instance - should be cleared device2 := &Device{ Id: "device-2", @@ -376,7 +376,7 @@ func TestReconcileDevices_MultipleDevices(t *testing.T) { AttachedTo: &stoppedInstanceID, CreatedAt: time.Now(), } - + // Device 3: Attached to non-existent instance - should be cleared device3 := &Device{ Id: "device-3", @@ -388,7 +388,7 @@ func TestReconcileDevices_MultipleDevices(t *testing.T) { AttachedTo: &orphanedInstanceID, CreatedAt: time.Now(), } - + // Device 4: Not attached - should stay unattached device4 := &Device{ Id: "device-4", @@ -400,41 +400,41 @@ func TestReconcileDevices_MultipleDevices(t *testing.T) { AttachedTo: nil, CreatedAt: time.Now(), } - + createTestDevice(t, p, device1) createTestDevice(t, p, device2) createTestDevice(t, p, device3) createTestDevice(t, p, device4) - + // Set up instance states createTestInstanceDir(t, p, runningInstanceID) createTestInstanceDir(t, p, stoppedInstanceID) // Don't create orphanedInstanceID directory - + liveness.setRunning(runningInstanceID, true) liveness.setRunning(stoppedInstanceID, false) // orphanedInstanceID doesn't exist in liveness checker - + // Run reconciliation err := mgr.ReconcileDevices(ctx) require.NoError(t, err) - + // Verify device 1 stays attached (running instance) d1, err := mgr.loadDevice("device-1") require.NoError(t, err) require.NotNil(t, d1.AttachedTo) assert.Equal(t, runningInstanceID, *d1.AttachedTo) - + // Verify device 2 is cleared (stopped instance) d2, err := mgr.loadDevice("device-2") require.NoError(t, err) assert.Nil(t, d2.AttachedTo) - + // Verify device 3 is cleared (orphaned instance) d3, err := mgr.loadDevice("device-3") require.NoError(t, err) assert.Nil(t, d3.AttachedTo) - + // Verify device 4 stays unattached d4, err := mgr.loadDevice("device-4") require.NoError(t, err) @@ -443,14 +443,14 @@ func TestReconcileDevices_MultipleDevices(t *testing.T) { func TestSetLivenessChecker(t *testing.T) { mgr, _, _ := setupTestManager(t) - + // Initially nil assert.Nil(t, mgr.livenessChecker) - + // Set liveness checker liveness := newMockLivenessChecker() mgr.SetLivenessChecker(liveness) - + // Verify it was set assert.Equal(t, liveness, mgr.livenessChecker) } @@ -458,16 +458,16 @@ func TestSetLivenessChecker(t *testing.T) { func TestIsInstanceOrphaned_NoLivenessChecker(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + existingInstanceID := "existing-instance" missingInstanceID := "missing-instance" - + // Create one instance directory createTestInstanceDir(t, p, existingInstanceID) - + // Existing instance is NOT orphaned assert.False(t, mgr.isInstanceOrphaned(ctx, existingInstanceID)) - + // Missing instance IS orphaned assert.True(t, mgr.isInstanceOrphaned(ctx, missingInstanceID)) } @@ -475,24 +475,24 @@ func TestIsInstanceOrphaned_NoLivenessChecker(t *testing.T) { func TestIsInstanceOrphaned_WithLivenessChecker(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + // Set up liveness checker liveness := newMockLivenessChecker() mgr.livenessChecker = liveness - + runningInstanceID := "running-instance" stoppedInstanceID := "stopped-instance" - + // Both instances have directories createTestInstanceDir(t, p, runningInstanceID) createTestInstanceDir(t, p, stoppedInstanceID) - + liveness.setRunning(runningInstanceID, true) liveness.setRunning(stoppedInstanceID, false) - + // Running instance is NOT orphaned assert.False(t, mgr.isInstanceOrphaned(ctx, runningInstanceID)) - + // Stopped instance IS orphaned (even though directory exists) assert.True(t, mgr.isInstanceOrphaned(ctx, stoppedInstanceID)) } @@ -500,16 +500,16 @@ func TestIsInstanceOrphaned_WithLivenessChecker(t *testing.T) { func TestReconcileDevices_NoDevicesDirectory(t *testing.T) { tmpDir := t.TempDir() p := paths.New(tmpDir) - + // Don't create devices directory - + mgr := &manager{ paths: p, vfioBinder: NewVFIOBinder(), } - + ctx := context.Background() - + // Should not error when directory doesn't exist err := mgr.ReconcileDevices(ctx) require.NoError(t, err) @@ -518,7 +518,7 @@ func TestReconcileDevices_NoDevicesDirectory(t *testing.T) { func TestReconcileStats(t *testing.T) { // Verify stats struct has expected fields stats := reconcileStats{} - + stats.orphanedCleared = 1 stats.resetAttempted = 2 stats.resetSucceeded = 3 @@ -526,7 +526,7 @@ func TestReconcileStats(t *testing.T) { stats.mismatches = 5 stats.suspiciousVMM = 6 stats.errors = 7 - + assert.Equal(t, 1, stats.orphanedCleared) assert.Equal(t, 2, stats.resetAttempted) assert.Equal(t, 3, stats.resetSucceeded) @@ -541,7 +541,7 @@ func TestReconcileStats(t *testing.T) { func TestResetOrphanedDevice_NonExistentPCIAddress(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + // Create device with fake PCI address that doesn't exist device := &Device{ Id: "test-device", @@ -554,15 +554,15 @@ func TestResetOrphanedDevice_NonExistentPCIAddress(t *testing.T) { CreatedAt: time.Now(), } createTestDevice(t, p, device) - + stats := &reconcileStats{} - + // Should not panic, should handle errors gracefully mgr.resetOrphanedDevice(ctx, device, stats) - + // Reset was attempted assert.Equal(t, 1, stats.resetAttempted) - + // May fail due to non-existent device, that's expected // The key is it doesn't panic } @@ -580,7 +580,7 @@ func verifyDeviceDir(t *testing.T, p *paths.Paths, deviceID string) bool { func TestReconcileDevices_CorruptedDeviceMetadata(t *testing.T) { mgr, p, _ := setupTestManager(t) ctx := context.Background() - + // Create a valid device validDevice := &Device{ Id: "valid-device", @@ -592,21 +592,20 @@ func TestReconcileDevices_CorruptedDeviceMetadata(t *testing.T) { CreatedAt: time.Now(), } createTestDevice(t, p, validDevice) - + // Create a corrupted device directory with invalid JSON corruptedID := "corrupted-device" corruptedDir := p.DeviceDir(corruptedID) require.NoError(t, os.MkdirAll(corruptedDir, 0755)) corruptedPath := filepath.Join(corruptedDir, "metadata.json") require.NoError(t, os.WriteFile(corruptedPath, []byte("not valid json{{{"), 0644)) - + // Should not error - should skip corrupted device and continue err := mgr.ReconcileDevices(ctx) require.NoError(t, err) - + // Valid device should still be loadable d, err := mgr.loadDevice("valid-device") require.NoError(t, err) assert.Equal(t, "valid-gpu", d.Name) } - diff --git a/lib/devices/vfio_linux.go b/lib/devices/vfio_linux.go index 65be8104..fdd0c682 100644 --- a/lib/devices/vfio_linux.go +++ b/lib/devices/vfio_linux.go @@ -13,9 +13,9 @@ import ( ) const ( - vfioDriverPath = "/sys/bus/pci/drivers/vfio-pci" - pciDriversPath = "/sys/bus/pci/drivers" - vfioDevicePath = "/dev/vfio" + vfioDriverPath = "/sys/bus/pci/drivers/vfio-pci" + pciDriversPath = "/sys/bus/pci/drivers" + vfioDevicePath = "/dev/vfio" ) // VFIOBinder handles binding and unbinding devices to/from VFIO @@ -149,17 +149,16 @@ func (v *VFIOBinder) unbindFromDriver(pciAddress, driver string) error { // setDriverOverride sets the driver_override for a device func (v *VFIOBinder) setDriverOverride(pciAddress, driver string) error { overridePath := filepath.Join(sysfsDevicesPath, pciAddress, "driver_override") - + // Empty string clears the override content := driver if driver == "" { content = "\n" // Writing newline clears the override } - + return os.WriteFile(overridePath, []byte(content), 0200) } - // bindDeviceToVFIO binds a specific device to vfio-pci using bind func (v *VFIOBinder) bindDeviceToVFIO(pciAddress string) error { bindPath := filepath.Join(vfioDriverPath, "bind") @@ -176,13 +175,13 @@ func (v *VFIOBinder) triggerDriverProbe(pciAddress string) error { // This service keeps /dev/nvidia* open and blocks driver unbind func (v *VFIOBinder) stopNvidiaPersistenced() error { slog.Debug("stopping nvidia-persistenced service") - + // Try systemctl first (works as root) cmd := exec.Command("systemctl", "stop", "nvidia-persistenced") if err := cmd.Run(); err == nil { return nil } - + // Fall back to killing the process directly (works with CAP_KILL or as root) // This is less clean but allows running with capabilities instead of full root cmd = exec.Command("pkill", "-TERM", "nvidia-persistenced") @@ -195,7 +194,7 @@ func (v *VFIOBinder) stopNvidiaPersistenced() error { } return fmt.Errorf("failed to stop nvidia-persistenced (try: sudo systemctl stop nvidia-persistenced)") } - + // Wait for process to exit with polling instead of arbitrary sleep return v.waitForProcessExit("nvidia-persistenced", 2*time.Second) } @@ -204,7 +203,7 @@ func (v *VFIOBinder) stopNvidiaPersistenced() error { func (v *VFIOBinder) waitForProcessExit(processName string, timeout time.Duration) error { deadline := time.Now().Add(timeout) pollInterval := 100 * time.Millisecond - + for time.Now().Before(deadline) { checkCmd := exec.Command("pgrep", processName) if checkCmd.Run() != nil { @@ -213,7 +212,7 @@ func (v *VFIOBinder) waitForProcessExit(processName string, timeout time.Duratio } time.Sleep(pollInterval) } - + // Timeout - process still running slog.Warn("timeout waiting for process to exit", "process", processName, "timeout", timeout) return nil // Continue anyway, the bind might still work @@ -222,7 +221,7 @@ func (v *VFIOBinder) waitForProcessExit(processName string, timeout time.Duratio // startNvidiaPersistenced starts the nvidia-persistenced service func (v *VFIOBinder) startNvidiaPersistenced() error { slog.Debug("starting nvidia-persistenced service") - + // Try systemctl first (works as root) cmd := exec.Command("systemctl", "start", "nvidia-persistenced") if err := cmd.Run(); err != nil { @@ -308,5 +307,3 @@ func (v *VFIOBinder) isPCIBridge(pciAddress string) bool { func GetDeviceSysfsPath(pciAddress string) string { return filepath.Join(sysfsDevicesPath, pciAddress) + "/" } - - diff --git a/lib/guest/metrics.go b/lib/guest/metrics.go index ce17e56a..f85726b1 100644 --- a/lib/guest/metrics.go +++ b/lib/guest/metrics.go @@ -169,4 +169,3 @@ func (m *Metrics) RecordCpSession(ctx context.Context, start time.Time, directio )) } } - diff --git a/lib/images/disk.go b/lib/images/disk.go index 438497ab..5cd24066 100644 --- a/lib/images/disk.go +++ b/lib/images/disk.go @@ -273,4 +273,3 @@ func CreateEmptyExt4Disk(diskPath string, sizeBytes int64) error { return nil } - diff --git a/lib/images/oci_test.go b/lib/images/oci_test.go index 51005bf6..4887d106 100644 --- a/lib/images/oci_test.go +++ b/lib/images/oci_test.go @@ -86,12 +86,13 @@ func TestExtractMetadataSucceedsOnBuildKitCache(t *testing.T) { // // Layout structure: // cacheDir/ -// ├── oci-layout (OCI layout version marker) -// ├── index.json (points to manifest) -// └── blobs/sha256/ -// ├── (image manifest with buildkit config mediatype) -// ├── (buildkit cache config blob) -// └── (dummy layer) +// +// ├── oci-layout (OCI layout version marker) +// ├── index.json (points to manifest) +// └── blobs/sha256/ +// ├── (image manifest with buildkit config mediatype) +// ├── (buildkit cache config blob) +// └── (dummy layer) func createBuildKitCacheLayout(cacheDir, layoutTag string) error { // Create directory structure blobsDir := filepath.Join(cacheDir, "blobs", "sha256") diff --git a/lib/images/storage.go b/lib/images/storage.go index eaeeb2a6..63d730ae 100644 --- a/lib/images/storage.go +++ b/lib/images/storage.go @@ -12,8 +12,8 @@ import ( ) type imageMetadata struct { - Name string `json:"name"` // Normalized ref (tag or digest) - Digest string `json:"digest"` // Always present: sha256:... + Name string `json:"name"` // Normalized ref (tag or digest) + Digest string `json:"digest"` // Always present: sha256:... Status string `json:"status"` Error *string `json:"error,omitempty"` Request *CreateImageRequest `json:"request,omitempty"` @@ -193,7 +193,7 @@ func resolveTag(p *paths.Paths, repository, tag string) (string, error) { // listTags returns all tags for a repository func listTags(p *paths.Paths, repository string) ([]string, error) { repoDir := p.ImageRepositoryDir(repository) - + entries, err := os.ReadDir(repoDir) if err != nil { if os.IsNotExist(err) { @@ -209,7 +209,7 @@ func listTags(p *paths.Paths, repository string) ([]string, error) { if err != nil { continue } - + if info.Mode()&os.ModeSymlink != 0 { tags = append(tags, entry.Name()) } @@ -272,7 +272,7 @@ func digestExists(p *paths.Paths, repository, digestHex string) bool { // deleteTag removes a tag symlink (does not delete the digest directory) func deleteTag(p *paths.Paths, repository, tag string) error { linkPath := tagSymlinkPath(p, repository, tag) - + // Check if symlink exists if _, err := os.Lstat(linkPath); err != nil { if os.IsNotExist(err) { diff --git a/lib/images/systemd.go b/lib/images/systemd.go index b03c6191..1d68ce26 100644 --- a/lib/images/systemd.go +++ b/lib/images/systemd.go @@ -36,4 +36,3 @@ func IsSystemdImage(entrypoint, cmd []string) bool { return false } - diff --git a/lib/images/systemd_test.go b/lib/images/systemd_test.go index 3428d639..46b9f4a6 100644 --- a/lib/images/systemd_test.go +++ b/lib/images/systemd_test.go @@ -88,4 +88,3 @@ func TestIsSystemdImage(t *testing.T) { }) } } - diff --git a/lib/images/types.go b/lib/images/types.go index 6b8a99a7..7431e90a 100644 --- a/lib/images/types.go +++ b/lib/images/types.go @@ -4,8 +4,8 @@ import "time" // Image represents a container image converted to bootable disk type Image struct { - Name string // Normalized ref (e.g., docker.io/library/alpine:latest) - Digest string // Resolved manifest digest (sha256:...) + Name string // Normalized ref (e.g., docker.io/library/alpine:latest) + Digest string // Resolved manifest digest (sha256:...) Status string QueuePosition *int Error *string @@ -21,4 +21,3 @@ type Image struct { type CreateImageRequest struct { Name string } - diff --git a/lib/ingress/manager.go b/lib/ingress/manager.go index ac05b205..d0e06b16 100644 --- a/lib/ingress/manager.go +++ b/lib/ingress/manager.go @@ -10,10 +10,10 @@ import ( "sync" "time" - "github.com/nrednav/cuid2" "github.com/kernel/hypeman/lib/dns" "github.com/kernel/hypeman/lib/logger" "github.com/kernel/hypeman/lib/paths" + "github.com/nrednav/cuid2" ) // InstanceResolver provides instance resolution capabilities. diff --git a/lib/instances/cpu.go b/lib/instances/cpu.go index a67f41bb..2353b573 100644 --- a/lib/instances/cpu.go +++ b/lib/instances/cpu.go @@ -36,16 +36,16 @@ func detectHostTopology() *HostTopology { scanner := bufio.NewScanner(file) for scanner.Scan() { line := scanner.Text() - + // Parse key: value pairs parts := strings.SplitN(line, ":", 2) if len(parts) != 2 { continue } - + key := strings.TrimSpace(parts[0]) value := strings.TrimSpace(parts[1]) - + switch key { case "siblings": if !hasSiblings { @@ -109,7 +109,7 @@ func calculateGuestTopology(vcpus int, host *HostTopology) *vmm.CpuTopology { if host.ThreadsPerCore > 1 && vcpus%host.ThreadsPerCore == 0 { threadsPerCore = host.ThreadsPerCore remainingCores := vcpus / threadsPerCore - + // Distribute cores across sockets if needed if remainingCores <= host.CoresPerSocket { coresPerDie = remainingCores @@ -128,7 +128,7 @@ func calculateGuestTopology(vcpus int, host *HostTopology) *vmm.CpuTopology { } else { // Use 1 thread per core for simpler layout threadsPerCore = 1 - + if vcpus <= host.CoresPerSocket { coresPerDie = vcpus diesPerPackage = 1 @@ -162,4 +162,3 @@ func calculateGuestTopology(vcpus int, host *HostTopology) *vmm.CpuTopology { Packages: &packages, } } - diff --git a/lib/instances/cpu_test.go b/lib/instances/cpu_test.go index 69992b65..0acc1b25 100644 --- a/lib/instances/cpu_test.go +++ b/lib/instances/cpu_test.go @@ -15,14 +15,14 @@ func TestCalculateGuestTopology(t *testing.T) { } tests := []struct { - name string - vcpus int - host *HostTopology - expectNil bool - expectedThreads *int - expectedCores *int - expectedDies *int - expectedPackages *int + name string + vcpus int + host *HostTopology + expectNil bool + expectedThreads *int + expectedCores *int + expectedDies *int + expectedPackages *int }{ { name: "1 vCPU - use CH defaults", @@ -178,4 +178,3 @@ func TestCalculateGuestTopologyNoSMT(t *testing.T) { func intPtr(i int) *int { return &i } - diff --git a/lib/instances/delete.go b/lib/instances/delete.go index 5291f748..5231973c 100644 --- a/lib/instances/delete.go +++ b/lib/instances/delete.go @@ -49,16 +49,18 @@ func (m *manager) deleteInstance( } // 4. If running, try graceful guest shutdown before force kill. + gracefulShutdown := false if inst.State == StateRunning { stopTimeout := resolveStopTimeout(stored) - if !m.tryGracefulGuestShutdown(ctx, &inst, stopTimeout) { + gracefulShutdown = m.tryGracefulGuestShutdown(ctx, &inst, stopTimeout) + if !gracefulShutdown { log.DebugContext(ctx, "graceful shutdown before delete did not complete", "instance_id", id) } } // 5. If hypervisor might be running, force kill it // Also attempt kill for StateUnknown since we can't be sure if hypervisor is running - if inst.State.RequiresVMM() || inst.State == StateUnknown { + if !gracefulShutdown && (inst.State.RequiresVMM() || inst.State == StateUnknown) { log.DebugContext(ctx, "stopping hypervisor", "instance_id", id, "state", inst.State) if err := m.killHypervisor(ctx, &inst); err != nil { // Log error but continue with cleanup diff --git a/lib/instances/liveness.go b/lib/instances/liveness.go index 73dbb09e..96f13a89 100644 --- a/lib/instances/liveness.go +++ b/lib/instances/liveness.go @@ -152,4 +152,3 @@ func (a *instanceLivenessAdapter) DetectSuspiciousVMMProcesses(ctx context.Conte return suspiciousCount } - diff --git a/lib/instances/metrics.go b/lib/instances/metrics.go index 3c982532..b9252255 100644 --- a/lib/instances/metrics.go +++ b/lib/instances/metrics.go @@ -161,8 +161,8 @@ func (m *manager) recordStateTransition(ctx context.Context, fromState, toState return } attrs := []attribute.KeyValue{ - attribute.String("from", fromState), - attribute.String("to", toState), + attribute.String("from", fromState), + attribute.String("to", toState), } if hvType != "" { attrs = append(attrs, attribute.String("hypervisor", string(hvType))) diff --git a/lib/instances/query_test.go b/lib/instances/query_test.go index b3cd0873..fd965e18 100644 --- a/lib/instances/query_test.go +++ b/lib/instances/query_test.go @@ -9,9 +9,9 @@ import ( func TestParseExitSentinelLine(t *testing.T) { tests := []struct { - name string - line string - wantOK bool + name string + line string + wantOK bool wantCode int wantMsg string }{ diff --git a/lib/instances/stop.go b/lib/instances/stop.go index f604cd57..668d2d06 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -3,6 +3,7 @@ package instances import ( "context" "fmt" + "syscall" "time" "github.com/kernel/hypeman/lib/devices" @@ -46,9 +47,7 @@ func (m *manager) tryGracefulGuestShutdown(ctx context.Context, inst *Instance, // Send shutdown signal (best-effort, fire and forget) shutdownCtx, cancel := context.WithTimeout(ctx, 5*time.Second) if err := guest.ShutdownInstance(shutdownCtx, dialer, 0); err != nil { - log.WarnContext(ctx, "shutdown RPC failed (will fall back to hypervisor shutdown)", "instance_id", inst.Id, "error", err) - cancel() - return false + log.WarnContext(ctx, "shutdown RPC failed; still waiting for process exit before fallback", "instance_id", inst.Id, "error", err) } cancel() @@ -66,8 +65,37 @@ func (m *manager) tryGracefulGuestShutdown(ctx context.Context, inst *Instance, return false } +// forceKillHypervisorProcess sends SIGKILL to the hypervisor process if it's still running +// and waits briefly for it to exit. +func (m *manager) forceKillHypervisorProcess(ctx context.Context, inst *Instance) error { + log := logger.FromContext(ctx) + + if inst.HypervisorPID == nil { + return nil + } + + pid := *inst.HypervisorPID + if err := syscall.Kill(pid, 0); err != nil { + // Process is already gone (likely ESRCH). + return nil + } + + log.WarnContext(ctx, "hypervisor still running after shutdown fallback, sending SIGKILL", "instance_id", inst.Id, "pid", pid) + if err := syscall.Kill(pid, syscall.SIGKILL); err != nil { + return fmt.Errorf("sigkill hypervisor pid %d: %w", pid, err) + } + + if !WaitForProcessExit(pid, 5*time.Second) { + return fmt.Errorf("hypervisor pid %d still alive after SIGKILL", pid) + } + + log.DebugContext(ctx, "hypervisor process force-killed", "instance_id", inst.Id, "pid", pid) + return nil +} + // stopInstance gracefully stops a running instance. -// Flow: send Shutdown RPC -> wait for VM to power off -> fall back to hypervisor shutdown. +// Flow: send Shutdown RPC -> wait for VM to power off -> +// fall back to hypervisor shutdown -> final SIGKILL if still alive. // Multi-hop orchestration: Running → Shutdown → Stopped func (m *manager) stopInstance( ctx context.Context, @@ -120,9 +148,15 @@ func (m *manager) stopInstance( if !gracefulShutdown { log.DebugContext(ctx, "shutting down hypervisor (fallback)", "instance_id", id) if err := m.shutdownHypervisor(ctx, &inst); err != nil { - // Log but continue - try to clean up anyway + // Continue to final SIGKILL fallback if graceful shutdown API fails. log.WarnContext(ctx, "failed to shutdown hypervisor", "instance_id", id, "error", err) } + + // Final fallback: force-kill the process if it's still alive. + if err := m.forceKillHypervisorProcess(ctx, &inst); err != nil { + log.ErrorContext(ctx, "failed to force-kill hypervisor process", "instance_id", id, "error", err) + return nil, err + } } // 6. Release network allocation (delete TAP device) diff --git a/lib/middleware/oapi_auth.go b/lib/middleware/oapi_auth.go index a2f25dd1..25bcd63a 100644 --- a/lib/middleware/oapi_auth.go +++ b/lib/middleware/oapi_auth.go @@ -212,7 +212,6 @@ func isTokenEndpoint(path string) bool { return path == "/v2/token" || path == "/v2/token/" } - // extractRepoFromPath extracts the repository name from a registry path. // Uses the docker/distribution router which properly handles repository names // that can contain slashes (e.g., "builds/abc123" from "/v2/builds/abc123/manifests/latest"). @@ -252,7 +251,7 @@ func writeRegistryUnauthorized(w http.ResponseWriter, r *http.Request) { } host := r.Host tokenURL := fmt.Sprintf("%s://%s/v2/token", scheme, host) - + // Use Bearer challenge pointing to our token endpoint challenge := fmt.Sprintf(`Bearer realm="%s",service="hypeman"`, tokenURL) w.Header().Set("WWW-Authenticate", challenge) @@ -355,16 +354,16 @@ func JwtAuth(jwtSecret string) func(http.Handler) http.Handler { return } - if authHeader != "" { - // Try to extract token (supports both Bearer and Basic auth) - log.InfoContext(r.Context(), "registry request with auth header", - "path", r.URL.Path, - "method", r.Method, - "auth_type", strings.Split(authHeader, " ")[0], - "remote_addr", r.RemoteAddr) - token, authType, err := extractTokenFromAuth(authHeader) - if err == nil { - log.DebugContext(r.Context(), "extracted token for registry request", "auth_type", authType) + if authHeader != "" { + // Try to extract token (supports both Bearer and Basic auth) + log.InfoContext(r.Context(), "registry request with auth header", + "path", r.URL.Path, + "method", r.Method, + "auth_type", strings.Split(authHeader, " ")[0], + "remote_addr", r.RemoteAddr) + token, authType, err := extractTokenFromAuth(authHeader) + if err == nil { + log.DebugContext(r.Context(), "extracted token for registry request", "auth_type", authType) // Try to validate as a registry-scoped token registryClaims, err := validateRegistryToken(token, jwtSecret, r.URL.Path, r.Method) diff --git a/lib/network/errors.go b/lib/network/errors.go index 94c0fefc..7400e9dc 100644 --- a/lib/network/errors.go +++ b/lib/network/errors.go @@ -9,4 +9,3 @@ var ( // ErrNameExists is returned when an instance name already exists ErrNameExists = errors.New("instance name already exists") ) - diff --git a/lib/network/manager_test.go b/lib/network/manager_test.go index 9081528c..8b4b708a 100644 --- a/lib/network/manager_test.go +++ b/lib/network/manager_test.go @@ -11,17 +11,17 @@ import ( func TestGenerateMAC(t *testing.T) { // Generate 100 MACs to test uniqueness and format seen := make(map[string]bool) - + for i := 0; i < 100; i++ { mac, err := generateMAC() require.NoError(t, err) - + // Check format (XX:XX:XX:XX:XX:XX) require.Len(t, mac, 17, "MAC should be 17 chars") - + // Check starts with 02:00:00 (locally administered) require.True(t, mac[:8] == "02:00:00", "MAC should start with 02:00:00") - + // Check uniqueness require.False(t, seen[mac], "MAC should be unique") seen[mac] = true @@ -50,7 +50,7 @@ func TestGenerateTAPName(t *testing.T) { want: "hype-abcd1234", }, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { got := GenerateTAPName(tt.instanceID) @@ -87,7 +87,7 @@ func TestIncrementIP(t *testing.T) { want: "192.168.2.0", }, } - + for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { ip := parseIP(tt.ip) @@ -232,4 +232,3 @@ func TestFormatTcRate(t *testing.T) { }) } } - diff --git a/lib/resources/resource_test.go b/lib/resources/resource_test.go index 0480561f..c6b355c1 100644 --- a/lib/resources/resource_test.go +++ b/lib/resources/resource_test.go @@ -351,8 +351,8 @@ func TestNetworkResource_Allocated(t *testing.T) { t.Skip("network rate limiting not supported on this platform") } cfg := &config.Config{ - DataDir: t.TempDir(), - Capacity: config.CapacityConfig{Network: "1Gbps"}, // 125MB/s + DataDir: t.TempDir(), + Capacity: config.CapacityConfig{Network: "1Gbps"}, // 125MB/s Oversubscription: config.OversubscriptionConfig{Network: 1.0}, } diff --git a/lib/system/README.md b/lib/system/README.md index 6c646139..6455d0a0 100644 --- a/lib/system/README.md +++ b/lib/system/README.md @@ -73,7 +73,7 @@ It replaces the previous shell-based init script with cleaner logic and structur - **Exec mode** (default): Init chroots to container rootfs, runs entrypoint as child process. When the app exits, init logs exit info and cleanly shuts down the VM via `reboot(POWER_OFF)`. - **Systemd mode** (auto-detected on host): Init chroots to container rootfs, then execs /sbin/init so systemd becomes PID 1 -**Graceful shutdown:** The host sends a `Shutdown` gRPC RPC to the guest-agent, which signals PID 1 (init). Init forwards the signal to the entrypoint child process. If the app doesn't exit within the stop timeout, the host falls back to hypervisor shutdown. +**Graceful shutdown:** The host sends a `Shutdown` gRPC RPC to the guest-agent, which signals PID 1 (init). Init forwards the signal to the entrypoint child process. If the app doesn't exit within the stop timeout, the host falls back to hypervisor shutdown and then force-kills the hypervisor process if still needed. **Exit info propagation:** When the entrypoint exits, init writes a machine-parseable `HYPEMAN-EXIT` sentinel to the serial console with the exit code and a human-readable description (signal names, OOM detection via `/dev/kmsg`, shell conventions for 126/127). The host lazily parses this from the serial log when it discovers the VM has stopped, and persists `exit_code`/`exit_message` to instance metadata and the API. diff --git a/lib/system/errors.go b/lib/system/errors.go index 414a1ccb..811f5819 100644 --- a/lib/system/errors.go +++ b/lib/system/errors.go @@ -12,4 +12,3 @@ var ( // ErrBuildFailed is returned when building initrd fails ErrBuildFailed = errors.New("build failed") ) - diff --git a/lib/system/guest_agent/cp.go b/lib/system/guest_agent/cp.go index 62a7eec2..c2ad5b02 100644 --- a/lib/system/guest_agent/cp.go +++ b/lib/system/guest_agent/cp.go @@ -399,4 +399,3 @@ func (s *guestServer) copyFromGuestDir(rootPath string, followLinks bool, stream log.Printf("[guest-agent] copy-from-guest complete: %d entries from %s", len(entries), rootPath) return nil } - diff --git a/lib/system/guest_agent/main.go b/lib/system/guest_agent/main.go index 95a9d5cc..b0fe125c 100644 --- a/lib/system/guest_agent/main.go +++ b/lib/system/guest_agent/main.go @@ -4,8 +4,8 @@ import ( "log" "time" - "github.com/mdlayher/vsock" pb "github.com/kernel/hypeman/lib/guest" + "github.com/mdlayher/vsock" "google.golang.org/grpc" ) diff --git a/lib/system/guest_agent/stat.go b/lib/system/guest_agent/stat.go index a36ee04e..b49e17d1 100644 --- a/lib/system/guest_agent/stat.go +++ b/lib/system/guest_agent/stat.go @@ -51,4 +51,3 @@ func (s *guestServer) StatPath(ctx context.Context, req *pb.StatPathRequest) (*p return resp, nil } - diff --git a/lib/system/guest_agent_binary.go b/lib/system/guest_agent_binary.go index 57d69722..cc3f7edf 100644 --- a/lib/system/guest_agent_binary.go +++ b/lib/system/guest_agent_binary.go @@ -4,5 +4,6 @@ import _ "embed" // GuestAgentBinary contains the embedded guest-agent binary // This is built by the Makefile before the main binary is compiled +// //go:embed guest_agent/guest-agent var GuestAgentBinary []byte diff --git a/lib/system/init/mount.go b/lib/system/init/mount.go index f39462bc..0336041e 100644 --- a/lib/system/init/mount.go +++ b/lib/system/init/mount.go @@ -263,4 +263,3 @@ func copyGuestAgent(log *Logger, skipGuestAgent bool) error { log.Info("hypeman-init:agent", "copied guest-agent to /opt/hypeman/") return nil } - diff --git a/lib/system/init/volumes.go b/lib/system/init/volumes.go index 5f24572b..d39b9dbe 100644 --- a/lib/system/init/volumes.go +++ b/lib/system/init/volumes.go @@ -110,4 +110,3 @@ func mountVolumeReadWrite(log *Logger, vol vmconfig.VolumeMount, mountPath strin log.Info("hypeman-init:volumes", fmt.Sprintf("mounted %s at %s (rw)", vol.Device, vol.Path)) return nil } - diff --git a/lib/system/kernel.go b/lib/system/kernel.go index 1318eb5d..dd2c2cc1 100644 --- a/lib/system/kernel.go +++ b/lib/system/kernel.go @@ -28,7 +28,7 @@ func (m *manager) downloadKernel(version KernelVersion, arch string) error { return nil // Follow redirects }, } - + resp, err := client.Get(url) if err != nil { return fmt.Errorf("http get: %w", err) @@ -78,4 +78,3 @@ func (m *manager) ensureKernel(version KernelVersion) (string, error) { return kernelPath, nil } - diff --git a/lib/system/manager.go b/lib/system/manager.go index 40c172b3..ac28f540 100644 --- a/lib/system/manager.go +++ b/lib/system/manager.go @@ -62,13 +62,13 @@ func (m *manager) GetKernelPath(version KernelVersion) (string, error) { func (m *manager) GetInitrdPath() (string, error) { arch := GetArch() latestLink := m.paths.SystemInitrdLatest(arch) - + // Read the symlink to get the timestamp target, err := os.Readlink(latestLink) if err != nil { return "", fmt.Errorf("read latest symlink: %w", err) } - + return m.paths.SystemInitrdTimestamp(target, arch), nil } @@ -76,4 +76,3 @@ func (m *manager) GetInitrdPath() (string, error) { func (m *manager) GetDefaultKernelVersion() KernelVersion { return DefaultKernelVersion } - diff --git a/lib/vm_metrics/manager_test.go b/lib/vm_metrics/manager_test.go index 67a94c45..d3e009f3 100644 --- a/lib/vm_metrics/manager_test.go +++ b/lib/vm_metrics/manager_test.go @@ -127,11 +127,11 @@ func TestVMStats_CPUSeconds(t *testing.T) { func TestVMStats_MemoryUtilizationRatio(t *testing.T) { tests := []struct { - name string - rss uint64 - allocated int64 - expectRatio *float64 - expectNil bool + name string + rss uint64 + allocated int64 + expectRatio *float64 + expectNil bool }{ { name: "normal ratio", @@ -184,7 +184,7 @@ func TestVMStats_MemoryUtilizationRatio(t *testing.T) { func TestBuildInstanceInfo(t *testing.T) { pid := 1234 - + // With network enabled info := BuildInstanceInfo("abc123", "my-vm", &pid, true, 4, 4*1024*1024*1024) assert.Equal(t, "abc123", info.ID) @@ -193,7 +193,7 @@ func TestBuildInstanceInfo(t *testing.T) { assert.Equal(t, 4, info.AllocatedVcpus) assert.Equal(t, int64(4*1024*1024*1024), info.AllocatedMemoryBytes) assert.NotEmpty(t, info.TAPDevice, "should have TAP device when network enabled") - + // Without network enabled info = BuildInstanceInfo("abc123", "my-vm", &pid, false, 4, 4*1024*1024*1024) assert.Empty(t, info.TAPDevice, "should not have TAP device when network disabled") diff --git a/lib/volumes/archive.go b/lib/volumes/archive.go index c51a4201..6e2dd5da 100644 --- a/lib/volumes/archive.go +++ b/lib/volumes/archive.go @@ -194,4 +194,3 @@ func ExtractTarGz(r io.Reader, destDir string, maxBytes int64) (int64, error) { return extractedBytes, nil } - diff --git a/lib/volumes/archive_test.go b/lib/volumes/archive_test.go index 810b8a85..90dea144 100644 --- a/lib/volumes/archive_test.go +++ b/lib/volumes/archive_test.go @@ -41,7 +41,7 @@ func createTestTarGz(t *testing.T, files map[string][]byte) *bytes.Buffer { func TestExtractTarGz_Basic(t *testing.T) { // Create a simple archive files := map[string][]byte{ - "hello.txt": []byte("Hello, World!"), + "hello.txt": []byte("Hello, World!"), "dir/nested.txt": []byte("Nested content"), } archive := createTestTarGz(t, files) @@ -237,9 +237,9 @@ func TestExtractTarGz_PreventsTarBomb(t *testing.T) { func TestExtractTarGz_Attack_DotDotSlashVariants(t *testing.T) { // Test various path traversal patterns that attackers commonly try testCases := []struct { - name string - path string - wantErr bool + name string + path string + wantErr bool }{ {"double dot basic", "../etc/passwd", true}, {"double dot nested", "foo/../../etc/passwd", true}, @@ -411,4 +411,3 @@ func TestExtractTarGz_Attack_ZeroSizeClaimLargeContent(t *testing.T) { require.Error(t, err) assert.ErrorIs(t, err, ErrArchiveTooLarge) } - diff --git a/lib/volumes/errors.go b/lib/volumes/errors.go index bab7c431..522ceda9 100644 --- a/lib/volumes/errors.go +++ b/lib/volumes/errors.go @@ -8,4 +8,3 @@ var ( ErrAlreadyExists = errors.New("volume already exists") ErrAmbiguousName = errors.New("multiple volumes with the same name") ) - diff --git a/lib/volumes/manager.go b/lib/volumes/manager.go index 57fca9c2..6f3f16d8 100644 --- a/lib/volumes/manager.go +++ b/lib/volumes/manager.go @@ -8,9 +8,9 @@ import ( "sync" "time" - "github.com/nrednav/cuid2" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/paths" + "github.com/nrednav/cuid2" "go.opentelemetry.io/otel/metric" ) diff --git a/lib/volumes/storage.go b/lib/volumes/storage.go index 597b0dd3..ccca55ed 100644 --- a/lib/volumes/storage.go +++ b/lib/volumes/storage.go @@ -123,4 +123,3 @@ func listVolumeIDs(p *paths.Paths) ([]string, error) { return ids, nil } - diff --git a/lib/volumes/types.go b/lib/volumes/types.go index 55a035bc..4235b169 100644 --- a/lib/volumes/types.go +++ b/lib/volumes/types.go @@ -39,4 +39,3 @@ type CreateVolumeFromArchiveRequest struct { SizeGb int // Maximum size in GB (extraction fails if content exceeds this) Id *string // Optional custom ID } - diff --git a/openapi.go b/openapi.go index 2c9e0674..09dadb5f 100644 --- a/openapi.go +++ b/openapi.go @@ -4,4 +4,3 @@ import _ "embed" //go:embed openapi.yaml var OpenAPIYAML []byte - From ecfee09c90f1101897f22fe11b82c5781d08abdc Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 26 Feb 2026 14:42:46 -0500 Subject: [PATCH 3/4] check gofmt in CI --- .github/workflows/test.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2c04026b..1b418ad1 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -44,6 +44,20 @@ jobs: - name: Build run: make build + + - name: Check gofmt + run: | + set -euo pipefail + go_files="$(git ls-files '*.go')" + if [ -z "$go_files" ]; then + exit 0 + fi + unformatted="$(echo "$go_files" | xargs gofmt -l)" + if [ -n "$unformatted" ]; then + echo "The following files are not gofmt-formatted:" + echo "$unformatted" + exit 1 + fi - name: Run tests env: @@ -87,6 +101,21 @@ jobs: run: make oapi-generate - name: Build run: make build + + - name: Check gofmt + run: | + set -euo pipefail + go_files="$(git ls-files '*.go')" + if [ -z "$go_files" ]; then + exit 0 + fi + unformatted="$(echo "$go_files" | xargs gofmt -l)" + if [ -n "$unformatted" ]; then + echo "The following files are not gofmt-formatted:" + echo "$unformatted" + exit 1 + fi + - name: Run tests env: DEFAULT_HYPERVISOR: vz From edf016111277f4ca0a477c341adda10ff49a9973 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Thu, 26 Feb 2026 15:06:06 -0500 Subject: [PATCH 4/4] Fix SIGKILL fallback to reap process in stop path --- lib/instances/stop.go | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/lib/instances/stop.go b/lib/instances/stop.go index 668d2d06..82fb7b9b 100644 --- a/lib/instances/stop.go +++ b/lib/instances/stop.go @@ -85,8 +85,25 @@ func (m *manager) forceKillHypervisorProcess(ctx context.Context, inst *Instance return fmt.Errorf("sigkill hypervisor pid %d: %w", pid, err) } - if !WaitForProcessExit(pid, 5*time.Second) { - return fmt.Errorf("hypervisor pid %d still alive after SIGKILL", pid) + // Wait for process to die and reap it to avoid zombie false positives. + reaped := false + for i := 0; i < 50; i++ { // 50 * 100ms = 5s + var wstatus syscall.WaitStatus + wpid, err := syscall.Wait4(pid, &wstatus, syscall.WNOHANG, nil) + if err != nil || wpid == pid { + // Process reaped, or not our child (ECHILD) and no longer trackable here. + reaped = true + break + } + time.Sleep(100 * time.Millisecond) + } + + if !reaped { + // Timed out waiting for reap; if process still exists, treat as failure. + if err := syscall.Kill(pid, 0); err == nil { + return fmt.Errorf("hypervisor pid %d still alive after SIGKILL", pid) + } + log.WarnContext(ctx, "timeout waiting to reap hypervisor process after SIGKILL", "instance_id", inst.Id, "pid", pid) } log.DebugContext(ctx, "hypervisor process force-killed", "instance_id", inst.Id, "pid", pid)