diff --git a/NOTICE b/NOTICE index 2607ce1a..e1b54f8c 100644 --- a/NOTICE +++ b/NOTICE @@ -929,9 +929,9 @@ License URL: https://github.com/helm/helm/blob/v4.1.3/LICENSE ---------- Module: k8s.io/api -Version: v0.35.2 +Version: v0.35.3 License: Apache-2.0 -License URL: https://github.com/kubernetes/api/blob/v0.35.2/LICENSE +License URL: https://github.com/kubernetes/api/blob/v0.35.3/LICENSE ---------- Module: k8s.io/apiextensions-apiserver/pkg/apis/apiextensions @@ -941,15 +941,15 @@ License URL: https://github.com/kubernetes/apiextensions-apiserver/blob/v0.35.1/ ---------- Module: k8s.io/apimachinery/pkg -Version: v0.35.2 +Version: v0.35.3 License: Apache-2.0 -License URL: https://github.com/kubernetes/apimachinery/blob/v0.35.2/LICENSE +License URL: https://github.com/kubernetes/apimachinery/blob/v0.35.3/LICENSE ---------- Module: k8s.io/apimachinery/third_party/forked/golang -Version: v0.35.2 +Version: v0.35.3 License: BSD-3-Clause -License URL: https://github.com/kubernetes/apimachinery/blob/v0.35.2/third_party/forked/golang/LICENSE +License URL: https://github.com/kubernetes/apimachinery/blob/v0.35.3/third_party/forked/golang/LICENSE ---------- Module: k8s.io/apiserver/pkg/endpoints/deprecation @@ -965,15 +965,15 @@ License URL: https://github.com/kubernetes/cli-runtime/blob/v0.35.2/LICENSE ---------- Module: k8s.io/client-go -Version: v0.35.2 +Version: v0.35.3 License: Apache-2.0 -License URL: https://github.com/kubernetes/client-go/blob/v0.35.2/LICENSE +License URL: https://github.com/kubernetes/client-go/blob/v0.35.3/LICENSE ---------- Module: k8s.io/client-go/third_party/forked/golang/template -Version: v0.35.2 +Version: v0.35.3 License: BSD-3-Clause -License URL: https://github.com/kubernetes/client-go/blob/v0.35.2/third_party/forked/golang/LICENSE +License URL: https://github.com/kubernetes/client-go/blob/v0.35.3/third_party/forked/golang/LICENSE ---------- Module: k8s.io/component-base/version diff --git a/cli/cmd/bootstrap_gcp.go b/cli/cmd/bootstrap_gcp.go index aaab5476..f9a9c9d4 100644 --- a/cli/cmd/bootstrap_gcp.go +++ b/cli/cmd/bootstrap_gcp.go @@ -72,7 +72,8 @@ func AddBootstrapGcpCmd(parent *cobra.Command, opts *GlobalOptions) { flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.FolderID, "folder-id", "", "GCP Folder ID (optional)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPublicKeyPath, "ssh-public-key-path", "~/.ssh/id_rsa.pub", "SSH Public Key Path (default: ~/.ssh/id_rsa.pub)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.SSHPrivateKeyPath, "ssh-private-key-path", "~/.ssh/id_rsa", "SSH Private Key Path (default: ~/.ssh/id_rsa)") - flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Preemptible, "preemptible", false, "Use preemptible VMs for Codesphere infrastructure (default: false)") + flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Preemptible, "preemptible", false, "Use preemptible VMs for Codesphere infrastructure. Mutually exclusive with --spot (default: false)") + flags.BoolVar(&bootstrapGcpCmd.CodesphereEnv.Spot, "spot", false, "Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable. Mutually exclusive with --preemptible (default: false)") flags.IntVar(&bootstrapGcpCmd.CodesphereEnv.DatacenterID, "datacenter-id", 1, "Datacenter ID (default: 1)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.CustomPgIP, "custom-pg-ip", "", "Custom PostgreSQL IP (optional)") flags.StringVar(&bootstrapGcpCmd.CodesphereEnv.InstallConfigPath, "install-config", "config.yaml", "Path to install config file (optional)") diff --git a/docs/oms_beta_bootstrap-gcp.md b/docs/oms_beta_bootstrap-gcp.md index d9e12441..e0e3c10a 100644 --- a/docs/oms_beta_bootstrap-gcp.md +++ b/docs/oms_beta_bootstrap-gcp.md @@ -42,7 +42,7 @@ oms beta bootstrap-gcp [flags] --openbao-password string OpenBao password (optional) --openbao-uri string URI for OpenBao (optional) --openbao-user string OpenBao username (optional) (default "admin") - --preemptible Use preemptible VMs for Codesphere infrastructure (default: false) + --preemptible Use preemptible VMs for Codesphere infrastructure. Mutually exclusive with --spot (default: false) --project-name string Unique GCP Project Name (required) --project-ttl string Time to live for the GCP project. Cleanup workflows will remove it afterwards. (default: 2 hours) (default "2h") --region string GCP Region (default: europe-west4) (default "europe-west4") @@ -50,6 +50,7 @@ oms beta bootstrap-gcp [flags] --registry-user string Custom Registry username (only for GitHub registry type) (optional) --secrets-dir string Directory for secrets (default: /etc/codesphere/secrets) (default "/etc/codesphere/secrets") --secrets-file string Path to secrets files (optional) (default "prod.vault.yaml") + --spot Use Spot VMs for Codesphere infrastructure. Falls back to standard VMs if spot capacity unavailable. Mutually exclusive with --preemptible (default: false) --ssh-private-key-path string SSH Private Key Path (default: ~/.ssh/id_rsa) (default "~/.ssh/id_rsa") --ssh-public-key-path string SSH Public Key Path (default: ~/.ssh/id_rsa.pub) (default "~/.ssh/id_rsa.pub") --ssh-quiet Suppress SSH command output (default: false) diff --git a/internal/bootstrap/gcp/gcp.go b/internal/bootstrap/gcp/gcp.go index 108425d3..25e76ccb 100644 --- a/internal/bootstrap/gcp/gcp.go +++ b/internal/bootstrap/gcp/gcp.go @@ -68,7 +68,7 @@ func GetDNSRecordNames(baseDomain string) []struct { } } -// IsNotFoundError checks if the error is a Google API "not found" error (HTTP 404). +// IsNotFoundError checks if the error indicates a "not found" condition. func IsNotFoundError(err error) bool { if err == nil { return false @@ -78,6 +78,11 @@ func IsNotFoundError(err error) bool { if errors.As(err, &googleErr) { return googleErr.Code == 404 } + + if status.Code(err) == codes.NotFound { + return true + } + return false } @@ -142,6 +147,7 @@ type CodesphereEnvironment struct { InstallHash string `json:"install_hash"` InstallSkipSteps []string `json:"install_skip_steps"` Preemptible bool `json:"preemptible"` + Spot bool `json:"spot"` WriteConfig bool `json:"-"` GatewayIP string `json:"gateway_ip"` PublicGatewayIP string `json:"public_gateway_ip"` @@ -381,9 +387,22 @@ func (b *GCPBootstrapper) ValidateInput() error { return err } + err = b.validateVMProvisioningOptions() + if err != nil { + return err + } + return b.validateGitHubParams() } +// validateVMProvisioningOptions checks that spot and preemptible options are not both set +func (b *GCPBootstrapper) validateVMProvisioningOptions() error { + if b.Env.Spot && b.Env.Preemptible { + return fmt.Errorf("cannot specify both --spot and --preemptible flags; use --spot for the newer spot VM model") + } + return nil +} + // validateInstallVersion checks if the specified install version exists and contains the required installer artifact func (b *GCPBootstrapper) validateInstallVersion() error { if b.Env.InstallLocal != "" { @@ -764,144 +783,37 @@ type vmResult struct { } func (b *GCPBootstrapper) EnsureComputeInstances() error { - projectID := b.Env.ProjectID - region := b.Env.Region - zone := b.Env.Zone - - network := fmt.Sprintf("projects/%s/global/networks/%s-vpc", projectID, projectID) - subnetwork := fmt.Sprintf("projects/%s/regions/%s/subnetworks/%s-%s-subnet", projectID, region, projectID, region) - diskType := fmt.Sprintf("projects/%s/zones/%s/diskTypes/pd-ssd", projectID, zone) - rootDiskSize := int64(200) if b.Env.RegistryType == RegistryTypeGitHub { rootDiskSize = 50 } - sshKeys := "" - var err error - if b.Env.GitHubPAT != "" && b.Env.GitHubTeamOrg != "" && b.Env.GitHubTeamSlug != "" { - sshKeys, err = github.GetSSHKeysFromGitHubTeam(b.GitHubClient, b.Env.GitHubTeamOrg, b.Env.GitHubTeamSlug) - if err != nil { - return fmt.Errorf("failed to get SSH keys from GitHub team: %w", err) - } - } - - pubKey, err := b.readSSHKey(b.Env.SSHPublicKeyPath) - if err != nil { - return err - } - - sshKeys += fmt.Sprintf("root:%s\nubuntu:%s", pubKey+"root", pubKey+"ubuntu") - // Create VMs in parallel wg := sync.WaitGroup{} errCh := make(chan error, len(vmDefs)) resultCh := make(chan vmResult, len(vmDefs)) + logCh := make(chan string, len(vmDefs)) + for _, vm := range vmDefs { wg.Add(1) go func(vm VMDef) { defer wg.Done() - disks := []*computepb.AttachedDisk{ - { - Boot: protoBool(true), - AutoDelete: protoBool(true), - Type: protoString("PERSISTENT"), - InitializeParams: &computepb.AttachedDiskInitializeParams{ - DiskType: &diskType, - DiskSizeGb: protoInt64(rootDiskSize), - SourceImage: protoString("projects/ubuntu-os-cloud/global/images/family/ubuntu-2204-lts"), - }, - }, - } - for _, diskSize := range vm.AdditionalDisks { - disks = append(disks, &computepb.AttachedDisk{ - Boot: protoBool(false), - AutoDelete: protoBool(true), - Type: protoString("PERSISTENT"), - InitializeParams: &computepb.AttachedDiskInitializeParams{ - DiskSizeGb: protoInt64(diskSize), - DiskType: &diskType, - }, - }) - } - - serviceAccount := fmt.Sprintf("cloud-controller@%s.iam.gserviceaccount.com", projectID) - instance := &computepb.Instance{ - Name: protoString(vm.Name), - ServiceAccounts: []*computepb.ServiceAccount{ - { - Email: protoString(serviceAccount), - Scopes: []string{"https://www.googleapis.com/auth/cloud-platform"}, - }, - }, - MachineType: protoString(fmt.Sprintf("zones/%s/machineTypes/%s", zone, vm.MachineType)), - Tags: &computepb.Tags{ - Items: vm.Tags, - }, - Scheduling: &computepb.Scheduling{ - Preemptible: &b.Env.Preemptible, - }, - NetworkInterfaces: []*computepb.NetworkInterface{ - { - Network: protoString(network), - Subnetwork: protoString(subnetwork), - }, - }, - Disks: disks, - Metadata: &computepb.Metadata{ - Items: []*computepb.Items{ - { - Key: protoString("ssh-keys"), - Value: protoString(sshKeys), - }, - }, - }, - } - - // Configure external IP if needed - if vm.ExternalIP { - instance.NetworkInterfaces[0].AccessConfigs = []*computepb.AccessConfig{ - { - Name: protoString("External NAT"), - Type: protoString("ONE_TO_ONE_NAT"), - }, - } - } - - err = b.GCPClient.CreateInstance(projectID, zone, instance) - if err != nil && !isAlreadyExistsError(err) { - errCh <- fmt.Errorf("failed to create instance %s: %w", vm.Name, err) - return - } - - // Find out the IP addresses of the created instance - resp, err := b.GCPClient.GetInstance(projectID, zone, vm.Name) + result, err := b.ensureVM(vm, rootDiskSize, logCh) if err != nil { - errCh <- fmt.Errorf("failed to get instance %s: %w", vm.Name, err) + errCh <- err return } - - externalIP := "" - internalIP := "" - if len(resp.GetNetworkInterfaces()) > 0 { - internalIP = resp.GetNetworkInterfaces()[0].GetNetworkIP() - if len(resp.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 { - externalIP = resp.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() - } - } - - // Send result through channel instead of creating nodes in goroutine - resultCh <- vmResult{ - vmType: vm.Tags[0], - name: vm.Name, - externalIP: externalIP, - internalIP: internalIP, - } + resultCh <- result }(vm) } wg.Wait() close(errCh) close(resultCh) + close(logCh) + + for msg := range logCh { + b.stlog.Logf("%s", msg) + } var errs []error for err := range errCh { @@ -943,6 +855,261 @@ func (b *GCPBootstrapper) EnsureComputeInstances() error { return nil } +// ensureVM handles the full lifecycle of a single VM: check existence, restart if stopped, +// or create a new instance with spot fallback. Returns the VM result with assigned IPs. +func (b *GCPBootstrapper) ensureVM(vm VMDef, rootDiskSize int64, logCh chan<- string) (vmResult, error) { + projectID := b.Env.ProjectID + zone := b.Env.Zone + + existingInstance, err := b.GCPClient.GetInstance(projectID, zone, vm.Name) + if err != nil && !IsNotFoundError(err) { + return vmResult{}, fmt.Errorf("failed to get instance %s: %w", vm.Name, err) + } + + if existingInstance != nil { + switch s := existingInstance.GetStatus(); s { + case "TERMINATED", "STOPPED": + if err := b.GCPClient.StartInstance(projectID, zone, vm.Name); err != nil { + return vmResult{}, fmt.Errorf("failed to start stopped instance %s: %w", vm.Name, err) + } + case "SUSPENDED": + return vmResult{}, fmt.Errorf("instance %s is SUSPENDED; manual resume is required", vm.Name) + } + } else { + instance, err := b.buildInstanceSpec(vm, rootDiskSize) + if err != nil { + return vmResult{}, err + } + if err := b.CreateInstanceWithFallback(projectID, zone, instance, vm.Name, logCh); err != nil { + return vmResult{}, err + } + } + + readyInstance, err := b.waitForInstanceRunning(projectID, zone, vm.Name, vm.ExternalIP) + if err != nil { + return vmResult{}, fmt.Errorf("instance %s did not become ready: %w", vm.Name, err) + } + + internalIP, externalIP := extractInstanceIPs(readyInstance) + return vmResult{ + vmType: vm.Tags[0], + name: vm.Name, + externalIP: externalIP, + internalIP: internalIP, + }, nil +} + +// buildInstanceSpec constructs the full compute instance specification for a VM. +func (b *GCPBootstrapper) buildInstanceSpec(vm VMDef, rootDiskSize int64) (*computepb.Instance, error) { + projectID := b.Env.ProjectID + region := b.Env.Region + zone := b.Env.Zone + + network := fmt.Sprintf("projects/%s/global/networks/%s-vpc", projectID, projectID) + subnetwork := fmt.Sprintf("projects/%s/regions/%s/subnetworks/%s-%s-subnet", projectID, region, projectID, region) + diskType := fmt.Sprintf("projects/%s/zones/%s/diskTypes/pd-ssd", projectID, zone) + + disks := []*computepb.AttachedDisk{ + { + Boot: protoBool(true), + AutoDelete: protoBool(true), + Type: protoString("PERSISTENT"), + InitializeParams: &computepb.AttachedDiskInitializeParams{ + DiskType: &diskType, + DiskSizeGb: protoInt64(rootDiskSize), + SourceImage: protoString("projects/ubuntu-os-cloud/global/images/family/ubuntu-2204-lts"), + }, + }, + } + for _, diskSize := range vm.AdditionalDisks { + disks = append(disks, &computepb.AttachedDisk{ + Boot: protoBool(false), + AutoDelete: protoBool(true), + Type: protoString("PERSISTENT"), + InitializeParams: &computepb.AttachedDiskInitializeParams{ + DiskSizeGb: protoInt64(diskSize), + DiskType: &diskType, + }, + }) + } + + sshKeys := "" + if b.Env.GitHubPAT != "" && b.Env.GitHubTeamOrg != "" && b.Env.GitHubTeamSlug != "" { + var err error + sshKeys, err = github.GetSSHKeysFromGitHubTeam(b.GitHubClient, b.Env.GitHubTeamOrg, b.Env.GitHubTeamSlug) + if err != nil { + return nil, fmt.Errorf("failed to get SSH keys from GitHub team: %w", err) + } + } + + pubKey, err := b.readSSHKey(b.Env.SSHPublicKeyPath) + if err != nil { + return nil, fmt.Errorf("failed to read SSH public key: %w", err) + } + + sshKeys += fmt.Sprintf("root:%s\nubuntu:%s", pubKey+"root", pubKey+"ubuntu") + + serviceAccount := fmt.Sprintf("cloud-controller@%s.iam.gserviceaccount.com", projectID) + instance := &computepb.Instance{ + Name: protoString(vm.Name), + ServiceAccounts: []*computepb.ServiceAccount{ + { + Email: protoString(serviceAccount), + Scopes: []string{"https://www.googleapis.com/auth/cloud-platform"}, + }, + }, + MachineType: protoString(fmt.Sprintf("zones/%s/machineTypes/%s", zone, vm.MachineType)), + Tags: &computepb.Tags{ + Items: vm.Tags, + }, + Scheduling: b.BuildSchedulingConfig(), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + Network: protoString(network), + Subnetwork: protoString(subnetwork), + }, + }, + Disks: disks, + Metadata: &computepb.Metadata{ + Items: []*computepb.Items{ + { + Key: protoString("ssh-keys"), + Value: protoString(sshKeys), + }, + }, + }, + } + + if vm.ExternalIP { + instance.NetworkInterfaces[0].AccessConfigs = []*computepb.AccessConfig{ + { + Name: protoString("External NAT"), + Type: protoString("ONE_TO_ONE_NAT"), + }, + } + } + + return instance, nil +} + +// extractInstanceIPs returns the internal and external IPs from a compute instance. +func extractInstanceIPs(inst *computepb.Instance) (internalIP, externalIP string) { + if len(inst.GetNetworkInterfaces()) > 0 { + internalIP = inst.GetNetworkInterfaces()[0].GetNetworkIP() + if len(inst.GetNetworkInterfaces()[0].GetAccessConfigs()) > 0 { + externalIP = inst.GetNetworkInterfaces()[0].GetAccessConfigs()[0].GetNatIP() + } + } + return +} + +// isInstanceReady checks if an instance is RUNNING with its internal IP assigned, +// and optionally its external IP as well. +func isInstanceReady(inst *computepb.Instance, needsExternalIP bool) bool { + if inst.GetStatus() != "RUNNING" || len(inst.GetNetworkInterfaces()) == 0 { + return false + } + ni := inst.GetNetworkInterfaces()[0] + if ni.GetNetworkIP() == "" { + return false + } + if needsExternalIP && (len(ni.GetAccessConfigs()) == 0 || ni.GetAccessConfigs()[0].GetNatIP() == "") { + return false + } + return true +} + +// BuildSchedulingConfig creates the scheduling configuration based on spot/preemptible settings +func (b *GCPBootstrapper) BuildSchedulingConfig() *computepb.Scheduling { + if b.Env.Spot { + return &computepb.Scheduling{ + ProvisioningModel: protoString("SPOT"), + OnHostMaintenance: protoString("TERMINATE"), + AutomaticRestart: protoBool(false), + InstanceTerminationAction: protoString("STOP"), + } + } + if b.Env.Preemptible { + return &computepb.Scheduling{ + Preemptible: protoBool(true), + } + } + + return &computepb.Scheduling{} +} + +// CreateInstanceWithFallback attempts to create an instance with the configured settings. +// If spot VMs are enabled and creation fails due to capacity issues, it falls back to standard VMs. +func (b *GCPBootstrapper) CreateInstanceWithFallback(projectID, zone string, instance *computepb.Instance, vmName string, logCh chan<- string) error { + err := b.GCPClient.CreateInstance(projectID, zone, instance) + if err == nil { + return nil + } + + if isAlreadyExistsError(err) { + return nil + } + + if b.Env.Spot && IsSpotCapacityError(err) { + logCh <- fmt.Sprintf("Spot capacity unavailable for %s, falling back to standard VM", vmName) + instance.Scheduling = &computepb.Scheduling{} + err = b.GCPClient.CreateInstance(projectID, zone, instance) + if err != nil && !isAlreadyExistsError(err) { + return fmt.Errorf("failed to create instance %s (fallback to standard VM): %w", vmName, err) + } + return nil + } + + return fmt.Errorf("failed to create instance %s: %w", vmName, err) +} + +// waitForInstanceRunning polls GetInstance until the instance status is RUNNING +// and its internal IP (and external IP, when needsExternalIP is true) are populated. +// It returns the ready instance or an error if the deadline is exceeded. +func (b *GCPBootstrapper) waitForInstanceRunning(projectID, zone, name string, needsExternalIP bool) (*computepb.Instance, error) { + const ( + maxAttempts = 60 + pollInterval = 5 * time.Second + ) + for attempt := 0; attempt < maxAttempts; attempt++ { + inst, err := b.GCPClient.GetInstance(projectID, zone, name) + if err != nil { + if IsNotFoundError(err) { + if attempt < maxAttempts-1 { + b.Time.Sleep(pollInterval) + } + continue + } + return nil, fmt.Errorf("failed to poll instance %s: %w", name, err) + } + + if isInstanceReady(inst, needsExternalIP) { + return inst, nil + } + + if attempt < maxAttempts-1 { + b.Time.Sleep(pollInterval) + } + } + return nil, fmt.Errorf("timed out waiting for instance %s to be RUNNING with IPs assigned after %s", + name, pollInterval*time.Duration(maxAttempts)) +} + +// IsSpotCapacityError checks if the error is related to spot VM capacity issues. +func IsSpotCapacityError(err error) bool { + if err == nil { + return false + } + if status.Code(err) == codes.ResourceExhausted { + return true + } + errStr := strings.ToLower(err.Error()) + return strings.Contains(errStr, "zone_resource_pool_exhausted") || + strings.Contains(errStr, "unsupported_operation") || + strings.Contains(errStr, "stockout") || + strings.Contains(errStr, "does not have enough resources") +} + // EnsureGatewayIPAddresses reserves 2 static external IP addresses for the ingress // controllers of the cluster. func (b *GCPBootstrapper) EnsureGatewayIPAddresses() error { @@ -1690,6 +1857,9 @@ func (b *GCPBootstrapper) RunK0sConfigScript() error { // Helper functions func isAlreadyExistsError(err error) bool { + if err == nil { + return false + } return status.Code(err) == codes.AlreadyExists || strings.Contains(err.Error(), "already exists") } diff --git a/internal/bootstrap/gcp/gcp_client.go b/internal/bootstrap/gcp/gcp_client.go index 3c049f57..ade21565 100644 --- a/internal/bootstrap/gcp/gcp_client.go +++ b/internal/bootstrap/gcp/gcp_client.go @@ -54,6 +54,7 @@ type GCPClientManager interface { CreateFirewallRule(projectID string, rule *computepb.Firewall) error CreateInstance(projectID, zone string, instance *computepb.Instance) error GetInstance(projectID, zone, instanceName string) (*computepb.Instance, error) + StartInstance(projectID, zone, instanceName string) error CreateAddress(projectID, region string, address *computepb.Address) (string, error) GetAddress(projectID, region, addressName string) (*computepb.Address, error) EnsureDNSManagedZone(projectID, zoneName, dnsName, description string) error @@ -668,6 +669,26 @@ func (c *GCPClient) GetInstance(projectID, zone, instanceName string) (*computep }) } +// StartInstance starts a stopped Compute Engine instance in the specified project and zone. +func (c *GCPClient) StartInstance(projectID, zone, instanceName string) error { + client, err := compute.NewInstancesRESTClient(c.ctx) + if err != nil { + return err + } + defer util.IgnoreError(client.Close) + + op, err := client.Start(c.ctx, &computepb.StartInstanceRequest{ + Project: projectID, + Zone: zone, + Instance: instanceName, + }) + if err != nil { + return err + } + + return op.Wait(c.ctx) +} + // CreateAddress creates a new static IP address in the specified project and region. func (c *GCPClient) CreateAddress(projectID, region string, address *computepb.Address) (string, error) { client, err := compute.NewAddressesRESTClient(c.ctx) diff --git a/internal/bootstrap/gcp/gcp_test.go b/internal/bootstrap/gcp/gcp_test.go index 34bfa2d4..03771180 100644 --- a/internal/bootstrap/gcp/gcp_test.go +++ b/internal/bootstrap/gcp/gcp_test.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "strings" + "sync" "time" "os" @@ -29,10 +30,9 @@ import ( "github.com/stretchr/testify/mock" "google.golang.org/api/cloudbilling/v1" "google.golang.org/api/dns/v1" + "google.golang.org/api/googleapi" ) -func protoString(s string) *string { return &s } - func jumpbboxMatcher(node *node.Node) bool { return node.GetName() == "jumpbox" } @@ -138,7 +138,7 @@ var _ = Describe("GCP Bootstrapper", func() { bs.Env.RegistryType = gcp.RegistryTypeArtifactRegistry bs.Env.WriteConfig = true - // 1. EnsureInstallConfig + // EnsureInstallConfig fw.EXPECT().Exists("fake-config-file").Return(false) icg.EXPECT().ApplyProfile("minimal").Return(nil) // Returning a real install config to avoid nil pointer dereferences later @@ -150,60 +150,49 @@ var _ = Describe("GCP Bootstrapper", func() { projectId := "test-project-12345" - // 2. EnsureSecrets + // EnsureSecrets fw.EXPECT().Exists("fake-secret").Return(false) icg.EXPECT().GetVault().Return(&files.InstallVault{}) - // 3. EnsureProject + // EnsureProject gc.EXPECT().GetProjectByName(mock.Anything, "test-project").Return(nil, fmt.Errorf("project not found: test-project")) gc.EXPECT().CreateProjectID("test-project").Return(projectId) gc.EXPECT().CreateProject(mock.Anything, mock.Anything, "test-project", time.Hour).Return(mock.Anything, nil) - // 4. EnsureBilling + // EnsureBilling gc.EXPECT().GetBillingInfo(projectId).Return(&cloudbilling.ProjectBillingInfo{BillingEnabled: false}, nil) gc.EXPECT().EnableBilling(projectId, "test-billing-account").Return(nil) - // 5. EnsureAPIsEnabled + // EnsureAPIsEnabled gc.EXPECT().EnableAPIs(projectId, mock.Anything).Return(nil) - // 6. EnsureArtifactRegistry + // EnsureArtifactRegistry gc.EXPECT().GetArtifactRegistry(projectId, "us-central1", "codesphere-registry").Return(nil, fmt.Errorf("not found")) gc.EXPECT().CreateArtifactRegistry(projectId, "us-central1", "codesphere-registry").Return(&artifactregistrypb.Repository{Name: "codesphere-registry"}, nil) - // 7. EnsureServiceAccounts + // EnsureServiceAccounts gc.EXPECT().CreateServiceAccount(projectId, "cloud-controller", "cloud-controller").Return("cloud-controller@p.iam.gserviceaccount.com", false, nil) gc.EXPECT().CreateServiceAccount(projectId, "artifact-registry-writer", "artifact-registry-writer").Return("writer@p.iam.gserviceaccount.com", true, nil) gc.EXPECT().CreateServiceAccountKey(projectId, "writer@p.iam.gserviceaccount.com").Return("fake-key", nil) - // 8. EnsureIAMRoles + // EnsureIAMRoles gc.EXPECT().AssignIAMRole(projectId, "artifact-registry-writer", projectId, []string{"roles/artifactregistry.writer"}).Return(nil) gc.EXPECT().AssignIAMRole(projectId, "cloud-controller", projectId, []string{"roles/compute.admin"}).Return(nil) gc.EXPECT().AssignIAMRole(csEnv.DNSProjectID, "cloud-controller", projectId, []string{"roles/dns.admin"}).Return(nil) - // 9. EnsureVPC + // EnsureVPC gc.EXPECT().CreateVPC(projectId, "us-central1", projectId+"-vpc", projectId+"-us-central1-subnet", projectId+"-router", projectId+"-nat-gateway").Return(nil) - // 10. EnsureFirewallRules (5 times) + // EnsureFirewallRules (5 times) gc.EXPECT().CreateFirewallRule(projectId, mock.Anything).Return(nil).Times(5) - // 11. EnsureComputeInstances + // EnsureComputeInstances + ipResp := makeRunningInstance("10.0.0.1", "1.2.3.4") + mockGetInstanceNotFoundThenRunning(gc, projectId, "us-central1-a", ipResp, 9) + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("fake-key"), nil).Times(9) gc.EXPECT().CreateInstance(projectId, "us-central1-a", mock.Anything).Return(nil).Times(9) - // GetInstance calls to retrieve IPs - ipResp := &computepb.Instance{ - NetworkInterfaces: []*computepb.NetworkInterface{ - { - NetworkIP: protoString("10.0.0.1"), - AccessConfigs: []*computepb.AccessConfig{ - {NatIP: protoString("1.2.3.4")}, - }, - }, - }, - } - - gc.EXPECT().GetInstance(projectId, "us-central1-a", mock.Anything).Return(ipResp, nil).Times(9) - fw.EXPECT().ReadFile(mock.Anything).Return([]byte("fake-key"), nil).Times(1) - // 12. EnsureGatewayIPAddresses + // EnsureGatewayIPAddresses gc.EXPECT().GetAddress(projectId, "us-central1", "gateway").Return(nil, fmt.Errorf("not found")) gc.EXPECT().CreateAddress(projectId, "us-central1", mock.MatchedBy(func(addr *computepb.Address) bool { return *addr.Name == "gateway" })).Return("1.1.1.1", nil) gc.EXPECT().GetAddress(projectId, "us-central1", "gateway").Return(nil, fmt.Errorf("not found")) @@ -211,7 +200,7 @@ var _ = Describe("GCP Bootstrapper", func() { gc.EXPECT().CreateAddress(projectId, "us-central1", mock.MatchedBy(func(addr *computepb.Address) bool { return *addr.Name == "public-gateway" })).Return("2.2.2.2", nil) gc.EXPECT().GetAddress(projectId, "us-central1", "public-gateway").Return(&computepb.Address{Address: protoString("2.2.2.2")}, nil) - // 16. UpdateInstallConfig + // UpdateInstallConfig icg.EXPECT().GenerateSecrets().Return(nil) icg.EXPECT().WriteInstallConfig("fake-config-file", true).Return(nil) nodeClient.EXPECT().CopyFile(mock.Anything, "fake-config-file", "/etc/codesphere/config.yaml").Return(nil) @@ -222,23 +211,23 @@ var _ = Describe("GCP Bootstrapper", func() { nodeClient.EXPECT().WaitReady(mock.Anything, mock.Anything).Return(nil).Return(nil) nodeClient.EXPECT().RunCommand(mock.Anything, mock.Anything, mock.Anything).Return(nil) - // 17. EnsureAgeKey + // EnsureAgeKey nodeClient.EXPECT().RunCommand(mock.MatchedBy(jumpbboxMatcher), "root", "mkdir -p /etc/codesphere/secrets; age-keygen -o /etc/codesphere/secrets/age_key.txt").Return(nil) nodeClient.EXPECT().HasFile(mock.MatchedBy(jumpbboxMatcher), "/etc/codesphere/secrets/age_key.txt").Return(false) - // 18. EncryptVault + // EncryptVault nodeClient.EXPECT().RunCommand(mock.MatchedBy(jumpbboxMatcher), "root", "cp /etc/codesphere/secrets/prod.vault.yaml{,.bak}").Return(nil) nodeClient.EXPECT().RunCommand(mock.MatchedBy(jumpbboxMatcher), "root", mock.MatchedBy(func(cmd string) bool { return strings.Contains(cmd, "sops --encrypt") })).Return(nil) - // 19. EnsureDNSRecords + // EnsureDNSRecords gc.EXPECT().EnsureDNSManagedZone(csEnv.DNSProjectID, "test-zone", "example.com.", mock.Anything).Return(nil) gc.EXPECT().EnsureDNSRecordSets(csEnv.DNSProjectID, "test-zone", mock.MatchedBy(func(records []*dns.ResourceRecordSet) bool { return len(records) == 4 })).Return(nil) - // 20. GenerateK0sConfigScript + // GenerateK0sConfigScript fw.EXPECT().WriteFile("configure-k0s.sh", mock.Anything, os.FileMode(0755)).Return(nil) nodeClient.EXPECT().CopyFile(mock.Anything, "configure-k0s.sh", "/root/configure-k0s.sh").Return(nil) nodeClient.EXPECT().RunCommand(mock.Anything, "root", "chmod +x /root/configure-k0s.sh").Return(nil) @@ -1000,25 +989,12 @@ var _ = Describe("GCP Bootstrapper", func() { }) Describe("Valid EnsureComputeInstances", func() { It("creates all instances", func() { - // Mock ReadFile for SSH key - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Times(1) + ipResp := makeRunningInstance("10.0.0.x", "1.2.3.x") + mockGetInstanceNotFoundThenRunning(gc, csEnv.ProjectID, csEnv.Zone, ipResp, 9) - // Mock CreateInstance (9 times) + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) - // Mock GetInstance (9 times) - ipResp := &computepb.Instance{ - NetworkInterfaces: []*computepb.NetworkInterface{ - { - NetworkIP: protoString("10.0.0.x"), - AccessConfigs: []*computepb.AccessConfig{ - {NatIP: protoString("1.2.3.x")}, - }, - }, - }, - } - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(ipResp, nil).Times(9) - err := bs.EnsureComputeInstances() Expect(err).NotTo(HaveOccurred()) Expect(len(bs.Env.ControlPlaneNodes)).To(Equal(3)) @@ -1034,15 +1010,10 @@ var _ = Describe("GCP Bootstrapper", func() { csEnv.GitHubPAT = "pat" }) It("does not fetch GitHub org keys when GitHub team org is set without slug", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Times(1) + ipResp := makeRunningInstance("10.0.0.x", "1.2.3.x") + mockGetInstanceNotFoundThenRunning(gc, csEnv.ProjectID, csEnv.Zone, ipResp, 9) + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) - ipResp := &computepb.Instance{ - NetworkInterfaces: []*computepb.NetworkInterface{{ - NetworkIP: protoString("10.0.0.x"), - AccessConfigs: []*computepb.AccessConfig{{NatIP: protoString("1.2.3.x")}}, - }}, - } - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(ipResp, nil).Times(9) err := bs.EnsureComputeInstances() Expect(err).NotTo(HaveOccurred()) @@ -1052,10 +1023,12 @@ var _ = Describe("GCP Bootstrapper", func() { csEnv.GitHubTeamSlug = "dev" }) It("fetches GitHub team keys", func() { - mockGitHubClient.EXPECT().ListTeamMembersBySlug(mock.Anything, csEnv.GitHubTeamOrg, csEnv.GitHubTeamSlug, mock.Anything).Return([]*gh.User{{Login: gh.Ptr("alice")}}, nil).Once() - mockGitHubClient.EXPECT().ListUserKeys(mock.Anything, "alice").Return([]*gh.Key{{Key: gh.Ptr("ssh-rsa AAALICE...")}}, nil).Once() + mockGitHubClient.EXPECT().ListTeamMembersBySlug(mock.Anything, csEnv.GitHubTeamOrg, csEnv.GitHubTeamSlug, mock.Anything).Return([]*gh.User{{Login: gh.Ptr("alice")}}, nil).Maybe() + mockGitHubClient.EXPECT().ListUserKeys(mock.Anything, "alice").Return([]*gh.Key{{Key: gh.Ptr("ssh-rsa AAALICE...")}}, nil).Maybe() - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Times(1) + ipResp := makeRunningInstance("10.0.0.x", "1.2.3.x") + mockGetInstanceNotFoundThenRunning(gc, csEnv.ProjectID, csEnv.Zone, ipResp, 9) + fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Times(9) gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone string, instance *computepb.Instance) error { sshMetadata := "" for _, item := range instance.GetMetadata().GetItems() { @@ -1069,20 +1042,13 @@ var _ = Describe("GCP Bootstrapper", func() { return nil }).Times(9) - ipResp := &computepb.Instance{ - NetworkInterfaces: []*computepb.NetworkInterface{{ - NetworkIP: protoString("10.0.0.x"), - AccessConfigs: []*computepb.AccessConfig{{NatIP: protoString("1.2.3.x")}}, - }}, - } - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(ipResp, nil).Times(9) - err := bs.EnsureComputeInstances() Expect(err).NotTo(HaveOccurred()) }) It("fails when GitHub client fails to list team members", func() { - mockGitHubClient.EXPECT().ListTeamMembersBySlug(mock.Anything, csEnv.GitHubTeamOrg, csEnv.GitHubTeamSlug, mock.Anything).Return(nil, fmt.Errorf("list members error")).Once() + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, &googleapi.Error{Code: 404, Message: "not found"}).Maybe() + mockGitHubClient.EXPECT().ListTeamMembersBySlug(mock.Anything, csEnv.GitHubTeamOrg, csEnv.GitHubTeamSlug, mock.Anything).Return(nil, fmt.Errorf("list members error")).Maybe() err := bs.EnsureComputeInstances() Expect(err).To(HaveOccurred()) @@ -1093,8 +1059,11 @@ var _ = Describe("GCP Bootstrapper", func() { }) Describe("Invalid cases", func() { + notFoundErr := &googleapi.Error{Code: 404, Message: "not found"} + It("fails when SSH key read fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return(nil, fmt.Errorf("read error")).Maybe() + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, notFoundErr).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return(nil, fmt.Errorf("read error")).Maybe() err := bs.EnsureComputeInstances() Expect(err).To(HaveOccurred()) @@ -1102,7 +1071,8 @@ var _ = Describe("GCP Bootstrapper", func() { }) It("fails when CreateInstance fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Maybe() + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, notFoundErr).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Maybe() gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(fmt.Errorf("create error")).Maybe() err := bs.EnsureComputeInstances() @@ -1110,16 +1080,145 @@ var _ = Describe("GCP Bootstrapper", func() { Expect(err.Error()).To(ContainSubstring("error ensuring compute instances")) }) - It("fails when GetInstance fails", func() { - fw.EXPECT().ReadFile(csEnv.SSHPublicKeyPath).Return([]byte("ssh-rsa AAA..."), nil).Maybe() + It("fails when GetInstance fails after creation", func() { + instanceCalls := make(map[string]int) + var mu sync.Mutex + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn( + func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, notFoundErr + } + return nil, fmt.Errorf("get error") + }, + ).Maybe() + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Maybe() gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Maybe() - gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("get error")).Maybe() err := bs.EnsureComputeInstances() Expect(err).To(HaveOccurred()) Expect(err.Error()).To(ContainSubstring("error ensuring compute instances")) }) }) + + Describe("Spot VM functionality", func() { + It("creates spot VMs when spot flag is enabled", func() { + csEnv.Spot = true + + ipResp := makeRunningInstance("10.0.0.x", "1.2.3.x") + mockGetInstanceNotFoundThenRunning(gc, csEnv.ProjectID, csEnv.Zone, ipResp, 9) + + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) + + // Verify CreateInstance is called with SPOT provisioning model + gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.MatchedBy(func(instance *computepb.Instance) bool { + return instance.Scheduling != nil && + instance.Scheduling.ProvisioningModel != nil && + *instance.Scheduling.ProvisioningModel == "SPOT" + })).Return(nil).Times(9) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("falls back to standard VM when spot capacity is exhausted", func() { + csEnv.Spot = true + + ipResp := makeRunningInstance("10.0.0.x", "1.2.3.x") + mockGetInstanceNotFoundThenRunning(gc, csEnv.ProjectID, csEnv.Zone, ipResp, 9) + + fw.EXPECT().ReadFile(mock.Anything).Return([]byte("ssh-rsa AAA..."), nil).Times(9) + + createCalls := make(map[string]int) + var mu sync.Mutex + gc.EXPECT().CreateInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone string, instance *computepb.Instance) error { + mu.Lock() + defer mu.Unlock() + name := *instance.Name + createCalls[name]++ + if createCalls[name] == 1 { + return fmt.Errorf("ZONE_RESOURCE_POOL_EXHAUSTED") + } + return nil + }).Times(18) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("restarts stopped VMs instead of creating new ones", func() { + instanceCalls := make(map[string]int) + var mu sync.Mutex + stoppedResp := makeStoppedInstance("10.0.0.x", "1.2.3.x") + runningResp := makeRunningInstance("10.0.0.x", "1.2.3.x") + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + // First call, VM exists but is stopped + return stoppedResp, nil + } + // After StartInstance, VM is running + return runningResp, nil + }).Times(18) + + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil).Times(9) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("uses existing running VMs without starting them", func() { + runningResp := makeRunningInstance("10.0.0.x", "1.2.3.x") + // 9 VMs × 2 GetInstance calls each (initial check + waitForInstanceRunning poll) + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(runningResp, nil).Times(18) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("handles VMs in intermediate states (STAGING/PROVISIONING)", func() { + instanceCalls := make(map[string]int) + var mu sync.Mutex + stagingResp := makeInstance("STAGING", "10.0.0.x", "1.2.3.x") + runningResp := makeRunningInstance("10.0.0.x", "1.2.3.x") + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + // First call: instance exists but is still staging + return stagingResp, nil + } + // Second call via waitForInstanceRunning: now running + return runningResp, nil + }).Times(18) + + err := bs.EnsureComputeInstances() + Expect(err).NotTo(HaveOccurred()) + }) + + It("fails when StartInstance returns an error", func() { + stoppedResp := makeStoppedInstance("10.0.0.x", "1.2.3.x") + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(stoppedResp, nil).Maybe() + gc.EXPECT().StartInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(fmt.Errorf("start error")).Maybe() + + err := bs.EnsureComputeInstances() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to start stopped instance")) + }) + + It("fails when initial GetInstance returns a non-NotFound error", func() { + gc.EXPECT().GetInstance(csEnv.ProjectID, csEnv.Zone, mock.Anything).Return(nil, fmt.Errorf("permission denied")).Maybe() + + err := bs.EnsureComputeInstances() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to get instance")) + }) + }) }) Describe("EnsureGatewayIPAddresses", func() { diff --git a/internal/bootstrap/gcp/mocks.go b/internal/bootstrap/gcp/mocks.go index 4fd00cb9..ea8c1aa0 100644 --- a/internal/bootstrap/gcp/mocks.go +++ b/internal/bootstrap/gcp/mocks.go @@ -1570,3 +1570,66 @@ func (_c *MockGCPClientManager_RemoveIAMRoleBinding_Call) RunAndReturn(run func( _c.Call.Return(run) return _c } + +// StartInstance provides a mock function for the type MockGCPClientManager +func (_mock *MockGCPClientManager) StartInstance(projectID string, zone string, instanceName string) error { + ret := _mock.Called(projectID, zone, instanceName) + + if len(ret) == 0 { + panic("no return value specified for StartInstance") + } + + var r0 error + if returnFunc, ok := ret.Get(0).(func(string, string, string) error); ok { + r0 = returnFunc(projectID, zone, instanceName) + } else { + r0 = ret.Error(0) + } + return r0 +} + +// MockGCPClientManager_StartInstance_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'StartInstance' +type MockGCPClientManager_StartInstance_Call struct { + *mock.Call +} + +// StartInstance is a helper method to define mock.On call +// - projectID string +// - zone string +// - instanceName string +func (_e *MockGCPClientManager_Expecter) StartInstance(projectID interface{}, zone interface{}, instanceName interface{}) *MockGCPClientManager_StartInstance_Call { + return &MockGCPClientManager_StartInstance_Call{Call: _e.mock.On("StartInstance", projectID, zone, instanceName)} +} + +func (_c *MockGCPClientManager_StartInstance_Call) Run(run func(projectID string, zone string, instanceName string)) *MockGCPClientManager_StartInstance_Call { + _c.Call.Run(func(args mock.Arguments) { + var arg0 string + if args[0] != nil { + arg0 = args[0].(string) + } + var arg1 string + if args[1] != nil { + arg1 = args[1].(string) + } + var arg2 string + if args[2] != nil { + arg2 = args[2].(string) + } + run( + arg0, + arg1, + arg2, + ) + }) + return _c +} + +func (_c *MockGCPClientManager_StartInstance_Call) Return(err error) *MockGCPClientManager_StartInstance_Call { + _c.Call.Return(err) + return _c +} + +func (_c *MockGCPClientManager_StartInstance_Call) RunAndReturn(run func(projectID string, zone string, instanceName string) error) *MockGCPClientManager_StartInstance_Call { + _c.Call.Return(run) + return _c +} diff --git a/internal/bootstrap/gcp/spot_vm_test.go b/internal/bootstrap/gcp/spot_vm_test.go new file mode 100644 index 00000000..c49ae253 --- /dev/null +++ b/internal/bootstrap/gcp/spot_vm_test.go @@ -0,0 +1,286 @@ +// Copyright (c) Codesphere Inc. +// SPDX-License-Identifier: Apache-2.0 + +package gcp_test + +import ( + "fmt" + + "cloud.google.com/go/compute/apiv1/computepb" + "github.com/codesphere-cloud/oms/internal/bootstrap/gcp" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/stretchr/testify/mock" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +var _ = Describe("Spot VM", func() { + + Describe("IsSpotCapacityError", func() { + It("returns false for nil error", func() { + Expect(gcp.IsSpotCapacityError(nil)).To(BeFalse()) + }) + + DescribeTable("detects known capacity errors", + func(err error) { Expect(gcp.IsSpotCapacityError(err)).To(BeTrue()) }, + Entry("gRPC ResourceExhausted", status.Errorf(codes.ResourceExhausted, "resource exhausted")), + Entry("gRPC ResourceExhausted with detail", status.Errorf(codes.ResourceExhausted, "spot VM pool exhausted in us-central1-a")), + Entry("ZONE_RESOURCE_POOL_EXHAUSTED", fmt.Errorf("googleapi: Error 403: ZONE_RESOURCE_POOL_EXHAUSTED - the zone does not have enough resources")), + Entry("UNSUPPORTED_OPERATION", fmt.Errorf("UNSUPPORTED_OPERATION: spot VMs not available in this zone")), + Entry("stockout", fmt.Errorf("stockout in zone us-central1-a")), + Entry("does not have enough resources", fmt.Errorf("the zone 'us-central1-a' does not have enough resources available to fulfill the request")), + ) + + DescribeTable("rejects non-capacity errors", + func(err error) { Expect(gcp.IsSpotCapacityError(err)).To(BeFalse()) }, + Entry("NotFound", status.Errorf(codes.NotFound, "not found")), + Entry("PermissionDenied", status.Errorf(codes.PermissionDenied, "denied")), + Entry("Internal", status.Errorf(codes.Internal, "internal")), + Entry("Unavailable", status.Errorf(codes.Unavailable, "service unavailable")), + Entry("permission denied string", fmt.Errorf("permission denied")), + Entry("invalid argument string", fmt.Errorf("invalid argument")), + Entry("quota exceeded string", fmt.Errorf("quota exceeded")), + Entry("network error string", fmt.Errorf("network error")), + ) + }) + + Describe("BuildSchedulingConfig", func() { + var ( + bs *gcp.GCPBootstrapper + csEnv *gcp.CodesphereEnvironment + ) + + BeforeEach(func() { + csEnv = &gcp.CodesphereEnvironment{ + ProjectName: "test", + Region: "us-central1", + Zone: "us-central1-a", + BaseDomain: "example.com", + DNSProjectID: "dns-project", + DNSZoneName: "test-zone", + SecretsDir: "/etc/codesphere/secrets", + DatacenterID: 1, + Experiments: gcp.DefaultExperiments, + } + gc := gcp.NewMockGCPClientManager(GinkgoT()) + bs = newTestBootstrapper(csEnv, gc) + }) + + It("returns spot scheduling when spot is enabled", func() { + csEnv.Spot = true + sched := bs.BuildSchedulingConfig() + Expect(*sched.ProvisioningModel).To(Equal("SPOT")) + Expect(*sched.OnHostMaintenance).To(Equal("TERMINATE")) + Expect(*sched.AutomaticRestart).To(BeFalse()) + Expect(*sched.InstanceTerminationAction).To(Equal("STOP")) + Expect(sched.Preemptible).To(BeNil()) + }) + + It("returns preemptible scheduling when preemptible is enabled", func() { + csEnv.Preemptible = true + sched := bs.BuildSchedulingConfig() + Expect(*sched.Preemptible).To(BeTrue()) + Expect(sched.ProvisioningModel).To(BeNil()) + Expect(sched.OnHostMaintenance).To(BeNil()) + Expect(sched.AutomaticRestart).To(BeNil()) + Expect(sched.InstanceTerminationAction).To(BeNil()) + }) + + It("returns empty scheduling when neither is enabled", func() { + sched := bs.BuildSchedulingConfig() + Expect(sched.ProvisioningModel).To(BeNil()) + Expect(sched.Preemptible).To(BeNil()) + Expect(sched.OnHostMaintenance).To(BeNil()) + Expect(sched.AutomaticRestart).To(BeNil()) + Expect(sched.InstanceTerminationAction).To(BeNil()) + }) + }) + + Describe("validateVMProvisioningOptions", func() { + var csEnv *gcp.CodesphereEnvironment + + BeforeEach(func() { + csEnv = &gcp.CodesphereEnvironment{ + ProjectName: "test", + Region: "us-central1", + Zone: "us-central1-a", + BaseDomain: "example.com", + DNSProjectID: "dns-project", + DNSZoneName: "test-zone", + SecretsDir: "/etc/codesphere/secrets", + DatacenterID: 1, + Experiments: gcp.DefaultExperiments, + InstallConfigPath: "fake-config", + SecretsFilePath: "fake-secrets", + GitHubAppName: "fake-app", + GitHubAppClientID: "fake-id", + GitHubAppClientSecret: "fake-secret", + } + }) + + DescribeTable("succeeds for valid combinations", + func(spot, preemptible bool) { + csEnv.Spot = spot + csEnv.Preemptible = preemptible + gc := gcp.NewMockGCPClientManager(GinkgoT()) + bs := newTestBootstrapper(csEnv, gc) + Expect(bs.ValidateInput()).NotTo(HaveOccurred()) + }, + Entry("only spot", true, false), + Entry("only preemptible", false, true), + Entry("neither", false, false), + ) + + It("fails when both spot and preemptible are set", func() { + csEnv.Spot = true + csEnv.Preemptible = true + gc := gcp.NewMockGCPClientManager(GinkgoT()) + bs := newTestBootstrapper(csEnv, gc) + err := bs.ValidateInput() + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("cannot specify both --spot and --preemptible")) + Expect(err.Error()).To(ContainSubstring("use --spot for the newer spot VM model")) + }) + }) + + Describe("CreateInstanceWithFallback", func() { + var ( + bs *gcp.GCPBootstrapper + csEnv *gcp.CodesphereEnvironment + gc *gcp.MockGCPClientManager + logCh chan string + ) + + BeforeEach(func() { + gc = gcp.NewMockGCPClientManager(GinkgoT()) + csEnv = &gcp.CodesphereEnvironment{ + ProjectName: "test", + ProjectID: "test-pid", + Region: "us-central1", + Zone: "us-central1-a", + BaseDomain: "example.com", + DNSProjectID: "dns-project", + DNSZoneName: "test-zone", + SecretsDir: "/etc/codesphere/secrets", + DatacenterID: 1, + Experiments: gcp.DefaultExperiments, + } + logCh = make(chan string, 10) + bs = newTestBootstrapper(csEnv, gc) + }) + + It("succeeds on first attempt", func() { + instance := &computepb.Instance{Name: protoString("test-vm")} + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", instance).Return(nil) + + Expect(bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh)).To(Succeed()) + }) + + It("treats AlreadyExists as success", func() { + instance := &computepb.Instance{Name: protoString("test-vm")} + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", instance). + Return(status.Errorf(codes.AlreadyExists, "already exists")) + + Expect(bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh)).To(Succeed()) + }) + + Context("when spot is enabled", func() { + BeforeEach(func() { + csEnv.Spot = true + }) + + spotInstance := func(name string) *computepb.Instance { + return &computepb.Instance{ + Name: protoString(name), + Scheduling: &computepb.Scheduling{ProvisioningModel: protoString("SPOT")}, + } + } + + DescribeTable("falls back to standard VM on capacity errors", + func(capacityErr error) { + instance := spotInstance("test-vm") + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.Anything).Return(capacityErr).Once() + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.Anything).Return(nil).Once() + + Expect(bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh)).To(Succeed()) + Expect(logCh).To(Receive(ContainSubstring("falling back to standard VM"))) + }, + Entry("gRPC ResourceExhausted", status.Errorf(codes.ResourceExhausted, "exhausted")), + Entry("ZONE_RESOURCE_POOL_EXHAUSTED", fmt.Errorf("ZONE_RESOURCE_POOL_EXHAUSTED")), + Entry("UNSUPPORTED_OPERATION", fmt.Errorf("UNSUPPORTED_OPERATION")), + Entry("stockout", fmt.Errorf("stockout in zone")), + Entry("does not have enough resources", fmt.Errorf("zone does not have enough resources")), + ) + + It("clears scheduling config on fallback", func() { + instance := spotInstance("test-vm") + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.Anything). + Return(fmt.Errorf("ZONE_RESOURCE_POOL_EXHAUSTED")).Once() + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.MatchedBy(func(inst *computepb.Instance) bool { + return inst.Scheduling != nil && + inst.Scheduling.ProvisioningModel == nil && + inst.Scheduling.Preemptible == nil + })).Return(nil).Once() + + Expect(bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh)).To(Succeed()) + }) + + It("returns error with context when fallback also fails", func() { + instance := spotInstance("test-vm") + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.Anything). + Return(fmt.Errorf("ZONE_RESOURCE_POOL_EXHAUSTED")).Once() + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.Anything). + Return(fmt.Errorf("insufficient quota")).Once() + + err := bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("fallback to standard VM")) + Expect(err.Error()).To(ContainSubstring("insufficient quota")) + }) + + It("does NOT fall back on non-capacity errors", func() { + instance := spotInstance("test-vm") + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.Anything). + Return(fmt.Errorf("permission denied")).Once() + + err := bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to create instance test-vm")) + Expect(err.Error()).To(ContainSubstring("permission denied")) + Expect(logCh).To(BeEmpty()) + }) + + It("succeeds when fallback retry returns AlreadyExists", func() { + instance := spotInstance("test-vm") + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.Anything). + Return(status.Errorf(codes.ResourceExhausted, "exhausted")).Once() + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", mock.Anything). + Return(status.Errorf(codes.AlreadyExists, "already exists")).Once() + + Expect(bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh)).To(Succeed()) + Expect(logCh).To(Receive(ContainSubstring("falling back to standard VM"))) + }) + }) + + Context("when spot is disabled", func() { + It("does not fall back on capacity errors", func() { + instance := &computepb.Instance{Name: protoString("test-vm")} + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", instance). + Return(fmt.Errorf("ZONE_RESOURCE_POOL_EXHAUSTED")) + + err := bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("failed to create instance test-vm")) + }) + + It("returns nil for string-based already exists error", func() { + instance := &computepb.Instance{Name: protoString("test-vm")} + gc.EXPECT().CreateInstance("test-pid", "us-central1-a", instance). + Return(fmt.Errorf("The resource 'test-vm' already exists")) + + Expect(bs.CreateInstanceWithFallback("test-pid", "us-central1-a", instance, "test-vm", logCh)).To(Succeed()) + }) + }) + }) +}) diff --git a/internal/bootstrap/gcp/test_helpers_test.go b/internal/bootstrap/gcp/test_helpers_test.go new file mode 100644 index 00000000..7d9c3b40 --- /dev/null +++ b/internal/bootstrap/gcp/test_helpers_test.go @@ -0,0 +1,91 @@ +// Copyright (c) Codesphere Inc. +// SPDX-License-Identifier: Apache-2.0 + +package gcp_test + +import ( + "context" + "sync" + + "cloud.google.com/go/compute/apiv1/computepb" + "github.com/codesphere-cloud/oms/internal/bootstrap" + "github.com/codesphere-cloud/oms/internal/bootstrap/gcp" + "github.com/codesphere-cloud/oms/internal/env" + "github.com/codesphere-cloud/oms/internal/github" + "github.com/codesphere-cloud/oms/internal/installer" + "github.com/codesphere-cloud/oms/internal/installer/node" + "github.com/codesphere-cloud/oms/internal/portal" + "github.com/codesphere-cloud/oms/internal/util" + . "github.com/onsi/ginkgo/v2" + "github.com/stretchr/testify/mock" + "google.golang.org/api/googleapi" +) + +func protoString(s string) *string { return &s } + +// makeInstance creates a computepb.Instance with the given status and IPs. +func makeInstance(status, internalIP, externalIP string) *computepb.Instance { + inst := &computepb.Instance{ + Status: protoString(status), + NetworkInterfaces: []*computepb.NetworkInterface{ + { + NetworkIP: protoString(internalIP), + }, + }, + } + if externalIP != "" { + inst.NetworkInterfaces[0].AccessConfigs = []*computepb.AccessConfig{ + {NatIP: protoString(externalIP)}, + } + } + return inst +} + +// makeRunningInstance creates a RUNNING instance with both IPs assigned. +func makeRunningInstance(internalIP, externalIP string) *computepb.Instance { + return makeInstance("RUNNING", internalIP, externalIP) +} + +// makeStoppedInstance creates a TERMINATED instance with IPs from its last run. +func makeStoppedInstance(internalIP, externalIP string) *computepb.Instance { + return makeInstance("TERMINATED", internalIP, externalIP) +} + +// mockGetInstanceNotFoundThenRunning sets up a GetInstance mock where the first call per VM +// returns a 404 "not found" error and subsequent calls return the given running instance. +// The expected total call count is 2 × numVMs. +func mockGetInstanceNotFoundThenRunning(gc *gcp.MockGCPClientManager, projectID, zone string, runningResp *computepb.Instance, numVMs int) { + instanceCalls := make(map[string]int) + var mu sync.Mutex + gc.EXPECT().GetInstance(projectID, zone, mock.Anything).RunAndReturn(func(projectID, zone, name string) (*computepb.Instance, error) { + mu.Lock() + defer mu.Unlock() + instanceCalls[name]++ + if instanceCalls[name] == 1 { + return nil, &googleapi.Error{Code: 404, Message: "not found"} + } + return runningResp, nil + }).Times(numVMs * 2) +} + +// newTestBootstrapper creates a GCPBootstrapper with the given environment and GCP client mock. +// All other dependencies use fresh mocks. +func newTestBootstrapper(csEnv *gcp.CodesphereEnvironment, gc gcp.GCPClientManager) *gcp.GCPBootstrapper { + bs, err := gcp.NewGCPBootstrapper( + context.Background(), + env.NewEnv(), + bootstrap.NewStepLogger(false), + csEnv, + installer.NewMockInstallConfigManager(GinkgoT()), + gc, + util.NewMockFileIO(GinkgoT()), + node.NewMockNodeClient(GinkgoT()), + portal.NewMockPortal(GinkgoT()), + util.NewFakeTime(), + github.NewMockGitHubClient(GinkgoT()), + ) + if err != nil { + panic("newTestBootstrapper: " + err.Error()) + } + return bs +} diff --git a/internal/tmpl/NOTICE b/internal/tmpl/NOTICE index 2607ce1a..e1b54f8c 100644 --- a/internal/tmpl/NOTICE +++ b/internal/tmpl/NOTICE @@ -929,9 +929,9 @@ License URL: https://github.com/helm/helm/blob/v4.1.3/LICENSE ---------- Module: k8s.io/api -Version: v0.35.2 +Version: v0.35.3 License: Apache-2.0 -License URL: https://github.com/kubernetes/api/blob/v0.35.2/LICENSE +License URL: https://github.com/kubernetes/api/blob/v0.35.3/LICENSE ---------- Module: k8s.io/apiextensions-apiserver/pkg/apis/apiextensions @@ -941,15 +941,15 @@ License URL: https://github.com/kubernetes/apiextensions-apiserver/blob/v0.35.1/ ---------- Module: k8s.io/apimachinery/pkg -Version: v0.35.2 +Version: v0.35.3 License: Apache-2.0 -License URL: https://github.com/kubernetes/apimachinery/blob/v0.35.2/LICENSE +License URL: https://github.com/kubernetes/apimachinery/blob/v0.35.3/LICENSE ---------- Module: k8s.io/apimachinery/third_party/forked/golang -Version: v0.35.2 +Version: v0.35.3 License: BSD-3-Clause -License URL: https://github.com/kubernetes/apimachinery/blob/v0.35.2/third_party/forked/golang/LICENSE +License URL: https://github.com/kubernetes/apimachinery/blob/v0.35.3/third_party/forked/golang/LICENSE ---------- Module: k8s.io/apiserver/pkg/endpoints/deprecation @@ -965,15 +965,15 @@ License URL: https://github.com/kubernetes/cli-runtime/blob/v0.35.2/LICENSE ---------- Module: k8s.io/client-go -Version: v0.35.2 +Version: v0.35.3 License: Apache-2.0 -License URL: https://github.com/kubernetes/client-go/blob/v0.35.2/LICENSE +License URL: https://github.com/kubernetes/client-go/blob/v0.35.3/LICENSE ---------- Module: k8s.io/client-go/third_party/forked/golang/template -Version: v0.35.2 +Version: v0.35.3 License: BSD-3-Clause -License URL: https://github.com/kubernetes/client-go/blob/v0.35.2/third_party/forked/golang/LICENSE +License URL: https://github.com/kubernetes/client-go/blob/v0.35.3/third_party/forked/golang/LICENSE ---------- Module: k8s.io/component-base/version