diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..ecf0920c3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: "[Feature]" +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d045d76c0..dd9027ca2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ on: paths-ignore: [docs/**, "**.md", "**.mdx", "**.png", "**.jpg"] env: - GO_VERSION: '1.24.13' + GO_VERSION: '1.25.8' CERT_MANAGER_VERSION: 'v1.16.2' jobs: diff --git a/.github/workflows/code-lint.yml b/.github/workflows/code-lint.yml index deeced9ec..17602a79c 100644 --- a/.github/workflows/code-lint.yml +++ b/.github/workflows/code-lint.yml @@ -14,7 +14,7 @@ on: env: # Common versions - GO_VERSION: "1.24.13" + GO_VERSION: "1.25.8" jobs: detect-noop: @@ -79,7 +79,7 @@ jobs: uses: actions/checkout@v6.0.2 - name: Set up Helm - uses: azure/setup-helm@v4 + uses: azure/setup-helm@v5 with: version: v3.17.0 diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index ee9d7ae79..eab1c9697 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@58077d3c7e43986b6b15fba718e8ea69e387dfcc # v2.15.1 + uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594 # v2.16.0 with: egress-policy: audit diff --git a/.github/workflows/pr-title-lint.yml b/.github/workflows/pr-title-lint.yml index c03a0a099..b7fb5ee63 100644 --- a/.github/workflows/pr-title-lint.yml +++ b/.github/workflows/pr-title-lint.yml @@ -1,4 +1,9 @@ name: PR Title Checker + +permissions: + contents: read + pull-requests: write + on: pull_request: types: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 000000000..e69de29bb diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 11aeafe4b..b5cf2b8a6 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -18,7 +18,7 @@ env: MEMBER_AGENT_IMAGE_NAME: member-agent REFRESH_TOKEN_IMAGE_NAME: refresh-token - GO_VERSION: '1.24.13' + GO_VERSION: '1.25.8' jobs: export-registry: diff --git a/.github/workflows/upgrade.yml b/.github/workflows/upgrade.yml index 11a4f8c92..b3e3150d4 100644 --- a/.github/workflows/upgrade.yml +++ b/.github/workflows/upgrade.yml @@ -17,7 +17,7 @@ on: paths-ignore: [docs/**, "**.md", "**.mdx", "**.png", "**.jpg"] env: - GO_VERSION: '1.24.13' + GO_VERSION: '1.25.8' jobs: detect-noop: diff --git a/.golangci.yml b/.golangci.yml index 556731073..f6d620d1d 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,6 +1,6 @@ run: timeout: 15m - go: '1.24.13' + go: '1.25.8' linters-settings: stylecheck: diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 8772074cf..7afe7c346 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -1,7 +1,10 @@ # The KubeFleet Maintainers -| Maintainer | Organization | GitHub Username | -|-------------|--------------|-------------------------------------------------------| -| Ryan Zhang | Microsoft | [@ryanzhang-oss](https://github.com/ryanzhang-oss) | -| Zhiying Lin | Microsoft | [@zhiying-lin](https://github.com/zhiying-lin) | -| Chen Yu | Microsoft | [@michaelawyu](https://github.com/michaelawyu) | +| Maintainer | Organization | GitHub Username | +|----------------|--------------|----------------------------------------------------| +| Ryan Zhang | Microsoft | [@ryanzhang-oss](https://github.com/ryanzhang-oss) | +| Zhiying Lin | Microsoft | [@zhiying-lin](https://github.com/zhiying-lin) | +| Chen Yu | Microsoft | [@michaelawyu](https://github.com/michaelawyu) | +| Wei Weng | Microsoft | [@weng271190436](https://github.com/weng271190436) | +| Yetkin Timocin | Microsoft | [@ytimocin](https://github.com/ytimocin) | +| Simon Waight | Microsoft | [@sjwaight](https://github.com/sjwaight) | diff --git a/Makefile b/Makefile index 65adfb0a1..ff95693da 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ TOOLS_BIN_DIR := $(abspath $(TOOLS_DIR)/bin) # Binaries # Note: Need to use abspath so we can invoke these from subdirectories -CONTROLLER_GEN_VER := v0.16.0 +CONTROLLER_GEN_VER := v0.20.0 CONTROLLER_GEN_BIN := controller-gen CONTROLLER_GEN := $(abspath $(TOOLS_BIN_DIR)/$(CONTROLLER_GEN_BIN)-$(CONTROLLER_GEN_VER)) diff --git a/README.md b/README.md index a43d46e21..73e5ffaa6 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,11 @@ You can reach the KubeFleet community and developers via the following channels: ## Community Meetings +March 2026: we're currently revamping our community call schedule and will have more to share soon. + +Future plans will land on our [community repository](https://github.com/kubefleet-dev/community). + + + ## Code of Conduct Participation in KubeFleet is governed by the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md). See the [Code of Conduct](CODE_OF_CONDUCT.md) for more information. diff --git a/ROADMAP.md b/ROADMAP.md index 051b014a7..6ce061174 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,7 +1,6 @@ # KubeFleet Roadmap -## Project Website -- Setup the project website +For our up to date roadmap please see: [KubeFleet Roadmap](https://github.com/orgs/kubefleet-dev/projects/4) ## Support more cluster properties so that user can pick the right cluster for their workload - Support node level SKU as properties, e.g. CPU, GPU, Memory, etc diff --git a/SUPPORT.md b/SUPPORT.md index e5a8966d0..de0bbb146 100644 --- a/SUPPORT.md +++ b/SUPPORT.md @@ -2,9 +2,7 @@ ## How to file issues and get help -This project uses GitHub Issues to track bugs and feature requests. Please search the existing -issues before filing new issues to avoid duplicates. For new issues, file your bug or -feature request as a new Issue. +This project uses GitHub Issues to track bugs and feature requests. Please search the existing issues before filing new issues to avoid duplicates. For new issues, file your bug or feature request as a new Issue. For help and questions about using this project, please diff --git a/charts/member-agent/README.md b/charts/member-agent/README.md index 027f95357..ccbcd1bc4 100644 --- a/charts/member-agent/README.md +++ b/charts/member-agent/README.md @@ -69,6 +69,7 @@ helm upgrade member-agent kubefleet/member-agent --namespace fleet-system | logVerbosity | Log level. Uses V logs (klog) | `3` | | propertyProvider | The property provider to use with the member agent; if none is specified, the Fleet member agent will start with no property provider (i.e., the agent will expose no cluster properties, and collect only limited resource usage information) | `` | | region | The region where the member cluster resides | `` | +| enableNamespaceCollectionInPropertyProvider | Enable namespace collection in the property provider; when enabled, the member agent will collect and report the list of namespaces present in the member cluster to the hub cluster for use in scheduling decisions | `false` | | workApplierRequeueRateLimiterAttemptsWithFixedDelay | This parameter is a set of values to control how frequent KubeFleet should reconcile (processed) manifests; it specifies then number of attempts to requeue with fixed delay before switching to exponential backoff | `1` | | workApplierRequeueRateLimiterFixedDelaySeconds | This parameter is a set of values to control how frequent KubeFleet should reconcile (process) manifests; it specifies the fixed delay in seconds for initial requeue attempts | `5` | | workApplierRequeueRateLimiterExponentialBaseForSlowBackoff | This parameter is a set of values to control how frequent KubeFleet should reconcile (process) manifests; it specifies the exponential base for the slow backoff stage | `1.2` | diff --git a/charts/member-agent/templates/deployment.yaml b/charts/member-agent/templates/deployment.yaml index 6aad28fdd..be57774c6 100644 --- a/charts/member-agent/templates/deployment.yaml +++ b/charts/member-agent/templates/deployment.yaml @@ -82,6 +82,9 @@ spec: - --work-applier-priority-linear-equation-coeff-a={{ .Values.priorityQueue.priorityLinearEquationCoeffA }} - --work-applier-priority-linear-equation-coeff-b={{ .Values.priorityQueue.priorityLinearEquationCoeffB }} {{- end }} + {{- if .Values.enableNamespaceCollectionInPropertyProvider }} + - --enable-namespace-collection-in-property-provider={{ .Values.enableNamespaceCollectionInPropertyProvider }} + {{- end }} env: - name: HUB_SERVER_URL value: "{{ .Values.config.hubURL }}" diff --git a/charts/member-agent/values.yaml b/charts/member-agent/values.yaml index eab5ca2f2..31ca2383c 100644 --- a/charts/member-agent/values.yaml +++ b/charts/member-agent/values.yaml @@ -76,3 +76,5 @@ priorityQueue: enabled: false priorityLinearEquationCoeffA: -3 priorityLinearEquationCoeffB: 100 + +enableNamespaceCollectionInPropertyProvider: false diff --git a/cmd/memberagent/main.go b/cmd/memberagent/main.go index d96b33004..c5ca386d0 100644 --- a/cmd/memberagent/main.go +++ b/cmd/memberagent/main.go @@ -50,6 +50,7 @@ import ( clusterv1beta1 "go.goms.io/fleet/apis/cluster/v1beta1" placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" + "go.goms.io/fleet/cmd/memberagent/options" imcv1beta1 "go.goms.io/fleet/pkg/controllers/internalmembercluster/v1beta1" "go.goms.io/fleet/pkg/controllers/workapplier" "go.goms.io/fleet/pkg/propertyprovider" @@ -66,52 +67,7 @@ const ( ) var ( - scheme = runtime.NewScheme() - useCertificateAuth = flag.Bool("use-ca-auth", false, "Use key and certificate to authenticate the member agent.") - tlsClientInsecure = flag.Bool("tls-insecure", false, "Enable TLSClientConfig.Insecure property. Enabling this will make the connection inSecure (should be 'true' for testing purpose only.)") - hubProbeAddr = flag.String("hub-health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") - hubMetricsAddr = flag.String("hub-metrics-bind-address", ":8080", "The address the metric endpoint binds to.") - probeAddr = flag.String("health-probe-bind-address", ":8091", "The address the probe endpoint binds to.") - metricsAddr = flag.String("metrics-bind-address", ":8090", "The address the metric endpoint binds to.") - enableLeaderElection = flag.Bool("leader-elect", false, - "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") - leaderElectionNamespace = flag.String("leader-election-namespace", "kube-system", "The namespace in which the leader election resource will be created.") - // TODO(weiweng): only keep enableV1Alpha1APIs for backward compatibility with helm charts. Remove soon. - enableV1Alpha1APIs = flag.Bool("enable-v1alpha1-apis", false, "If set, the agents will watch for the v1alpha1 APIs. This is deprecated and will be removed soon.") - enableV1Beta1APIs = flag.Bool("enable-v1beta1-apis", true, "If set, the agents will watch for the v1beta1 APIs.") - propertyProvider = flag.String("property-provider", "none", "The property provider to use for the agent.") - region = flag.String("region", "", "The region where the member cluster resides.") - cloudConfigFile = flag.String("cloud-config", "/etc/kubernetes/provider/config.json", "The path to the cloud cloudconfig file.") - deletionWaitTime = flag.Int("deletion-wait-time", 5, "The time the work-applier will wait for work object to be deleted before updating the applied work owner reference") - enablePprof = flag.Bool("enable-pprof", false, "enable pprof profiling") - pprofPort = flag.Int("pprof-port", 6065, "port for pprof profiling") - hubPprofPort = flag.Int("hub-pprof-port", 6066, "port for hub pprof profiling") - hubQPS = flag.Float64("hub-api-qps", 50, "QPS to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - hubBurst = flag.Int("hub-api-burst", 500, "Burst to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - memberQPS = flag.Float64("member-api-qps", 250, "QPS to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - memberBurst = flag.Int("member-api-burst", 1000, "Burst to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - - // Work applier requeue rate limiter settings. - workApplierRequeueRateLimiterAttemptsWithFixedDelay = flag.Int("work-applier-requeue-rate-limiter-attempts-with-fixed-delay", 1, "If set, the work applier will requeue work objects with a fixed delay for the specified number of attempts before switching to exponential backoff.") - workApplierRequeueRateLimiterFixedDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-fixed-delay-seconds", 5.0, "If set, the work applier will requeue work objects with this fixed delay in seconds for the specified number of attempts before switching to exponential backoff.") - workApplierRequeueRateLimiterExponentialBaseForSlowBackoff = flag.Float64("work-applier-requeue-rate-limiter-exponential-base-for-slow-backoff", 1.2, "If set, the work applier will start to back off slowly at this factor after it finished requeueing with fixed delays, until it reaches the slow backoff delay cap. Its value should be larger than 1.0 and no larger than 100.0") - workApplierRequeueRateLimiterInitialSlowBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-initial-slow-backoff-delay-seconds", 2, "If set, the work applier will start to back off slowly at this delay in seconds.") - workApplierRequeueRateLimiterMaxSlowBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-max-slow-backoff-delay-seconds", 15, "If set, the work applier will not back off longer than this value in seconds when it is in the slow backoff stage.") - workApplierRequeueRateLimiterExponentialBaseForFastBackoff = flag.Float64("work-applier-requeue-rate-limiter-exponential-base-for-fast-backoff", 1.5, "If set, the work applier will start to back off fast at this factor after it completes the slow backoff stage, until it reaches the fast backoff delay cap. Its value should be larger than the base value for the slow backoff stage.") - workApplierRequeueRateLimiterMaxFastBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-max-fast-backoff-delay-seconds", 900, "If set, the work applier will not back off longer than this value in seconds when it is in the fast backoff stage.") - workApplierRequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs = flag.Bool("work-applier-requeue-rate-limiter-skip-to-fast-backoff-for-available-or-diff-reported-work-objs", true, "If set, the rate limiter will skip the slow backoff stage and start fast backoff immediately for work objects that are available or have diff reported.") - - // Property feature gates when the property provider is not none. - enableNamespaceCollectionInPropertyProvider = flag.Bool("enable-namespace-collection-in-property-provider", false, "If set, the property provider will collect the namespaces in the member cluster.") - - // Work applier priority queue settings. - enableWorkApplierPriorityQueue = flag.Bool("enable-work-applier-priority-queue", false, "If set, the work applier will use a priority queue to process work objects.") - workApplierPriorityLinearEquationCoeffA = flag.Int("work-applier-priority-linear-equation-coeff-a", -3, "The work applier sets the priority for a Work object processing attempt using the linear equation: priority = A * (work object age in minutes) + B. This flag sets the coefficient A in the equation.") - workApplierPriorityLinearEquationCoeffB = flag.Int("work-applier-priority-linear-equation-coeff-b", 100, "The work applier sets the priority for a Work object processing attempt using the linear equation: priority = A * (work object age in minutes) + B. This flag sets the coefficient B in the equation.") - - // Azure property provider feature gates. - isAzProviderCostPropertiesEnabled = flag.Bool("use-cost-properties-in-azure-provider", true, "If set, the Azure property provider will expose cost properties in the member cluster.") - isAzProviderAvailableResPropertiesEnabled = flag.Bool("use-available-res-properties-in-azure-provider", true, "If set, the Azure property provider will expose available resources properties in the member cluster.") + scheme = runtime.NewScheme() ) func init() { @@ -124,6 +80,9 @@ func init() { } func main() { + opts := options.NewOptions() + opts.AddFlags(flag.CommandLine) + flag.Parse() utilrand.Seed(time.Now().UnixNano()) defer klog.Flush() @@ -135,32 +94,19 @@ func main() { // Set up controller-runtime logger ctrl.SetLogger(zap.New(zap.UseDevMode(true))) - // Validate flags - if !*enableV1Alpha1APIs && !*enableV1Beta1APIs { - klog.ErrorS(errors.New("either enable-v1alpha1-apis or enable-v1beta1-apis is required"), "Invalid APIs flags") - klog.FlushAndExit(klog.ExitFlushTimeout, 1) - } - // TO-DO (chenyu1): refactor the validation logic. - if workApplierPriorityLinearEquationCoeffA == nil || *workApplierPriorityLinearEquationCoeffA >= 0 { - klog.ErrorS(errors.New("parameter workApplierPriorityLinearEquationCoeffA is set incorrectly; must use a value less than 0"), "InvalidFlag", "workApplierPriorityLinearEquationCoeffA") - } - if workApplierPriorityLinearEquationCoeffB == nil || *workApplierPriorityLinearEquationCoeffB <= 0 { - klog.ErrorS(errors.New("parameter workApplierPriorityLinearEquationCoeffB is set incorrectly; must use a value greater than 0"), "InvalidFlag", "workApplierPriorityLinearEquationCoeffB") - } - hubURL := os.Getenv("HUB_SERVER_URL") if hubURL == "" { klog.ErrorS(errors.New("hub server api cannot be empty"), "Failed to read URL for the hub cluster") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } - hubConfig, err := buildHubConfig(hubURL, *useCertificateAuth, *tlsClientInsecure) - hubConfig.QPS = float32(*hubQPS) - hubConfig.Burst = *hubBurst + hubConfig, err := buildHubConfig(hubURL, opts.HubConnectivityOpts.UseCertificateAuth, opts.HubConnectivityOpts.UseInsecureTLSClient) if err != nil { klog.ErrorS(err, "Failed to build Kubernetes client configuration for the hub cluster") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } + hubConfig.QPS = float32(opts.CtrlManagerOptions.HubManagerOpts.QPS) + hubConfig.Burst = opts.CtrlManagerOptions.HubManagerOpts.Burst mcName := os.Getenv("MEMBER_CLUSTER_NAME") if mcName == "" { @@ -171,20 +117,23 @@ func main() { mcNamespace := fmt.Sprintf(utils.NamespaceNameFormat, mcName) memberConfig := ctrl.GetConfigOrDie() - memberConfig.QPS = float32(*memberQPS) - memberConfig.Burst = *memberBurst + memberConfig.QPS = float32(opts.CtrlManagerOptions.MemberManagerOpts.QPS) + memberConfig.Burst = opts.CtrlManagerOptions.MemberManagerOpts.Burst // we place the leader election lease on the member cluster to avoid adding load to the hub hubOpts := ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{ - BindAddress: *hubMetricsAddr, + BindAddress: opts.CtrlManagerOptions.HubManagerOpts.MetricsBindAddress, }, WebhookServer: webhook.NewServer(webhook.Options{ Port: 8443, }), - HealthProbeBindAddress: *hubProbeAddr, - LeaderElection: *enableLeaderElection, - LeaderElectionNamespace: *leaderElectionNamespace, + HealthProbeBindAddress: opts.CtrlManagerOptions.HubManagerOpts.HealthProbeBindAddress, + LeaderElection: opts.CtrlManagerOptions.LeaderElectionOpts.LeaderElect, + LeaderElectionNamespace: opts.CtrlManagerOptions.LeaderElectionOpts.ResourceNamespace, + LeaseDuration: &opts.CtrlManagerOptions.LeaderElectionOpts.LeaseDuration.Duration, + RenewDeadline: &opts.CtrlManagerOptions.LeaderElectionOpts.RenewDeadline.Duration, + RetryPeriod: &opts.CtrlManagerOptions.LeaderElectionOpts.RetryPeriod.Duration, LeaderElectionConfig: memberConfig, LeaderElectionID: "136224848560.hub.fleet.azure.com", Cache: cache.Options{ @@ -203,23 +152,26 @@ func main() { memberOpts := ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{ - BindAddress: *metricsAddr, + BindAddress: opts.CtrlManagerOptions.MemberManagerOpts.MetricsBindAddress, }, WebhookServer: webhook.NewServer(webhook.Options{ Port: 9443, }), - HealthProbeBindAddress: *probeAddr, + HealthProbeBindAddress: opts.CtrlManagerOptions.MemberManagerOpts.HealthProbeBindAddress, LeaderElection: hubOpts.LeaderElection, - LeaderElectionNamespace: *leaderElectionNamespace, + LeaderElectionNamespace: opts.CtrlManagerOptions.LeaderElectionOpts.ResourceNamespace, + LeaseDuration: &opts.CtrlManagerOptions.LeaderElectionOpts.LeaseDuration.Duration, + RenewDeadline: &opts.CtrlManagerOptions.LeaderElectionOpts.RenewDeadline.Duration, + RetryPeriod: &opts.CtrlManagerOptions.LeaderElectionOpts.RetryPeriod.Duration, LeaderElectionID: "136224848560.member.fleet.azure.com", } //+kubebuilder:scaffold:builder - if *enablePprof { - memberOpts.PprofBindAddress = fmt.Sprintf(":%d", *pprofPort) - hubOpts.PprofBindAddress = fmt.Sprintf(":%d", *hubPprofPort) + if opts.CtrlManagerOptions.EnablePprof { + memberOpts.PprofBindAddress = fmt.Sprintf(":%d", opts.CtrlManagerOptions.MemberManagerOpts.PprofPort) + hubOpts.PprofBindAddress = fmt.Sprintf(":%d", opts.CtrlManagerOptions.HubManagerOpts.PprofPort) } - if err := Start(ctrl.SetupSignalHandler(), hubConfig, memberConfig, hubOpts, memberOpts); err != nil { + if err := Start(ctrl.SetupSignalHandler(), hubConfig, memberConfig, hubOpts, memberOpts, *opts); err != nil { klog.ErrorS(err, "Failed to start the controllers for the member agent") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } @@ -316,7 +268,7 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo } // Start the member controllers with the supplied config -func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memberOpts ctrl.Options) error { +func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memberOpts ctrl.Options, globalOpts options.Options) error { hubMgr, err := ctrl.NewManager(hubCfg, hubOpts) if err != nil { return fmt.Errorf("unable to start hub manager: %w", err) @@ -375,116 +327,112 @@ func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memb } discoverClient := discovery.NewDiscoveryClientForConfigOrDie(memberConfig) - if *enableV1Alpha1APIs { - // TODO(weiweng): keeping v1alpha1 APIs for backward compatibility with helm charts. Remove soon. - klog.Error("v1alpha1 APIs are no longer supported. Please switch to v1beta1 APIs") - return errors.New("v1alpha1 APIs are no longer supported. Please switch to v1beta1 APIs") + gvk := placementv1beta1.GroupVersion.WithKind(placementv1beta1.AppliedWorkKind) + if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { + klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) + return err + } + // Set up the work applier. Note that it is referenced by the InternalMemberCluster controller. + + // Set up the requeue rate limiter for the work applier. + // + // With default settings, the rate limiter will: + // * allow 1 attempt of fixed delay; this helps give objects a bit of headroom to get available (or have + // diffs reported). + // * use a fixed delay of 5 seconds for the first attempt. + // + // Important (chenyu1): before the introduction of the requeue rate limiter, the work + // applier uses static requeue intervals, specifically 5 seconds (if the work object is unavailable), + // and 15 seconds (if the work object is available). There are a number of test cases that + // implicitly assume this behavior (e.g., a test case might expect that the availability check completes + // w/in 10 seconds), which is why the rate limiter uses the 5 seconds fixed requeue delay by default. + // If you need to change this value and see that some test cases begin to fail, update the test + // cases accordingly. + // * after completing all attempts with fixed delay, switch to slow exponential backoff with a base of + // 1.2 with an initial delay of 2 seconds and a cap of 15 seconds (12 requeues in total, ~90 seconds in total); + // this is to allow fast checkups in cases where objects are not yet available or have not yet reported diffs. + // * after completing the slow backoff stage, switch to a fast exponential backoff with a base of 1.5 + // with an initial delay of 15 seconds and a cap of 15 minutes (10 requeues in total, ~42 minutes in total). + // * for Work objects that are available or have diffs reported, skip the slow backoff stage and + // start fast backoff immediately. + // + // The requeue pattern is essentially: + // * 1 attempts of requeue with fixed delay (5 seconds); then + // * 12 attempts of requeues with slow exponential backoff (factor of 1.2, ~90 seconds in total); then + // * 10 attempts of requeues with fast exponential backoff (factor of 1.5, ~42 minutes in total); + // * afterwards, requeue with a delay of 15 minutes indefinitely. + requeueRateLimiter := workapplier.NewRequeueMultiStageWithExponentialBackoffRateLimiter( + globalOpts.ApplierOpts.RequeueRateLimiterAttemptsWithFixedDelay, + float64(globalOpts.ApplierOpts.RequeueRateLimiterFixedDelaySeconds), + globalOpts.ApplierOpts.RequeueRateLimiterExponentialBaseForSlowBackoff, + float64(globalOpts.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds), + float64(globalOpts.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds), + globalOpts.ApplierOpts.RequeueRateLimiterExponentialBaseForFastBackoff, + float64(globalOpts.ApplierOpts.RequeueRateLimiterMaxFastBackoffDelaySeconds), + globalOpts.ApplierOpts.RequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs, + ) + + workApplier := workapplier.NewReconciler( + "work-applier", + hubMgr.GetClient(), + targetNS, + spokeDynamicClient, + memberMgr.GetClient(), + restMapper, + hubMgr.GetEventRecorderFor("work_applier"), + // The number of concurrent reconcilations. This is set to 5 to boost performance in + // resource processing. + 5, + // Use the default worker count (4) for parallelized manifest processing. + parallelizer.NewParallelizer(parallelizer.DefaultNumOfWorkers), + time.Minute*time.Duration(globalOpts.ApplierOpts.ResourceForceDeletionWaitTimeMinutes), + requeueRateLimiter, + globalOpts.ApplierOpts.EnablePriorityQueue, + &globalOpts.ApplierOpts.PriorityLinearEquationCoEffA, + &globalOpts.ApplierOpts.PriorityLinearEquationCoEffB, + ) + + if err = workApplier.SetupWithManager(hubMgr); err != nil { + klog.ErrorS(err, "Failed to create v1beta1 controller", "controller", "work") + return err } - if *enableV1Beta1APIs { - gvk := placementv1beta1.GroupVersion.WithKind(placementv1beta1.AppliedWorkKind) - if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { - klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) - return err - } - // Set up the work applier. Note that it is referenced by the InternalMemberCluster controller. - - // Set up the requeue rate limiter for the work applier. - // - // With default settings, the rate limiter will: - // * allow 1 attempt of fixed delay; this helps give objects a bit of headroom to get available (or have - // diffs reported). - // * use a fixed delay of 5 seconds for the first attempt. - // - // Important (chenyu1): before the introduction of the requeue rate limiter, the work - // applier uses static requeue intervals, specifically 5 seconds (if the work object is unavailable), - // and 15 seconds (if the work object is available). There are a number of test cases that - // implicitly assume this behavior (e.g., a test case might expect that the availability check completes - // w/in 10 seconds), which is why the rate limiter uses the 5 seconds fixed requeue delay by default. - // If you need to change this value and see that some test cases begin to fail, update the test - // cases accordingly. - // * after completing all attempts with fixed delay, switch to slow exponential backoff with a base of - // 1.2 with an initial delay of 2 seconds and a cap of 15 seconds (12 requeues in total, ~90 seconds in total); - // this is to allow fast checkups in cases where objects are not yet available or have not yet reported diffs. - // * after completing the slow backoff stage, switch to a fast exponential backoff with a base of 1.5 - // with an initial delay of 15 seconds and a cap of 15 minutes (10 requeues in total, ~42 minutes in total). - // * for Work objects that are available or have diffs reported, skip the slow backoff stage and - // start fast backoff immediately. - // - // The requeue pattern is essentially: - // * 1 attempts of requeue with fixed delay (5 seconds); then - // * 12 attempts of requeues with slow exponential backoff (factor of 1.2, ~90 seconds in total); then - // * 10 attempts of requeues with fast exponential backoff (factor of 1.5, ~42 minutes in total); - // * afterwards, requeue with a delay of 15 minutes indefinitely. - requeueRateLimiter := workapplier.NewRequeueMultiStageWithExponentialBackoffRateLimiter( - *workApplierRequeueRateLimiterAttemptsWithFixedDelay, - *workApplierRequeueRateLimiterFixedDelaySeconds, - *workApplierRequeueRateLimiterExponentialBaseForSlowBackoff, - *workApplierRequeueRateLimiterInitialSlowBackoffDelaySeconds, - *workApplierRequeueRateLimiterMaxSlowBackoffDelaySeconds, - *workApplierRequeueRateLimiterExponentialBaseForFastBackoff, - *workApplierRequeueRateLimiterMaxFastBackoffDelaySeconds, - *workApplierRequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs, - ) - - workApplier := workapplier.NewReconciler( - "work-applier", - hubMgr.GetClient(), - targetNS, - spokeDynamicClient, - memberMgr.GetClient(), - restMapper, - hubMgr.GetEventRecorderFor("work_applier"), - // The number of concurrent reconcilations. This is set to 5 to boost performance in - // resource processing. - 5, - // Use the default worker count (4) for parallelized manifest processing. - parallelizer.NewParallelizer(parallelizer.DefaultNumOfWorkers), - time.Minute*time.Duration(*deletionWaitTime), - requeueRateLimiter, - *enableWorkApplierPriorityQueue, - workApplierPriorityLinearEquationCoeffA, - workApplierPriorityLinearEquationCoeffB, - ) - - if err = workApplier.SetupWithManager(hubMgr); err != nil { - klog.ErrorS(err, "Failed to create v1beta1 controller", "controller", "work") - return err - } - - klog.Info("Setting up the internalMemberCluster v1beta1 controller") - // Set up a provider provider (if applicable). - var pp propertyprovider.PropertyProvider - switch { - case propertyProvider != nil && *propertyProvider == azurePropertyProvider: - klog.V(2).Info("setting up the Azure property provider") - // Note that the property provider, though initialized here, is not started until - // the specific instance wins the leader election. - klog.V(1).InfoS("Property Provider is azure, loading cloud config", "cloudConfigFile", *cloudConfigFile) - // TODO (britaniar): load cloud config for Azure property provider. - pp = azure.New(region, *isAzProviderCostPropertiesEnabled, *isAzProviderAvailableResPropertiesEnabled, *enableNamespaceCollectionInPropertyProvider) - default: - // Fall back to not using any property provider if the provided type is none or - // not recognizable. - klog.V(2).Info("no property provider is specified, or the given type is not recognizable; start with no property provider") - pp = nil - } + klog.Info("Setting up the internalMemberCluster v1beta1 controller") + // Set up a provider provider (if applicable). + var pp propertyprovider.PropertyProvider + switch { + case globalOpts.PropertyProviderOpts.Name == azurePropertyProvider: + klog.V(2).Info("setting up the Azure property provider") + // Note that the property provider, though initialized here, is not started until + // the specific instance wins the leader election. + klog.V(1).InfoS("Property Provider is azure, loading cloud config", "cloudConfigFile", globalOpts.PropertyProviderOpts.CloudConfigFilePath) + // TODO (britaniar): load cloud config for Azure property provider. + pp = azure.New( + &globalOpts.PropertyProviderOpts.Region, + globalOpts.PropertyProviderOpts.EnableAzProviderCostProperties, + globalOpts.PropertyProviderOpts.EnableAzProviderAvailableResourceProperties, + globalOpts.PropertyProviderOpts.EnableAzProviderNamespaceCollection) + default: + // Fall back to not using any property provider if the provided type is none or + // not recognizable. + klog.V(2).Info("no property provider is specified, or the given type is not recognizable; start with no property provider") + pp = nil + } - // Set up the IMC controller. - imcReconciler, err := imcv1beta1.NewReconciler( - ctx, - hubMgr.GetClient(), - memberMgr.GetConfig(), memberMgr.GetClient(), - workApplier, - pp) - if err != nil { - klog.ErrorS(err, "Failed to create InternalMemberCluster v1beta1 reconciler") - return fmt.Errorf("failed to create InternalMemberCluster v1beta1 reconciler: %w", err) - } - if err := imcReconciler.SetupWithManager(hubMgr, "internalmembercluster-controller"); err != nil { - klog.ErrorS(err, "Failed to set up InternalMemberCluster v1beta1 controller with the controller manager") - return fmt.Errorf("failed to set up InternalMemberCluster v1beta1 controller with the controller manager: %w", err) - } + // Set up the IMC controller. + imcReconciler, err := imcv1beta1.NewReconciler( + ctx, + hubMgr.GetClient(), + memberMgr.GetConfig(), memberMgr.GetClient(), + workApplier, + pp) + if err != nil { + klog.ErrorS(err, "Failed to create InternalMemberCluster v1beta1 reconciler") + return fmt.Errorf("failed to create InternalMemberCluster v1beta1 reconciler: %w", err) + } + if err := imcReconciler.SetupWithManager(hubMgr, "internalmembercluster-controller"); err != nil { + klog.ErrorS(err, "Failed to set up InternalMemberCluster v1beta1 controller with the controller manager") + return fmt.Errorf("failed to set up InternalMemberCluster v1beta1 controller with the controller manager: %w", err) } klog.InfoS("starting hub manager") diff --git a/cmd/memberagent/options/applier.go b/cmd/memberagent/options/applier.go new file mode 100644 index 000000000..cb416c5d8 --- /dev/null +++ b/cmd/memberagent/options/applier.go @@ -0,0 +1,415 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "strconv" +) + +type ApplierOptions struct { + // The time in minutes where the KubeFleet member agent will wait before force deleting all the + // resources from a member cluster if the placement that owns the resources has been removed + // from the hub cluster. + // + // KubeFleet member agent leverages owner references and foreground deletion to ensure that + // when a placement itself is gone, all of its selected resources will be removed as well. However, + // there are situations where Kubernetes might not be able to complete the cascade deletion timely; + // if the agent detects that a deletion has been stuck for a period of time longer than the value + // specified by this option, it will issue direct DELETE calls at the left resources. + ResourceForceDeletionWaitTimeMinutes int + + // The KubeFleet member agent periodically re-processes all resources propagated via a placement. + // This helps ensure that KubeFleet can detect inadvertent changes on processed resources and + // take actions accordingly. + // + // The system uses a rate limiter to regulate the frequency of the periodic re-processing; the + // flow is as follows: + // + // * when a placement is first processed, the system re-queues the placement for re-processing + // at a fixed delay for a few attempts; + // * if the processing outcomes remain the same across attempts, the system will start to + // re-process the placement with a slowly but exponentially increasing delay, until the delay + // reaches a user-configurable maximum value; + // * if the processing outcomes continues to stay the same across attempts, the system will + // start to re-process the placement with a fastly and exponentially increasing delay, until the + // delay reaches a user-configurable maximum value; + // + // In addition, the rate limiter provides an optional shortcut that allows placements that + // has been processed successfully (i.e., all the resources are applied and are found + // to be available, or all the resources have their diffs reported) to skip the slow backoff + // stage and jump to the fast backoff stage directly. + // + // See the options below for further details: + + // The number of attempts that the KubeFleet member agent will re-process a placement with a fixed delay. + RequeueRateLimiterAttemptsWithFixedDelay int + + // The fixed delay in seconds that the KubeFleet member agent will use. + RequeueRateLimiterFixedDelaySeconds int + + // The exponential delay growth base that the KubeFleet member agent will use for the slow backoff stage. + RequeueRateLimiterExponentialBaseForSlowBackoff float64 + + // The initial delay in seconds that the KubeFleet member agent will use for the slow backoff stage. + RequeueRateLimiterInitialSlowBackoffDelaySeconds int + + // The maximum delay in seconds that the KubeFleet member agent will use for the slow backoff stage. + RequeueRateLimiterMaxSlowBackoffDelaySeconds int + + // The exponential delay growth base that the KubeFleet member agent will use for the fast backoff stage. + RequeueRateLimiterExponentialBaseForFastBackoff float64 + + // The max delay in seconds that the KubeFleet member agent will use for the fast backoff stage. + RequeueRateLimiterMaxFastBackoffDelaySeconds int + + // Allow placements that have been processed successfully to skip the slow backoff stage and jump to + // the fast backoff stage directly or not. + RequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs bool + + // By default, the KubeFleet member agent processes all placements in a FIFO order. Alternatively, + // one can set up the agent to prioritize placements that are just created or updated; this is most + // helpful in cases where the number of placements per member cluster gets large and the processing + // latencies for newly created/updated placements become a concern. + // + // KubeFleet calculates the priority score for a placement using the formula below: + // + // Pri(placement) = A * (placement age in minutes) + B, + // + // where A is a negative integer and B a positive one, so that the younger the placement is, the + // higher priority it gets. The values of A and B are user-configurable. + // + // See the options below for further details: + + // Enable prioritized processing of newly created/updated placements or not. + EnablePriorityQueue bool + + // The coefficient A in the linear equation for calculating the priority score of a placement. + PriorityLinearEquationCoEffA int + + // The coefficient B in the linear equation for calculating the priority score of a placement. + PriorityLinearEquationCoEffB int +} + +func (o *ApplierOptions) AddFlags(flags *flag.FlagSet) { + flags.Var( + newResForceDeletionWaitTimeMinutesValue(5, &o.ResourceForceDeletionWaitTimeMinutes), + "deletion-wait-time", + "The time in minutes where the KubeFleet member agent will wait before force deleting all the resources from a member cluster if the placement that owns the resources has been removed from the hub cluster. Default is 5 minutes. The value must be in the range [1, 60].") + + flags.BoolVar( + &o.EnablePriorityQueue, + "enable-work-applier-priority-queue", + false, + "Enable prioritized processing of newly created/updated placements or not. Default is false, which means placements will be processed in a FIFO order.") + + flags.Var( + newRequeueAttemptsWithFixedDelayValue(1, &o.RequeueRateLimiterAttemptsWithFixedDelay), + "work-applier-requeue-rate-limiter-attempts-with-fixed-delay", + "The number of attempts with a fixed delay before the KubeFleet member agent switches to exponential backoff when re-processing a placement. Default is 1. The value must be in the range [1, 40].") + + flags.Var( + newRequeueFixedDelaySecondsValue(5, &o.RequeueRateLimiterFixedDelaySeconds), + "work-applier-requeue-rate-limiter-fixed-delay-seconds", + "The fixed delay in seconds for the KubeFleet member agent when re-processing a placement before switching to exponential backoff. Default is 5 seconds. The value must be in the range [2, 300].") + + flags.Var( + newRequeueExpBaseForSlowBackoffValue(1.2, &o.RequeueRateLimiterExponentialBaseForSlowBackoff), + "work-applier-requeue-rate-limiter-exponential-base-for-slow-backoff", + "The exponential delay growth base for the slow backoff stage when the KubeFleet member agent re-processes a placement. Default is 1.2. The value must be in the range [1.05, 2].") + + flags.Var( + newRequeueInitSlowBackoffDelaySecondsValue(2, &o.RequeueRateLimiterInitialSlowBackoffDelaySeconds), + "work-applier-requeue-rate-limiter-initial-slow-backoff-delay-seconds", + "The initial delay in seconds for the slow backoff stage when the KubeFleet member agent re-processes a placement. Default is 2 seconds. The value must be no less than 2.") + + flags.Var( + newRequeueMaxSlowBackoffDelaySecondsValue(15, &o.RequeueRateLimiterMaxSlowBackoffDelaySeconds), + "work-applier-requeue-rate-limiter-max-slow-backoff-delay-seconds", + "The maximum delay in seconds for the slow backoff stage when the KubeFleet member agent re-processes a placement. Default is 15 seconds. The value must be no less than 2.") + + flags.Var( + newRequeueExpBaseForFastBackoffValue(1.5, &o.RequeueRateLimiterExponentialBaseForFastBackoff), + "work-applier-requeue-rate-limiter-exponential-base-for-fast-backoff", + "The exponential delay growth base for the fast backoff stage when the KubeFleet member agent re-processes a placement. Default is 1.5. The value must be in the range (1, 2].") + + flags.Var( + newRequeueMaxFastBackoffDelaySecondsValue(900, &o.RequeueRateLimiterMaxFastBackoffDelaySeconds), + "work-applier-requeue-rate-limiter-max-fast-backoff-delay-seconds", + "The maximum delay in seconds for the fast backoff stage when the KubeFleet member agent re-processes a placement. Default is 900 seconds. The value must be in the range (0, 3600].") + + flags.BoolVar( + &o.RequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs, + "work-applier-requeue-rate-limiter-skip-to-fast-backoff-for-available-or-diff-reported-work-objs", + true, + "Allow placements that have been processed successfully to skip the slow backoff stage and jump to the fast backoff stage directly or not when periodically re-processing a placement. Default is true.") + + flags.Var( + newPriCoEffAValue(-3, &o.PriorityLinearEquationCoEffA), + "work-applier-priority-linear-equation-coeff-a", + "The coefficient A in the linear equation for calculating the priority score of a placement. The value must be a negative integer no less than -100. Default is -3.") + + flags.Var( + newPriCoEffBValue(100, &o.PriorityLinearEquationCoEffB), + "work-applier-priority-linear-equation-coeff-b", + "The coefficient B in the linear equation for calculating the priority score of a placement. The value must be a positive integer no greater than 1000. Default is 100.") +} + +type ResForceDeletionWaitTimeMinutes int + +func (v *ResForceDeletionWaitTimeMinutes) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *ResForceDeletionWaitTimeMinutes) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 1 || t > 60 { + return fmt.Errorf("resource force deletion wait time in minutes is set to an invalid value (%d), must be a value in the range [1, 60]", t) + } + *v = ResForceDeletionWaitTimeMinutes(t) + return nil +} + +func newResForceDeletionWaitTimeMinutesValue(defaultValue int, p *int) *ResForceDeletionWaitTimeMinutes { + *p = defaultValue + return (*ResForceDeletionWaitTimeMinutes)(p) +} + +type RequeueAttemptsWithFixedDelay int + +func (v *RequeueAttemptsWithFixedDelay) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueAttemptsWithFixedDelay) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 1 || t > 40 { + return fmt.Errorf("requeue rate limiter attempts with fixed delay is set to an invalid value (%d), must be a value in the range [1, 40]", t) + } + *v = RequeueAttemptsWithFixedDelay(t) + return nil +} + +func newRequeueAttemptsWithFixedDelayValue(defaultValue int, p *int) *RequeueAttemptsWithFixedDelay { + *p = defaultValue + return (*RequeueAttemptsWithFixedDelay)(p) +} + +type RequeueFixedDelaySeconds int + +func (v *RequeueFixedDelaySeconds) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueFixedDelaySeconds) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 2 || t > 300 { + return fmt.Errorf("requeue rate limiter fixed delay seconds is set to an invalid value (%d), must be a value in the range [2, 300]", t) + } + *v = RequeueFixedDelaySeconds(t) + return nil +} + +func newRequeueFixedDelaySecondsValue(defaultValue int, p *int) *RequeueFixedDelaySeconds { + *p = defaultValue + return (*RequeueFixedDelaySeconds)(p) +} + +// RequeueExpBaseForSlowBackoff is a custom flag value type for the +// RequeueRateLimiterExponentialBaseForSlowBackoff option. +type RequeueExpBaseForSlowBackoff float64 + +func (v *RequeueExpBaseForSlowBackoff) String() string { + return fmt.Sprintf("%g", *v) +} + +func (v *RequeueExpBaseForSlowBackoff) Set(s string) error { + exp, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float value: %w", err) + } + + if exp < 1.05 || exp > 2 { + return fmt.Errorf("requeue rate limiter exponential base for slow backoff is set to an invalid value (%g), must be a value in the range [1.05, 2]", exp) + } + *v = RequeueExpBaseForSlowBackoff(exp) + return nil +} + +func newRequeueExpBaseForSlowBackoffValue(defaultValue float64, p *float64) *RequeueExpBaseForSlowBackoff { + *p = defaultValue + return (*RequeueExpBaseForSlowBackoff)(p) +} + +type RequeueInitSlowBackoffDelaySeconds int + +func (v *RequeueInitSlowBackoffDelaySeconds) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueInitSlowBackoffDelaySeconds) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 2 { + return fmt.Errorf("requeue rate limiter initial slow backoff delay seconds is set to an invalid value (%d), must be a value no less than 2", t) + } + *v = RequeueInitSlowBackoffDelaySeconds(t) + return nil +} + +func newRequeueInitSlowBackoffDelaySecondsValue(defaultValue int, p *int) *RequeueInitSlowBackoffDelaySeconds { + *p = defaultValue + return (*RequeueInitSlowBackoffDelaySeconds)(p) +} + +type RequeueMaxSlowBackoffDelaySeconds int + +func (v *RequeueMaxSlowBackoffDelaySeconds) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueMaxSlowBackoffDelaySeconds) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 2 { + return fmt.Errorf("requeue rate limiter max slow backoff delay seconds is set to an invalid value (%d), must be a value no less than 2", t) + } + *v = RequeueMaxSlowBackoffDelaySeconds(t) + return nil +} + +func newRequeueMaxSlowBackoffDelaySecondsValue(defaultValue int, p *int) *RequeueMaxSlowBackoffDelaySeconds { + *p = defaultValue + return (*RequeueMaxSlowBackoffDelaySeconds)(p) +} + +type RequeueExpBaseForFastBackoff float64 + +func (v *RequeueExpBaseForFastBackoff) String() string { + return fmt.Sprintf("%g", *v) +} + +func (v *RequeueExpBaseForFastBackoff) Set(s string) error { + exp, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float value: %w", err) + } + + if exp <= 1 || exp > 2 { + return fmt.Errorf("requeue rate limiter exponential base for fast backoff is set to an invalid value (%g), must be a value in the range (1, 2]", exp) + } + *v = RequeueExpBaseForFastBackoff(exp) + return nil +} + +func newRequeueExpBaseForFastBackoffValue(defaultValue float64, p *float64) *RequeueExpBaseForFastBackoff { + *p = defaultValue + return (*RequeueExpBaseForFastBackoff)(p) +} + +type RequeueMaxFastBackoffDelaySeconds int + +func (v *RequeueMaxFastBackoffDelaySeconds) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueMaxFastBackoffDelaySeconds) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t <= 0 || t > 3600 { + return fmt.Errorf("requeue rate limiter max fast backoff delay seconds is set to an invalid value (%d), must be a value in the range (0, 3600]", t) + } + *v = RequeueMaxFastBackoffDelaySeconds(t) + return nil +} + +func newRequeueMaxFastBackoffDelaySecondsValue(defaultValue int, p *int) *RequeueMaxFastBackoffDelaySeconds { + *p = defaultValue + return (*RequeueMaxFastBackoffDelaySeconds)(p) +} + +type PriCoEffA int + +func (v *PriCoEffA) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *PriCoEffA) Set(s string) error { + coeff, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if coeff >= 0 || coeff < -100 { + return fmt.Errorf("priority linear equation coefficient A is set to an invalid value (%d), must be a negative integer no less than -100", coeff) + } + + *v = PriCoEffA(coeff) + return nil +} + +func newPriCoEffAValue(defaultValue int, p *int) *PriCoEffA { + *p = defaultValue + return (*PriCoEffA)(p) +} + +type PriCoEffB int + +func (v *PriCoEffB) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *PriCoEffB) Set(s string) error { + coeff, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if coeff <= 0 || coeff > 1000 { + return fmt.Errorf("priority linear equation coefficient B is set to an invalid value (%d), must be a positive integer no greater than 1000", coeff) + } + + *v = PriCoEffB(coeff) + return nil +} + +func newPriCoEffBValue(defaultValue int, p *int) *PriCoEffB { + *p = defaultValue + return (*PriCoEffB)(p) +} diff --git a/cmd/memberagent/options/ctrlmanager.go b/cmd/memberagent/options/ctrlmanager.go new file mode 100644 index 000000000..0f3385f0f --- /dev/null +++ b/cmd/memberagent/options/ctrlmanager.go @@ -0,0 +1,301 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "strconv" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type CtrlManagerOptions struct { + // This set of options apply to the controller manager instance that connects to the hub + // cluster in the KubeFleet member agent. + HubManagerOpts PerClusterCtrlManagerOptions + // This set of options apply to the controller manager instance that connects to the member + // cluster in the KubeFleet member agent. + MemberManagerOpts PerClusterCtrlManagerOptions + + // Enable the pprof server for each controller manager instance or not. + EnablePprof bool + + // The leader election related options that apply to both controller manager instances + // in the KubeFleet member agent. + LeaderElectionOpts LeaderElectionOptions +} + +type PerClusterCtrlManagerOptions struct { + // The TCP address that the controller manager would bind to + // serve health probes. It can be set to "0" or the empty value to disable the health probe server. + HealthProbeBindAddress string + + // The TCP address that the controller manager should bind to serve prometheus metrics. + // It can be set to "0" or the empty value to disable the metrics server. + MetricsBindAddress string + + // The port in use by the pprof server for profiling the controller manager. This option + // only applies if pprof is enabled. + PprofPort int + + // The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager + // and all of its managed controller, for client-side throttling purposes. + QPS float64 + + // The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager + // and all of its managed controller, for client-side throttling purposes. + Burst int +} + +type LeaderElectionOptions struct { + // Enable leader election or not. + LeaderElect bool + + // The duration of a leader election lease. This is the period where a non-leader candidate + // will wait after observing a leadership renewal before attempting to acquire leadership of the + // current leader. And it is also effectively the maximum duration that a leader can be stopped + // before it is replaced by another candidate. The option only applies if leader election is enabled. + LeaseDuration metav1.Duration + + // The interval between attempts by the acting master to renew a leadership slot + // before it stops leading. This must be less than or equal to the lease duration. + // The option only applies if leader election is enabled. + RenewDeadline metav1.Duration + + // The duration the clients should wait between attempting acquisition and renewal of a + // leadership. The option only applies if leader election is enabled. + RetryPeriod metav1.Duration + + // The namespace of the resource object that will be used to lock during leader election cycles. + // This option only applies if leader election is enabled. + ResourceNamespace string +} + +func (o *CtrlManagerOptions) AddFlags(flags *flag.FlagSet) { + flags.StringVar( + &o.HubManagerOpts.HealthProbeBindAddress, + "hub-health-probe-bind-address", + ":8081", + "The TCP address that the controller manager would bind to serve health probes. It can be set to '0' or the empty value to disable the health probe server.") + + flags.StringVar( + &o.HubManagerOpts.MetricsBindAddress, + "hub-metrics-bind-address", + ":8080", + "The TCP address that the controller manager should bind to serve prometheus metrics. It can be set to '0' or the empty value to disable the metrics server.") + + flags.IntVar( + &o.HubManagerOpts.PprofPort, + "hub-pprof-port", + 6066, + "The port in use by the pprof server for profiling the controller manager that connects to the hub cluster.") + + flags.Var( + newHubQPSValueWithValidation(50.0, &o.HubManagerOpts.QPS), + "hub-api-qps", + "The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager (and all of its managed controllers) that connects to the hub cluster, for client-side throttling purposes.") + + flags.StringVar( + &o.MemberManagerOpts.HealthProbeBindAddress, + "health-probe-bind-address", + ":8091", + "The TCP address that the controller manager would bind to serve health probes. It can be set to '0' or the empty value to disable the health probe server.") + + flags.StringVar( + &o.MemberManagerOpts.MetricsBindAddress, + "metrics-bind-address", + ":8090", + "The TCP address that the controller manager should bind to serve prometheus metrics. It can be set to '0' or the empty value to disable the metrics server.") + + flags.IntVar( + &o.MemberManagerOpts.PprofPort, + "pprof-port", + 6065, + "The port in use by the pprof server for profiling the controller manager that connects to the member cluster.") + + flags.Var( + newHubBurstValueWithValidation(500, &o.HubManagerOpts.Burst), + "hub-api-burst", + "The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager (and all of its managed controllers) that connects to the hub cluster, for client-side throttling purposes.") + + flags.Var( + newMemberQPSValueWithValidation(250.0, &o.MemberManagerOpts.QPS), + "member-api-qps", + "The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager (and all of its managed controllers) that connects to the member cluster, for client-side throttling purposes.") + + flags.Var( + newMemberBurstValueWithValidation(1000, &o.MemberManagerOpts.Burst), + "member-api-burst", + "The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager (and all of its managed controllers) that connects to the member cluster, for client-side throttling purposes.") + + flags.BoolVar( + &o.EnablePprof, + "enable-pprof", + false, + "Enable the pprof server for profiling the member agent controller managers or not.") + + flags.BoolVar( + &o.LeaderElectionOpts.LeaderElect, + "leader-elect", + false, + "Enable leader election on the controller managers in use by the KubeFleet member agent.") + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.LeaderElectionOpts.LeaseDuration.Duration, + "leader-lease-duration", + 15*time.Second, + "The duration of a leader election lease. This is the period where a non-leader candidate will wait after observing a leadership renewal before attempting to acquire leadership of the current leader. And it is also effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. The option only applies if leader election is enabled.") + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.LeaderElectionOpts.RenewDeadline.Duration, + "leader-renew-deadline", + 10*time.Second, + "The interval between attempts by the acting master to renew a leadership slot before it stops leading. This must be less than or equal to the lease duration. The option only applies if leader election is enabled.") + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.LeaderElectionOpts.RetryPeriod.Duration, + "leader-retry-period", + 2*time.Second, + "The duration the clients should wait between attempting acquisition and renewal of a leadership. The option only applies if leader election is enabled.") + + flags.StringVar( + &o.LeaderElectionOpts.ResourceNamespace, + "leader-election-namespace", + "kube-system", + "The namespace in which the leader election resource will be created. The option only applies if leader election is enabled.") +} + +type HubQPSValueWithValidation float64 + +func (v *HubQPSValueWithValidation) String() string { + return fmt.Sprintf("%f", *v) +} + +func (v *HubQPSValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + qps, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float64 value: %w", err) + } + + if qps < 0.0 { + // Disable client-side throttling. + *v = -1.0 + return nil + } + + if qps < 10.0 || qps > 1000.0 { + return fmt.Errorf("QPS limit is set to an invalid value (%f), must be a value in the range [10.0, 1000.0]", qps) + } + *v = HubQPSValueWithValidation(qps) + return nil +} + +func newHubQPSValueWithValidation(defaultVal float64, p *float64) *HubQPSValueWithValidation { + *p = defaultVal + return (*HubQPSValueWithValidation)(p) +} + +type HubBurstValueWithValidation int + +func (v *HubBurstValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *HubBurstValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + burst, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + + if burst < 10 || burst > 2000 { + return fmt.Errorf("burst limit is set to an invalid value (%d), must be a value in the range [10, 2000]", burst) + } + *v = HubBurstValueWithValidation(burst) + return nil +} + +func newHubBurstValueWithValidation(defaultVal int, p *int) *HubBurstValueWithValidation { + *p = defaultVal + return (*HubBurstValueWithValidation)(p) +} + +type MemberQPSValueWithValidation float64 + +func (v *MemberQPSValueWithValidation) String() string { + return fmt.Sprintf("%f", *v) +} + +func (v *MemberQPSValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + qps, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float64 value: %w", err) + } + + if qps < 0.0 { + // Disable client-side throttling. + *v = -1.0 + return nil + } + + if qps < 10.0 || qps > 10000.0 { + return fmt.Errorf("QPS limit is set to an invalid value (%f), must be a value in the range [10.0, 10000.0]", qps) + } + *v = MemberQPSValueWithValidation(qps) + return nil +} + +func newMemberQPSValueWithValidation(defaultVal float64, p *float64) *MemberQPSValueWithValidation { + *p = defaultVal + return (*MemberQPSValueWithValidation)(p) +} + +type MemberBurstValueWithValidation int + +func (v *MemberBurstValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *MemberBurstValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + burst, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + + if burst < 10 || burst > 20000 { + return fmt.Errorf("burst limit is set to an invalid value (%d), must be a value in the range [10, 20000]", burst) + } + *v = MemberBurstValueWithValidation(burst) + return nil +} + +func newMemberBurstValueWithValidation(defaultVal int, p *int) *MemberBurstValueWithValidation { + *p = defaultVal + return (*MemberBurstValueWithValidation)(p) +} diff --git a/cmd/memberagent/options/hub.go b/cmd/memberagent/options/hub.go new file mode 100644 index 000000000..aacd89091 --- /dev/null +++ b/cmd/memberagent/options/hub.go @@ -0,0 +1,59 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" +) + +// HubConnectivityOptions is a set of options that control how the KubeFleet +// member agent connects to the hub cluster. +type HubConnectivityOptions struct { + // Enable certificate-based authentication or not when connecting to the hub cluster. + // + // If this is set to true, provide with the member agent the file paths to the key + // and certificate to use for authentication via the `IDENTITY_KEY` and `IDENTITY_CERT` + // environment variables respectively. + // + // Otherwise, the member agent will use token-based authentication when connecting + // to the hub cluster. The agent will read the token from the file path specified + // by the `CONFIG_PATH` environment variable. + UseCertificateAuth bool + + // Use an insecure client or not when connecting to the hub cluster. + // + // If this is set to false, provide with the member agent a file path to the CA + // bundle to use for verifying the hub cluster's identity via the `CA_BUNDLE` environment + // variable; you can also give the member agent a file path to the CA data instead + // via the `HUB_CERTIFICATE_AUTHORITY` environment variable. + UseInsecureTLSClient bool +} + +// AddFlags adds flags for HubConnectivityOptions to the specified FlagSet. +func (o *HubConnectivityOptions) AddFlags(flags *flag.FlagSet) { + flags.BoolVar( + &o.UseCertificateAuth, + "use-ca-auth", + false, + "Enable certificate-based authentication or not when connecting to the hub cluster.") + + flags.BoolVar( + &o.UseInsecureTLSClient, + "tls-insecure", + false, + "Use an insecure client or not when connecting to the hub cluster.") +} diff --git a/cmd/memberagent/options/options.go b/cmd/memberagent/options/options.go new file mode 100644 index 000000000..29376f9fe --- /dev/null +++ b/cmd/memberagent/options/options.go @@ -0,0 +1,61 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" +) + +// Options is the options to use for running the KubeFleet member agent. +type Options struct { + // Options that control how the KubeFleet member agent connects to the hub cluster. + HubConnectivityOpts HubConnectivityOptions + + // Options that concern the setup of the controller manager instance in use by the + // KubeFleet member agent. + CtrlManagerOptions CtrlManagerOptions + + // Options that control how the KubeFleet member agent applies resources from the hub + // cluster to the member cluster. + ApplierOpts ApplierOptions + + // KubeFleet cluster property provider related options. + PropertyProviderOpts PropertyProviderOptions + + // The fields below are added only for backwards compatibility reasons. + // Their values are never read. + UseV1Beta1APIs bool +} + +func NewOptions() *Options { + return &Options{} +} + +func (o *Options) AddFlags(flags *flag.FlagSet) { + o.HubConnectivityOpts.AddFlags(flags) + o.CtrlManagerOptions.AddFlags(flags) + o.ApplierOpts.AddFlags(flags) + o.PropertyProviderOpts.AddFlags(flags) + + // The flags set up below are added only for backwards compatibility reasons. + // They are no-op flags and their values are never read. + flags.BoolVar( + &o.UseV1Beta1APIs, + "enable-v1beta1-apis", + true, + "Use KubeFleet v1beta1 APIs or not. This flag is obsolete; its value has no effect.") +} diff --git a/cmd/memberagent/options/propertyprovider.go b/cmd/memberagent/options/propertyprovider.go new file mode 100644 index 000000000..348d041fe --- /dev/null +++ b/cmd/memberagent/options/propertyprovider.go @@ -0,0 +1,82 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" +) + +type PropertyProviderOptions struct { + // The region where the member cluster resides. + Region string + + // The property provider to use in the KubeFleet member agent. + Name string + + // The path to a configuration file that enables the KubeFleet member agent to + // connect to a specific cloud platform to retrieve platform-specific cluster properties. + CloudConfigFilePath string + + // Enable support for cost properties in the Azure property provider or not. This option + // applies only when the Azure property provider is in use. + EnableAzProviderCostProperties bool + + // Enable support for available resource properties in the Azure property provider or not. + // This option applies only when the Azure property provider is in use. + EnableAzProviderAvailableResourceProperties bool + + // Enable support for namespace collection in the Azure property provider or not. This option applies only when the Azure property provider is in use. + EnableAzProviderNamespaceCollection bool +} + +func (o *PropertyProviderOptions) AddFlags(flags *flag.FlagSet) { + flags.StringVar( + &o.Region, + "region", + "", + "The region where the member cluster resides.") + + flags.StringVar( + &o.Name, + "property-provider", + "none", + "The property provider to use in the KubeFleet member agent.") + + flags.StringVar( + &o.CloudConfigFilePath, + "cloud-config", + "/etc/kubernetes/provider/config.json", + "The path to a configuration file that enables the KubeFleet member agent to connect to a specific cloud platform to retrieve platform-specific cluster properties.") + + flags.BoolVar( + &o.EnableAzProviderCostProperties, + "use-cost-properties-in-azure-provider", + true, + "Enable support for cost properties in the Azure property provider or not. This option applies only when the Azure property provider is in use.") + + flags.BoolVar( + &o.EnableAzProviderAvailableResourceProperties, + "use-available-res-properties-in-azure-provider", + true, + "Enable support for available resource properties in the Azure property provider or not. This option applies only when the Azure property provider is in use.") + + flags.BoolVar( + &o.EnableAzProviderNamespaceCollection, + "enable-namespace-collection-in-property-provider", + false, + "Enable support for namespace collection in the Azure property provider or not. This option applies only when the Azure property provider is in use.") +} diff --git a/cmd/memberagent/options/validation.go b/cmd/memberagent/options/validation.go new file mode 100644 index 000000000..5016205d8 --- /dev/null +++ b/cmd/memberagent/options/validation.go @@ -0,0 +1,54 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "k8s.io/apimachinery/pkg/util/validation/field" +) + +// Validate checks Options and return a slice of found errs. +// +// Note: the logic here concerns primarily cross-option validation; for single-option validation, +// consider adding the logic directly as part of the flag parsing function, for clarity reasons. +func (o *Options) Validate() field.ErrorList { + errs := field.ErrorList{} + newPath := field.NewPath("Options") + + // Cross-field validation for controller manager options. + if float64(o.CtrlManagerOptions.HubManagerOpts.Burst) < o.CtrlManagerOptions.HubManagerOpts.QPS { + errs = append(errs, field.Invalid(newPath.Child("HubManagerOpts").Child("Burst"), o.CtrlManagerOptions.HubManagerOpts.Burst, "The burst limit for hub cluster client-side throttling must be greater than or equal to its QPS limit")) + } + + if float64(o.CtrlManagerOptions.MemberManagerOpts.Burst) < o.CtrlManagerOptions.MemberManagerOpts.QPS { + errs = append(errs, field.Invalid(newPath.Child("MemberManagerOpts").Child("Burst"), o.CtrlManagerOptions.MemberManagerOpts.Burst, "The burst limit for member cluster client-side throttling must be greater than or equal to its QPS limit")) + } + + // Cross-field validation for applier options. + if o.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds > o.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds { + errs = append(errs, field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterInitialSlowBackoffDelaySeconds"), o.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds, "The initial delay for the slow backoff stage must not exceed the maximum delay for the slow backoff stage")) + } + + if o.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds > o.ApplierOpts.RequeueRateLimiterMaxFastBackoffDelaySeconds { + errs = append(errs, field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterMaxSlowBackoffDelaySeconds"), o.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds, "The maximum delay for the slow backoff stage must not exceed the maximum delay for the fast backoff stage")) + } + + if o.ApplierOpts.RequeueRateLimiterExponentialBaseForFastBackoff < o.ApplierOpts.RequeueRateLimiterExponentialBaseForSlowBackoff { + errs = append(errs, field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterExponentialBaseForFastBackoff"), o.ApplierOpts.RequeueRateLimiterExponentialBaseForFastBackoff, "The exponential base for the fast backoff stage must be greater than or equal to the exponential base for the slow backoff stage")) + } + + return errs +} diff --git a/cmd/memberagent/options/validation_test.go b/cmd/memberagent/options/validation_test.go new file mode 100644 index 000000000..a77ef9ad9 --- /dev/null +++ b/cmd/memberagent/options/validation_test.go @@ -0,0 +1,137 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "k8s.io/apimachinery/pkg/util/validation/field" +) + +// a callback function to modify options +type ModifyOptions func(option *Options) + +// newTestOptions creates an Options with default parameters. +func newTestOptions(modifyOptions ModifyOptions) Options { + option := Options{ + CtrlManagerOptions: CtrlManagerOptions{ + HubManagerOpts: PerClusterCtrlManagerOptions{ + QPS: 50, + Burst: 500, + }, + MemberManagerOpts: PerClusterCtrlManagerOptions{ + QPS: 250, + Burst: 1000, + }, + }, + ApplierOpts: ApplierOptions{ + RequeueRateLimiterInitialSlowBackoffDelaySeconds: 2, + RequeueRateLimiterMaxSlowBackoffDelaySeconds: 15, + RequeueRateLimiterMaxFastBackoffDelaySeconds: 900, + RequeueRateLimiterExponentialBaseForSlowBackoff: 1.2, + RequeueRateLimiterExponentialBaseForFastBackoff: 1.5, + }, + } + + if modifyOptions != nil { + modifyOptions(&option) + } + return option +} + +func TestValidate(t *testing.T) { + newPath := field.NewPath("Options") + testCases := map[string]struct { + opt Options + want field.ErrorList + }{ + "valid options": { + opt: newTestOptions(nil), + want: field.ErrorList{}, + }, + "hub burst less than hub QPS": { + opt: newTestOptions(func(option *Options) { + option.CtrlManagerOptions.HubManagerOpts.QPS = 200 + option.CtrlManagerOptions.HubManagerOpts.Burst = 100 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("HubManagerOpts").Child("Burst"), 100, "The burst limit for hub cluster client-side throttling must be greater than or equal to its QPS limit"), + }, + }, + "member burst less than member QPS": { + opt: newTestOptions(func(option *Options) { + option.CtrlManagerOptions.MemberManagerOpts.QPS = 500 + option.CtrlManagerOptions.MemberManagerOpts.Burst = 200 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("MemberManagerOpts").Child("Burst"), 200, "The burst limit for member cluster client-side throttling must be greater than or equal to its QPS limit"), + }, + }, + "slow backoff initial delay exceeds slow backoff max delay": { + opt: newTestOptions(func(option *Options) { + option.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds = 30 + option.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds = 15 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterInitialSlowBackoffDelaySeconds"), 30, "The initial delay for the slow backoff stage must not exceed the maximum delay for the slow backoff stage"), + }, + }, + "slow backoff max delay exceeds fast backoff max delay": { + opt: newTestOptions(func(option *Options) { + option.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds = 1000 + option.ApplierOpts.RequeueRateLimiterMaxFastBackoffDelaySeconds = 900 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterMaxSlowBackoffDelaySeconds"), 1000, "The maximum delay for the slow backoff stage must not exceed the maximum delay for the fast backoff stage"), + }, + }, + "fast backoff exponential base less than slow backoff exponential base": { + opt: newTestOptions(func(option *Options) { + option.ApplierOpts.RequeueRateLimiterExponentialBaseForSlowBackoff = 2.0 + option.ApplierOpts.RequeueRateLimiterExponentialBaseForFastBackoff = 1.5 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterExponentialBaseForFastBackoff"), 1.5, "The exponential base for the fast backoff stage must be greater than or equal to the exponential base for the slow backoff stage"), + }, + }, + "multiple simultaneous violations": { + opt: newTestOptions(func(option *Options) { + option.CtrlManagerOptions.HubManagerOpts.QPS = 200 + option.CtrlManagerOptions.HubManagerOpts.Burst = 100 + option.CtrlManagerOptions.MemberManagerOpts.QPS = 500 + option.CtrlManagerOptions.MemberManagerOpts.Burst = 200 + option.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds = 30 + option.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds = 15 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("HubManagerOpts").Child("Burst"), 100, "The burst limit for hub cluster client-side throttling must be greater than or equal to its QPS limit"), + field.Invalid(newPath.Child("MemberManagerOpts").Child("Burst"), 200, "The burst limit for member cluster client-side throttling must be greater than or equal to its QPS limit"), + field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterInitialSlowBackoffDelaySeconds"), 30, "The initial delay for the slow backoff stage must not exceed the maximum delay for the slow backoff stage"), + }, + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := tc.opt.Validate() + if diff := cmp.Diff(got, tc.want); diff != "" { + t.Errorf("Validate() errs mismatch (-got, +want):\n%s", diff) + } + }) + } +} diff --git a/config/crd/bases/cluster.kubernetes-fleet.io_internalmemberclusters.yaml b/config/crd/bases/cluster.kubernetes-fleet.io_internalmemberclusters.yaml index 8742d05eb..7199ca697 100644 --- a/config/crd/bases/cluster.kubernetes-fleet.io_internalmemberclusters.yaml +++ b/config/crd/bases/cluster.kubernetes-fleet.io_internalmemberclusters.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: internalmemberclusters.cluster.kubernetes-fleet.io spec: group: cluster.kubernetes-fleet.io diff --git a/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml b/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml index cc28cb137..7e883d441 100644 --- a/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml +++ b/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: memberclusters.cluster.kubernetes-fleet.io spec: group: cluster.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml b/config/crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml index 21ae6b352..aceb00761 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: appliedworks.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml b/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml index b56bbdae9..fadbc2e4b 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: approvalrequests.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml index 02ddd96fe..f02ba1461 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterapprovalrequests.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml index 0e12d6032..ddb481a31 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourcebindings.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml index 9b00cfba7..bee507d23 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceenvelopes.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml index 71474ff9b..7d8e978d4 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceoverrides.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml index 336d3a861..f84768f50 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceoverridesnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml index fd1f78819..a3af13ea4 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceplacementdisruptionbudgets.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementevictions.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementevictions.yaml index ae5fea701..ae40aae6f 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementevictions.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementevictions.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceplacementevictions.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml index 9299ee0af..630ad2a07 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceplacements.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml index 2c42689db..00b893dae 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceplacementstatuses.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml index 72a3ef8a6..325f5b255 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourcesnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml index ffa988280..954a72bab 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterschedulingpolicysnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml index d4955b6ce..2b34db4d3 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterstagedupdateruns.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml index 1556c8946..9be8428dc 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterstagedupdatestrategies.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourcebindings.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourcebindings.yaml index 046ce7ece..403e37e13 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourcebindings.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourcebindings.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourcebindings.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml index 963e66c40..09e0f8a4d 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourceenvelopes.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceoverrides.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceoverrides.yaml index dc538218c..1296c68a5 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceoverrides.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceoverrides.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourceoverrides.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml index 0d5279618..dc634b84e 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourceoverridesnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml index 303f9e9b8..e233dc3bd 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourceplacements.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml index 1d6265e1c..933e26276 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourcesnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml index b0b6c5ba1..0485e0696 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: schedulingpolicysnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml index a0a5c1a4c..ff9152c9d 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: stagedupdateruns.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml index 487040869..5315f7322 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: stagedupdatestrategies.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_works.yaml b/config/crd/bases/placement.kubernetes-fleet.io_works.yaml index 9d6bcfb3b..4ae5e0adc 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_works.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_works.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: works.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/docker/crd-installer.Dockerfile b/docker/crd-installer.Dockerfile index cf0bfd154..302c6e0a1 100644 --- a/docker/crd-installer.Dockerfile +++ b/docker/crd-installer.Dockerfile @@ -1,5 +1,5 @@ # Build the crdinstaller binary -FROM mcr.microsoft.com/oss/go/microsoft/golang:1.24.13 AS builder +FROM mcr.microsoft.com/oss/go/microsoft/golang:1.25.8 AS builder ARG GOOS=linux ARG GOARCH=amd64 diff --git a/docker/hub-agent.Dockerfile b/docker/hub-agent.Dockerfile index 5d4c964a6..5c751257d 100644 --- a/docker/hub-agent.Dockerfile +++ b/docker/hub-agent.Dockerfile @@ -1,5 +1,5 @@ # Build the hubagent binary -FROM mcr.microsoft.com/oss/go/microsoft/golang:1.24.13 AS builder +FROM mcr.microsoft.com/oss/go/microsoft/golang:1.25.8 AS builder ARG GOOS=linux ARG GOARCH=amd64 diff --git a/docker/member-agent.Dockerfile b/docker/member-agent.Dockerfile index ef3fb0a14..33e122e1b 100644 --- a/docker/member-agent.Dockerfile +++ b/docker/member-agent.Dockerfile @@ -1,5 +1,5 @@ # Build the memberagent binary -FROM mcr.microsoft.com/oss/go/microsoft/golang:1.24.13 AS builder +FROM mcr.microsoft.com/oss/go/microsoft/golang:1.25.8 AS builder ARG GOOS=linux ARG GOARCH=amd64 @@ -13,13 +13,13 @@ COPY go.sum go.sum RUN go mod download # Copy the go source -COPY cmd/memberagent/main.go main.go +COPY cmd/memberagent cmd/memberagent/ COPY apis/ apis/ COPY pkg/ pkg/ -# Build with CGO enabled and GOEXPERIMENT=systemcrypto for internal usage -RUN echo "Building for GOOS=$GOOS GOARCH=$GOARCH" -RUN CGO_ENABLED=1 GOOS=$GOOS GOARCH=$GOARCH GOEXPERIMENT=systemcrypto GO111MODULE=on go build -o memberagent main.go +# Build +RUN echo "Building images with GOOS=$GOOS GOARCH=$GOARCH" +RUN CGO_ENABLED=1 GOOS=$GOOS GOARCH=$GOARCH GOEXPERIMENT=systemcrypto GO111MODULE=on go build -o memberagent cmd/memberagent/main.go # Use Azure Linux distroless base image to package the memberagent binary # Refer to https://mcr.microsoft.com/en-us/artifact/mar/azurelinux/distroless/base/about for more details diff --git a/docker/refresh-token.Dockerfile b/docker/refresh-token.Dockerfile index 19ac47f7a..f688c9e68 100644 --- a/docker/refresh-token.Dockerfile +++ b/docker/refresh-token.Dockerfile @@ -1,5 +1,5 @@ # Build the refreshtoken binary -FROM mcr.microsoft.com/oss/go/microsoft/golang:1.24.13 AS builder +FROM mcr.microsoft.com/oss/go/microsoft/golang:1.25.8 AS builder ARG GOOS="linux" ARG GOARCH="amd64" diff --git a/go.mod b/go.mod index 48588c34c..68d96d644 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module go.goms.io/fleet -go 1.24.13 +go 1.25.8 require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 diff --git a/hack/perftest/1000stagedupdateruns/README.md b/hack/perftest/1000stagedupdateruns/README.md new file mode 100644 index 000000000..cb1d1b0cf --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/README.md @@ -0,0 +1,33 @@ +# KubeFleet Performance/Scalability Test Utility: Creating 1K Staged Update Runs Concurrently + +This directory contains a utility program that creates 1K staged update runs that run concurrently and polls +until their full completion. + +The program is added for the purpose of testing the performance and scalability of KubeFleet. + +## Before you begin + +* Set up 1K placements using the utility program in `../1000placements` before running this utility program. +* Make sure that all placements have been updated to use the `External` update strategy (via the staged update run APIs). +* Make sure that all member clusters are labelled with `env=[canary|staging|prod]` as appropriate so that KubeFleet +can assign them to their respective stages. + +> If you have followed the instructions in `../../README.md` and have used the given scripts and utility programs +> to run the performance/scalability test, all the steps above should have been done for you already. + +## Running the utility program + +Run the commands below to run the utility program: + +```bash +go run main.go +``` + +With the default setup, it might take 1-2 hours before all 1K staged update runs are fully completed. The programs +reports the progress in the output. + +After the program completes, run the commands below to clean up the created 1K staged update runs: + +```bash +CLEANUP=set go run main.go +``` diff --git a/hack/perftest/1000stagedupdateruns/example-run.yaml b/hack/perftest/1000stagedupdateruns/example-run.yaml new file mode 100644 index 000000000..94da747ba --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/example-run.yaml @@ -0,0 +1,8 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterStagedUpdateRun +metadata: + name: run-0 +spec: + placementName: crp-0 + stagedRolloutStrategyName: staging-canary-prod + state: Run diff --git a/hack/perftest/1000stagedupdateruns/example-strategy.yaml b/hack/perftest/1000stagedupdateruns/example-strategy.yaml new file mode 100644 index 000000000..af500455a --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/example-strategy.yaml @@ -0,0 +1,21 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterStagedUpdateStrategy +metadata: + name: "staging-canary-prod" +spec: + stages: + - name: staging + labelSelector: + matchLabels: + env: staging + maxConcurrency: 25% + - name: canary + labelSelector: + matchLabels: + env: canary + maxConcurrency: 25% + - name: prod + labelSelector: + matchLabels: + env: prod + maxConcurrency: 25% diff --git a/hack/perftest/1000stagedupdateruns/main.go b/hack/perftest/1000stagedupdateruns/main.go new file mode 100644 index 000000000..161f8d3c7 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/main.go @@ -0,0 +1,96 @@ +package main + +import ( + "context" + "os" + "time" + + "k8s.io/apimachinery/pkg/util/wait" + + "go.goms.io/fleet/hack/perftest/1000stagedupdateruns/utils" +) + +const ( + // The number of workers to create/delete resources concurrently. + resourceSetupWorkerCount = 15 + // The number of workers to long poll staged update runs concurrently. + longPollingWorkerCount = 15 + // The cool down period between two stages (e.g., creating resources and long polling them). + betweenStageCoolDownPeriod = time.Second * 30 + // The cool down period between two polls. + longPollingCoolDownPeriod = time.Second * 45 + + // The max concurrency per stage setting for each staged update run. + maxConcurrencyPerStage = "50%" + // The number of placements (and thus the number of staged update runs) to create. + maxCRPToUpdateCount = 1000 +) + +var ( + retryOpsBackoff = wait.Backoff{ + Steps: 4, + Duration: 4 * time.Second, + Factor: 2.0, + Jitter: 0.1, + } +) + +func main() { + ctx := context.Background() + + // Read the arguments. + doCleanUp := false + cleanUpFlag := os.Getenv("CLEANUP") + if len(cleanUpFlag) != 0 { + doCleanUp = true + } + + runner := utils.New( + resourceSetupWorkerCount, + longPollingWorkerCount, + betweenStageCoolDownPeriod, + longPollingCoolDownPeriod, + maxConcurrencyPerStage, + maxCRPToUpdateCount, + retryOpsBackoff, + ) + + if doCleanUp { + runner.CleanUp(ctx) + return + } + + // Prepare the staged update run strategy. + println("Preparing the staged update run strategy...") + runner.CreateStagedUpdateRunStrategy(ctx) + + // Patch existing resources. + println("Updating existing resources...") + runner.UpdateResources(ctx) + + // Cool down. + println("Cooling down...") + runner.CoolDown() + + // Create the staged update runs. + println("Creating staged update runs...") + runner.CreateStagedUpdateRuns(ctx) + + // Cool down. + println("Cooling down...") + runner.CoolDown() + + // Long poll the staged update runs. + println("Long polling staged update runs...") + runner.LongPollStagedUpdateRuns(ctx) + + // Track the latency. + println("Tracking latency...") + runner.TrackLatency(ctx) + + // Tally the latency quantiles. + println("Tallying latency quantiles...") + runner.TallyLatencyQuantiles() + + println("All staged update runs have been completed, exiting.") +} diff --git a/hack/perftest/1000stagedupdateruns/utils/cleanup.go b/hack/perftest/1000stagedupdateruns/utils/cleanup.go new file mode 100644 index 000000000..37567a480 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/cleanup.go @@ -0,0 +1,114 @@ +package utils + +import ( + "context" + "fmt" + "sync" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" +) + +func (r *Runner) CleanUp(ctx context.Context) { + r.cleanUpStrategy(ctx) + r.cleanUpRuns(ctx) +} + +func (r *Runner) cleanUpStrategy(ctx context.Context) { + stagedUpdateRunStrategy := &placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: commonStagedUpdateRunStrategyName, + }, + } + + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsNotFound(err) + }, func() error { + return r.hubClient.Delete(ctx, stagedUpdateRunStrategy) + }) + if errAfterRetries != nil && !errors.IsNotFound(errAfterRetries) { + fmt.Printf("failed to delete staged update run strategy %s after retries: %v\n", commonStagedUpdateRunStrategyName, errAfterRetries) + } +} + +func (r *Runner) cleanUpRuns(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the producer. + wg.Add(1) + go func() { + defer wg.Done() + + for i := 0; i < r.maxCRPToUpdateCount; i++ { + select { + case r.toDeleteChan <- i: + case <-ctx.Done(): + close(r.toDeleteChan) + return + } + } + + close(r.toDeleteChan) + }() + + // Run the workers. + for i := 0; i < r.resourceSetupWorkerCount; i++ { + wg.Add(1) + go func(workerIdx int) { + defer wg.Done() + + for { + // Read from the channel. + var resIdx int + var readOk bool + select { + case resIdx, readOk = <-r.toDeleteChan: + if !readOk { + fmt.Printf("worker %d exits\n", workerIdx) + return + } + case <-ctx.Done(): + return + } + + // Delete the staged update run. + stagedUpdateRun := &placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(stagedUpdateRunNameFmt, resIdx), + }, + } + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsNotFound(err) + }, func() error { + return r.hubClient.Delete(ctx, stagedUpdateRun) + }) + if errAfterRetries != nil && !errors.IsNotFound(errAfterRetries) { + fmt.Printf("worker %d: failed to delete staged update run %s after retries: %v\n", workerIdx, stagedUpdateRun.Name, errAfterRetries) + continue + } + + // Wait until the staged update run is deleted. + errAfterRetries = retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsNotFound(err) + }, func() error { + stagedUpdateRun := &placementv1beta1.ClusterStagedUpdateRun{} + err := r.hubClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(stagedUpdateRunNameFmt, resIdx)}, stagedUpdateRun) + if err == nil { + return fmt.Errorf("staged update run %s still exists", stagedUpdateRun.Name) + } + return err + }) + if errAfterRetries == nil || !errors.IsNotFound(errAfterRetries) { + fmt.Printf("worker %d: failed to wait for staged update run %s to be deleted after retries: %v\n", workerIdx, stagedUpdateRun.Name, errAfterRetries) + } else { + fmt.Printf("worker %d: deleted staged update run %s\n", workerIdx, stagedUpdateRun.Name) + } + } + }(i) + } + wg.Wait() +} diff --git a/hack/perftest/1000stagedupdateruns/utils/latency.go b/hack/perftest/1000stagedupdateruns/utils/latency.go new file mode 100644 index 000000000..0d1885c88 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/latency.go @@ -0,0 +1,53 @@ +package utils + +import ( + "context" + "fmt" + "math" + "sort" + "sync" +) + +func (r *Runner) TrackLatency(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the latency tracker. + wg.Add(1) + go func() { + defer wg.Done() + + for { + var attempt latencyTrackAttempt + var readOK bool + select { + case attempt, readOK = <-r.toTrackLatencyChan: + if !readOK { + return + } + fmt.Printf("latency tracker: staged update run run-%d has latency %v seconds\n", attempt.resIdx, attempt.latency.Seconds()) + r.stagedUpdateRunCompletionLatencyByRunName[fmt.Sprintf("run-%d", attempt.resIdx)] = attempt.latency + case <-ctx.Done(): + return + } + } + }() + wg.Wait() +} + +func (r *Runner) TallyLatencyQuantiles() { + latencies := make([]float64, 0, len(r.stagedUpdateRunCompletionLatencyByRunName)) + for _, latency := range r.stagedUpdateRunCompletionLatencyByRunName { + latencies = append(latencies, latency.Seconds()) + } + sort.Slice(latencies, func(i, j int) bool { + return latencies[i] < latencies[j] + }) + + q25 := int(math.Floor(float64(len(latencies)) * 0.25)) + q50 := int(math.Floor(float64(len(latencies)) * 0.50)) + q75 := int(math.Floor(float64(len(latencies)) * 0.75)) + q90 := int(math.Floor(float64(len(latencies)) * 0.90)) + q99 := int(math.Floor(float64(len(latencies)) * 0.99)) + fmt.Printf("latencies (seconds): 25th=%v, 50th=%v, 75th=%v, 90th=%v, 99th=%v\n", + latencies[q25], latencies[q50], latencies[q75], latencies[q90], latencies[q99]) +} diff --git a/hack/perftest/1000stagedupdateruns/utils/poll.go b/hack/perftest/1000stagedupdateruns/utils/poll.go new file mode 100644 index 000000000..61d14969d --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/poll.go @@ -0,0 +1,86 @@ +package utils + +import ( + "context" + "fmt" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" +) + +func (r *Runner) LongPollStagedUpdateRuns(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the polling workers. + for i := 0; i < r.longPollingWorkerCount; i++ { + wg.Add(1) + + go func(workerIdx int) { + defer wg.Done() + + for { + // Read from the channel. + var resIdx int + var readOk bool + select { + case resIdx, readOk = <-r.toLongPollStagedUpdateRunsChan: + if !readOk { + fmt.Printf("long polling worker %d exits\n", workerIdx) + return + } + case <-ctx.Done(): + return + default: + if r.longPollingStagedUpdateRunsCount.Load() == r.completedStagedUpdateRunCount.Load() { + return + } + continue + } + + // Read the staged update run to check if it's completed. + var stagedUpdateRun placementv1beta1.ClusterStagedUpdateRun + if err := r.hubClient.Get(ctx, client.ObjectKey{Name: fmt.Sprintf(stagedUpdateRunNameFmt, resIdx)}, &stagedUpdateRun); err != nil { + fmt.Printf("long polling worker %d: failed to get staged update run run-%d: %v\n", workerIdx, resIdx, err) + // Requeue the run; no need to retry here. + r.toLongPollStagedUpdateRunsChan <- resIdx + time.Sleep(r.longPollingCoolDownPeriod) + continue + } + + // Check if the staged update run is completed. + runSucceededCond := meta.FindStatusCondition(stagedUpdateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionSucceeded)) + if runSucceededCond == nil || runSucceededCond.Status != metav1.ConditionTrue || runSucceededCond.ObservedGeneration != stagedUpdateRun.Generation { + fmt.Printf("long polling worker %d: staged update run run-%d is not completed yet, requeue it\n", workerIdx, resIdx) + // Requeue the run. + r.toLongPollStagedUpdateRunsChan <- resIdx + time.Sleep(r.longPollingCoolDownPeriod) + continue + } else { + // The staged update run has been completed. + newCount := r.completedStagedUpdateRunCount.Add(1) + fmt.Printf("long polling worker %d: staged update run run-%d is completed, total completed count: %d\n", workerIdx, resIdx, newCount) + + // Track its latency. + creationTimestamp := stagedUpdateRun.CreationTimestamp.Time + completionLatency := runSucceededCond.LastTransitionTime.Sub(creationTimestamp) + r.toTrackLatencyChan <- latencyTrackAttempt{ + latency: completionLatency, + resIdx: resIdx, + } + } + } + }(i) + } + + wg.Wait() + + // Do a sanity check report. + fmt.Printf("long polling %d staged update runs, with %d completed\n", r.longPollingStagedUpdateRunsCount.Load(), r.completedStagedUpdateRunCount.Load()) + + close(r.toTrackLatencyChan) +} diff --git a/hack/perftest/1000stagedupdateruns/utils/run.go b/hack/perftest/1000stagedupdateruns/utils/run.go new file mode 100644 index 000000000..be5a67dd6 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/run.go @@ -0,0 +1,86 @@ +package utils + +import ( + "context" + "fmt" + "sync" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" + + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" +) + +func (r *Runner) CreateStagedUpdateRuns(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the producer. + wg.Add(1) + go func() { + defer wg.Done() + + for i := 0; i < r.maxCRPToUpdateCount; i++ { + select { + case r.toCreateStagedUpdateRunsChan <- i: + case <-ctx.Done(): + close(r.toCreateStagedUpdateRunsChan) + return + } + } + + close(r.toCreateStagedUpdateRunsChan) + }() + + // Run the workers to create staged update runs. + for i := 0; i < r.resourceSetupWorkerCount; i++ { + wg.Add(1) + + go func(workerIdx int) { + defer wg.Done() + + for { + // Read from the channel. + var resIdx int + var readOk bool + select { + case resIdx, readOk = <-r.toCreateStagedUpdateRunsChan: + if !readOk { + fmt.Printf("worker %d exits\n", workerIdx) + return + } + case <-ctx.Done(): + return + } + + // Create the staged update run. + stagedUpdateRun := placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(stagedUpdateRunNameFmt, resIdx), + }, + Spec: placementv1beta1.UpdateRunSpec{ + PlacementName: fmt.Sprintf(placementNameFmt, resIdx), + StagedUpdateStrategyName: commonStagedUpdateRunStrategyName, + State: placementv1beta1.StateRun, + }, + } + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsAlreadyExists(err) + }, func() error { + return r.hubClient.Create(ctx, &stagedUpdateRun) + }) + if errAfterRetries != nil && !errors.IsAlreadyExists(errAfterRetries) { + fmt.Printf("worker %d: failed to create staged update run run-%d after retries: %v\n", workerIdx, resIdx, errAfterRetries) + continue + } + fmt.Printf("worker %d: successfully created staged update run run-%d\n", workerIdx, resIdx) + + // Submit the run for long polling. + r.toLongPollStagedUpdateRunsChan <- resIdx + r.longPollingStagedUpdateRunsCount.Add(1) + } + }(i) + } + + wg.Wait() +} diff --git a/hack/perftest/1000stagedupdateruns/utils/setup.go b/hack/perftest/1000stagedupdateruns/utils/setup.go new file mode 100644 index 000000000..f7d84ae04 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/setup.go @@ -0,0 +1,121 @@ +package utils + +import ( + "fmt" + "sync/atomic" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" +) + +const ( + nsNameFmt = "work-%d" + configMapNameFmt = "data-%d" + deployNameFmt = "app-%d" + + placementNameFmt = "crp-%d" + + stagedUpdateRunNameFmt = "run-%d" +) + +const ( + commonStagedUpdateRunStrategyName = "staging-canary-prod" + + envLabelKey = "env" + stagingEnvLabelValue = "staging" + canaryEnvLabelValue = "canary" + prodEnvLabelValue = "prod" + + randomUUIDLabelKey = "random-uuid" +) + +type latencyTrackAttempt struct { + latency time.Duration + resIdx int +} + +type Runner struct { + hubClient client.Client + + resourceSetupWorkerCount int + longPollingWorkerCount int + betweenStageCoolDownPeriod time.Duration + longPollingCoolDownPeriod time.Duration + + maxConcurrencyPerStage string + maxCRPToUpdateCount int + + retryOpsBackoff wait.Backoff + + toDeleteChan chan int + toPatchResourcesChan chan int + toCreateStagedUpdateRunsChan chan int + toLongPollStagedUpdateRunsChan chan int + toTrackLatencyChan chan latencyTrackAttempt + + resourcesPatchedCount atomic.Int32 + longPollingStagedUpdateRunsCount atomic.Int32 + completedStagedUpdateRunCount atomic.Int32 + + stagedUpdateRunCompletionLatencyByRunName map[string]time.Duration +} + +func New( + concurrentWorkerCount, longPollingWorkerCount int, + betweenStageCoolDownPeriod, longPollingCoolDownPeriod time.Duration, + maxConcurrencyPerStage string, + maxCRPToUpdateCount int, + retryOpsBackoff wait.Backoff, +) *Runner { + // Set up the K8s client for the hub cluster. + hubClusterConfig := ctrl.GetConfigOrDie() + hubClusterConfig.QPS = 200 + hubClusterConfig.Burst = 400 + hubClient, err := client.New(hubClusterConfig, client.Options{ + Scheme: scheme.Scheme, + }) + if err != nil { + panic(fmt.Sprintf("Failed to create hub client: %v", err)) + } + + return &Runner{ + hubClient: hubClient, + resourceSetupWorkerCount: concurrentWorkerCount, + longPollingWorkerCount: longPollingWorkerCount, + betweenStageCoolDownPeriod: betweenStageCoolDownPeriod, + longPollingCoolDownPeriod: longPollingCoolDownPeriod, + maxConcurrencyPerStage: maxConcurrencyPerStage, + maxCRPToUpdateCount: maxCRPToUpdateCount, + retryOpsBackoff: retryOpsBackoff, + toDeleteChan: make(chan int, 20), + toPatchResourcesChan: make(chan int, maxCRPToUpdateCount), + toCreateStagedUpdateRunsChan: make(chan int, 20), + toLongPollStagedUpdateRunsChan: make(chan int, maxCRPToUpdateCount), + toTrackLatencyChan: make(chan latencyTrackAttempt, maxCRPToUpdateCount+1), + stagedUpdateRunCompletionLatencyByRunName: make(map[string]time.Duration, maxCRPToUpdateCount), + } +} + +func init() { + // Set up the scheme. + if err := placementv1beta1.AddToScheme(scheme.Scheme); err != nil { + panic(fmt.Sprintf("Failed to add placement v1beta1 APIs to the scheme: %v", err)) + } + if err := corev1.AddToScheme(scheme.Scheme); err != nil { + panic(fmt.Sprintf("Failed to add core v1 APIs to the scheme: %v", err)) + } + if err := appsv1.AddToScheme(scheme.Scheme); err != nil { + panic(fmt.Sprintf("Failed to add apps v1 APIs to the scheme: %v", err)) + } +} + +func (r *Runner) CoolDown() { + time.Sleep(r.betweenStageCoolDownPeriod) +} diff --git a/hack/perftest/1000stagedupdateruns/utils/strategy.go b/hack/perftest/1000stagedupdateruns/utils/strategy.go new file mode 100644 index 000000000..85b7ff82f --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/strategy.go @@ -0,0 +1,61 @@ +package utils + +import ( + "context" + "fmt" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/util/retry" + "k8s.io/utils/ptr" + + placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1" +) + +func (r *Runner) CreateStagedUpdateRunStrategy(ctx context.Context) { + stagedUpdateRunStrategy := &placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: commonStagedUpdateRunStrategyName, + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "staging", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + envLabelKey: stagingEnvLabelValue, + }, + }, + MaxConcurrency: ptr.To(intstr.FromString(r.maxConcurrencyPerStage)), + }, + { + Name: "canary", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + envLabelKey: canaryEnvLabelValue, + }, + }, + MaxConcurrency: ptr.To(intstr.FromString(r.maxConcurrencyPerStage)), + }, + { + Name: "prod", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + envLabelKey: prodEnvLabelValue, + }, + }, + MaxConcurrency: ptr.To(intstr.FromString(r.maxConcurrencyPerStage)), + }, + }, + }, + } + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsAlreadyExists(err) + }, func() error { + return r.hubClient.Create(ctx, stagedUpdateRunStrategy) + }) + if errAfterRetries != nil && !errors.IsAlreadyExists(errAfterRetries) { + panic(fmt.Sprintf("Failed to create staged update run strategy: %v", errAfterRetries)) + } +} diff --git a/hack/perftest/1000stagedupdateruns/utils/update.go b/hack/perftest/1000stagedupdateruns/utils/update.go new file mode 100644 index 000000000..0cfe5317c --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/update.go @@ -0,0 +1,164 @@ +package utils + +import ( + "context" + "fmt" + "sync" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/client-go/util/retry" +) + +func (r *Runner) UpdateResources(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the producer. + wg.Add(1) + go func() { + defer wg.Done() + + for i := 0; i < r.maxCRPToUpdateCount; i++ { + select { + case r.toPatchResourcesChan <- i: + case <-ctx.Done(): + close(r.toPatchResourcesChan) + return + } + } + + close(r.toPatchResourcesChan) + }() + + // Run the workers to add a label to each existing resource. + for i := 0; i < r.resourceSetupWorkerCount; i++ { + wg.Add(1) + go func(workerIdx int) { + defer wg.Done() + + for { + // Read from the channel. + var resIdx int + var readOk bool + select { + case resIdx, readOk = <-r.toPatchResourcesChan: + if !readOk { + fmt.Printf("worker %d exits\n", workerIdx) + return + } + case <-ctx.Done(): + return + } + + if err := r.updateNS(ctx, workerIdx, resIdx); err != nil { + fmt.Printf("worker %d: failed to update namespace work-%d: %v\n", workerIdx, resIdx, err) + continue + } + + if err := r.updateConfigMap(ctx, workerIdx, resIdx); err != nil { + fmt.Printf("worker %d: failed to update configmap data-%d: %v\n", workerIdx, resIdx, err) + continue + } + + if err := r.updateDeploy(ctx, workerIdx, resIdx); err != nil { + fmt.Printf("worker %d: failed to update deployment deploy-%d: %v\n", workerIdx, resIdx, err) + continue + } + + fmt.Printf("worker %d: successfully updated resources group of idx %d\n", workerIdx, resIdx) + + r.resourcesPatchedCount.Add(1) + } + }(i) + } + + wg.Wait() + + // Do a sanity check report. + fmt.Printf("patched %d out of %d resources in total\n", r.resourcesPatchedCount.Load(), r.maxCRPToUpdateCount) +} + +func (r *Runner) updateNS(ctx context.Context, workerIdx, resIdx int) error { + nsName := fmt.Sprintf(nsNameFmt, resIdx) + + // Update the namespace with the new label. + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil + }, func() error { + ns := &corev1.Namespace{} + if err := r.hubClient.Get(ctx, types.NamespacedName{Name: nsName}, ns); err != nil { + return fmt.Errorf("failed to get namespace %s before update: %w", nsName, err) + } + + labels := ns.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + labels[randomUUIDLabelKey] = string(uuid.NewUUID()) + ns.SetLabels(labels) + + if err := r.hubClient.Update(ctx, ns); err != nil { + return fmt.Errorf("failed to update namespace %s: %w", nsName, err) + } + return nil + }) + return errAfterRetries +} + +func (r *Runner) updateConfigMap(ctx context.Context, workerIdx, resIdx int) error { + cmName := fmt.Sprintf(configMapNameFmt, resIdx) + cmNamespace := fmt.Sprintf(nsNameFmt, resIdx) + + // Update the configmap with the new label. + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil + }, func() error { + cm := &corev1.ConfigMap{} + if err := r.hubClient.Get(ctx, types.NamespacedName{Name: cmName, Namespace: cmNamespace}, cm); err != nil { + return fmt.Errorf("failed to get configmap %s in namespace %s before update: %w", cmName, cmNamespace, err) + } + + labels := cm.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + labels[randomUUIDLabelKey] = string(uuid.NewUUID()) + cm.SetLabels(labels) + + if err := r.hubClient.Update(ctx, cm); err != nil { + return fmt.Errorf("failed to update configmap %s in namespace %s: %w", cmName, cmNamespace, err) + } + return nil + }) + return errAfterRetries +} + +func (r *Runner) updateDeploy(ctx context.Context, workerIdx, resIdx int) error { + deployName := fmt.Sprintf(deployNameFmt, resIdx) + deployNamespace := fmt.Sprintf(nsNameFmt, resIdx) + + // Update the deployment with the new label. + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil + }, func() error { + deploy := &appsv1.Deployment{} + if err := r.hubClient.Get(ctx, types.NamespacedName{Name: deployName, Namespace: deployNamespace}, deploy); err != nil { + return fmt.Errorf("failed to get deployment %s in namespace %s before update: %w", deployName, deployNamespace, err) + } + + labels := deploy.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + labels[randomUUIDLabelKey] = string(uuid.NewUUID()) + deploy.SetLabels(labels) + + if err := r.hubClient.Update(ctx, deploy); err != nil { + return fmt.Errorf("failed to update deployment %s in namespace %s: %w", deployName, deployNamespace, err) + } + return nil + }) + return errAfterRetries +} diff --git a/hack/quickstart/join-member-clusters.ps1 b/hack/quickstart/join-member-clusters.ps1 new file mode 100644 index 000000000..2e229d926 --- /dev/null +++ b/hack/quickstart/join-member-clusters.ps1 @@ -0,0 +1,207 @@ +# Note: you must have at least one hub cluster and one member cluster. +# +# The reason we require the hub cluster API URL as an argument is that, in some environments (e.g. kind), the URL cannot be derived from kubeconfig and needs to be explicitly provided by users. +# +# For example, using Docker, you can get the right IP address for member clusters to use: +# docker inspect local-hub-01-control-plane --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' +# +# You can assume port 6443 unless you have explicitly changed the API server port for your hub cluster. Don't use the mapped Docker port (i.e. 50063) +# +# Example usage: +# ./join-member-clusters.ps1 0.2.2 demo-hub-01 https://172.18.0.2:6443 member-cluster-1 member-cluster-2 + +[CmdletBinding()] +param( + [Parameter(Position = 0)] + [string]$KubefleetVersion, + + [Parameter(Position = 1)] + [string]$HubClusterName, + + [Parameter(Position = 2)] + [string]$HubControlPlaneURL, + + [Parameter(Position = 3, ValueFromRemainingArguments = $true)] + [string[]]$MemberClusterNames, + + [switch]$Help +) + +function Show-Usage { + @' +Usage: + ./join-member-clusters.ps1 [ ...] + +Example: + ./join-member-clusters.ps1 0.2.2 demo-hub-01 https://172.18.0.2:6443 member-cluster-1 member-cluster-2 + +Requirements: + - PowerShell 7, kubectl, and helm must be installed + - hub and member cluster names must exist in your kubeconfig +'@ +} + +function Fail-WithHelp { + param( + [Parameter(Mandatory = $true)] + [string]$Message + ) + + Write-Error $Message + Write-Host "" + Show-Usage + exit 1 +} + +function Test-CommandExists { + param( + [Parameter(Mandatory = $true)] + [string]$Name + ) + + return $null -ne (Get-Command $Name -ErrorAction SilentlyContinue) +} + +function Test-ClusterExists { + param( + [Parameter(Mandatory = $true)] + [string]$Name + ) + + $clusters = @(kubectl config get-clusters 2>$null) + if ($LASTEXITCODE -ne 0) { + return $false + } + + return $clusters | Select-Object -Skip 1 | Where-Object { $_.Trim() -eq $Name } | ForEach-Object { $true } | Select-Object -First 1 +} + +if ($Help) { + Show-Usage + exit 0 +} + +if ([string]::IsNullOrWhiteSpace($KubefleetVersion) -or [string]::IsNullOrWhiteSpace($HubClusterName) -or [string]::IsNullOrWhiteSpace($HubControlPlaneURL) -or $null -eq $MemberClusterNames -or $MemberClusterNames.Count -lt 1) { + $argumentCount = 0 + if (-not [string]::IsNullOrWhiteSpace($KubefleetVersion)) { + $argumentCount++ + } + if (-not [string]::IsNullOrWhiteSpace($HubClusterName)) { + $argumentCount++ + } + if (-not [string]::IsNullOrWhiteSpace($HubControlPlaneURL)) { + $argumentCount++ + } + if ($null -ne $MemberClusterNames) { + $argumentCount += $MemberClusterNames.Count + } + + Fail-WithHelp "expected at least 4 arguments, got $argumentCount" +} + +if (-not (Test-CommandExists -Name kubectl)) { + Fail-WithHelp "kubectl is not installed or not available in PATH" +} + +if (-not (Test-CommandExists -Name helm)) { + Fail-WithHelp "helm is not installed or not available in PATH" +} + +if (-not (Test-ClusterExists -Name $HubClusterName)) { + Fail-WithHelp "hub cluster '$HubClusterName' was not found in kubeconfig" +} + +[System.Uri]$parsedHubURL = $null +if (-not [System.Uri]::TryCreate($HubControlPlaneURL, [System.UriKind]::Absolute, [ref]$parsedHubURL)) { + Fail-WithHelp "hub control plane URL '$HubControlPlaneURL' is not a valid absolute URL" +} + +if ($parsedHubURL.Scheme -ne "https") { + Fail-WithHelp "hub control plane URL must use https" +} + +foreach ($memberClusterName in $MemberClusterNames) { + if ([string]::IsNullOrWhiteSpace($memberClusterName)) { + Fail-WithHelp "member cluster name cannot be empty" + } + + if ($memberClusterName -eq $HubClusterName) { + Fail-WithHelp "member cluster '$memberClusterName' cannot be the same as hub cluster" + } + + if (-not (Test-ClusterExists -Name $memberClusterName)) { + Fail-WithHelp "member cluster '$memberClusterName' was not found in kubeconfig" + } +} + +foreach ($memberClusterName in $MemberClusterNames) { + # Note that Fleet will recognize your cluster with this name once it joins. + $serviceAccount = "$memberClusterName-hub-cluster-access" + $serviceAccountSecret = "$memberClusterName-hub-cluster-access-token" + + Write-Host "Switching into hub cluster context..." + kubectl config use-context $HubClusterName + + # The service account can, in theory, be created in any namespace; for simplicity reasons, + # here you will use the namespace reserved by Fleet installation, `fleet-system`. + # + # Note that if you choose a different value, commands in some steps below need to be + # modified accordingly. + Write-Host "Creating member service account..." + kubectl create serviceaccount $serviceAccount -n fleet-system + + Write-Host "Creating member service account secret..." + @" +apiVersion: v1 +kind: Secret +metadata: + name: $serviceAccountSecret + namespace: fleet-system + annotations: + kubernetes.io/service-account.name: $serviceAccount +type: kubernetes.io/service-account-token +"@ | kubectl apply -f - + + Write-Host "Creating member cluster custom resource on hub cluster..." + $encodedToken = kubectl get secret $serviceAccountSecret -n fleet-system -o "jsonpath={.data.token}" + $token = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String($encodedToken)) + + @" +apiVersion: cluster.kubernetes-fleet.io/v1 +kind: MemberCluster +metadata: + name: $memberClusterName +spec: + identity: + name: $memberClusterName-hub-cluster-access + kind: ServiceAccount + namespace: fleet-system + apiGroup: "" + heartbeatPeriodSeconds: 15 +"@ | kubectl apply -f - + + # Install the member agent helm chart on the member cluster. + Write-Host "Switching to member cluster context..." + kubectl config use-context $memberClusterName + + # Create the secret with the token extracted previously for member agent to use. + Write-Host "Creating secret..." + kubectl delete secret hub-kubeconfig-secret + kubectl create secret generic hub-kubeconfig-secret --from-literal="token=$token" + + Write-Host "Uninstalling any existing member-agent instances..." + helm uninstall member-agent -n fleet-system --wait + + Write-Host "Installing member-agent..." + helm install member-agent oci://ghcr.io/kubefleet-dev/kubefleet/charts/member-agent ` + --version $KubefleetVersion ` + --set "config.hubURL=$HubControlPlaneURL" ` + --set "config.memberClusterName=$memberClusterName" ` + --set logFileMaxSize=100000 ` + --namespace fleet-system ` + --create-namespace + + kubectl get pods -A + kubectl config use-context $HubClusterName + kubectl get membercluster $memberClusterName +} diff --git a/hack/quickstart/join-member-clusters.sh b/hack/quickstart/join-member-clusters.sh new file mode 100755 index 000000000..e941dca56 --- /dev/null +++ b/hack/quickstart/join-member-clusters.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# Note: you must have at least one hub cluster and one member cluster. +# +# The reason we require the hub cluster API URL as an argument is that, in some environments (e.g. kind), the URL cannot be derived from kubeconfig and needs to be explicitly provided by users. +# +# For example, using Docker, you can get the right IP address for member clusters to use: +# docker inspect local-hub-01-control-plane --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' +# +# You can assume port 6443 unless you have explicitly changed the API server port for your hub cluster. Don't use the mapped Docker port (i.e. 50063) +# +# Example usage: ./join-member-clusters.sh 0.2.2 demo-hub-01 https://172.18.0.2:6443 member-cluster-1 [ ...] + +usage() { + cat <<'EOF' +Usage: + ./join-member-clusters.sh [ ...] + +Example: + ./join-member-clusters.sh 0.2.2 demo-hub-01 https://172.18.0.2:6443 member-cluster-1 member-cluster-2 + +Requirements: + - kubectl and helm must be installed + - hub and member cluster names must exist in your kubeconfig +EOF +} + +fail_with_help() { + echo "Error: $1" >&2 + echo >&2 + usage >&2 + exit 1 +} + +if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then + usage + exit 0 +fi + +if [ "$#" -lt 4 ]; then + fail_with_help "expected at least 4 arguments, got $#" +fi + +if ! command -v kubectl >/dev/null 2>&1; then + fail_with_help "kubectl is not installed or not available in PATH" +fi + +if ! command -v helm >/dev/null 2>&1; then + fail_with_help "helm is not installed or not available in PATH" +fi + +export KUBEFLEET_VERSION="$1" +export HUB_CLUSTER_NAME="$2" +export HUB_CONTROL_PLANE_URL="$3" + +if [ -z "$KUBEFLEET_VERSION" ]; then + fail_with_help "kubefleet version cannot be empty" +fi + +if [ -z "$HUB_CLUSTER_NAME" ]; then + fail_with_help "hub cluster name cannot be empty" +fi + +if [ -z "$HUB_CONTROL_PLANE_URL" ]; then + fail_with_help "hub control plane URL cannot be empty" +fi + +case "$HUB_CONTROL_PLANE_URL" in + https://*) ;; + *) fail_with_help "hub control plane URL must use https" ;; +esac + +if ! kubectl config get-clusters 2>/dev/null | grep -Fxq "$HUB_CLUSTER_NAME"; then + fail_with_help "hub cluster '$HUB_CLUSTER_NAME' was not found in kubeconfig" +fi + +for MC in "${@:4}"; do + if [ -z "$MC" ]; then + fail_with_help "member cluster name cannot be empty" + fi + + if [ "$MC" = "$HUB_CLUSTER_NAME" ]; then + fail_with_help "member cluster '$MC' cannot be the same as hub cluster" + fi + + if ! kubectl config get-clusters 2>/dev/null | grep -Fxq "$MC"; then + fail_with_help "member cluster '$MC' was not found in kubeconfig" + fi +done + +for MC in "${@:4}"; do + +# Note that Fleet will recognize your cluster with this name once it joins. +export MEMBER_CLUSTER_NAME=$MC +export SERVICE_ACCOUNT="$MEMBER_CLUSTER_NAME-hub-cluster-access" + +echo "Switching into hub cluster context..." +kubectl config use $HUB_CLUSTER_NAME +# The service account can, in theory, be created in any namespace; for simplicity reasons, +# here you will use the namespace reserved by Fleet installation, `fleet-system`. +# +# Note that if you choose a different value, commands in some steps below need to be +# modified accordingly. +echo "Creating member service account..." +kubectl create serviceaccount $SERVICE_ACCOUNT -n fleet-system + +echo "Creating member service account secret..." +export SERVICE_ACCOUNT_SECRET="$MEMBER_CLUSTER_NAME-hub-cluster-access-token" +cat <