From aa37ea2249b42a8b0406f74216d078f7e42612cc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 19 Mar 2026 12:07:27 +1100 Subject: [PATCH 01/15] chore: bump step-security/harden-runner from 2.15.1 to 2.16.0 (#525) Bumps [step-security/harden-runner](https://github.com/step-security/harden-runner) from 2.15.1 to 2.16.0. - [Release notes](https://github.com/step-security/harden-runner/releases) - [Commits](https://github.com/step-security/harden-runner/compare/58077d3c7e43986b6b15fba718e8ea69e387dfcc...fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594) --- updated-dependencies: - dependency-name: step-security/harden-runner dependency-version: 2.16.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codespell.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index ee9d7ae79..eab1c9697 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@58077d3c7e43986b6b15fba718e8ea69e387dfcc # v2.15.1 + uses: step-security/harden-runner@fa2e9d605c4eeb9fcad4c99c224cee0c6c7f3594 # v2.16.0 with: egress-policy: audit From bcb3df62821ae111c23c14bc9834b633e1827a3a Mon Sep 17 00:00:00 2001 From: Simon Waight Date: Thu, 19 Mar 2026 14:30:11 +1100 Subject: [PATCH 02/15] fix: Invalid pr title workflow fix (#519) * Update PR template to add clear link requirement. Signed-off-by: Simon Waight * fix: add permissions for PR title checker workflow Signed-off-by: Simon Waight --------- Signed-off-by: Simon Waight --- .github/workflows/pr-title-lint.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/pr-title-lint.yml b/.github/workflows/pr-title-lint.yml index c03a0a099..b7fb5ee63 100644 --- a/.github/workflows/pr-title-lint.yml +++ b/.github/workflows/pr-title-lint.yml @@ -1,4 +1,9 @@ name: PR Title Checker + +permissions: + contents: read + pull-requests: write + on: pull_request: types: From 9bf9bdb4b5238194b18b6ab0569ce5dbd22f6ecc Mon Sep 17 00:00:00 2001 From: Simon Waight Date: Thu, 19 Mar 2026 15:02:59 +1100 Subject: [PATCH 03/15] feat: Add feature issue template (#517) * Update PR template to add clear link requirement. Signed-off-by: Simon Waight * test: add UTs to the hub agent CLI flag setup (#467) Signed-off-by: Simon Waight * feat: placement controller does not create resource snapshot when External rollout strategy (#465) Signed-off-by: Simon Waight * test: minor fix and add a test (#480) Signed-off-by: Simon Waight * chore: bump step-security/harden-runner from 2.14.2 to 2.15.0 (#468) Bumps [step-security/harden-runner](https://github.com/step-security/harden-runner) from 2.14.2 to 2.15.0. - [Release notes](https://github.com/step-security/harden-runner/releases) - [Commits](https://github.com/step-security/harden-runner/compare/5ef0c079ce82195b2a36a210272d6b661572d83e...a90bcbc6539c36a85cdfeb73f7e2f433735f215b) --- updated-dependencies: - dependency-name: step-security/harden-runner dependency-version: 2.15.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Ryan Zhang Signed-off-by: Simon Waight * feat: improve the default setup for hub agent leader election to allow better scalability/stability (#414) Signed-off-by: Simon Waight * feat: new selection mode so that CRP can select some namespace resources (#452) * add selection mode that places namespace and namespace scoped resources Signed-off-by: Wei Weng * remove namespace selector must be 1st restriction Signed-off-by: Wei Weng * use CEL validation Signed-off-by: Wei Weng * try double quote Signed-off-by: Wei Weng * simplify if else Signed-off-by: Wei Weng --------- Signed-off-by: Wei Weng Co-authored-by: Wei Weng Signed-off-by: Simon Waight * fix: correct pre-selected to preselected in scheduler framework test (#492) Signed-off-by: Simon Waight * chore: add gomod dependency updates to dependabot Signed-off-by: Yetkin Timocin Signed-off-by: Simon Waight * feat: simplify selection count check in NamespaceWithResourceSelectors selection scope (#488) Signed-off-by: Simon Waight * chore: bump actions/upload-artifact from 6 to 7 (#470) Signed-off-by: Simon Waight * chore: bump step-security/harden-runner from 2.15.0 to 2.15.1 (#497) Signed-off-by: Simon Waight * fix: use fixed version go import (#515) Signed-off-by: Simon Waight * chore: remove v1beta1 flag from chart (#502) Signed-off-by: Simon Waight * fix: fix the appVersion default value that doesn't start with v (#483) Signed-off-by: Simon Waight * Add feature request template. Signed-off-by: Simon Waight * Tweak label assigned Signed-off-by: Simon Waight --------- Signed-off-by: Simon Waight Signed-off-by: dependabot[bot] Signed-off-by: Wei Weng Signed-off-by: Yetkin Timocin Co-authored-by: michaelawyu Co-authored-by: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Co-authored-by: Ryan Zhang Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Wei Weng Co-authored-by: Wei Weng Co-authored-by: Yetkin Timocin --- .github/ISSUE_TEMPLATE/feature_request.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..ecf0920c3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: "[Feature]" +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. \ No newline at end of file From a271633b37d649ffcbe9a78aab31e1bd7803f3d9 Mon Sep 17 00:00:00 2001 From: Simon Waight Date: Fri, 20 Mar 2026 11:08:02 +1100 Subject: [PATCH 04/15] chore: updated maintainers and few other minor tweaks. (#533) Updated maintainers and few other minor tweaks. Signed-off-by: Simon Waight --- MAINTAINERS.md | 13 ++++++++----- README.md | 7 +++++++ ROADMAP.md | 3 +-- SUPPORT.md | 4 +--- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 8772074cf..7afe7c346 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -1,7 +1,10 @@ # The KubeFleet Maintainers -| Maintainer | Organization | GitHub Username | -|-------------|--------------|-------------------------------------------------------| -| Ryan Zhang | Microsoft | [@ryanzhang-oss](https://github.com/ryanzhang-oss) | -| Zhiying Lin | Microsoft | [@zhiying-lin](https://github.com/zhiying-lin) | -| Chen Yu | Microsoft | [@michaelawyu](https://github.com/michaelawyu) | +| Maintainer | Organization | GitHub Username | +|----------------|--------------|----------------------------------------------------| +| Ryan Zhang | Microsoft | [@ryanzhang-oss](https://github.com/ryanzhang-oss) | +| Zhiying Lin | Microsoft | [@zhiying-lin](https://github.com/zhiying-lin) | +| Chen Yu | Microsoft | [@michaelawyu](https://github.com/michaelawyu) | +| Wei Weng | Microsoft | [@weng271190436](https://github.com/weng271190436) | +| Yetkin Timocin | Microsoft | [@ytimocin](https://github.com/ytimocin) | +| Simon Waight | Microsoft | [@sjwaight](https://github.com/sjwaight) | diff --git a/README.md b/README.md index 2ee713fc3..d5782ce8e 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,11 @@ You can reach the KubeFleet community and developers via the following channels: ## Community Meetings +March 2026: we're currently revamping our community call schedule and will have more to share soon. + +Future plans will land on our [community repository](https://github.com/kubefleet-dev/community). + + + ## Code of Conduct Participation in KubeFleet is governed by the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/master/code-of-conduct.md). See the [Code of Conduct](CODE_OF_CONDUCT.md) for more information. diff --git a/ROADMAP.md b/ROADMAP.md index 051b014a7..6ce061174 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,7 +1,6 @@ # KubeFleet Roadmap -## Project Website -- Setup the project website +For our up to date roadmap please see: [KubeFleet Roadmap](https://github.com/orgs/kubefleet-dev/projects/4) ## Support more cluster properties so that user can pick the right cluster for their workload - Support node level SKU as properties, e.g. CPU, GPU, Memory, etc diff --git a/SUPPORT.md b/SUPPORT.md index 7954da112..97bc6626a 100644 --- a/SUPPORT.md +++ b/SUPPORT.md @@ -2,9 +2,7 @@ ## How to file issues and get help -This project uses GitHub Issues to track bugs and feature requests. Please search the existing -issues before filing new issues to avoid duplicates. For new issues, file your bug or -feature request as a new Issue. +This project uses GitHub Issues to track bugs and feature requests. Please search the existing issues before filing new issues to avoid duplicates. For new issues, file your bug or feature request as a new Issue. For help and questions about using this project, please From 119c51988434a595e5d16af330888d09cc7efb69 Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Mon, 23 Mar 2026 14:07:17 +1100 Subject: [PATCH 05/15] fix: build ghcr images using the tagged version of the code (#541) Minor fixes Signed-off-by: michaelawyu --- .github/workflows/release.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 906d6f8af..dd3ae9619 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -42,6 +42,8 @@ jobs: - name: Checkout code uses: actions/checkout@v6.0.2 + with: + ref: ${{ needs.export-registry.outputs.tag }} - name: Login to ghcr.io uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 From 997ba1c37c7e97b1e8ca3281aa80d32a3eff0583 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Mon, 23 Mar 2026 14:11:34 +1100 Subject: [PATCH 06/15] fix: push release images with both `v{version}` and `{version}` tags (#535) * Initial plan * fix: tag release images with both v-prefix and non-v-prefix version tags Co-authored-by: sjwaight <4828246+sjwaight@users.noreply.github.com> Agent-Logs-Url: https://github.com/kubefleet-dev/kubefleet/sessions/012764d2-924d-4558-8da2-02671704cdff --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: sjwaight <4828246+sjwaight@users.noreply.github.com> Co-authored-by: Simon Waight --- .github/workflows/release.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dd3ae9619..aa6c6e75b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -56,9 +56,24 @@ jobs: run: | make push + - name: Tag and push images without v prefix + env: + VERSION: ${{ needs.export-registry.outputs.version }} + run: | + for IMAGE in ${{ env.HUB_AGENT_IMAGE_NAME }} ${{ env.MEMBER_AGENT_IMAGE_NAME }} ${{ env.REFRESH_TOKEN_IMAGE_NAME }}; do + docker buildx imagetools create \ + --tag "${{ env.REGISTRY }}/${IMAGE}:${VERSION}" \ + "${{ env.REGISTRY }}/${IMAGE}:${{ env.TAG }}" + done + - name: Verify images + env: + VERSION: ${{ needs.export-registry.outputs.version }} run: | echo "✅ Published images:" echo " - ${{ env.REGISTRY }}/${{ env.HUB_AGENT_IMAGE_NAME }}:${{ env.TAG }}" + echo " - ${{ env.REGISTRY }}/${{ env.HUB_AGENT_IMAGE_NAME }}:${VERSION}" echo " - ${{ env.REGISTRY }}/${{ env.MEMBER_AGENT_IMAGE_NAME }}:${{ env.TAG }}" + echo " - ${{ env.REGISTRY }}/${{ env.MEMBER_AGENT_IMAGE_NAME }}:${VERSION}" echo " - ${{ env.REGISTRY }}/${{ env.REFRESH_TOKEN_IMAGE_NAME }}:${{ env.TAG }}" + echo " - ${{ env.REGISTRY }}/${{ env.REFRESH_TOKEN_IMAGE_NAME }}:${VERSION}" From d75f28a0deff235aee79d19567a2292e071fe60b Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Wed, 25 Mar 2026 10:39:17 +1100 Subject: [PATCH 07/15] feat: refactor member agent flag handling logic + add per-flag validation logic (#489) --- cmd/memberagent/main.go | 314 ++++++--------- cmd/memberagent/options/applier.go | 415 ++++++++++++++++++++ cmd/memberagent/options/ctrlmanager.go | 301 ++++++++++++++ cmd/memberagent/options/hub.go | 59 +++ cmd/memberagent/options/options.go | 61 +++ cmd/memberagent/options/propertyprovider.go | 82 ++++ cmd/memberagent/options/validation.go | 54 +++ cmd/memberagent/options/validation_test.go | 137 +++++++ docker/member-agent.Dockerfile | 4 +- 9 files changed, 1242 insertions(+), 185 deletions(-) create mode 100644 cmd/memberagent/options/applier.go create mode 100644 cmd/memberagent/options/ctrlmanager.go create mode 100644 cmd/memberagent/options/hub.go create mode 100644 cmd/memberagent/options/options.go create mode 100644 cmd/memberagent/options/propertyprovider.go create mode 100644 cmd/memberagent/options/validation.go create mode 100644 cmd/memberagent/options/validation_test.go diff --git a/cmd/memberagent/main.go b/cmd/memberagent/main.go index 7350d0e03..a710ab2ca 100644 --- a/cmd/memberagent/main.go +++ b/cmd/memberagent/main.go @@ -50,6 +50,7 @@ import ( clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + "github.com/kubefleet-dev/kubefleet/cmd/memberagent/options" imcv1beta1 "github.com/kubefleet-dev/kubefleet/pkg/controllers/internalmembercluster/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/controllers/workapplier" "github.com/kubefleet-dev/kubefleet/pkg/propertyprovider" @@ -66,52 +67,7 @@ const ( ) var ( - scheme = runtime.NewScheme() - useCertificateAuth = flag.Bool("use-ca-auth", false, "Use key and certificate to authenticate the member agent.") - tlsClientInsecure = flag.Bool("tls-insecure", false, "Enable TLSClientConfig.Insecure property. Enabling this will make the connection inSecure (should be 'true' for testing purpose only.)") - hubProbeAddr = flag.String("hub-health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") - hubMetricsAddr = flag.String("hub-metrics-bind-address", ":8080", "The address the metric endpoint binds to.") - probeAddr = flag.String("health-probe-bind-address", ":8091", "The address the probe endpoint binds to.") - metricsAddr = flag.String("metrics-bind-address", ":8090", "The address the metric endpoint binds to.") - enableLeaderElection = flag.Bool("leader-elect", false, - "Enable leader election for controller manager. Enabling this will ensure there is only one active controller manager.") - leaderElectionNamespace = flag.String("leader-election-namespace", "kube-system", "The namespace in which the leader election resource will be created.") - // TODO(weiweng): only keep enableV1Alpha1APIs for backward compatibility with helm charts. Remove soon. - enableV1Alpha1APIs = flag.Bool("enable-v1alpha1-apis", false, "If set, the agents will watch for the v1alpha1 APIs. This is deprecated and will be removed soon.") - enableV1Beta1APIs = flag.Bool("enable-v1beta1-apis", true, "If set, the agents will watch for the v1beta1 APIs.") - propertyProvider = flag.String("property-provider", "none", "The property provider to use for the agent.") - region = flag.String("region", "", "The region where the member cluster resides.") - cloudConfigFile = flag.String("cloud-config", "/etc/kubernetes/provider/config.json", "The path to the cloud cloudconfig file.") - deletionWaitTime = flag.Int("deletion-wait-time", 5, "The time the work-applier will wait for work object to be deleted before updating the applied work owner reference") - enablePprof = flag.Bool("enable-pprof", false, "enable pprof profiling") - pprofPort = flag.Int("pprof-port", 6065, "port for pprof profiling") - hubPprofPort = flag.Int("hub-pprof-port", 6066, "port for hub pprof profiling") - hubQPS = flag.Float64("hub-api-qps", 50, "QPS to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - hubBurst = flag.Int("hub-api-burst", 500, "Burst to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - memberQPS = flag.Float64("member-api-qps", 250, "QPS to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - memberBurst = flag.Int("member-api-burst", 1000, "Burst to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - - // Work applier requeue rate limiter settings. - workApplierRequeueRateLimiterAttemptsWithFixedDelay = flag.Int("work-applier-requeue-rate-limiter-attempts-with-fixed-delay", 1, "If set, the work applier will requeue work objects with a fixed delay for the specified number of attempts before switching to exponential backoff.") - workApplierRequeueRateLimiterFixedDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-fixed-delay-seconds", 5.0, "If set, the work applier will requeue work objects with this fixed delay in seconds for the specified number of attempts before switching to exponential backoff.") - workApplierRequeueRateLimiterExponentialBaseForSlowBackoff = flag.Float64("work-applier-requeue-rate-limiter-exponential-base-for-slow-backoff", 1.2, "If set, the work applier will start to back off slowly at this factor after it finished requeueing with fixed delays, until it reaches the slow backoff delay cap. Its value should be larger than 1.0 and no larger than 100.0") - workApplierRequeueRateLimiterInitialSlowBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-initial-slow-backoff-delay-seconds", 2, "If set, the work applier will start to back off slowly at this delay in seconds.") - workApplierRequeueRateLimiterMaxSlowBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-max-slow-backoff-delay-seconds", 15, "If set, the work applier will not back off longer than this value in seconds when it is in the slow backoff stage.") - workApplierRequeueRateLimiterExponentialBaseForFastBackoff = flag.Float64("work-applier-requeue-rate-limiter-exponential-base-for-fast-backoff", 1.5, "If set, the work applier will start to back off fast at this factor after it completes the slow backoff stage, until it reaches the fast backoff delay cap. Its value should be larger than the base value for the slow backoff stage.") - workApplierRequeueRateLimiterMaxFastBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-max-fast-backoff-delay-seconds", 900, "If set, the work applier will not back off longer than this value in seconds when it is in the fast backoff stage.") - workApplierRequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs = flag.Bool("work-applier-requeue-rate-limiter-skip-to-fast-backoff-for-available-or-diff-reported-work-objs", true, "If set, the rate limiter will skip the slow backoff stage and start fast backoff immediately for work objects that are available or have diff reported.") - - // Property feature gates when the property provider is not none. - enableNamespaceCollectionInPropertyProvider = flag.Bool("enable-namespace-collection-in-property-provider", false, "If set, the property provider will collect the namespaces in the member cluster.") - - // Work applier priority queue settings. - enableWorkApplierPriorityQueue = flag.Bool("enable-work-applier-priority-queue", false, "If set, the work applier will use a priority queue to process work objects.") - workApplierPriorityLinearEquationCoeffA = flag.Int("work-applier-priority-linear-equation-coeff-a", -3, "The work applier sets the priority for a Work object processing attempt using the linear equation: priority = A * (work object age in minutes) + B. This flag sets the coefficient A in the equation.") - workApplierPriorityLinearEquationCoeffB = flag.Int("work-applier-priority-linear-equation-coeff-b", 100, "The work applier sets the priority for a Work object processing attempt using the linear equation: priority = A * (work object age in minutes) + B. This flag sets the coefficient B in the equation.") - - // Azure property provider feature gates. - isAzProviderCostPropertiesEnabled = flag.Bool("use-cost-properties-in-azure-provider", true, "If set, the Azure property provider will expose cost properties in the member cluster.") - isAzProviderAvailableResPropertiesEnabled = flag.Bool("use-available-res-properties-in-azure-provider", true, "If set, the Azure property provider will expose available resources properties in the member cluster.") + scheme = runtime.NewScheme() ) func init() { @@ -124,6 +80,9 @@ func init() { } func main() { + opts := options.NewOptions() + opts.AddFlags(flag.CommandLine) + flag.Parse() utilrand.Seed(time.Now().UnixNano()) defer klog.Flush() @@ -135,32 +94,19 @@ func main() { // Set up controller-runtime logger ctrl.SetLogger(zap.New(zap.UseDevMode(true))) - // Validate flags - if !*enableV1Alpha1APIs && !*enableV1Beta1APIs { - klog.ErrorS(errors.New("either enable-v1alpha1-apis or enable-v1beta1-apis is required"), "Invalid APIs flags") - klog.FlushAndExit(klog.ExitFlushTimeout, 1) - } - // TO-DO (chenyu1): refactor the validation logic. - if workApplierPriorityLinearEquationCoeffA == nil || *workApplierPriorityLinearEquationCoeffA >= 0 { - klog.ErrorS(errors.New("parameter workApplierPriorityLinearEquationCoeffA is set incorrectly; must use a value less than 0"), "InvalidFlag", "workApplierPriorityLinearEquationCoeffA") - } - if workApplierPriorityLinearEquationCoeffB == nil || *workApplierPriorityLinearEquationCoeffB <= 0 { - klog.ErrorS(errors.New("parameter workApplierPriorityLinearEquationCoeffB is set incorrectly; must use a value greater than 0"), "InvalidFlag", "workApplierPriorityLinearEquationCoeffB") - } - hubURL := os.Getenv("HUB_SERVER_URL") if hubURL == "" { klog.ErrorS(errors.New("hub server api cannot be empty"), "Failed to read URL for the hub cluster") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } - hubConfig, err := buildHubConfig(hubURL, *useCertificateAuth, *tlsClientInsecure) - hubConfig.QPS = float32(*hubQPS) - hubConfig.Burst = *hubBurst + hubConfig, err := buildHubConfig(hubURL, opts.HubConnectivityOpts.UseCertificateAuth, opts.HubConnectivityOpts.UseInsecureTLSClient) if err != nil { klog.ErrorS(err, "Failed to build Kubernetes client configuration for the hub cluster") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } + hubConfig.QPS = float32(opts.CtrlManagerOptions.HubManagerOpts.QPS) + hubConfig.Burst = opts.CtrlManagerOptions.HubManagerOpts.Burst mcName := os.Getenv("MEMBER_CLUSTER_NAME") if mcName == "" { @@ -171,20 +117,23 @@ func main() { mcNamespace := fmt.Sprintf(utils.NamespaceNameFormat, mcName) memberConfig := ctrl.GetConfigOrDie() - memberConfig.QPS = float32(*memberQPS) - memberConfig.Burst = *memberBurst + memberConfig.QPS = float32(opts.CtrlManagerOptions.MemberManagerOpts.QPS) + memberConfig.Burst = opts.CtrlManagerOptions.MemberManagerOpts.Burst // we place the leader election lease on the member cluster to avoid adding load to the hub hubOpts := ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{ - BindAddress: *hubMetricsAddr, + BindAddress: opts.CtrlManagerOptions.HubManagerOpts.MetricsBindAddress, }, WebhookServer: webhook.NewServer(webhook.Options{ Port: 8443, }), - HealthProbeBindAddress: *hubProbeAddr, - LeaderElection: *enableLeaderElection, - LeaderElectionNamespace: *leaderElectionNamespace, + HealthProbeBindAddress: opts.CtrlManagerOptions.HubManagerOpts.HealthProbeBindAddress, + LeaderElection: opts.CtrlManagerOptions.LeaderElectionOpts.LeaderElect, + LeaderElectionNamespace: opts.CtrlManagerOptions.LeaderElectionOpts.ResourceNamespace, + LeaseDuration: &opts.CtrlManagerOptions.LeaderElectionOpts.LeaseDuration.Duration, + RenewDeadline: &opts.CtrlManagerOptions.LeaderElectionOpts.RenewDeadline.Duration, + RetryPeriod: &opts.CtrlManagerOptions.LeaderElectionOpts.RetryPeriod.Duration, LeaderElectionConfig: memberConfig, LeaderElectionID: "136224848560.hub.fleet.azure.com", Cache: cache.Options{ @@ -203,23 +152,26 @@ func main() { memberOpts := ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{ - BindAddress: *metricsAddr, + BindAddress: opts.CtrlManagerOptions.MemberManagerOpts.MetricsBindAddress, }, WebhookServer: webhook.NewServer(webhook.Options{ Port: 9443, }), - HealthProbeBindAddress: *probeAddr, + HealthProbeBindAddress: opts.CtrlManagerOptions.MemberManagerOpts.HealthProbeBindAddress, LeaderElection: hubOpts.LeaderElection, - LeaderElectionNamespace: *leaderElectionNamespace, + LeaderElectionNamespace: opts.CtrlManagerOptions.LeaderElectionOpts.ResourceNamespace, + LeaseDuration: &opts.CtrlManagerOptions.LeaderElectionOpts.LeaseDuration.Duration, + RenewDeadline: &opts.CtrlManagerOptions.LeaderElectionOpts.RenewDeadline.Duration, + RetryPeriod: &opts.CtrlManagerOptions.LeaderElectionOpts.RetryPeriod.Duration, LeaderElectionID: "136224848560.member.fleet.azure.com", } //+kubebuilder:scaffold:builder - if *enablePprof { - memberOpts.PprofBindAddress = fmt.Sprintf(":%d", *pprofPort) - hubOpts.PprofBindAddress = fmt.Sprintf(":%d", *hubPprofPort) + if opts.CtrlManagerOptions.EnablePprof { + memberOpts.PprofBindAddress = fmt.Sprintf(":%d", opts.CtrlManagerOptions.MemberManagerOpts.PprofPort) + hubOpts.PprofBindAddress = fmt.Sprintf(":%d", opts.CtrlManagerOptions.HubManagerOpts.PprofPort) } - if err := Start(ctrl.SetupSignalHandler(), hubConfig, memberConfig, hubOpts, memberOpts); err != nil { + if err := Start(ctrl.SetupSignalHandler(), hubConfig, memberConfig, hubOpts, memberOpts, *opts); err != nil { klog.ErrorS(err, "Failed to start the controllers for the member agent") klog.FlushAndExit(klog.ExitFlushTimeout, 1) } @@ -316,7 +268,7 @@ func buildHubConfig(hubURL string, useCertificateAuth bool, tlsClientInsecure bo } // Start the member controllers with the supplied config -func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memberOpts ctrl.Options) error { +func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memberOpts ctrl.Options, globalOpts options.Options) error { hubMgr, err := ctrl.NewManager(hubCfg, hubOpts) if err != nil { return fmt.Errorf("unable to start hub manager: %w", err) @@ -375,116 +327,112 @@ func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memb } discoverClient := discovery.NewDiscoveryClientForConfigOrDie(memberConfig) - if *enableV1Alpha1APIs { - // TODO(weiweng): keeping v1alpha1 APIs for backward compatibility with helm charts. Remove soon. - klog.Error("v1alpha1 APIs are no longer supported. Please switch to v1beta1 APIs") - return errors.New("v1alpha1 APIs are no longer supported. Please switch to v1beta1 APIs") + gvk := placementv1beta1.GroupVersion.WithKind(placementv1beta1.AppliedWorkKind) + if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { + klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) + return err + } + // Set up the work applier. Note that it is referenced by the InternalMemberCluster controller. + + // Set up the requeue rate limiter for the work applier. + // + // With default settings, the rate limiter will: + // * allow 1 attempt of fixed delay; this helps give objects a bit of headroom to get available (or have + // diffs reported). + // * use a fixed delay of 5 seconds for the first attempt. + // + // Important (chenyu1): before the introduction of the requeue rate limiter, the work + // applier uses static requeue intervals, specifically 5 seconds (if the work object is unavailable), + // and 15 seconds (if the work object is available). There are a number of test cases that + // implicitly assume this behavior (e.g., a test case might expect that the availability check completes + // w/in 10 seconds), which is why the rate limiter uses the 5 seconds fixed requeue delay by default. + // If you need to change this value and see that some test cases begin to fail, update the test + // cases accordingly. + // * after completing all attempts with fixed delay, switch to slow exponential backoff with a base of + // 1.2 with an initial delay of 2 seconds and a cap of 15 seconds (12 requeues in total, ~90 seconds in total); + // this is to allow fast checkups in cases where objects are not yet available or have not yet reported diffs. + // * after completing the slow backoff stage, switch to a fast exponential backoff with a base of 1.5 + // with an initial delay of 15 seconds and a cap of 15 minutes (10 requeues in total, ~42 minutes in total). + // * for Work objects that are available or have diffs reported, skip the slow backoff stage and + // start fast backoff immediately. + // + // The requeue pattern is essentially: + // * 1 attempts of requeue with fixed delay (5 seconds); then + // * 12 attempts of requeues with slow exponential backoff (factor of 1.2, ~90 seconds in total); then + // * 10 attempts of requeues with fast exponential backoff (factor of 1.5, ~42 minutes in total); + // * afterwards, requeue with a delay of 15 minutes indefinitely. + requeueRateLimiter := workapplier.NewRequeueMultiStageWithExponentialBackoffRateLimiter( + globalOpts.ApplierOpts.RequeueRateLimiterAttemptsWithFixedDelay, + float64(globalOpts.ApplierOpts.RequeueRateLimiterFixedDelaySeconds), + globalOpts.ApplierOpts.RequeueRateLimiterExponentialBaseForSlowBackoff, + float64(globalOpts.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds), + float64(globalOpts.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds), + globalOpts.ApplierOpts.RequeueRateLimiterExponentialBaseForFastBackoff, + float64(globalOpts.ApplierOpts.RequeueRateLimiterMaxFastBackoffDelaySeconds), + globalOpts.ApplierOpts.RequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs, + ) + + workApplier := workapplier.NewReconciler( + "work-applier", + hubMgr.GetClient(), + targetNS, + spokeDynamicClient, + memberMgr.GetClient(), + restMapper, + hubMgr.GetEventRecorderFor("work_applier"), + // The number of concurrent reconcilations. This is set to 5 to boost performance in + // resource processing. + 5, + // Use the default worker count (4) for parallelized manifest processing. + parallelizer.NewParallelizer(parallelizer.DefaultNumOfWorkers), + time.Minute*time.Duration(globalOpts.ApplierOpts.ResourceForceDeletionWaitTimeMinutes), + requeueRateLimiter, + globalOpts.ApplierOpts.EnablePriorityQueue, + &globalOpts.ApplierOpts.PriorityLinearEquationCoEffA, + &globalOpts.ApplierOpts.PriorityLinearEquationCoEffB, + ) + + if err = workApplier.SetupWithManager(hubMgr); err != nil { + klog.ErrorS(err, "Failed to create v1beta1 controller", "controller", "work") + return err } - if *enableV1Beta1APIs { - gvk := placementv1beta1.GroupVersion.WithKind(placementv1beta1.AppliedWorkKind) - if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { - klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) - return err - } - // Set up the work applier. Note that it is referenced by the InternalMemberCluster controller. - - // Set up the requeue rate limiter for the work applier. - // - // With default settings, the rate limiter will: - // * allow 1 attempt of fixed delay; this helps give objects a bit of headroom to get available (or have - // diffs reported). - // * use a fixed delay of 5 seconds for the first attempt. - // - // Important (chenyu1): before the introduction of the requeue rate limiter, the work - // applier uses static requeue intervals, specifically 5 seconds (if the work object is unavailable), - // and 15 seconds (if the work object is available). There are a number of test cases that - // implicitly assume this behavior (e.g., a test case might expect that the availability check completes - // w/in 10 seconds), which is why the rate limiter uses the 5 seconds fixed requeue delay by default. - // If you need to change this value and see that some test cases begin to fail, update the test - // cases accordingly. - // * after completing all attempts with fixed delay, switch to slow exponential backoff with a base of - // 1.2 with an initial delay of 2 seconds and a cap of 15 seconds (12 requeues in total, ~90 seconds in total); - // this is to allow fast checkups in cases where objects are not yet available or have not yet reported diffs. - // * after completing the slow backoff stage, switch to a fast exponential backoff with a base of 1.5 - // with an initial delay of 15 seconds and a cap of 15 minutes (10 requeues in total, ~42 minutes in total). - // * for Work objects that are available or have diffs reported, skip the slow backoff stage and - // start fast backoff immediately. - // - // The requeue pattern is essentially: - // * 1 attempts of requeue with fixed delay (5 seconds); then - // * 12 attempts of requeues with slow exponential backoff (factor of 1.2, ~90 seconds in total); then - // * 10 attempts of requeues with fast exponential backoff (factor of 1.5, ~42 minutes in total); - // * afterwards, requeue with a delay of 15 minutes indefinitely. - requeueRateLimiter := workapplier.NewRequeueMultiStageWithExponentialBackoffRateLimiter( - *workApplierRequeueRateLimiterAttemptsWithFixedDelay, - *workApplierRequeueRateLimiterFixedDelaySeconds, - *workApplierRequeueRateLimiterExponentialBaseForSlowBackoff, - *workApplierRequeueRateLimiterInitialSlowBackoffDelaySeconds, - *workApplierRequeueRateLimiterMaxSlowBackoffDelaySeconds, - *workApplierRequeueRateLimiterExponentialBaseForFastBackoff, - *workApplierRequeueRateLimiterMaxFastBackoffDelaySeconds, - *workApplierRequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs, - ) - - workApplier := workapplier.NewReconciler( - "work-applier", - hubMgr.GetClient(), - targetNS, - spokeDynamicClient, - memberMgr.GetClient(), - restMapper, - hubMgr.GetEventRecorderFor("work_applier"), - // The number of concurrent reconcilations. This is set to 5 to boost performance in - // resource processing. - 5, - // Use the default worker count (4) for parallelized manifest processing. - parallelizer.NewParallelizer(parallelizer.DefaultNumOfWorkers), - time.Minute*time.Duration(*deletionWaitTime), - requeueRateLimiter, - *enableWorkApplierPriorityQueue, - workApplierPriorityLinearEquationCoeffA, - workApplierPriorityLinearEquationCoeffB, - ) - - if err = workApplier.SetupWithManager(hubMgr); err != nil { - klog.ErrorS(err, "Failed to create v1beta1 controller", "controller", "work") - return err - } - - klog.Info("Setting up the internalMemberCluster v1beta1 controller") - // Set up a provider provider (if applicable). - var pp propertyprovider.PropertyProvider - switch { - case propertyProvider != nil && *propertyProvider == azurePropertyProvider: - klog.V(2).Info("setting up the Azure property provider") - // Note that the property provider, though initialized here, is not started until - // the specific instance wins the leader election. - klog.V(1).InfoS("Property Provider is azure, loading cloud config", "cloudConfigFile", *cloudConfigFile) - // TODO (britaniar): load cloud config for Azure property provider. - pp = azure.New(region, *isAzProviderCostPropertiesEnabled, *isAzProviderAvailableResPropertiesEnabled, *enableNamespaceCollectionInPropertyProvider) - default: - // Fall back to not using any property provider if the provided type is none or - // not recognizable. - klog.V(2).Info("no property provider is specified, or the given type is not recognizable; start with no property provider") - pp = nil - } + klog.Info("Setting up the internalMemberCluster v1beta1 controller") + // Set up a provider provider (if applicable). + var pp propertyprovider.PropertyProvider + switch { + case globalOpts.PropertyProviderOpts.Name == azurePropertyProvider: + klog.V(2).Info("setting up the Azure property provider") + // Note that the property provider, though initialized here, is not started until + // the specific instance wins the leader election. + klog.V(1).InfoS("Property Provider is azure, loading cloud config", "cloudConfigFile", globalOpts.PropertyProviderOpts.CloudConfigFilePath) + // TODO (britaniar): load cloud config for Azure property provider. + pp = azure.New( + &globalOpts.PropertyProviderOpts.Region, + globalOpts.PropertyProviderOpts.EnableAzProviderCostProperties, + globalOpts.PropertyProviderOpts.EnableAzProviderAvailableResourceProperties, + globalOpts.PropertyProviderOpts.EnableAzProviderNamespaceCollection) + default: + // Fall back to not using any property provider if the provided type is none or + // not recognizable. + klog.V(2).Info("no property provider is specified, or the given type is not recognizable; start with no property provider") + pp = nil + } - // Set up the IMC controller. - imcReconciler, err := imcv1beta1.NewReconciler( - ctx, - hubMgr.GetClient(), - memberMgr.GetConfig(), memberMgr.GetClient(), - workApplier, - pp) - if err != nil { - klog.ErrorS(err, "Failed to create InternalMemberCluster v1beta1 reconciler") - return fmt.Errorf("failed to create InternalMemberCluster v1beta1 reconciler: %w", err) - } - if err := imcReconciler.SetupWithManager(hubMgr, "internalmembercluster-controller"); err != nil { - klog.ErrorS(err, "Failed to set up InternalMemberCluster v1beta1 controller with the controller manager") - return fmt.Errorf("failed to set up InternalMemberCluster v1beta1 controller with the controller manager: %w", err) - } + // Set up the IMC controller. + imcReconciler, err := imcv1beta1.NewReconciler( + ctx, + hubMgr.GetClient(), + memberMgr.GetConfig(), memberMgr.GetClient(), + workApplier, + pp) + if err != nil { + klog.ErrorS(err, "Failed to create InternalMemberCluster v1beta1 reconciler") + return fmt.Errorf("failed to create InternalMemberCluster v1beta1 reconciler: %w", err) + } + if err := imcReconciler.SetupWithManager(hubMgr, "internalmembercluster-controller"); err != nil { + klog.ErrorS(err, "Failed to set up InternalMemberCluster v1beta1 controller with the controller manager") + return fmt.Errorf("failed to set up InternalMemberCluster v1beta1 controller with the controller manager: %w", err) } klog.InfoS("starting hub manager") diff --git a/cmd/memberagent/options/applier.go b/cmd/memberagent/options/applier.go new file mode 100644 index 000000000..cb416c5d8 --- /dev/null +++ b/cmd/memberagent/options/applier.go @@ -0,0 +1,415 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "strconv" +) + +type ApplierOptions struct { + // The time in minutes where the KubeFleet member agent will wait before force deleting all the + // resources from a member cluster if the placement that owns the resources has been removed + // from the hub cluster. + // + // KubeFleet member agent leverages owner references and foreground deletion to ensure that + // when a placement itself is gone, all of its selected resources will be removed as well. However, + // there are situations where Kubernetes might not be able to complete the cascade deletion timely; + // if the agent detects that a deletion has been stuck for a period of time longer than the value + // specified by this option, it will issue direct DELETE calls at the left resources. + ResourceForceDeletionWaitTimeMinutes int + + // The KubeFleet member agent periodically re-processes all resources propagated via a placement. + // This helps ensure that KubeFleet can detect inadvertent changes on processed resources and + // take actions accordingly. + // + // The system uses a rate limiter to regulate the frequency of the periodic re-processing; the + // flow is as follows: + // + // * when a placement is first processed, the system re-queues the placement for re-processing + // at a fixed delay for a few attempts; + // * if the processing outcomes remain the same across attempts, the system will start to + // re-process the placement with a slowly but exponentially increasing delay, until the delay + // reaches a user-configurable maximum value; + // * if the processing outcomes continues to stay the same across attempts, the system will + // start to re-process the placement with a fastly and exponentially increasing delay, until the + // delay reaches a user-configurable maximum value; + // + // In addition, the rate limiter provides an optional shortcut that allows placements that + // has been processed successfully (i.e., all the resources are applied and are found + // to be available, or all the resources have their diffs reported) to skip the slow backoff + // stage and jump to the fast backoff stage directly. + // + // See the options below for further details: + + // The number of attempts that the KubeFleet member agent will re-process a placement with a fixed delay. + RequeueRateLimiterAttemptsWithFixedDelay int + + // The fixed delay in seconds that the KubeFleet member agent will use. + RequeueRateLimiterFixedDelaySeconds int + + // The exponential delay growth base that the KubeFleet member agent will use for the slow backoff stage. + RequeueRateLimiterExponentialBaseForSlowBackoff float64 + + // The initial delay in seconds that the KubeFleet member agent will use for the slow backoff stage. + RequeueRateLimiterInitialSlowBackoffDelaySeconds int + + // The maximum delay in seconds that the KubeFleet member agent will use for the slow backoff stage. + RequeueRateLimiterMaxSlowBackoffDelaySeconds int + + // The exponential delay growth base that the KubeFleet member agent will use for the fast backoff stage. + RequeueRateLimiterExponentialBaseForFastBackoff float64 + + // The max delay in seconds that the KubeFleet member agent will use for the fast backoff stage. + RequeueRateLimiterMaxFastBackoffDelaySeconds int + + // Allow placements that have been processed successfully to skip the slow backoff stage and jump to + // the fast backoff stage directly or not. + RequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs bool + + // By default, the KubeFleet member agent processes all placements in a FIFO order. Alternatively, + // one can set up the agent to prioritize placements that are just created or updated; this is most + // helpful in cases where the number of placements per member cluster gets large and the processing + // latencies for newly created/updated placements become a concern. + // + // KubeFleet calculates the priority score for a placement using the formula below: + // + // Pri(placement) = A * (placement age in minutes) + B, + // + // where A is a negative integer and B a positive one, so that the younger the placement is, the + // higher priority it gets. The values of A and B are user-configurable. + // + // See the options below for further details: + + // Enable prioritized processing of newly created/updated placements or not. + EnablePriorityQueue bool + + // The coefficient A in the linear equation for calculating the priority score of a placement. + PriorityLinearEquationCoEffA int + + // The coefficient B in the linear equation for calculating the priority score of a placement. + PriorityLinearEquationCoEffB int +} + +func (o *ApplierOptions) AddFlags(flags *flag.FlagSet) { + flags.Var( + newResForceDeletionWaitTimeMinutesValue(5, &o.ResourceForceDeletionWaitTimeMinutes), + "deletion-wait-time", + "The time in minutes where the KubeFleet member agent will wait before force deleting all the resources from a member cluster if the placement that owns the resources has been removed from the hub cluster. Default is 5 minutes. The value must be in the range [1, 60].") + + flags.BoolVar( + &o.EnablePriorityQueue, + "enable-work-applier-priority-queue", + false, + "Enable prioritized processing of newly created/updated placements or not. Default is false, which means placements will be processed in a FIFO order.") + + flags.Var( + newRequeueAttemptsWithFixedDelayValue(1, &o.RequeueRateLimiterAttemptsWithFixedDelay), + "work-applier-requeue-rate-limiter-attempts-with-fixed-delay", + "The number of attempts with a fixed delay before the KubeFleet member agent switches to exponential backoff when re-processing a placement. Default is 1. The value must be in the range [1, 40].") + + flags.Var( + newRequeueFixedDelaySecondsValue(5, &o.RequeueRateLimiterFixedDelaySeconds), + "work-applier-requeue-rate-limiter-fixed-delay-seconds", + "The fixed delay in seconds for the KubeFleet member agent when re-processing a placement before switching to exponential backoff. Default is 5 seconds. The value must be in the range [2, 300].") + + flags.Var( + newRequeueExpBaseForSlowBackoffValue(1.2, &o.RequeueRateLimiterExponentialBaseForSlowBackoff), + "work-applier-requeue-rate-limiter-exponential-base-for-slow-backoff", + "The exponential delay growth base for the slow backoff stage when the KubeFleet member agent re-processes a placement. Default is 1.2. The value must be in the range [1.05, 2].") + + flags.Var( + newRequeueInitSlowBackoffDelaySecondsValue(2, &o.RequeueRateLimiterInitialSlowBackoffDelaySeconds), + "work-applier-requeue-rate-limiter-initial-slow-backoff-delay-seconds", + "The initial delay in seconds for the slow backoff stage when the KubeFleet member agent re-processes a placement. Default is 2 seconds. The value must be no less than 2.") + + flags.Var( + newRequeueMaxSlowBackoffDelaySecondsValue(15, &o.RequeueRateLimiterMaxSlowBackoffDelaySeconds), + "work-applier-requeue-rate-limiter-max-slow-backoff-delay-seconds", + "The maximum delay in seconds for the slow backoff stage when the KubeFleet member agent re-processes a placement. Default is 15 seconds. The value must be no less than 2.") + + flags.Var( + newRequeueExpBaseForFastBackoffValue(1.5, &o.RequeueRateLimiterExponentialBaseForFastBackoff), + "work-applier-requeue-rate-limiter-exponential-base-for-fast-backoff", + "The exponential delay growth base for the fast backoff stage when the KubeFleet member agent re-processes a placement. Default is 1.5. The value must be in the range (1, 2].") + + flags.Var( + newRequeueMaxFastBackoffDelaySecondsValue(900, &o.RequeueRateLimiterMaxFastBackoffDelaySeconds), + "work-applier-requeue-rate-limiter-max-fast-backoff-delay-seconds", + "The maximum delay in seconds for the fast backoff stage when the KubeFleet member agent re-processes a placement. Default is 900 seconds. The value must be in the range (0, 3600].") + + flags.BoolVar( + &o.RequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs, + "work-applier-requeue-rate-limiter-skip-to-fast-backoff-for-available-or-diff-reported-work-objs", + true, + "Allow placements that have been processed successfully to skip the slow backoff stage and jump to the fast backoff stage directly or not when periodically re-processing a placement. Default is true.") + + flags.Var( + newPriCoEffAValue(-3, &o.PriorityLinearEquationCoEffA), + "work-applier-priority-linear-equation-coeff-a", + "The coefficient A in the linear equation for calculating the priority score of a placement. The value must be a negative integer no less than -100. Default is -3.") + + flags.Var( + newPriCoEffBValue(100, &o.PriorityLinearEquationCoEffB), + "work-applier-priority-linear-equation-coeff-b", + "The coefficient B in the linear equation for calculating the priority score of a placement. The value must be a positive integer no greater than 1000. Default is 100.") +} + +type ResForceDeletionWaitTimeMinutes int + +func (v *ResForceDeletionWaitTimeMinutes) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *ResForceDeletionWaitTimeMinutes) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 1 || t > 60 { + return fmt.Errorf("resource force deletion wait time in minutes is set to an invalid value (%d), must be a value in the range [1, 60]", t) + } + *v = ResForceDeletionWaitTimeMinutes(t) + return nil +} + +func newResForceDeletionWaitTimeMinutesValue(defaultValue int, p *int) *ResForceDeletionWaitTimeMinutes { + *p = defaultValue + return (*ResForceDeletionWaitTimeMinutes)(p) +} + +type RequeueAttemptsWithFixedDelay int + +func (v *RequeueAttemptsWithFixedDelay) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueAttemptsWithFixedDelay) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 1 || t > 40 { + return fmt.Errorf("requeue rate limiter attempts with fixed delay is set to an invalid value (%d), must be a value in the range [1, 40]", t) + } + *v = RequeueAttemptsWithFixedDelay(t) + return nil +} + +func newRequeueAttemptsWithFixedDelayValue(defaultValue int, p *int) *RequeueAttemptsWithFixedDelay { + *p = defaultValue + return (*RequeueAttemptsWithFixedDelay)(p) +} + +type RequeueFixedDelaySeconds int + +func (v *RequeueFixedDelaySeconds) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueFixedDelaySeconds) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 2 || t > 300 { + return fmt.Errorf("requeue rate limiter fixed delay seconds is set to an invalid value (%d), must be a value in the range [2, 300]", t) + } + *v = RequeueFixedDelaySeconds(t) + return nil +} + +func newRequeueFixedDelaySecondsValue(defaultValue int, p *int) *RequeueFixedDelaySeconds { + *p = defaultValue + return (*RequeueFixedDelaySeconds)(p) +} + +// RequeueExpBaseForSlowBackoff is a custom flag value type for the +// RequeueRateLimiterExponentialBaseForSlowBackoff option. +type RequeueExpBaseForSlowBackoff float64 + +func (v *RequeueExpBaseForSlowBackoff) String() string { + return fmt.Sprintf("%g", *v) +} + +func (v *RequeueExpBaseForSlowBackoff) Set(s string) error { + exp, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float value: %w", err) + } + + if exp < 1.05 || exp > 2 { + return fmt.Errorf("requeue rate limiter exponential base for slow backoff is set to an invalid value (%g), must be a value in the range [1.05, 2]", exp) + } + *v = RequeueExpBaseForSlowBackoff(exp) + return nil +} + +func newRequeueExpBaseForSlowBackoffValue(defaultValue float64, p *float64) *RequeueExpBaseForSlowBackoff { + *p = defaultValue + return (*RequeueExpBaseForSlowBackoff)(p) +} + +type RequeueInitSlowBackoffDelaySeconds int + +func (v *RequeueInitSlowBackoffDelaySeconds) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueInitSlowBackoffDelaySeconds) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 2 { + return fmt.Errorf("requeue rate limiter initial slow backoff delay seconds is set to an invalid value (%d), must be a value no less than 2", t) + } + *v = RequeueInitSlowBackoffDelaySeconds(t) + return nil +} + +func newRequeueInitSlowBackoffDelaySecondsValue(defaultValue int, p *int) *RequeueInitSlowBackoffDelaySeconds { + *p = defaultValue + return (*RequeueInitSlowBackoffDelaySeconds)(p) +} + +type RequeueMaxSlowBackoffDelaySeconds int + +func (v *RequeueMaxSlowBackoffDelaySeconds) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueMaxSlowBackoffDelaySeconds) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t < 2 { + return fmt.Errorf("requeue rate limiter max slow backoff delay seconds is set to an invalid value (%d), must be a value no less than 2", t) + } + *v = RequeueMaxSlowBackoffDelaySeconds(t) + return nil +} + +func newRequeueMaxSlowBackoffDelaySecondsValue(defaultValue int, p *int) *RequeueMaxSlowBackoffDelaySeconds { + *p = defaultValue + return (*RequeueMaxSlowBackoffDelaySeconds)(p) +} + +type RequeueExpBaseForFastBackoff float64 + +func (v *RequeueExpBaseForFastBackoff) String() string { + return fmt.Sprintf("%g", *v) +} + +func (v *RequeueExpBaseForFastBackoff) Set(s string) error { + exp, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float value: %w", err) + } + + if exp <= 1 || exp > 2 { + return fmt.Errorf("requeue rate limiter exponential base for fast backoff is set to an invalid value (%g), must be a value in the range (1, 2]", exp) + } + *v = RequeueExpBaseForFastBackoff(exp) + return nil +} + +func newRequeueExpBaseForFastBackoffValue(defaultValue float64, p *float64) *RequeueExpBaseForFastBackoff { + *p = defaultValue + return (*RequeueExpBaseForFastBackoff)(p) +} + +type RequeueMaxFastBackoffDelaySeconds int + +func (v *RequeueMaxFastBackoffDelaySeconds) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RequeueMaxFastBackoffDelaySeconds) Set(s string) error { + t, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if t <= 0 || t > 3600 { + return fmt.Errorf("requeue rate limiter max fast backoff delay seconds is set to an invalid value (%d), must be a value in the range (0, 3600]", t) + } + *v = RequeueMaxFastBackoffDelaySeconds(t) + return nil +} + +func newRequeueMaxFastBackoffDelaySecondsValue(defaultValue int, p *int) *RequeueMaxFastBackoffDelaySeconds { + *p = defaultValue + return (*RequeueMaxFastBackoffDelaySeconds)(p) +} + +type PriCoEffA int + +func (v *PriCoEffA) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *PriCoEffA) Set(s string) error { + coeff, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if coeff >= 0 || coeff < -100 { + return fmt.Errorf("priority linear equation coefficient A is set to an invalid value (%d), must be a negative integer no less than -100", coeff) + } + + *v = PriCoEffA(coeff) + return nil +} + +func newPriCoEffAValue(defaultValue int, p *int) *PriCoEffA { + *p = defaultValue + return (*PriCoEffA)(p) +} + +type PriCoEffB int + +func (v *PriCoEffB) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *PriCoEffB) Set(s string) error { + coeff, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer value: %w", err) + } + + if coeff <= 0 || coeff > 1000 { + return fmt.Errorf("priority linear equation coefficient B is set to an invalid value (%d), must be a positive integer no greater than 1000", coeff) + } + + *v = PriCoEffB(coeff) + return nil +} + +func newPriCoEffBValue(defaultValue int, p *int) *PriCoEffB { + *p = defaultValue + return (*PriCoEffB)(p) +} diff --git a/cmd/memberagent/options/ctrlmanager.go b/cmd/memberagent/options/ctrlmanager.go new file mode 100644 index 000000000..0f3385f0f --- /dev/null +++ b/cmd/memberagent/options/ctrlmanager.go @@ -0,0 +1,301 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "strconv" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type CtrlManagerOptions struct { + // This set of options apply to the controller manager instance that connects to the hub + // cluster in the KubeFleet member agent. + HubManagerOpts PerClusterCtrlManagerOptions + // This set of options apply to the controller manager instance that connects to the member + // cluster in the KubeFleet member agent. + MemberManagerOpts PerClusterCtrlManagerOptions + + // Enable the pprof server for each controller manager instance or not. + EnablePprof bool + + // The leader election related options that apply to both controller manager instances + // in the KubeFleet member agent. + LeaderElectionOpts LeaderElectionOptions +} + +type PerClusterCtrlManagerOptions struct { + // The TCP address that the controller manager would bind to + // serve health probes. It can be set to "0" or the empty value to disable the health probe server. + HealthProbeBindAddress string + + // The TCP address that the controller manager should bind to serve prometheus metrics. + // It can be set to "0" or the empty value to disable the metrics server. + MetricsBindAddress string + + // The port in use by the pprof server for profiling the controller manager. This option + // only applies if pprof is enabled. + PprofPort int + + // The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager + // and all of its managed controller, for client-side throttling purposes. + QPS float64 + + // The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager + // and all of its managed controller, for client-side throttling purposes. + Burst int +} + +type LeaderElectionOptions struct { + // Enable leader election or not. + LeaderElect bool + + // The duration of a leader election lease. This is the period where a non-leader candidate + // will wait after observing a leadership renewal before attempting to acquire leadership of the + // current leader. And it is also effectively the maximum duration that a leader can be stopped + // before it is replaced by another candidate. The option only applies if leader election is enabled. + LeaseDuration metav1.Duration + + // The interval between attempts by the acting master to renew a leadership slot + // before it stops leading. This must be less than or equal to the lease duration. + // The option only applies if leader election is enabled. + RenewDeadline metav1.Duration + + // The duration the clients should wait between attempting acquisition and renewal of a + // leadership. The option only applies if leader election is enabled. + RetryPeriod metav1.Duration + + // The namespace of the resource object that will be used to lock during leader election cycles. + // This option only applies if leader election is enabled. + ResourceNamespace string +} + +func (o *CtrlManagerOptions) AddFlags(flags *flag.FlagSet) { + flags.StringVar( + &o.HubManagerOpts.HealthProbeBindAddress, + "hub-health-probe-bind-address", + ":8081", + "The TCP address that the controller manager would bind to serve health probes. It can be set to '0' or the empty value to disable the health probe server.") + + flags.StringVar( + &o.HubManagerOpts.MetricsBindAddress, + "hub-metrics-bind-address", + ":8080", + "The TCP address that the controller manager should bind to serve prometheus metrics. It can be set to '0' or the empty value to disable the metrics server.") + + flags.IntVar( + &o.HubManagerOpts.PprofPort, + "hub-pprof-port", + 6066, + "The port in use by the pprof server for profiling the controller manager that connects to the hub cluster.") + + flags.Var( + newHubQPSValueWithValidation(50.0, &o.HubManagerOpts.QPS), + "hub-api-qps", + "The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager (and all of its managed controllers) that connects to the hub cluster, for client-side throttling purposes.") + + flags.StringVar( + &o.MemberManagerOpts.HealthProbeBindAddress, + "health-probe-bind-address", + ":8091", + "The TCP address that the controller manager would bind to serve health probes. It can be set to '0' or the empty value to disable the health probe server.") + + flags.StringVar( + &o.MemberManagerOpts.MetricsBindAddress, + "metrics-bind-address", + ":8090", + "The TCP address that the controller manager should bind to serve prometheus metrics. It can be set to '0' or the empty value to disable the metrics server.") + + flags.IntVar( + &o.MemberManagerOpts.PprofPort, + "pprof-port", + 6065, + "The port in use by the pprof server for profiling the controller manager that connects to the member cluster.") + + flags.Var( + newHubBurstValueWithValidation(500, &o.HubManagerOpts.Burst), + "hub-api-burst", + "The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager (and all of its managed controllers) that connects to the hub cluster, for client-side throttling purposes.") + + flags.Var( + newMemberQPSValueWithValidation(250.0, &o.MemberManagerOpts.QPS), + "member-api-qps", + "The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager (and all of its managed controllers) that connects to the member cluster, for client-side throttling purposes.") + + flags.Var( + newMemberBurstValueWithValidation(1000, &o.MemberManagerOpts.Burst), + "member-api-burst", + "The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager (and all of its managed controllers) that connects to the member cluster, for client-side throttling purposes.") + + flags.BoolVar( + &o.EnablePprof, + "enable-pprof", + false, + "Enable the pprof server for profiling the member agent controller managers or not.") + + flags.BoolVar( + &o.LeaderElectionOpts.LeaderElect, + "leader-elect", + false, + "Enable leader election on the controller managers in use by the KubeFleet member agent.") + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.LeaderElectionOpts.LeaseDuration.Duration, + "leader-lease-duration", + 15*time.Second, + "The duration of a leader election lease. This is the period where a non-leader candidate will wait after observing a leadership renewal before attempting to acquire leadership of the current leader. And it is also effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. The option only applies if leader election is enabled.") + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.LeaderElectionOpts.RenewDeadline.Duration, + "leader-renew-deadline", + 10*time.Second, + "The interval between attempts by the acting master to renew a leadership slot before it stops leading. This must be less than or equal to the lease duration. The option only applies if leader election is enabled.") + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.LeaderElectionOpts.RetryPeriod.Duration, + "leader-retry-period", + 2*time.Second, + "The duration the clients should wait between attempting acquisition and renewal of a leadership. The option only applies if leader election is enabled.") + + flags.StringVar( + &o.LeaderElectionOpts.ResourceNamespace, + "leader-election-namespace", + "kube-system", + "The namespace in which the leader election resource will be created. The option only applies if leader election is enabled.") +} + +type HubQPSValueWithValidation float64 + +func (v *HubQPSValueWithValidation) String() string { + return fmt.Sprintf("%f", *v) +} + +func (v *HubQPSValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + qps, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float64 value: %w", err) + } + + if qps < 0.0 { + // Disable client-side throttling. + *v = -1.0 + return nil + } + + if qps < 10.0 || qps > 1000.0 { + return fmt.Errorf("QPS limit is set to an invalid value (%f), must be a value in the range [10.0, 1000.0]", qps) + } + *v = HubQPSValueWithValidation(qps) + return nil +} + +func newHubQPSValueWithValidation(defaultVal float64, p *float64) *HubQPSValueWithValidation { + *p = defaultVal + return (*HubQPSValueWithValidation)(p) +} + +type HubBurstValueWithValidation int + +func (v *HubBurstValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *HubBurstValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + burst, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + + if burst < 10 || burst > 2000 { + return fmt.Errorf("burst limit is set to an invalid value (%d), must be a value in the range [10, 2000]", burst) + } + *v = HubBurstValueWithValidation(burst) + return nil +} + +func newHubBurstValueWithValidation(defaultVal int, p *int) *HubBurstValueWithValidation { + *p = defaultVal + return (*HubBurstValueWithValidation)(p) +} + +type MemberQPSValueWithValidation float64 + +func (v *MemberQPSValueWithValidation) String() string { + return fmt.Sprintf("%f", *v) +} + +func (v *MemberQPSValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + qps, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float64 value: %w", err) + } + + if qps < 0.0 { + // Disable client-side throttling. + *v = -1.0 + return nil + } + + if qps < 10.0 || qps > 10000.0 { + return fmt.Errorf("QPS limit is set to an invalid value (%f), must be a value in the range [10.0, 10000.0]", qps) + } + *v = MemberQPSValueWithValidation(qps) + return nil +} + +func newMemberQPSValueWithValidation(defaultVal float64, p *float64) *MemberQPSValueWithValidation { + *p = defaultVal + return (*MemberQPSValueWithValidation)(p) +} + +type MemberBurstValueWithValidation int + +func (v *MemberBurstValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *MemberBurstValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + burst, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + + if burst < 10 || burst > 20000 { + return fmt.Errorf("burst limit is set to an invalid value (%d), must be a value in the range [10, 20000]", burst) + } + *v = MemberBurstValueWithValidation(burst) + return nil +} + +func newMemberBurstValueWithValidation(defaultVal int, p *int) *MemberBurstValueWithValidation { + *p = defaultVal + return (*MemberBurstValueWithValidation)(p) +} diff --git a/cmd/memberagent/options/hub.go b/cmd/memberagent/options/hub.go new file mode 100644 index 000000000..aacd89091 --- /dev/null +++ b/cmd/memberagent/options/hub.go @@ -0,0 +1,59 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" +) + +// HubConnectivityOptions is a set of options that control how the KubeFleet +// member agent connects to the hub cluster. +type HubConnectivityOptions struct { + // Enable certificate-based authentication or not when connecting to the hub cluster. + // + // If this is set to true, provide with the member agent the file paths to the key + // and certificate to use for authentication via the `IDENTITY_KEY` and `IDENTITY_CERT` + // environment variables respectively. + // + // Otherwise, the member agent will use token-based authentication when connecting + // to the hub cluster. The agent will read the token from the file path specified + // by the `CONFIG_PATH` environment variable. + UseCertificateAuth bool + + // Use an insecure client or not when connecting to the hub cluster. + // + // If this is set to false, provide with the member agent a file path to the CA + // bundle to use for verifying the hub cluster's identity via the `CA_BUNDLE` environment + // variable; you can also give the member agent a file path to the CA data instead + // via the `HUB_CERTIFICATE_AUTHORITY` environment variable. + UseInsecureTLSClient bool +} + +// AddFlags adds flags for HubConnectivityOptions to the specified FlagSet. +func (o *HubConnectivityOptions) AddFlags(flags *flag.FlagSet) { + flags.BoolVar( + &o.UseCertificateAuth, + "use-ca-auth", + false, + "Enable certificate-based authentication or not when connecting to the hub cluster.") + + flags.BoolVar( + &o.UseInsecureTLSClient, + "tls-insecure", + false, + "Use an insecure client or not when connecting to the hub cluster.") +} diff --git a/cmd/memberagent/options/options.go b/cmd/memberagent/options/options.go new file mode 100644 index 000000000..29376f9fe --- /dev/null +++ b/cmd/memberagent/options/options.go @@ -0,0 +1,61 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" +) + +// Options is the options to use for running the KubeFleet member agent. +type Options struct { + // Options that control how the KubeFleet member agent connects to the hub cluster. + HubConnectivityOpts HubConnectivityOptions + + // Options that concern the setup of the controller manager instance in use by the + // KubeFleet member agent. + CtrlManagerOptions CtrlManagerOptions + + // Options that control how the KubeFleet member agent applies resources from the hub + // cluster to the member cluster. + ApplierOpts ApplierOptions + + // KubeFleet cluster property provider related options. + PropertyProviderOpts PropertyProviderOptions + + // The fields below are added only for backwards compatibility reasons. + // Their values are never read. + UseV1Beta1APIs bool +} + +func NewOptions() *Options { + return &Options{} +} + +func (o *Options) AddFlags(flags *flag.FlagSet) { + o.HubConnectivityOpts.AddFlags(flags) + o.CtrlManagerOptions.AddFlags(flags) + o.ApplierOpts.AddFlags(flags) + o.PropertyProviderOpts.AddFlags(flags) + + // The flags set up below are added only for backwards compatibility reasons. + // They are no-op flags and their values are never read. + flags.BoolVar( + &o.UseV1Beta1APIs, + "enable-v1beta1-apis", + true, + "Use KubeFleet v1beta1 APIs or not. This flag is obsolete; its value has no effect.") +} diff --git a/cmd/memberagent/options/propertyprovider.go b/cmd/memberagent/options/propertyprovider.go new file mode 100644 index 000000000..348d041fe --- /dev/null +++ b/cmd/memberagent/options/propertyprovider.go @@ -0,0 +1,82 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" +) + +type PropertyProviderOptions struct { + // The region where the member cluster resides. + Region string + + // The property provider to use in the KubeFleet member agent. + Name string + + // The path to a configuration file that enables the KubeFleet member agent to + // connect to a specific cloud platform to retrieve platform-specific cluster properties. + CloudConfigFilePath string + + // Enable support for cost properties in the Azure property provider or not. This option + // applies only when the Azure property provider is in use. + EnableAzProviderCostProperties bool + + // Enable support for available resource properties in the Azure property provider or not. + // This option applies only when the Azure property provider is in use. + EnableAzProviderAvailableResourceProperties bool + + // Enable support for namespace collection in the Azure property provider or not. This option applies only when the Azure property provider is in use. + EnableAzProviderNamespaceCollection bool +} + +func (o *PropertyProviderOptions) AddFlags(flags *flag.FlagSet) { + flags.StringVar( + &o.Region, + "region", + "", + "The region where the member cluster resides.") + + flags.StringVar( + &o.Name, + "property-provider", + "none", + "The property provider to use in the KubeFleet member agent.") + + flags.StringVar( + &o.CloudConfigFilePath, + "cloud-config", + "/etc/kubernetes/provider/config.json", + "The path to a configuration file that enables the KubeFleet member agent to connect to a specific cloud platform to retrieve platform-specific cluster properties.") + + flags.BoolVar( + &o.EnableAzProviderCostProperties, + "use-cost-properties-in-azure-provider", + true, + "Enable support for cost properties in the Azure property provider or not. This option applies only when the Azure property provider is in use.") + + flags.BoolVar( + &o.EnableAzProviderAvailableResourceProperties, + "use-available-res-properties-in-azure-provider", + true, + "Enable support for available resource properties in the Azure property provider or not. This option applies only when the Azure property provider is in use.") + + flags.BoolVar( + &o.EnableAzProviderNamespaceCollection, + "enable-namespace-collection-in-property-provider", + false, + "Enable support for namespace collection in the Azure property provider or not. This option applies only when the Azure property provider is in use.") +} diff --git a/cmd/memberagent/options/validation.go b/cmd/memberagent/options/validation.go new file mode 100644 index 000000000..5016205d8 --- /dev/null +++ b/cmd/memberagent/options/validation.go @@ -0,0 +1,54 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "k8s.io/apimachinery/pkg/util/validation/field" +) + +// Validate checks Options and return a slice of found errs. +// +// Note: the logic here concerns primarily cross-option validation; for single-option validation, +// consider adding the logic directly as part of the flag parsing function, for clarity reasons. +func (o *Options) Validate() field.ErrorList { + errs := field.ErrorList{} + newPath := field.NewPath("Options") + + // Cross-field validation for controller manager options. + if float64(o.CtrlManagerOptions.HubManagerOpts.Burst) < o.CtrlManagerOptions.HubManagerOpts.QPS { + errs = append(errs, field.Invalid(newPath.Child("HubManagerOpts").Child("Burst"), o.CtrlManagerOptions.HubManagerOpts.Burst, "The burst limit for hub cluster client-side throttling must be greater than or equal to its QPS limit")) + } + + if float64(o.CtrlManagerOptions.MemberManagerOpts.Burst) < o.CtrlManagerOptions.MemberManagerOpts.QPS { + errs = append(errs, field.Invalid(newPath.Child("MemberManagerOpts").Child("Burst"), o.CtrlManagerOptions.MemberManagerOpts.Burst, "The burst limit for member cluster client-side throttling must be greater than or equal to its QPS limit")) + } + + // Cross-field validation for applier options. + if o.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds > o.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds { + errs = append(errs, field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterInitialSlowBackoffDelaySeconds"), o.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds, "The initial delay for the slow backoff stage must not exceed the maximum delay for the slow backoff stage")) + } + + if o.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds > o.ApplierOpts.RequeueRateLimiterMaxFastBackoffDelaySeconds { + errs = append(errs, field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterMaxSlowBackoffDelaySeconds"), o.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds, "The maximum delay for the slow backoff stage must not exceed the maximum delay for the fast backoff stage")) + } + + if o.ApplierOpts.RequeueRateLimiterExponentialBaseForFastBackoff < o.ApplierOpts.RequeueRateLimiterExponentialBaseForSlowBackoff { + errs = append(errs, field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterExponentialBaseForFastBackoff"), o.ApplierOpts.RequeueRateLimiterExponentialBaseForFastBackoff, "The exponential base for the fast backoff stage must be greater than or equal to the exponential base for the slow backoff stage")) + } + + return errs +} diff --git a/cmd/memberagent/options/validation_test.go b/cmd/memberagent/options/validation_test.go new file mode 100644 index 000000000..a77ef9ad9 --- /dev/null +++ b/cmd/memberagent/options/validation_test.go @@ -0,0 +1,137 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + "k8s.io/apimachinery/pkg/util/validation/field" +) + +// a callback function to modify options +type ModifyOptions func(option *Options) + +// newTestOptions creates an Options with default parameters. +func newTestOptions(modifyOptions ModifyOptions) Options { + option := Options{ + CtrlManagerOptions: CtrlManagerOptions{ + HubManagerOpts: PerClusterCtrlManagerOptions{ + QPS: 50, + Burst: 500, + }, + MemberManagerOpts: PerClusterCtrlManagerOptions{ + QPS: 250, + Burst: 1000, + }, + }, + ApplierOpts: ApplierOptions{ + RequeueRateLimiterInitialSlowBackoffDelaySeconds: 2, + RequeueRateLimiterMaxSlowBackoffDelaySeconds: 15, + RequeueRateLimiterMaxFastBackoffDelaySeconds: 900, + RequeueRateLimiterExponentialBaseForSlowBackoff: 1.2, + RequeueRateLimiterExponentialBaseForFastBackoff: 1.5, + }, + } + + if modifyOptions != nil { + modifyOptions(&option) + } + return option +} + +func TestValidate(t *testing.T) { + newPath := field.NewPath("Options") + testCases := map[string]struct { + opt Options + want field.ErrorList + }{ + "valid options": { + opt: newTestOptions(nil), + want: field.ErrorList{}, + }, + "hub burst less than hub QPS": { + opt: newTestOptions(func(option *Options) { + option.CtrlManagerOptions.HubManagerOpts.QPS = 200 + option.CtrlManagerOptions.HubManagerOpts.Burst = 100 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("HubManagerOpts").Child("Burst"), 100, "The burst limit for hub cluster client-side throttling must be greater than or equal to its QPS limit"), + }, + }, + "member burst less than member QPS": { + opt: newTestOptions(func(option *Options) { + option.CtrlManagerOptions.MemberManagerOpts.QPS = 500 + option.CtrlManagerOptions.MemberManagerOpts.Burst = 200 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("MemberManagerOpts").Child("Burst"), 200, "The burst limit for member cluster client-side throttling must be greater than or equal to its QPS limit"), + }, + }, + "slow backoff initial delay exceeds slow backoff max delay": { + opt: newTestOptions(func(option *Options) { + option.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds = 30 + option.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds = 15 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterInitialSlowBackoffDelaySeconds"), 30, "The initial delay for the slow backoff stage must not exceed the maximum delay for the slow backoff stage"), + }, + }, + "slow backoff max delay exceeds fast backoff max delay": { + opt: newTestOptions(func(option *Options) { + option.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds = 1000 + option.ApplierOpts.RequeueRateLimiterMaxFastBackoffDelaySeconds = 900 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterMaxSlowBackoffDelaySeconds"), 1000, "The maximum delay for the slow backoff stage must not exceed the maximum delay for the fast backoff stage"), + }, + }, + "fast backoff exponential base less than slow backoff exponential base": { + opt: newTestOptions(func(option *Options) { + option.ApplierOpts.RequeueRateLimiterExponentialBaseForSlowBackoff = 2.0 + option.ApplierOpts.RequeueRateLimiterExponentialBaseForFastBackoff = 1.5 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterExponentialBaseForFastBackoff"), 1.5, "The exponential base for the fast backoff stage must be greater than or equal to the exponential base for the slow backoff stage"), + }, + }, + "multiple simultaneous violations": { + opt: newTestOptions(func(option *Options) { + option.CtrlManagerOptions.HubManagerOpts.QPS = 200 + option.CtrlManagerOptions.HubManagerOpts.Burst = 100 + option.CtrlManagerOptions.MemberManagerOpts.QPS = 500 + option.CtrlManagerOptions.MemberManagerOpts.Burst = 200 + option.ApplierOpts.RequeueRateLimiterInitialSlowBackoffDelaySeconds = 30 + option.ApplierOpts.RequeueRateLimiterMaxSlowBackoffDelaySeconds = 15 + }), + want: field.ErrorList{ + field.Invalid(newPath.Child("HubManagerOpts").Child("Burst"), 100, "The burst limit for hub cluster client-side throttling must be greater than or equal to its QPS limit"), + field.Invalid(newPath.Child("MemberManagerOpts").Child("Burst"), 200, "The burst limit for member cluster client-side throttling must be greater than or equal to its QPS limit"), + field.Invalid(newPath.Child("ApplierOpts").Child("RequeueRateLimiterInitialSlowBackoffDelaySeconds"), 30, "The initial delay for the slow backoff stage must not exceed the maximum delay for the slow backoff stage"), + }, + }, + } + + for name, tc := range testCases { + t.Run(name, func(t *testing.T) { + got := tc.opt.Validate() + if diff := cmp.Diff(got, tc.want); diff != "" { + t.Errorf("Validate() errs mismatch (-got, +want):\n%s", diff) + } + }) + } +} diff --git a/docker/member-agent.Dockerfile b/docker/member-agent.Dockerfile index 3a58cf33f..ca0786c2c 100644 --- a/docker/member-agent.Dockerfile +++ b/docker/member-agent.Dockerfile @@ -13,13 +13,13 @@ COPY go.sum go.sum RUN go mod download # Copy the go source -COPY cmd/memberagent/main.go main.go +COPY cmd/memberagent cmd/memberagent/ COPY apis/ apis/ COPY pkg/ pkg/ # Build RUN echo "Building images with GOOS=$GOOS GOARCH=$GOARCH" -RUN CGO_ENABLED=1 GOOS=$GOOS GOARCH=$GOARCH GOEXPERIMENT=systemcrypto GO111MODULE=on go build -o memberagent main.go +RUN CGO_ENABLED=1 GOOS=$GOOS GOARCH=$GOARCH GOEXPERIMENT=systemcrypto GO111MODULE=on go build -o memberagent cmd/memberagent/main.go # Use distroless as minimal base image to package the memberagent binary # Refer to https://github.com/GoogleContainerTools/distroless for more details From b871a060a104ce86875b9ae308ef08c55e1498a2 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Date: Wed, 25 Mar 2026 11:15:24 -0700 Subject: [PATCH 08/15] fix: Update condition message for Placement when waiting for rollout to finish (#548) --- pkg/controllers/placement/controller.go | 2 +- pkg/controllers/placement/controller_test.go | 6 +++--- pkg/controllers/placement/placement_status.go | 10 ++++++++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pkg/controllers/placement/controller.go b/pkg/controllers/placement/controller.go index 5dcd05f22..a41d8f4c9 100644 --- a/pkg/controllers/placement/controller.go +++ b/pkg/controllers/placement/controller.go @@ -742,7 +742,7 @@ func (r *Reconciler) determineRolloutStateForPlacementWithExternalRolloutStrateg Type: getPlacementRolloutStartedConditionType(placementObj), Status: metav1.ConditionUnknown, Reason: condition.RolloutControlledByExternalControllerReason, - Message: "Rollout is controlled by an external controller and no resource snapshot name is observed across clusters, probably rollout has not started yet", + Message: fmt.Sprintf("Rollout is controlled by an external controller and no resource snapshot name is observed across clusters. To determine rollout status, check the %s resources with spec.placementName matching this placement", getStagedUpdateRunResourceName(placementObj)), ObservedGeneration: placementObj.GetGeneration(), }) // As placement status will refresh even if the spec has not changed, we reset any unused conditions to avoid confusion. diff --git a/pkg/controllers/placement/controller_test.go b/pkg/controllers/placement/controller_test.go index 2d1e37ff3..026621671 100644 --- a/pkg/controllers/placement/controller_test.go +++ b/pkg/controllers/placement/controller_test.go @@ -1725,7 +1725,7 @@ func TestDetermineRolloutStateForPlacementWithExternalRolloutStrategy(t *testing Type: string(fleetv1beta1.ClusterResourcePlacementRolloutStartedConditionType), Status: metav1.ConditionUnknown, Reason: "RolloutControlledByExternalController", - Message: "Rollout is controlled by an external controller and no resource snapshot name is observed across clusters, probably rollout has not started yet", + Message: "Rollout is controlled by an external controller and no resource snapshot name is observed across clusters. To determine rollout status, check the ClusterStagedUpdateRun resources with spec.placementName matching this placement", ObservedGeneration: 1, }, }, @@ -1800,7 +1800,7 @@ func TestDetermineRolloutStateForPlacementWithExternalRolloutStrategy(t *testing Type: string(fleetv1beta1.ClusterResourcePlacementRolloutStartedConditionType), Status: metav1.ConditionUnknown, Reason: "RolloutControlledByExternalController", - Message: "Rollout is controlled by an external controller and no resource snapshot name is observed across clusters, probably rollout has not started yet", + Message: "Rollout is controlled by an external controller and no resource snapshot name is observed across clusters. To determine rollout status, check the ClusterStagedUpdateRun resources with spec.placementName matching this placement", ObservedGeneration: 1, }, }, @@ -1834,7 +1834,7 @@ func TestDetermineRolloutStateForPlacementWithExternalRolloutStrategy(t *testing Type: string(fleetv1beta1.ClusterResourcePlacementRolloutStartedConditionType), Status: metav1.ConditionUnknown, Reason: "RolloutControlledByExternalController", - Message: "Rollout is controlled by an external controller and no resource snapshot name is observed across clusters, probably rollout has not started yet", + Message: "Rollout is controlled by an external controller and no resource snapshot name is observed across clusters. To determine rollout status, check the ClusterStagedUpdateRun resources with spec.placementName matching this placement", ObservedGeneration: 1, }, }, diff --git a/pkg/controllers/placement/placement_status.go b/pkg/controllers/placement/placement_status.go index 007b1a1af..68d6cb3d4 100644 --- a/pkg/controllers/placement/placement_status.go +++ b/pkg/controllers/placement/placement_status.go @@ -466,6 +466,16 @@ func getPlacementRolloutStartedConditionType(placementObj fleetv1beta1.Placement return string(fleetv1beta1.ResourcePlacementRolloutStartedConditionType) } +// getStagedUpdateRunResourceName returns the appropriate StagedUpdateRun resource name based on the placement type. +// For ClusterResourcePlacement, it returns "ClusterStagedUpdateRun". +// For ResourcePlacement, it returns "StagedUpdateRun". +func getStagedUpdateRunResourceName(placementObj fleetv1beta1.PlacementObj) string { + if isClusterScopedPlacement(placementObj) { + return "ClusterStagedUpdateRun" + } + return "StagedUpdateRun" +} + // getPlacementConditionType returns the appropriate placement condition type based on whether the placement is cluster-scoped or namespace-scoped. // If the placement namespace is empty (cluster-scoped), it returns the ClusterResourcePlacement condition type. // Otherwise (namespace-scoped), it returns the ResourcePlacement condition type. From 0c13d1ace2a1280ac5c24caf71e6792c7f5bfb97 Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Wed, 25 Mar 2026 19:13:55 -0400 Subject: [PATCH 09/15] feat: wire namespace affinity scheduler plugin (#461) --- charts/member-agent/README.md | 1 + charts/member-agent/templates/deployment.yaml | 3 + charts/member-agent/values.yaml | 2 + pkg/scheduler/profile/profile.go | 6 +- pkg/scheduler/profile/profile_test.go | 137 ++++++ test/e2e/enveloped_object_placement_test.go | 4 + test/e2e/join_and_leave_test.go | 4 + test/e2e/mixed_placement_test.go | 72 +--- .../resource_placement_apply_strategy_test.go | 40 ++ .../e2e/resource_placement_drift_diff_test.go | 40 ++ ...ource_placement_namespace_affinity_test.go | 396 ++++++++++++++++++ .../resource_placement_negative_cases_test.go | 12 + test/e2e/resource_placement_pickall_test.go | 44 ++ test/e2e/resource_placement_pickfixed_test.go | 16 + test/e2e/resource_placement_pickn_test.go | 44 ++ test/e2e/resource_placement_ro_test.go | 36 ++ test/e2e/resource_placement_rollout_test.go | 28 ++ ...ource_placement_scheduler_watchers_test.go | 92 ++++ ...urce_placement_selecting_resources_test.go | 44 ++ ...esource_placement_taint_toleration_test.go | 20 + ...ource_placement_with_custom_config_test.go | 3 + test/e2e/setup.sh | 2 + test/e2e/staged_updaterun_test.go | 42 ++ test/e2e/utils_test.go | 167 ++++++-- 24 files changed, 1172 insertions(+), 83 deletions(-) create mode 100644 pkg/scheduler/profile/profile_test.go create mode 100644 test/e2e/resource_placement_namespace_affinity_test.go diff --git a/charts/member-agent/README.md b/charts/member-agent/README.md index 027f95357..ccbcd1bc4 100644 --- a/charts/member-agent/README.md +++ b/charts/member-agent/README.md @@ -69,6 +69,7 @@ helm upgrade member-agent kubefleet/member-agent --namespace fleet-system | logVerbosity | Log level. Uses V logs (klog) | `3` | | propertyProvider | The property provider to use with the member agent; if none is specified, the Fleet member agent will start with no property provider (i.e., the agent will expose no cluster properties, and collect only limited resource usage information) | `` | | region | The region where the member cluster resides | `` | +| enableNamespaceCollectionInPropertyProvider | Enable namespace collection in the property provider; when enabled, the member agent will collect and report the list of namespaces present in the member cluster to the hub cluster for use in scheduling decisions | `false` | | workApplierRequeueRateLimiterAttemptsWithFixedDelay | This parameter is a set of values to control how frequent KubeFleet should reconcile (processed) manifests; it specifies then number of attempts to requeue with fixed delay before switching to exponential backoff | `1` | | workApplierRequeueRateLimiterFixedDelaySeconds | This parameter is a set of values to control how frequent KubeFleet should reconcile (process) manifests; it specifies the fixed delay in seconds for initial requeue attempts | `5` | | workApplierRequeueRateLimiterExponentialBaseForSlowBackoff | This parameter is a set of values to control how frequent KubeFleet should reconcile (process) manifests; it specifies the exponential base for the slow backoff stage | `1.2` | diff --git a/charts/member-agent/templates/deployment.yaml b/charts/member-agent/templates/deployment.yaml index 30bf4076b..ae83e5d95 100644 --- a/charts/member-agent/templates/deployment.yaml +++ b/charts/member-agent/templates/deployment.yaml @@ -73,6 +73,9 @@ spec: - --work-applier-priority-linear-equation-coeff-a={{ .Values.priorityQueue.priorityLinearEquationCoeffA }} - --work-applier-priority-linear-equation-coeff-b={{ .Values.priorityQueue.priorityLinearEquationCoeffB }} {{- end }} + {{- if .Values.enableNamespaceCollectionInPropertyProvider }} + - --enable-namespace-collection-in-property-provider={{ .Values.enableNamespaceCollectionInPropertyProvider }} + {{- end }} env: - name: HUB_SERVER_URL value: "{{ .Values.config.hubURL }}" diff --git a/charts/member-agent/values.yaml b/charts/member-agent/values.yaml index 8777af2ac..310007dd1 100644 --- a/charts/member-agent/values.yaml +++ b/charts/member-agent/values.yaml @@ -67,3 +67,5 @@ priorityQueue: enabled: false priorityLinearEquationCoeffA: -3 priorityLinearEquationCoeffB: 100 + +enableNamespaceCollectionInPropertyProvider: false diff --git a/pkg/scheduler/profile/profile.go b/pkg/scheduler/profile/profile.go index 494aaa5ed..31771ce1c 100644 --- a/pkg/scheduler/profile/profile.go +++ b/pkg/scheduler/profile/profile.go @@ -21,6 +21,7 @@ import ( "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/clusteraffinity" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/clustereligibility" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/namespaceaffinity" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/sameplacementaffinity" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/tainttoleration" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/topologyspreadconstraints" @@ -51,13 +52,14 @@ func NewProfile(opts Options) *framework.Profile { clusterAffinityPlugin = *opts.ClusterAffinityPlugin } clusterEligibilityPlugin := clustereligibility.New() + namespaceAffinityPlugin := namespaceaffinity.New() samePlacementAffinityPlugin := sameplacementaffinity.New() topologySpreadConstraintsPlugin := topologyspreadconstraints.New() taintTolerationPlugin := tainttoleration.New() p.WithPostBatchPlugin(&topologySpreadConstraintsPlugin). - WithPreFilterPlugin(&clusterAffinityPlugin).WithPreFilterPlugin(&topologySpreadConstraintsPlugin). - WithFilterPlugin(&clusterAffinityPlugin).WithFilterPlugin(&clusterEligibilityPlugin).WithFilterPlugin(&taintTolerationPlugin).WithFilterPlugin(&samePlacementAffinityPlugin).WithFilterPlugin(&topologySpreadConstraintsPlugin). + WithPreFilterPlugin(&clusterAffinityPlugin).WithPreFilterPlugin(&namespaceAffinityPlugin).WithPreFilterPlugin(&topologySpreadConstraintsPlugin). + WithFilterPlugin(&clusterAffinityPlugin).WithFilterPlugin(&clusterEligibilityPlugin).WithFilterPlugin(&namespaceAffinityPlugin).WithFilterPlugin(&taintTolerationPlugin).WithFilterPlugin(&samePlacementAffinityPlugin).WithFilterPlugin(&topologySpreadConstraintsPlugin). WithPreScorePlugin(&clusterAffinityPlugin).WithPreScorePlugin(&topologySpreadConstraintsPlugin). WithScorePlugin(&clusterAffinityPlugin).WithScorePlugin(&samePlacementAffinityPlugin).WithScorePlugin(&topologySpreadConstraintsPlugin) return p diff --git a/pkg/scheduler/profile/profile_test.go b/pkg/scheduler/profile/profile_test.go new file mode 100644 index 000000000..696bf58f4 --- /dev/null +++ b/pkg/scheduler/profile/profile_test.go @@ -0,0 +1,137 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package profile + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/clusteraffinity" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/clustereligibility" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/namespaceaffinity" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/sameplacementaffinity" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/tainttoleration" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/topologyspreadconstraints" +) + +// TestNewDefaultProfile tests the creation of a default scheduling profile. +func TestNewDefaultProfile(t *testing.T) { + profile := NewDefaultProfile() + + if profile == nil { + t.Fatal("NewDefaultProfile() returned nil, want non-nil profile") + } + + // Verify profile name + if profile.Name() != defaultProfileName { + t.Errorf("NewDefaultProfile() profile name = %q, want %q", profile.Name(), defaultProfileName) + } + + // Use reflection to check the internal structure since the fields are unexported + // Create a comparable profile to verify the structure + wantProfile := framework.NewProfile(defaultProfileName) + + // Configure the expected profile with the same plugins + testClusterAffinityPlugin := clusteraffinity.New() + testClusterEligibilityPlugin := clustereligibility.New() + testNamespaceAffinityPlugin := namespaceaffinity.New() + testSamePlacementAffinityPlugin := sameplacementaffinity.New() + testTopologySpreadConstraintsPlugin := topologyspreadconstraints.New() + testTaintTolerationPlugin := tainttoleration.New() + + wantProfile.WithPostBatchPlugin(&testTopologySpreadConstraintsPlugin). + WithPreFilterPlugin(&testClusterAffinityPlugin).WithPreFilterPlugin(&testNamespaceAffinityPlugin).WithPreFilterPlugin(&testTopologySpreadConstraintsPlugin). + WithFilterPlugin(&testClusterAffinityPlugin).WithFilterPlugin(&testClusterEligibilityPlugin).WithFilterPlugin(&testNamespaceAffinityPlugin).WithFilterPlugin(&testTaintTolerationPlugin).WithFilterPlugin(&testSamePlacementAffinityPlugin).WithFilterPlugin(&testTopologySpreadConstraintsPlugin). + WithPreScorePlugin(&testClusterAffinityPlugin).WithPreScorePlugin(&testTopologySpreadConstraintsPlugin). + WithScorePlugin(&testClusterAffinityPlugin).WithScorePlugin(&testSamePlacementAffinityPlugin).WithScorePlugin(&testTopologySpreadConstraintsPlugin) + + // Compare the profiles using cmp.Equal with AllowUnexported to access private fields + if diff := cmp.Diff(profile, wantProfile, + cmp.AllowUnexported(framework.Profile{}, + clusteraffinity.Plugin{}, + clustereligibility.Plugin{}, + namespaceaffinity.Plugin{}, + sameplacementaffinity.Plugin{}, + topologyspreadconstraints.Plugin{}, + tainttoleration.Plugin{})); diff != "" { + t.Errorf("NewDefaultProfile() mismatch (-got +want):\n%s", diff) + } +} + +// TestNewProfileWithOptions tests the creation of a scheduling profile with custom options. +// It verifies that: +// 1. Profile is created successfully with both empty and custom options +// 2. Profile name is set to the default value regardless of options +// 3. Custom ClusterAffinityPlugin option is accepted +func TestNewProfileWithOptions(t *testing.T) { + testCases := []struct { + name string + opts Options + wantName string + }{ + { + name: "EmptyOptions", + opts: Options{}, + wantName: defaultProfileName, + }, + { + name: "CustomClusterAffinityPlugin", + opts: Options{ + ClusterAffinityPlugin: &clusteraffinity.Plugin{}, + }, + wantName: defaultProfileName, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + profile := NewProfile(tc.opts) + + if profile == nil { + t.Fatal("NewProfile() returned nil, want non-nil profile") + } + + if profile.Name() != tc.wantName { + t.Errorf("NewProfile() profile name = %q, want %q", profile.Name(), tc.wantName) + } + }) + } +} + +// TestNewProfileWithCustomClusterAffinityPlugin tests that custom cluster affinity plugin is used when provided. +func TestNewProfileWithCustomClusterAffinityPlugin(t *testing.T) { + customPlugin := clusteraffinity.New() + opts := Options{ + ClusterAffinityPlugin: &customPlugin, + } + + profile1 := NewProfile(opts) + profile2 := NewProfile(Options{}) // Default profile + + // Both profiles should have the same name + if profile1.Name() != profile2.Name() { + t.Errorf("Profile names should be the same: custom=%q, default=%q", profile1.Name(), profile2.Name()) + } + + // Both profiles should be configured similarly, but with potentially different plugin instances + // This test verifies that the custom plugin option doesn't break the profile creation + if profile1 == nil || profile2 == nil { + t.Error("Both profiles should be non-nil") + } +} diff --git a/test/e2e/enveloped_object_placement_test.go b/test/e2e/enveloped_object_placement_test.go index b52451958..35e7b52e4 100644 --- a/test/e2e/enveloped_object_placement_test.go +++ b/test/e2e/enveloped_object_placement_test.go @@ -551,6 +551,10 @@ var _ = Describe("Test NamespaceOnly placement through CRP then resource placeme } }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("Create the ResourcePlacement that selects the ResourceEnvelope", func() { rp := &placementv1beta1.ResourcePlacement{ ObjectMeta: metav1.ObjectMeta{ diff --git a/test/e2e/join_and_leave_test.go b/test/e2e/join_and_leave_test.go index d8a692809..ce715bf2d 100644 --- a/test/e2e/join_and_leave_test.go +++ b/test/e2e/join_and_leave_test.go @@ -272,6 +272,10 @@ var _ = Describe("Test member cluster join and leave flow", Label("joinleave"), }) Context("Test cluster join and leave flow with RP not deleted", Label("joinleave"), Ordered, Serial, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespaceName, allMemberClusterNames) + }) + It("Create the RP that select the config map and place it to all clusters", func() { resourceSelectors := []placementv1beta1.ResourceSelectorTerm{ { diff --git a/test/e2e/mixed_placement_test.go b/test/e2e/mixed_placement_test.go index dc8dae38a..539ac91fe 100644 --- a/test/e2e/mixed_placement_test.go +++ b/test/e2e/mixed_placement_test.go @@ -78,58 +78,13 @@ var _ = Describe("mixed ClusterResourcePlacement and ResourcePlacement negative ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) }) - It("should update RP status as expected", func() { - rpStatusUpdatedActual := func() error { - rp := &placementv1beta1.ResourcePlacement{} - if err := hubClient.Get(ctx, types.NamespacedName{Name: rpName, Namespace: workNamespaceName}, rp); err != nil { - return err - } + It("should wait for namespace collection to sync on clusters where CRP placed it", func() { + waitForNamespaceCollectionOnClusters(workNamespaceName, []string{memberCluster2EastCanaryName, memberCluster3WestProdName}) + }) - appConfigMapName := fmt.Sprintf(appConfigMapNameTemplate, GinkgoParallelProcess()) - wantStatus := placementv1beta1.PlacementStatus{ - Conditions: rpAppliedFailedConditions(rp.Generation), - PerClusterPlacementStatuses: []placementv1beta1.PerClusterPlacementStatus{ - { - ClusterName: memberCluster1EastProdName, - ObservedResourceIndex: "0", - FailedPlacements: []placementv1beta1.FailedResourcePlacement{ - { - ResourceIdentifier: placementv1beta1.ResourceIdentifier{ - Kind: "ConfigMap", - Name: appConfigMapName, - Version: "v1", - Namespace: workNamespaceName, - }, - Condition: metav1.Condition{ - Type: placementv1beta1.WorkConditionTypeApplied, - Status: metav1.ConditionFalse, - Reason: string(workapplier.ApplyOrReportDiffResTypeFailedToApply), - ObservedGeneration: 0, - }, - }, - }, - Conditions: perClusterApplyFailedConditions(rp.Generation), - }, - { - ClusterName: memberCluster2EastCanaryName, - ObservedResourceIndex: "0", - Conditions: perClusterRolloutCompletedConditions(rp.Generation, true, false), - }, - { - ClusterName: memberCluster3WestProdName, - ObservedResourceIndex: "0", - Conditions: perClusterRolloutCompletedConditions(rp.Generation, true, false), - }, - }, - SelectedResources: appConfigMapIdentifiers(), - ObservedResourceIndex: "0", - } - if diff := cmp.Diff(rp.Status, wantStatus, placementStatusCmpOptions...); diff != "" { - return fmt.Errorf("RP status diff (-got, +want): %s", diff) - } - return nil - } - Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s status as expected", rpName) + It("should update RP status as expected", func() { + rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), []string{memberCluster2EastCanaryName, memberCluster3WestProdName}, nil, "0") + Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") }) It("should place resources on the specified clusters", func() { @@ -150,6 +105,10 @@ var _ = Describe("mixed ClusterResourcePlacement and ResourcePlacement negative Eventually(updateFunc, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update the crp %s", crpName) }) + It("should wait for namespace collection to sync on the newly added member cluster", func() { + waitForNamespaceCollectionOnClusters(workNamespaceName, []string{memberCluster1EastProdName}) + }) + It("should update RP status as expected", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), allMemberClusterNames, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") @@ -175,6 +134,10 @@ var _ = Describe("mixed ClusterResourcePlacement and ResourcePlacement negative ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespaceName, allMemberClusterNames) + }) + It("creating an RP that uses PickAll strategy on the same namespace", func() { createRP(workNamespaceName, rpName) }) @@ -271,6 +234,9 @@ var _ = Describe("mixed ClusterResourcePlacement and ResourcePlacement negative Expect(memberCluster.KubeClient.Create(ctx, &ns)).To(Succeed()) } + By("waiting for namespace to be collected on all member clusters") + waitForNamespaceCollectionOnClusters(workNamespaceName, allMemberClusterNames) + By("creating an RP") createRP(workNamespaceName, rpName) @@ -442,6 +408,10 @@ var _ = Describe("mixed ClusterResourcePlacement and ResourcePlacement positive Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection to sync on selected member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespaceName, wantSelectedClusters) + }) + It("add labels to member clusters based on CRP placement decisions", func() { Eventually(func() error { // Add labels to the selected clusters diff --git a/test/e2e/resource_placement_apply_strategy_test.go b/test/e2e/resource_placement_apply_strategy_test.go index cc407ba4a..86cab0184 100644 --- a/test/e2e/resource_placement_apply_strategy_test.go +++ b/test/e2e/resource_placement_apply_strategy_test.go @@ -80,6 +80,10 @@ var _ = Describe("validating resource placement using different apply strategies }) Context("Test a RP place objects successfully (client-side-apply and allow co-own)", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { cm := appConfigMap() cm.SetOwnerReferences([]metav1.OwnerReference{ @@ -143,6 +147,10 @@ var _ = Describe("validating resource placement using different apply strategies }) Context("Test a RP place objects successfully (client-side-apply and disallow co-own) and existing resource has no owner reference", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { cm := appConfigMap() cm.Annotations = map[string]string{ @@ -194,6 +202,10 @@ var _ = Describe("validating resource placement using different apply strategies }) Context("Test a RP place objects successfully (server-side-apply and disallow co-own) and existing resource has no owner reference", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { cm := appConfigMap() cm.Annotations = map[string]string{ @@ -248,6 +260,10 @@ var _ = Describe("validating resource placement using different apply strategies }) Context("Test a RP fail to apply configmap (server-side-apply and disallow co-own) and existing resource is owned by others", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { cm := appConfigMap() cm.SetOwnerReferences([]metav1.OwnerReference{ @@ -353,6 +369,10 @@ var _ = Describe("validating resource placement using different apply strategies }) Context("Test a RP able to apply configmap when the conflicted annotation is managed by others (force server-side-apply and allow co-own)", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { cm := appConfigMap() cm.SetOwnerReferences([]metav1.OwnerReference{ @@ -430,6 +450,10 @@ var _ = Describe("validating resource placement using different apply strategies }) Context("no dual placement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) conflictedRPName := "rp-conflicted" workNamespaceName := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -606,6 +630,10 @@ var _ = Describe("validating resource placement using different apply strategies Describe("SSA", Ordered, func() { Context("use server-side apply to place resources (with changes)", func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + // The key here should match the one used in the default config map. cmDataKey := "data" cmDataVal1 := "foobar" @@ -716,6 +744,10 @@ var _ = Describe("validating resource placement using different apply strategies }) Context("fall back to server-side apply when client-side apply cannot be used", func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + cmDataKey := "randomBase64Str" BeforeAll(func() { @@ -901,6 +933,10 @@ var _ = Describe("validating resource placement using different apply strategies Describe("switching apply strategies", func() { Context("switch from client-side apply to report diff", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + anotherConfigMapName := types.NamespacedName{Name: fmt.Sprintf("another-config-map-%d", GinkgoParallelProcess()), Namespace: workNamespaceName} selectedResources := []placementv1beta1.ResourceIdentifier{ { @@ -1179,6 +1215,10 @@ var _ = Describe("validating resource placement using different apply strategies }) Context("switch from report diff to server side apply", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { createConfigMap() diff --git a/test/e2e/resource_placement_drift_diff_test.go b/test/e2e/resource_placement_drift_diff_test.go index 26856a8a8..92234da17 100644 --- a/test/e2e/resource_placement_drift_diff_test.go +++ b/test/e2e/resource_placement_drift_diff_test.go @@ -106,6 +106,10 @@ var _ = Describe("take over existing resources using RP", Label("resourceplaceme Context("always take over", Ordered, func() { rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP in the same namespace selecting namespaced resources // with apply strategy. @@ -203,6 +207,10 @@ var _ = Describe("take over existing resources using RP", Label("resourceplaceme Context("take over if no diff, partial comparison", Ordered, func() { rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP in the same namespace selecting namespaced resources // with apply strategy. @@ -375,6 +383,10 @@ var _ = Describe("take over existing resources using RP", Label("resourceplaceme Context("take over if no diff, full comparison", Ordered, func() { rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP in the same namespace selecting namespaced resources // with apply strategy. @@ -623,6 +635,10 @@ var _ = Describe("detect drifts on placed resources using RP", Ordered, Label("r Context("always apply, full comparison", Ordered, func() { rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP in the same namespace selecting namespaced resources // with apply strategy. @@ -732,6 +748,10 @@ var _ = Describe("detect drifts on placed resources using RP", Ordered, Label("r Context("apply if no drift, partial comparison", Ordered, func() { rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP in the same namespace selecting namespaced resources // with apply strategy. @@ -931,6 +951,10 @@ var _ = Describe("detect drifts on placed resources using RP", Ordered, Label("r Context("apply if no drift, full comparison", Ordered, func() { rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP in the same namespace selecting namespaced resources // with apply strategy. @@ -1167,6 +1191,10 @@ var _ = Describe("report diff mode using RP", Label("resourceplacement"), func() cm1Name := fmt.Sprintf(appConfigMapNameTemplate, GinkgoParallelProcess()) cm2Name := fmt.Sprintf(appConfigMapNameTemplate+"-%d", GinkgoParallelProcess(), 2) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + BeforeAll(func() { // Create the resources on the hub cluster. createWorkResources() @@ -1252,6 +1280,10 @@ var _ = Describe("report diff mode using RP", Label("resourceplacement"), func() Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for CRP to propagate namespace and namespace collection to sync", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should update RP status as expected", func() { buildWantRPStatus := func(rpGeneration int64) *placementv1beta1.PlacementStatus { return &placementv1beta1.PlacementStatus{ @@ -1600,6 +1632,10 @@ var _ = Describe("mixed diff and drift reportings using RP", Ordered, Label("res var lastDeployDriftObservedTimeOnCluster2 metav1.Time var firstDeployDriftObservedTimeOnCluster2 metav1.Time + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + BeforeAll(func() { // Create the resources on the hub cluster. // @@ -1690,6 +1726,10 @@ var _ = Describe("mixed diff and drift reportings using RP", Ordered, Label("res Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("can introduce drifts", func() { Eventually(func() error { deploy := &appsv1.Deployment{} diff --git a/test/e2e/resource_placement_namespace_affinity_test.go b/test/e2e/resource_placement_namespace_affinity_test.go new file mode 100644 index 000000000..e20ebed69 --- /dev/null +++ b/test/e2e/resource_placement_namespace_affinity_test.go @@ -0,0 +1,396 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "fmt" + + "github.com/google/go-cmp/cmp" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" + + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + "github.com/kubefleet-dev/kubefleet/test/e2e/framework" +) + +// The test suites below cover the namespace affinity scheduler plugin behavior. +// The plugin ensures that an RP is only scheduled onto clusters where the target +// namespace already exists (i.e. the namespace was placed there by a CRP). + +// Test scenario (PickAll): +// - A CRP with PickFixed policy places the namespace onto 2 out of 3 member clusters. +// - An RP with PickAll policy selects a ConfigMap in that namespace. +// - The namespace affinity plugin should restrict the RP to only the 2 clusters +// that actually have the namespace; the RP must still be considered successful. +var _ = Describe("placing namespaced scoped resources using an RP with PickAll policy and namespace affinity", Label("resourceplacement", "namespaceaffinity"), func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + rpKey := types.NamespacedName{Name: rpName, Namespace: appNamespace().Name} + + BeforeEach(OncePerOrdered, func() { + // Create the work resources (namespace + configmap) on the hub cluster. + createWorkResources() + + // Create a CRP with PickFixed policy that places the namespace on only 2 of the 3 + // member clusters. The namespace affinity plugin will later use the namespace + // presence information propagated from these 2 clusters to filter RP scheduling. + createNamespaceOnlyCRPForTwoClusters(crpName) + + // Wait until the CRP has placed the namespace on exactly the 2 fixed clusters. + // With PickFixed, non-targeted clusters are not listed as unselected — pass nil. + crpStatusUpdatedActual := crpStatusUpdatedActual( + workNamespaceIdentifiers(), + []string{memberCluster1EastProdName, memberCluster2EastCanaryName}, + nil, + "0", + ) + Eventually(crpStatusUpdatedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + }) + + AfterEach(OncePerOrdered, func() { + ensureRPAndRelatedResourcesDeleted(rpKey, allMemberClusters) + ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) + }) + + Context("namespace affinity restricts RP placement to clusters with the namespace", Ordered, func() { + It("should create RP with PickAll policy successfully", func() { + rp := &placementv1beta1.ResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: rpName, + Namespace: appNamespace().Name, + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: configMapSelector(), + Policy: &placementv1beta1.PlacementPolicy{ + PlacementType: placementv1beta1.PickAllPlacementType, + }, + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1beta1.RollingUpdateConfig{ + UnavailablePeriodSeconds: ptr.To(2), + }, + }, + }, + } + Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create RP") + }) + + It("should update RP status showing placement only on clusters with the namespace", func() { + // The RP uses PickAll. Namespace affinity silently filters out cluster-3 (no namespace there), + // so the scheduler considers the policy fulfilled — only cluster-1 and cluster-2 appear as selected. + // cluster-3 does not appear as an unselected entry in RP status. + // Use a longer timeout since namespace collection status must propagate before scheduling. + rpStatusUpdatedActual := rpStatusUpdatedActual( + appConfigMapIdentifiers(), + []string{memberCluster1EastProdName, memberCluster2EastCanaryName}, + nil, + "0", + ) + Eventually(rpStatusUpdatedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") + }) + + It("should place the ConfigMap on the clusters that have the namespace", func() { + for _, cluster := range []*framework.Cluster{memberCluster1EastProd, memberCluster2EastCanary} { + resourcePlacedActual := workNamespaceAndConfigMapPlacedOnClusterActual(cluster) + Eventually(resourcePlacedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place resources on cluster %s", cluster.ClusterName) + } + }) + + It("should not place the ConfigMap on the cluster without the namespace", func() { + checkIfRemovedConfigMapFromMemberClusters([]*framework.Cluster{memberCluster3WestProd}) + }) + }) +}) + +// Test scenario (PickN): +// - A CRP with PickFixed policy places the namespace onto 2 out of 3 member clusters. +// - An RP with PickN=2 policy selects a ConfigMap in that namespace. +// - The namespace affinity plugin restricts eligible clusters to the 2 that have the namespace, +// and PickN=2 is exactly fulfilled — the RP is considered successful. +var _ = Describe("placing namespaced scoped resources using an RP with PickN policy and namespace affinity", Label("resourceplacement", "namespaceaffinity"), func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + rpKey := types.NamespacedName{Name: rpName, Namespace: appNamespace().Name} + + BeforeEach(OncePerOrdered, func() { + // Create the work resources (namespace + configmap) on the hub cluster. + createWorkResources() + + // Create a CRP with PickFixed policy that places the namespace on only 2 of the 3 + // member clusters. The namespace affinity plugin will later use the namespace + // presence information propagated from these 2 clusters to filter RP scheduling. + createNamespaceOnlyCRPForTwoClusters(crpName) + + // Wait until the CRP has placed the namespace on exactly the 2 fixed clusters. + // With PickFixed, non-targeted clusters are not listed as unselected — pass nil. + crpStatusUpdatedActual := crpStatusUpdatedActual( + workNamespaceIdentifiers(), + []string{memberCluster1EastProdName, memberCluster2EastCanaryName}, + nil, + "0", + ) + Eventually(crpStatusUpdatedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + }) + + AfterEach(OncePerOrdered, func() { + ensureRPAndRelatedResourcesDeleted(rpKey, allMemberClusters) + ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) + }) + + Context("PickN=2 is exactly fulfilled by the clusters that have the namespace", Ordered, func() { + It("should create RP with PickN=2 policy successfully", func() { + rp := &placementv1beta1.ResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: rpName, + Namespace: appNamespace().Name, + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: configMapSelector(), + Policy: &placementv1beta1.PlacementPolicy{ + PlacementType: placementv1beta1.PickNPlacementType, + NumberOfClusters: ptr.To(int32(2)), + }, + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1beta1.RollingUpdateConfig{ + UnavailablePeriodSeconds: ptr.To(2), + }, + }, + }, + } + Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create RP") + }) + + It("should update RP status showing placement only on the 2 clusters with the namespace", func() { + // Namespace affinity restricts eligible clusters to only cluster-1 and cluster-2. + // PickN=2 is exactly fulfilled — Scheduled=True, no unselected entries. + rpStatusUpdatedActual := rpStatusUpdatedActual( + appConfigMapIdentifiers(), + []string{memberCluster1EastProdName, memberCluster2EastCanaryName}, + nil, + "0", + ) + Eventually(rpStatusUpdatedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") + }) + + It("should place the ConfigMap on the 2 clusters that have the namespace", func() { + for _, cluster := range []*framework.Cluster{memberCluster1EastProd, memberCluster2EastCanary} { + resourcePlacedActual := workNamespaceAndConfigMapPlacedOnClusterActual(cluster) + Eventually(resourcePlacedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place resources on cluster %s", cluster.ClusterName) + } + }) + + It("should not place the ConfigMap on the cluster without the namespace", func() { + checkIfRemovedConfigMapFromMemberClusters([]*framework.Cluster{memberCluster3WestProd}) + }) + }) +}) + +// Test scenario (PickN edge case): +// - A CRP with PickFixed policy places the namespace onto only 1 out of 3 member clusters. +// - An RP with PickN=2 policy selects a ConfigMap in that namespace. +// - The namespace affinity plugin restricts eligible clusters to only the 1 that has the namespace. +// - PickN=2 cannot be fulfilled with only 1 eligible cluster, so Scheduled=False but resources are still placed on the eligible cluster. +var _ = Describe("placing namespaced scoped resources using an RP with PickN=2 but namespace only on 1 cluster", Label("resourceplacement", "namespaceaffinity"), func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + rpKey := types.NamespacedName{Name: rpName, Namespace: appNamespace().Name} + + BeforeEach(OncePerOrdered, func() { + // Create the work resources (namespace + configmap) on the hub cluster. + createWorkResources() + + // Create a CRP with PickFixed policy that places the namespace on only 1 of the 3 + // member clusters. This creates a scenario where PickN=2 cannot be satisfied. + createNamespaceOnlyCRPForOneCluster(crpName) + + // Wait until the CRP has placed the namespace on exactly the 1 fixed cluster. + crpStatusUpdatedActual := crpStatusUpdatedActual( + workNamespaceIdentifiers(), + []string{memberCluster1EastProdName}, + nil, + "0", + ) + Eventually(crpStatusUpdatedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + }) + + AfterEach(OncePerOrdered, func() { + ensureRPAndRelatedResourcesDeleted(rpKey, allMemberClusters) + ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) + }) + + Context("PickN=2 cannot be fulfilled by only 1 eligible cluster", Ordered, func() { + It("should create RP with PickN=2 policy successfully", func() { + rp := &placementv1beta1.ResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: rpName, + Namespace: appNamespace().Name, + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: configMapSelector(), + Policy: &placementv1beta1.PlacementPolicy{ + PlacementType: placementv1beta1.PickNPlacementType, + NumberOfClusters: ptr.To(int32(2)), + }, + Strategy: placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1beta1.RollingUpdateConfig{ + UnavailablePeriodSeconds: ptr.To(2), + }, + }, + }, + } + Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create RP") + }) + + It("should update RP status showing Scheduled=False due to insufficient clusters", func() { + // Namespace affinity restricts eligible clusters to only cluster-1. + // PickN=2 cannot be fulfilled with only 1 eligible cluster — Scheduled=False. + // The scheduler places on the 1 available cluster but reports the policy as unfulfilled. + + rpStatusUpdatedActual := func() error { + placement, err := retrievePlacement(rpKey) + if err != nil { + return fmt.Errorf("failed to get placement %s: %w", rpKey, err) + } + + // Build expected status to match actual scheduler behavior: + // - One selected cluster (kind-cluster-1) + // - One unselected entry (for PickN unfulfillment) + // - Overall Scheduled=False because PickN=2 cannot be satisfied + wantStatus := &placementv1beta1.PlacementStatus{ + SelectedResources: appConfigMapIdentifiers(), + ObservedResourceIndex: "0", + PerClusterPlacementStatuses: []placementv1beta1.PerClusterPlacementStatus{ + { + ClusterName: memberCluster1EastProdName, + ObservedResourceIndex: "0", + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.PerClusterAppliedConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "AllWorkHaveBeenApplied", + }, + { + Type: string(placementv1beta1.PerClusterAvailableConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "AllWorkAreAvailable", + }, + { + Type: string(placementv1beta1.PerClusterOverriddenConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "NoOverrideSpecified", + }, + { + Type: string(placementv1beta1.PerClusterRolloutStartedConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "RolloutStarted", + }, + { + Type: string(placementv1beta1.PerClusterScheduledConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "Scheduled", + }, + { + Type: string(placementv1beta1.PerClusterWorkSynchronizedConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "AllWorkSynced", + }, + }, + }, + { + // Unselected entry for PickN policy unfulfillment + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.PerClusterScheduledConditionType), + Status: metav1.ConditionFalse, + ObservedGeneration: placement.GetGeneration(), + Reason: "ScheduleFailed", + }, + }, + }, + }, + Conditions: []metav1.Condition{ + { + Type: string(placementv1beta1.ResourcePlacementAppliedConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "ApplySucceeded", + }, + { + Type: string(placementv1beta1.ResourcePlacementAvailableConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "ResourceAvailable", + }, + { + Type: string(placementv1beta1.ResourcePlacementOverriddenConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "NoOverrideSpecified", + }, + { + Type: string(placementv1beta1.ResourcePlacementRolloutStartedConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "RolloutStarted", + }, + { + Type: string(placementv1beta1.ResourcePlacementScheduledConditionType), + Status: metav1.ConditionFalse, + ObservedGeneration: placement.GetGeneration(), + Reason: "SchedulingPolicyUnfulfilled", + }, + { + Type: string(placementv1beta1.ResourcePlacementWorkSynchronizedConditionType), + Status: metav1.ConditionTrue, + ObservedGeneration: placement.GetGeneration(), + Reason: "WorkSynchronized", + }, + }, + } + + if diff := cmp.Diff(placement.GetPlacementStatus(), wantStatus, placementStatusCmpOptionsOnCreate...); diff != "" { + return fmt.Errorf("Placement status diff (-got, +want): %s for placement %v", diff, rpKey) + } + return nil + } + Eventually(rpStatusUpdatedActual, workloadEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") + }) + + It("should place the ConfigMap on the only 1 cluster that has the namespace", func() { + resourcePlacedActual := workNamespaceAndConfigMapPlacedOnClusterActual(memberCluster1EastProd) + Eventually(resourcePlacedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to place resources on cluster %s", memberCluster1EastProd.ClusterName) + }) + + It("should not place the ConfigMap on the clusters without the namespace", func() { + checkIfRemovedConfigMapFromMemberClusters([]*framework.Cluster{memberCluster2EastCanary, memberCluster3WestProd}) + }) + }) +}) diff --git a/test/e2e/resource_placement_negative_cases_test.go b/test/e2e/resource_placement_negative_cases_test.go index c08f7c352..2e1dd0fcc 100644 --- a/test/e2e/resource_placement_negative_cases_test.go +++ b/test/e2e/resource_placement_negative_cases_test.go @@ -149,6 +149,10 @@ var _ = Describe("handling errors and failures gracefully for resource placement Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create ResourcePlacement") }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update ResourcePlacement status as expected", func() { Eventually(func() error { rp := &placementv1beta1.ResourcePlacement{} @@ -311,6 +315,10 @@ var _ = Describe("handling errors and failures gracefully for resource placement Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create ResourcePlacement") }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update ResourcePlacement status as expected", func() { Eventually(func() error { rp := &placementv1beta1.ResourcePlacement{} @@ -441,6 +449,10 @@ var _ = Describe("handling errors and failures gracefully for resource placement Expect(hubClient.Create(ctx, rp)).To(Succeed(), "Failed to create ResourcePlacement") }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update ResourcePlacement status as expected", func() { rpStatusUpdatedActual := func() error { rp := &placementv1beta1.ResourcePlacement{} diff --git a/test/e2e/resource_placement_pickall_test.go b/test/e2e/resource_placement_pickall_test.go index 74830ca5a..8b43cb7e0 100644 --- a/test/e2e/resource_placement_pickall_test.go +++ b/test/e2e/resource_placement_pickall_test.go @@ -53,6 +53,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with no placement policy specified", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { // Create the RP in the same namespace selecting namespaced resources with no placement policy. rp := &placementv1beta1.ResourcePlacement{ @@ -83,6 +87,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with no affinities specified", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -116,6 +124,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, label selector only, updated", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -221,6 +233,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, label selector only, no matching clusters", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -278,6 +294,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, property selector only (node count)", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") @@ -342,6 +362,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, property selector only (node count + CPU/memory capacity), updated", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") @@ -485,6 +509,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, property selector only (cost + CPU allocatable), no matching clusters", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") @@ -550,6 +578,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, label and property selectors (node count)", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") @@ -616,6 +648,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, label and property selectors (node count, CPU/memory allocatable, memory capacity), updated", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") @@ -783,6 +819,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, label and property selectors (cost), no matching clusters", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") @@ -851,6 +891,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickAll po }) Context("with affinities, label and property selectors (VM size), no matching clusters", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("creating the RP should succeed", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") diff --git a/test/e2e/resource_placement_pickfixed_test.go b/test/e2e/resource_placement_pickfixed_test.go index 933cc4e69..1b19a6e8a 100644 --- a/test/e2e/resource_placement_pickfixed_test.go +++ b/test/e2e/resource_placement_pickfixed_test.go @@ -73,6 +73,10 @@ var _ = Describe("placing namespaced scoped resources using an RP with PickFixed }) Context("pick some clusters", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickFixed policy successfully", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -112,6 +116,10 @@ var _ = Describe("placing namespaced scoped resources using an RP with PickFixed }) Context("refreshing target clusters", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create an RP with pickFixed policy successfully", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -171,6 +179,10 @@ var _ = Describe("placing namespaced scoped resources using an RP with PickFixed }) Context("pick unhealthy and non-existent clusters", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create RP with pickFixed policy targeting unhealthy and non-existent clusters", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -211,6 +223,10 @@ var _ = Describe("placing namespaced scoped resources using an RP with PickFixed appConfigMapName := fmt.Sprintf(appConfigMapNameTemplate, GinkgoParallelProcess()) var currentConfigMap corev1.ConfigMap + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create RP with pickFixed policy successfully", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ diff --git a/test/e2e/resource_placement_pickn_test.go b/test/e2e/resource_placement_pickn_test.go index 78d8e4e53..0897ae172 100644 --- a/test/e2e/resource_placement_pickn_test.go +++ b/test/e2e/resource_placement_pickn_test.go @@ -73,6 +73,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("picking N clusters with no affinities/topology spread constraints (pick by cluster names in alphanumeric order)", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy successfully", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -110,6 +114,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("upscaling", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy for upscaling test", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -168,6 +176,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("downscaling", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy for downscaling test", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -229,6 +241,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("picking N clusters with affinities and topology spread constraints", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy and constraints successfully", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -297,6 +313,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("affinities and topology spread constraints updated", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with initial constraints", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -407,6 +427,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("not enough clusters to pick", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy requesting more clusters than available", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -471,6 +495,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("downscaling to zero", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy for downscaling to zero test", func() { // Create the RP in the same namespace selecting namespaced resources. rp := &placementv1beta1.ResourcePlacement{ @@ -528,6 +556,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("picking N clusters with single property sorter", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy and single property sorter", func() { // Have to add this check in each It() spec, instead of using BeforeAll(). // Otherwise, the AfterEach() would be skipped too and the namespace does not get cleaned up. @@ -603,6 +635,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("picking N clusters with multiple property sorters", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy and multiple property sorters", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") @@ -685,6 +721,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("picking N clusters with label selector and property sorter", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy, label selector and property sorter", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") @@ -777,6 +817,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with PickN poli }) Context("picking N clusters with required and preferred affinity terms", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should create rp with pickN policy, required and preferred affinity terms", func() { if !isAzurePropertyProviderEnabled { Skip("Skipping this test spec as Azure property provider is not enabled in the test environment") diff --git a/test/e2e/resource_placement_ro_test.go b/test/e2e/resource_placement_ro_test.go index 492684189..adbaa3c7b 100644 --- a/test/e2e/resource_placement_ro_test.go +++ b/test/e2e/resource_placement_ro_test.go @@ -54,6 +54,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride (selecting all clusters) to override configMap for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -222,6 +226,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride with multiple jsonPatchOverrides to override configMap for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -298,6 +306,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride with different rules for each cluster to override configMap for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -412,6 +424,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride with incorrect path for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -483,6 +499,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride and resource becomes invalid after override for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -580,6 +600,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride with templated rules with cluster name to override configMap for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -678,6 +702,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride with delete configMap for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -786,6 +814,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride with templated rules with cluster label key replacement for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) @@ -943,6 +975,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with ResourceOv }) Context("creating resourceOverride with non-exist label for ResourcePlacement", Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) roName := fmt.Sprintf(roNameTemplate, GinkgoParallelProcess()) workNamespace := fmt.Sprintf(workNamespaceNameTemplate, GinkgoParallelProcess()) diff --git a/test/e2e/resource_placement_rollout_test.go b/test/e2e/resource_placement_rollout_test.go index d780e2d12..0c405e3c8 100644 --- a/test/e2e/resource_placement_rollout_test.go +++ b/test/e2e/resource_placement_rollout_test.go @@ -134,6 +134,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with rollout", Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespace.Name, allMemberClusterNames) + }) + It("Create the RP that select the enveloped objects", func() { rp := &placementv1beta1.ResourcePlacement{ ObjectMeta: metav1.ObjectMeta{ @@ -253,6 +257,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with rollout", Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespace.Name, allMemberClusterNames) + }) + It("create the RP that select the deployment", func() { rp := buildRPForSafeRollout(workNamespace.Name) rp.Spec.ResourceSelectors = []placementv1beta1.ResourceSelectorTerm{ @@ -332,6 +340,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with rollout", Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespace.Name, allMemberClusterNames) + }) + It("create the RP that select the enveloped daemonset", func() { rp := buildRPForSafeRollout(workNamespace.Name) rp.Spec.ResourceSelectors = []placementv1beta1.ResourceSelectorTerm{ @@ -416,6 +428,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with rollout", Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespace.Name, allMemberClusterNames) + }) + It("create the RP that select the enveloped statefulset", func() { rp := buildRPForSafeRollout(workNamespace.Name) rp.Spec.ResourceSelectors = []placementv1beta1.ResourceSelectorTerm{ @@ -499,6 +515,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with rollout", Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespace.Name, allMemberClusterNames) + }) + It("create the RP that select the service", func() { rp := buildRPForSafeRollout(workNamespace.Name) rp.Spec.ResourceSelectors = []placementv1beta1.ResourceSelectorTerm{ @@ -579,6 +599,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with rollout", Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespace.Name, allMemberClusterNames) + }) + It("create the RP that select the deployment", func() { rp := buildRPForSafeRollout(workNamespace.Name) rp.Spec.RevisionHistoryLimit = ptr.To(int32(1)) @@ -743,6 +767,10 @@ var _ = Describe("placing namespaced scoped resources using a RP with rollout", Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") }) + It("should wait for namespace collection on all member clusters", func() { + waitForNamespaceCollectionOnClusters(workNamespace.Name, allMemberClusterNames) + }) + It("create the RP that select the job", func() { rp := buildRPForSafeRollout(workNamespace.Name) // the job we are trying to propagate takes 10s to complete. MaxUnavailable is set to 1. So setting UnavailablePeriodSeconds to 15s diff --git a/test/e2e/resource_placement_scheduler_watchers_test.go b/test/e2e/resource_placement_scheduler_watchers_test.go index 056e50eb1..96c7ef954 100644 --- a/test/e2e/resource_placement_scheduler_watchers_test.go +++ b/test/e2e/resource_placement_scheduler_watchers_test.go @@ -123,6 +123,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should place resources on all member clusters", checkIfPlacedWorkResourcesOnAllMemberClusters) It("rp should pick only healthy clusters in the system", func() { @@ -201,6 +205,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should propagate works for the new cluster; can mark them as available", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workNamespaceIdentifiers()) }) @@ -287,6 +295,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("rp should not select any clusters", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), nil, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Should not select any cluster") @@ -373,6 +385,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should place resources on all member clusters", checkIfPlacedWorkResourcesOnAllMemberClusters) It("rp should pick only healthy clusters in the system", func() { @@ -451,6 +467,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should propagate works for the new cluster; can mark them as applied", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workNamespaceIdentifiers()) verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, rpName, appConfigMapIdentifiers()) @@ -519,6 +539,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should propagate works for the new cluster; can mark them as applied", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workNamespaceIdentifiers()) verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, rpName, appConfigMapIdentifiers()) @@ -578,6 +602,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("rp should propagate works for the new cluster; can mark them as applied", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workNamespaceIdentifiers()) verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, rpName, appConfigMapIdentifiers()) @@ -663,6 +691,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("rp should pick all clusters", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), allMemberClusterNames, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Should select all clusters") @@ -744,6 +776,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should report in RP status that the cluster is not available", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), nil, []string{fakeClusterName1ForWatcherTests}, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") @@ -805,6 +841,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should report in RP status that the cluster is not available", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), nil, []string{fakeClusterName1ForWatcherTests}, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP status as expected") @@ -863,6 +903,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should propagate works for the new cluster; can mark them as applied", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workNamespaceIdentifiers()) verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, rpName, appConfigMapIdentifiers()) @@ -922,6 +966,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should propagate works for the new cluster; can mark them as applied", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workResourceIdentifiers()) verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, rpName, appConfigMapIdentifiers()) @@ -975,6 +1023,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should place resources on all member clusters", checkIfPlacedWorkResourcesOnAllMemberClusters) It("rp should pick only healthy clusters in the system", func() { @@ -1054,6 +1106,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("rp should not pick any cluster", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), nil, []string{fakeClusterName1ForWatcherTests}, "0") Eventually(rpStatusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Should not select any cluster") @@ -1137,6 +1193,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("rp should not pick any cluster", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), nil, []string{fakeClusterName1ForWatcherTests}, "0") Eventually(rpStatusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Should not select any cluster") @@ -1197,6 +1257,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should place resources on all member clusters", checkIfPlacedWorkResourcesOnAllMemberClusters) It("rp should pick only healthy clusters in the system", func() { @@ -1294,6 +1358,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("rp should not pick any cluster", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), nil, []string{memberCluster3WestProdName}, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Should not select any cluster") @@ -1397,6 +1465,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should place resources on all member clusters", checkIfPlacedWorkResourcesOnAllMemberClusters) It("should pick only healthy clusters in the system", func() { @@ -1452,6 +1524,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should propagate works for the new cluster; can mark them as applied", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workNamespaceIdentifiers()) verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, rpName, appConfigMapIdentifiers()) @@ -1537,6 +1613,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should propagate works for the new cluster; can mark them as applied", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workNamespaceIdentifiers()) verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, rpName, appConfigMapIdentifiers()) @@ -1618,6 +1698,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should propagate works for the new cluster; can mark them as applied", func() { verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, crpName, workNamespaceIdentifiers()) verifyWorkPropagationAndMarkAsAvailable(fakeClusterName1ForWatcherTests, rpName, appConfigMapIdentifiers()) @@ -1685,6 +1769,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("should place resources on all real member clusters", checkIfPlacedWorkResourcesOnAllMemberClusters) It("should propagate works for both new clusters; can mark them as applied", func() { @@ -1780,6 +1868,10 @@ var _ = Describe("responding to specific member cluster changes using RP", Label Expect(hubClient.Create(ctx, rp)).To(Succeed()) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(nsName, allMemberClusterNames) + }) + It("rp should pick one cluster", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), []string{memberCluster3WestProdName}, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Should not select any cluster") diff --git a/test/e2e/resource_placement_selecting_resources_test.go b/test/e2e/resource_placement_selecting_resources_test.go index 8083da339..17ed11c34 100644 --- a/test/e2e/resource_placement_selecting_resources_test.go +++ b/test/e2e/resource_placement_selecting_resources_test.go @@ -82,6 +82,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: appNamespace().Name}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), allMemberClusterNames, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s status as expected", rpName) @@ -150,6 +154,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: appNamespace().Name}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), allMemberClusterNames, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s status as expected", rpName) @@ -217,6 +225,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: appNamespace().Name}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { rpStatusUpdatedActual := rpStatusUpdatedActual([]placementv1beta1.ResourceIdentifier{}, allMemberClusterNames, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s status as expected", rpName) @@ -309,6 +321,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: appNamespace().Name}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), allMemberClusterNames, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s status as expected", rpName) @@ -389,6 +405,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: appNamespace().Name}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { // The configMap should not be selected. rpStatusUpdatedActual := rpStatusUpdatedActual([]placementv1beta1.ResourceIdentifier{}, allMemberClusterNames, nil, "0") @@ -451,6 +471,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: appNamespace().Name}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { Eventually(func() error { gotRP := &placementv1beta1.ResourcePlacement{} @@ -534,6 +558,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: appNamespace().Name}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { rpStatusUpdatedActual := func() error { rp := &placementv1beta1.ResourcePlacement{} @@ -666,6 +694,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: workNamespace}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(nil, allMemberClusterNames, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s status as expected", rpName) @@ -759,6 +791,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: workNamespace}, allMemberClusters) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status as expected", func() { rpStatusUpdatedActual := rpStatusUpdatedActual(nil, allMemberClusterNames, nil, "0") Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s status as expected", rpName) @@ -868,6 +904,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: workNamespace}, allMemberClusters, secrets...) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("check if created resource snapshots are as expected", func() { Eventually(multipleRPResourceSnapshotsCreatedActual("2", "2", "0"), largeEventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to check created resource snapshots", rpName) }) @@ -1011,6 +1051,10 @@ var _ = Describe("testing RP selecting resources", Label("resourceplacement"), f ensureRPAndRelatedResourcesDeleted(types.NamespacedName{Name: rpName, Namespace: nsName}, allMemberClusters, configMap, secret, pvc, role) }) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + It("should update RP status with the correct order of the selected resources", func() { // Define the expected resources in order. // Note: PVCs are not propagated, so they should not appear in selected resources diff --git a/test/e2e/resource_placement_taint_toleration_test.go b/test/e2e/resource_placement_taint_toleration_test.go index c2bd145da..e4ec5bbd5 100644 --- a/test/e2e/resource_placement_taint_toleration_test.go +++ b/test/e2e/resource_placement_taint_toleration_test.go @@ -51,6 +51,10 @@ var _ = Describe("validating RP for taint and toleration features", Label("resou }) Describe("placing resource using a resource placement with pickFixed placement policy specified, taint clusters, pick all specified clusters", Serial, Ordered, func() { + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the resources. createConfigMap() @@ -101,6 +105,10 @@ var _ = Describe("validating RP for taint and toleration features", Label("resou var taintClusterNames, noTaintClusterNames []string var taintClusters, noTaintClusters []*framework.Cluster + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { taintClusterNames = []string{memberCluster1EastProdName, memberCluster2EastCanaryName} taintClusters = []*framework.Cluster{memberCluster1EastProd, memberCluster2EastCanary} @@ -174,6 +182,10 @@ var _ = Describe("validating RP for taint and toleration features", Label("resou var taintClusterNames, noTaintClusterNames []string var taintClusters, noTaintClusters []*framework.Cluster + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { taintClusterNames = []string{memberCluster1EastProdName, memberCluster2EastCanaryName} taintClusters = []*framework.Cluster{memberCluster1EastProd, memberCluster2EastCanary} @@ -245,6 +257,10 @@ var _ = Describe("validating RP for taint and toleration features", Label("resou var taintClusterNames, tolerateClusterNames, unSelectedClusterNames []string var tolerateClusters, unSelectedClusters []*framework.Cluster + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { taintClusterNames = []string{memberCluster1EastProdName, memberCluster2EastCanaryName} @@ -341,6 +357,10 @@ var _ = Describe("validating RP for taint and toleration features", Label("resou Describe("picking all clusters using pickAll placement policy, add taint to a cluster that's already selected", Serial, Ordered, func() { taintClusterNames := []string{memberCluster1EastProdName} + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the resources. createConfigMap() diff --git a/test/e2e/resource_placement_with_custom_config_test.go b/test/e2e/resource_placement_with_custom_config_test.go index b9d713bcc..b632fe47b 100644 --- a/test/e2e/resource_placement_with_custom_config_test.go +++ b/test/e2e/resource_placement_with_custom_config_test.go @@ -71,6 +71,9 @@ var _ = Describe("validating RP when using customized resourceSnapshotCreationMi crpStatusUpdatedActual := crpStatusUpdatedActual(workNamespaceIdentifiers(), allMemberClusterNames, nil, "0") Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP status as expected") + By("waiting for namespace collection to sync on all member clusters") + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + By("creating RP") rp := &placementv1beta1.ResourcePlacement{ ObjectMeta: metav1.ObjectMeta{ diff --git a/test/e2e/setup.sh b/test/e2e/setup.sh index a1ecad710..bc1dcf9b7 100755 --- a/test/e2e/setup.sh +++ b/test/e2e/setup.sh @@ -216,6 +216,7 @@ do --set workApplierRequeueRateLimiterMaxFastBackoffDelaySeconds=5 \ --set propertyProvider=$PROPERTY_PROVIDER \ --set region=${REGIONS[$i]} \ + --set enableNamespaceCollectionInPropertyProvider=true \ $( [ "$PROPERTY_PROVIDER" = "azure" ] && echo "-f azure_valid_config.yaml" ) else helm install member-agent ../../charts/member-agent/ \ @@ -233,6 +234,7 @@ do --set workApplierRequeueRateLimiterMaxSlowBackoffDelaySeconds=5 \ --set workApplierRequeueRateLimiterMaxFastBackoffDelaySeconds=5 \ --set propertyProvider=$PROPERTY_PROVIDER \ + --set enableNamespaceCollectionInPropertyProvider=true \ $( [ "$PROPERTY_PROVIDER" = "azure" ] && echo "-f azure_valid_config.yaml" ) fi done diff --git a/test/e2e/staged_updaterun_test.go b/test/e2e/staged_updaterun_test.go index 3b8b52484..2ec6c4099 100644 --- a/test/e2e/staged_updaterun_test.go +++ b/test/e2e/staged_updaterun_test.go @@ -65,6 +65,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem var strategy *placementv1beta1.StagedUpdateStrategy var oldConfigMap, newConfigMap corev1.ConfigMap + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP with external rollout strategy. rp := &placementv1beta1.ResourcePlacement{ @@ -245,6 +249,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem var strategy *placementv1beta1.StagedUpdateStrategy var oldConfigMap, newConfigMap corev1.ConfigMap + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP with external rollout strategy. rp := &placementv1beta1.ResourcePlacement{ @@ -472,6 +480,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem var strategy *placementv1beta1.StagedUpdateStrategy updateRunNames := []string{} + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP with external rollout strategy and pick fixed policy. rp := &placementv1beta1.ResourcePlacement{ @@ -679,6 +691,12 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem updateRunNames := []string{} BeforeAll(func() { + // Ensure the namespace collection is synced on all member clusters before creating the RP. + // We expect kind-cluster-3 to be picked by PickN with N=1 policy due to the alphabetical order of cluster names. + // If kind-cluster-3 does not have namespace synced during schedule time, the RP might pick kind-cluster-1 or kind-cluster-2. + By("should wait for namespace collection to sync on all member clusters") + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + // Create the RP with external rollout strategy and pick N=1 policy. rp := &placementv1beta1.ResourcePlacement{ ObjectMeta: metav1.ObjectMeta{ @@ -889,6 +907,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem var wantROs map[string][]placementv1beta1.NamespacedName var wantROAnnotations map[string]string + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the ro before rp so that the observed resource index is predictable. ro := &placementv1beta1.ResourceOverride{ @@ -1047,6 +1069,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem var applyStrategy *placementv1beta1.ApplyStrategy updateRunName := fmt.Sprintf(stagedUpdateRunNameWithSubIndexTemplate, GinkgoParallelProcess(), 0) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP with external rollout strategy, pickAll policy and reportDiff apply strategy. applyStrategy = &placementv1beta1.ApplyStrategy{ @@ -1142,6 +1168,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem updateRunName := fmt.Sprintf(stagedUpdateRunNameWithSubIndexTemplate, GinkgoParallelProcess(), 0) var oldConfigMap, newConfigMap corev1.ConfigMap + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP with rollingUpdate strategy initially. rp := &placementv1beta1.ResourcePlacement{ @@ -1258,6 +1288,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem var strategy *placementv1beta1.StagedUpdateStrategy updateRunName := fmt.Sprintf(stagedUpdateRunNameWithSubIndexTemplate, GinkgoParallelProcess(), 0) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP with external rollout strategy. rp := &placementv1beta1.ResourcePlacement{ @@ -1350,6 +1384,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem var strategy *placementv1beta1.StagedUpdateStrategy updateRunName := fmt.Sprintf(stagedUpdateRunNameWithSubIndexTemplate, GinkgoParallelProcess(), 0) + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP with external rollout strategy. rp := &placementv1beta1.ResourcePlacement{ @@ -1444,6 +1482,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem updateRunNames := []string{} var strategy *placementv1beta1.StagedUpdateStrategy + It("should wait for namespace collection to sync on all member clusters", func() { + waitForNamespaceCollectionOnClusters(appNamespace().Name, allMemberClusterNames) + }) + BeforeAll(func() { // Create the RP with external rollout strategy. rp := &placementv1beta1.ResourcePlacement{ diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go index 66c6bca86..49a41f756 100644 --- a/test/e2e/utils_test.go +++ b/test/e2e/utils_test.go @@ -519,6 +519,11 @@ func summarizeAKSClusterProperties(memberCluster *framework.Cluster, mcObj *clus Status: metav1.ConditionTrue, ObservedGeneration: mcObj.Generation, }, + { + Type: propertyprovider.NamespaceCollectionSucceededCondType, + Status: metav1.ConditionTrue, + ObservedGeneration: mcObj.Generation, + }, }, } @@ -836,6 +841,70 @@ func createNamespace() { }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to create namespace %s", ns.Name) } +// isNamespaceCollectionEnabled checks (without waiting) whether namespace collection +// is currently enabled on the given cluster. Returns false if the cluster cannot be +// fetched or the condition is absent. +func isNamespaceCollectionEnabled(clusterName string) bool { + mc := &clusterv1beta1.MemberCluster{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: clusterName}, mc); err != nil { + return false + } + for _, cond := range mc.Status.Conditions { + if cond.Type == propertyprovider.NamespaceCollectionSucceededCondType { + return true + } + } + return false +} + +// waitForNamespaceCollectionOnClusters waits for the specified namespace to appear in +// MemberCluster.Status.Namespaces for all specified clusters. This ensures namespace collection +// has synced before creating ResourcePlacements, avoiding race conditions where the scheduler +// might filter out clusters before they report having the namespace. +// +// This is a no-op when namespace collection is not enabled (i.e. no property provider is +// configured), preserving backward compatibility for tests that run without PROPERTY_PROVIDER=azure. +// +// This is particularly important when: +// - CRPs dynamically create namespaces on member clusters +// - ResourcePlacements need to use those newly created namespaces +// - Namespace affinity plugin filtering is enabled +func waitForNamespaceCollectionOnClusters(namespaceName string, clusterNames []string) { + GinkgoHelper() + + // Skip waiting entirely when namespace collection is not nabled on any cluster. + // The namespace affinity plugin skips filtering in this case (backward compatibility), + // so there is no race to protect against. + // Note: In the test environment, namespace collection is configured in an all-or-nothing + // manner across all clusters, so checking the first cluster is sufficient. + if len(clusterNames) == 0 || !isNamespaceCollectionEnabled(clusterNames[0]) { + return + } + + // Wait for the specific namespace to appear in the collection on every cluster. + for _, clusterName := range clusterNames { + Eventually(func() error { + mc := &clusterv1beta1.MemberCluster{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: clusterName}, mc); err != nil { + return fmt.Errorf("failed to get member cluster %s: %w", clusterName, err) + } + + // Check if the namespace exists in the status + if mc.Status.Namespaces == nil { + return fmt.Errorf("cluster %s has namespace collection enabled but Status.Namespaces is nil", clusterName) + } + + if _, exists := mc.Status.Namespaces[namespaceName]; !exists { + return fmt.Errorf("namespace %s not yet collected on cluster %s (has %d namespaces)", + namespaceName, clusterName, len(mc.Status.Namespaces)) + } + + return nil + }, longEventuallyDuration, eventuallyInterval).Should(Succeed(), + "Failed to wait for namespace %s to be collected on cluster %s", namespaceName, clusterName) + } +} + func createConfigMap() { configMap := appConfigMap() Expect(hubClient.Create(ctx, &configMap)).To(Succeed(), "Failed to create config map %s", configMap.Name) @@ -1059,17 +1128,11 @@ func checkIfRemovedConfigMapFromMemberClustersConsistently(clusters []*framework // createCRPWithSelectors creates a ClusterResourcePlacement with the given selectors. func createCRPWithSelectors(crpName string, selectors []placementv1beta1.ResourceSelectorTerm) { - crp := &placementv1beta1.ClusterResourcePlacement{ - ObjectMeta: metav1.ObjectMeta{ - Name: crpName, - Finalizers: []string{customDeletionBlockerFinalizer}, - }, - Spec: placementv1beta1.PlacementSpec{ - ResourceSelectors: selectors, - }, - } - By(fmt.Sprintf("creating CRP %s", crpName)) - Expect(hubClient.Create(ctx, crp)).To(Succeed(), "Failed to create CRP %s", crpName) + // Use default strategy (empty strategy will use API defaults) + defaultStrategy := placementv1beta1.RolloutStrategy{} + + // Use the generic CRP creator with custom selectors, nil policy, and default strategy + createGenericCRP(crpName, selectors, nil, defaultStrategy) } // checkNamespacePlacedOnClusters verifies a namespace is placed on all member clusters. @@ -1740,31 +1803,25 @@ func createRPWithApplyStrategy(rpNamespace, rpName string, applyStrategy *placem // createCRPWithApplyStrategy creates a ClusterResourcePlacement with the given name and apply strategy. func createCRPWithApplyStrategy(crpName string, applyStrategy *placementv1beta1.ApplyStrategy, resourceSelectors []placementv1beta1.ResourceSelectorTerm) { - crp := &placementv1beta1.ClusterResourcePlacement{ - ObjectMeta: metav1.ObjectMeta{ - Name: crpName, - // Add a custom finalizer; this would allow us to better observe - // the behavior of the controllers. - Finalizers: []string{customDeletionBlockerFinalizer}, - }, - Spec: placementv1beta1.PlacementSpec{ - ResourceSelectors: workResourceSelector(), - Strategy: placementv1beta1.RolloutStrategy{ - Type: placementv1beta1.RollingUpdateRolloutStrategyType, - RollingUpdate: &placementv1beta1.RollingUpdateConfig{ - UnavailablePeriodSeconds: ptr.To(2), - }, - }, + // Build the strategy with optional apply strategy + strategy := placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1beta1.RollingUpdateConfig{ + UnavailablePeriodSeconds: ptr.To(2), }, } if applyStrategy != nil { - crp.Spec.Strategy.ApplyStrategy = applyStrategy + strategy.ApplyStrategy = applyStrategy } + + // Use workResourceSelector as default if no selectors provided + selectors := workResourceSelector() if resourceSelectors != nil { - crp.Spec.ResourceSelectors = resourceSelectors + selectors = resourceSelectors } - By(fmt.Sprintf("creating placement %s", crpName)) - Expect(hubClient.Create(ctx, crp)).To(Succeed(), "Failed to create CRP %s", crpName) + + // Use the generic CRP creator with nil placement policy (defaults to PickAll) + createGenericCRP(crpName, selectors, nil, strategy) } // createRP creates a ResourcePlacement with the given name. @@ -1782,6 +1839,56 @@ func createNamespaceOnlyCRP(crpName string) { createCRPWithApplyStrategy(crpName, nil, namespaceOnlySelector()) } +// createGenericCRP creates a CRP with full customization as suggested by michaelawyu +// This provides the most flexible CRP creation utility for E2E tests +func createGenericCRP(name string, resourceSelectors []placementv1beta1.ResourceSelectorTerm, placementPolicy *placementv1beta1.PlacementPolicy, strategy placementv1beta1.RolloutStrategy) { + crp := &placementv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Finalizers: []string{customDeletionBlockerFinalizer}, + }, + Spec: placementv1beta1.PlacementSpec{ + ResourceSelectors: resourceSelectors, + Policy: placementPolicy, + Strategy: strategy, + }, + } + By(fmt.Sprintf("creating placement %s", name)) + Expect(hubClient.Create(ctx, crp)).To(Succeed(), "Failed to create CRP %s", name) +} + +// createNamespaceOnlyCRPWithFixedClusters creates a CRP with PickFixed policy that places +// the namespace on the specified clusters. Uses the new generic CRP creator. +func createNamespaceOnlyCRPWithFixedClusters(crpName string, clusterNames []string) { + policy := &placementv1beta1.PlacementPolicy{ + PlacementType: placementv1beta1.PickFixedPlacementType, + ClusterNames: clusterNames, + } + strategy := placementv1beta1.RolloutStrategy{ + Type: placementv1beta1.RollingUpdateRolloutStrategyType, + RollingUpdate: &placementv1beta1.RollingUpdateConfig{ + UnavailablePeriodSeconds: ptr.To(2), + }, + } + createGenericCRP(crpName, namespaceOnlySelector(), policy, strategy) +} + +// createNamespaceOnlyCRPForTwoClusters creates a CRP with PickFixed policy that places the +// namespace on cluster-1 and cluster-2 only. Used by namespace affinity e2e tests to set up +// a state where only 2 out of 3 member clusters have the target namespace. +func createNamespaceOnlyCRPForTwoClusters(crpName string) { + createNamespaceOnlyCRPWithFixedClusters(crpName, []string{ + memberCluster1EastProdName, + memberCluster2EastCanaryName, + }) +} + +func createNamespaceOnlyCRPForOneCluster(crpName string) { + createNamespaceOnlyCRPWithFixedClusters(crpName, []string{ + memberCluster1EastProdName, + }) +} + // ensureClusterStagedUpdateRunDeletion deletes the cluster staged update run with the given name and checks all related cluster approval requests are also deleted. func ensureClusterStagedUpdateRunDeletion(updateRunName string) { updateRun := &placementv1beta1.ClusterStagedUpdateRun{ From 8e938ae191cb81e4b1293b0574f8ff28d652948e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 26 Mar 2026 19:44:39 +1100 Subject: [PATCH 10/15] chore: bump azure/setup-helm from 4 to 5 (#551) Bumps [azure/setup-helm](https://github.com/azure/setup-helm) from 4 to 5. - [Release notes](https://github.com/azure/setup-helm/releases) - [Changelog](https://github.com/Azure/setup-helm/blob/main/CHANGELOG.md) - [Commits](https://github.com/azure/setup-helm/compare/v4...v5) --- updated-dependencies: - dependency-name: azure/setup-helm dependency-version: '5' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/code-lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-lint.yml b/.github/workflows/code-lint.yml index deeced9ec..c6e62b24a 100644 --- a/.github/workflows/code-lint.yml +++ b/.github/workflows/code-lint.yml @@ -79,7 +79,7 @@ jobs: uses: actions/checkout@v6.0.2 - name: Set up Helm - uses: azure/setup-helm@v4 + uses: azure/setup-helm@v5 with: version: v3.17.0 From f061d92493656db36d8037a10367489d8000089d Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Thu, 26 Mar 2026 19:45:25 +1100 Subject: [PATCH 11/15] feat: check in performance test related code/utilities/scripts (1 of N) (#529) --- hack/perftest/1000stagedupdateruns/README.md | 33 ++++ .../1000stagedupdateruns/example-run.yaml | 8 + .../example-strategy.yaml | 21 +++ hack/perftest/1000stagedupdateruns/main.go | 96 ++++++++++ .../1000stagedupdateruns/utils/cleanup.go | 114 ++++++++++++ .../1000stagedupdateruns/utils/latency.go | 53 ++++++ .../1000stagedupdateruns/utils/poll.go | 86 +++++++++ .../1000stagedupdateruns/utils/run.go | 86 +++++++++ .../1000stagedupdateruns/utils/setup.go | 121 +++++++++++++ .../1000stagedupdateruns/utils/strategy.go | 61 +++++++ .../1000stagedupdateruns/utils/update.go | 164 ++++++++++++++++++ 11 files changed, 843 insertions(+) create mode 100644 hack/perftest/1000stagedupdateruns/README.md create mode 100644 hack/perftest/1000stagedupdateruns/example-run.yaml create mode 100644 hack/perftest/1000stagedupdateruns/example-strategy.yaml create mode 100644 hack/perftest/1000stagedupdateruns/main.go create mode 100644 hack/perftest/1000stagedupdateruns/utils/cleanup.go create mode 100644 hack/perftest/1000stagedupdateruns/utils/latency.go create mode 100644 hack/perftest/1000stagedupdateruns/utils/poll.go create mode 100644 hack/perftest/1000stagedupdateruns/utils/run.go create mode 100644 hack/perftest/1000stagedupdateruns/utils/setup.go create mode 100644 hack/perftest/1000stagedupdateruns/utils/strategy.go create mode 100644 hack/perftest/1000stagedupdateruns/utils/update.go diff --git a/hack/perftest/1000stagedupdateruns/README.md b/hack/perftest/1000stagedupdateruns/README.md new file mode 100644 index 000000000..cb1d1b0cf --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/README.md @@ -0,0 +1,33 @@ +# KubeFleet Performance/Scalability Test Utility: Creating 1K Staged Update Runs Concurrently + +This directory contains a utility program that creates 1K staged update runs that run concurrently and polls +until their full completion. + +The program is added for the purpose of testing the performance and scalability of KubeFleet. + +## Before you begin + +* Set up 1K placements using the utility program in `../1000placements` before running this utility program. +* Make sure that all placements have been updated to use the `External` update strategy (via the staged update run APIs). +* Make sure that all member clusters are labelled with `env=[canary|staging|prod]` as appropriate so that KubeFleet +can assign them to their respective stages. + +> If you have followed the instructions in `../../README.md` and have used the given scripts and utility programs +> to run the performance/scalability test, all the steps above should have been done for you already. + +## Running the utility program + +Run the commands below to run the utility program: + +```bash +go run main.go +``` + +With the default setup, it might take 1-2 hours before all 1K staged update runs are fully completed. The programs +reports the progress in the output. + +After the program completes, run the commands below to clean up the created 1K staged update runs: + +```bash +CLEANUP=set go run main.go +``` diff --git a/hack/perftest/1000stagedupdateruns/example-run.yaml b/hack/perftest/1000stagedupdateruns/example-run.yaml new file mode 100644 index 000000000..94da747ba --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/example-run.yaml @@ -0,0 +1,8 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterStagedUpdateRun +metadata: + name: run-0 +spec: + placementName: crp-0 + stagedRolloutStrategyName: staging-canary-prod + state: Run diff --git a/hack/perftest/1000stagedupdateruns/example-strategy.yaml b/hack/perftest/1000stagedupdateruns/example-strategy.yaml new file mode 100644 index 000000000..af500455a --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/example-strategy.yaml @@ -0,0 +1,21 @@ +apiVersion: placement.kubernetes-fleet.io/v1beta1 +kind: ClusterStagedUpdateStrategy +metadata: + name: "staging-canary-prod" +spec: + stages: + - name: staging + labelSelector: + matchLabels: + env: staging + maxConcurrency: 25% + - name: canary + labelSelector: + matchLabels: + env: canary + maxConcurrency: 25% + - name: prod + labelSelector: + matchLabels: + env: prod + maxConcurrency: 25% diff --git a/hack/perftest/1000stagedupdateruns/main.go b/hack/perftest/1000stagedupdateruns/main.go new file mode 100644 index 000000000..9572614f7 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/main.go @@ -0,0 +1,96 @@ +package main + +import ( + "context" + "os" + "time" + + "k8s.io/apimachinery/pkg/util/wait" + + "github.com/kubefleet-dev/kubefleet/hack/perftest/1000stagedupdateruns/utils" +) + +const ( + // The number of workers to create/delete resources concurrently. + resourceSetupWorkerCount = 15 + // The number of workers to long poll staged update runs concurrently. + longPollingWorkerCount = 15 + // The cool down period between two stages (e.g., creating resources and long polling them). + betweenStageCoolDownPeriod = time.Second * 30 + // The cool down period between two polls. + longPollingCoolDownPeriod = time.Second * 45 + + // The max concurrency per stage setting for each staged update run. + maxConcurrencyPerStage = "50%" + // The number of placements (and thus the number of staged update runs) to create. + maxCRPToUpdateCount = 1000 +) + +var ( + retryOpsBackoff = wait.Backoff{ + Steps: 4, + Duration: 4 * time.Second, + Factor: 2.0, + Jitter: 0.1, + } +) + +func main() { + ctx := context.Background() + + // Read the arguments. + doCleanUp := false + cleanUpFlag := os.Getenv("CLEANUP") + if len(cleanUpFlag) != 0 { + doCleanUp = true + } + + runner := utils.New( + resourceSetupWorkerCount, + longPollingWorkerCount, + betweenStageCoolDownPeriod, + longPollingCoolDownPeriod, + maxConcurrencyPerStage, + maxCRPToUpdateCount, + retryOpsBackoff, + ) + + if doCleanUp { + runner.CleanUp(ctx) + return + } + + // Prepare the staged update run strategy. + println("Preparing the staged update run strategy...") + runner.CreateStagedUpdateRunStrategy(ctx) + + // Patch existing resources. + println("Updating existing resources...") + runner.UpdateResources(ctx) + + // Cool down. + println("Cooling down...") + runner.CoolDown() + + // Create the staged update runs. + println("Creating staged update runs...") + runner.CreateStagedUpdateRuns(ctx) + + // Cool down. + println("Cooling down...") + runner.CoolDown() + + // Long poll the staged update runs. + println("Long polling staged update runs...") + runner.LongPollStagedUpdateRuns(ctx) + + // Track the latency. + println("Tracking latency...") + runner.TrackLatency(ctx) + + // Tally the latency quantiles. + println("Tallying latency quantiles...") + runner.TallyLatencyQuantiles() + + println("All staged update runs have been completed, exiting.") +} diff --git a/hack/perftest/1000stagedupdateruns/utils/cleanup.go b/hack/perftest/1000stagedupdateruns/utils/cleanup.go new file mode 100644 index 000000000..ebdbd945d --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/cleanup.go @@ -0,0 +1,114 @@ +package utils + +import ( + "context" + "fmt" + "sync" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +func (r *Runner) CleanUp(ctx context.Context) { + r.cleanUpStrategy(ctx) + r.cleanUpRuns(ctx) +} + +func (r *Runner) cleanUpStrategy(ctx context.Context) { + stagedUpdateRunStrategy := &placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: commonStagedUpdateRunStrategyName, + }, + } + + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsNotFound(err) + }, func() error { + return r.hubClient.Delete(ctx, stagedUpdateRunStrategy) + }) + if errAfterRetries != nil && !errors.IsNotFound(errAfterRetries) { + fmt.Printf("failed to delete staged update run strategy %s after retries: %v\n", commonStagedUpdateRunStrategyName, errAfterRetries) + } +} + +func (r *Runner) cleanUpRuns(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the producer. + wg.Add(1) + go func() { + defer wg.Done() + + for i := 0; i < r.maxCRPToUpdateCount; i++ { + select { + case r.toDeleteChan <- i: + case <-ctx.Done(): + close(r.toDeleteChan) + return + } + } + + close(r.toDeleteChan) + }() + + // Run the workers. + for i := 0; i < r.resourceSetupWorkerCount; i++ { + wg.Add(1) + go func(workerIdx int) { + defer wg.Done() + + for { + // Read from the channel. + var resIdx int + var readOk bool + select { + case resIdx, readOk = <-r.toDeleteChan: + if !readOk { + fmt.Printf("worker %d exits\n", workerIdx) + return + } + case <-ctx.Done(): + return + } + + // Delete the staged update run. + stagedUpdateRun := &placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(stagedUpdateRunNameFmt, resIdx), + }, + } + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsNotFound(err) + }, func() error { + return r.hubClient.Delete(ctx, stagedUpdateRun) + }) + if errAfterRetries != nil && !errors.IsNotFound(errAfterRetries) { + fmt.Printf("worker %d: failed to delete staged update run %s after retries: %v\n", workerIdx, stagedUpdateRun.Name, errAfterRetries) + continue + } + + // Wait until the staged update run is deleted. + errAfterRetries = retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsNotFound(err) + }, func() error { + stagedUpdateRun := &placementv1beta1.ClusterStagedUpdateRun{} + err := r.hubClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf(stagedUpdateRunNameFmt, resIdx)}, stagedUpdateRun) + if err == nil { + return fmt.Errorf("staged update run %s still exists", stagedUpdateRun.Name) + } + return err + }) + if errAfterRetries == nil || !errors.IsNotFound(errAfterRetries) { + fmt.Printf("worker %d: failed to wait for staged update run %s to be deleted after retries: %v\n", workerIdx, stagedUpdateRun.Name, errAfterRetries) + } else { + fmt.Printf("worker %d: deleted staged update run %s\n", workerIdx, stagedUpdateRun.Name) + } + } + }(i) + } + wg.Wait() +} diff --git a/hack/perftest/1000stagedupdateruns/utils/latency.go b/hack/perftest/1000stagedupdateruns/utils/latency.go new file mode 100644 index 000000000..0d1885c88 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/latency.go @@ -0,0 +1,53 @@ +package utils + +import ( + "context" + "fmt" + "math" + "sort" + "sync" +) + +func (r *Runner) TrackLatency(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the latency tracker. + wg.Add(1) + go func() { + defer wg.Done() + + for { + var attempt latencyTrackAttempt + var readOK bool + select { + case attempt, readOK = <-r.toTrackLatencyChan: + if !readOK { + return + } + fmt.Printf("latency tracker: staged update run run-%d has latency %v seconds\n", attempt.resIdx, attempt.latency.Seconds()) + r.stagedUpdateRunCompletionLatencyByRunName[fmt.Sprintf("run-%d", attempt.resIdx)] = attempt.latency + case <-ctx.Done(): + return + } + } + }() + wg.Wait() +} + +func (r *Runner) TallyLatencyQuantiles() { + latencies := make([]float64, 0, len(r.stagedUpdateRunCompletionLatencyByRunName)) + for _, latency := range r.stagedUpdateRunCompletionLatencyByRunName { + latencies = append(latencies, latency.Seconds()) + } + sort.Slice(latencies, func(i, j int) bool { + return latencies[i] < latencies[j] + }) + + q25 := int(math.Floor(float64(len(latencies)) * 0.25)) + q50 := int(math.Floor(float64(len(latencies)) * 0.50)) + q75 := int(math.Floor(float64(len(latencies)) * 0.75)) + q90 := int(math.Floor(float64(len(latencies)) * 0.90)) + q99 := int(math.Floor(float64(len(latencies)) * 0.99)) + fmt.Printf("latencies (seconds): 25th=%v, 50th=%v, 75th=%v, 90th=%v, 99th=%v\n", + latencies[q25], latencies[q50], latencies[q75], latencies[q90], latencies[q99]) +} diff --git a/hack/perftest/1000stagedupdateruns/utils/poll.go b/hack/perftest/1000stagedupdateruns/utils/poll.go new file mode 100644 index 000000000..62aaf6654 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/poll.go @@ -0,0 +1,86 @@ +package utils + +import ( + "context" + "fmt" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +func (r *Runner) LongPollStagedUpdateRuns(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the polling workers. + for i := 0; i < r.longPollingWorkerCount; i++ { + wg.Add(1) + + go func(workerIdx int) { + defer wg.Done() + + for { + // Read from the channel. + var resIdx int + var readOk bool + select { + case resIdx, readOk = <-r.toLongPollStagedUpdateRunsChan: + if !readOk { + fmt.Printf("long polling worker %d exits\n", workerIdx) + return + } + case <-ctx.Done(): + return + default: + if r.longPollingStagedUpdateRunsCount.Load() == r.completedStagedUpdateRunCount.Load() { + return + } + continue + } + + // Read the staged update run to check if it's completed. + var stagedUpdateRun placementv1beta1.ClusterStagedUpdateRun + if err := r.hubClient.Get(ctx, client.ObjectKey{Name: fmt.Sprintf(stagedUpdateRunNameFmt, resIdx)}, &stagedUpdateRun); err != nil { + fmt.Printf("long polling worker %d: failed to get staged update run run-%d: %v\n", workerIdx, resIdx, err) + // Requeue the run; no need to retry here. + r.toLongPollStagedUpdateRunsChan <- resIdx + time.Sleep(r.longPollingCoolDownPeriod) + continue + } + + // Check if the staged update run is completed. + runSucceededCond := meta.FindStatusCondition(stagedUpdateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionSucceeded)) + if runSucceededCond == nil || runSucceededCond.Status != metav1.ConditionTrue || runSucceededCond.ObservedGeneration != stagedUpdateRun.Generation { + fmt.Printf("long polling worker %d: staged update run run-%d is not completed yet, requeue it\n", workerIdx, resIdx) + // Requeue the run. + r.toLongPollStagedUpdateRunsChan <- resIdx + time.Sleep(r.longPollingCoolDownPeriod) + continue + } else { + // The staged update run has been completed. + newCount := r.completedStagedUpdateRunCount.Add(1) + fmt.Printf("long polling worker %d: staged update run run-%d is completed, total completed count: %d\n", workerIdx, resIdx, newCount) + + // Track its latency. + creationTimestamp := stagedUpdateRun.CreationTimestamp.Time + completionLatency := runSucceededCond.LastTransitionTime.Sub(creationTimestamp) + r.toTrackLatencyChan <- latencyTrackAttempt{ + latency: completionLatency, + resIdx: resIdx, + } + } + } + }(i) + } + + wg.Wait() + + // Do a sanity check report. + fmt.Printf("long polling %d staged update runs, with %d completed\n", r.longPollingStagedUpdateRunsCount.Load(), r.completedStagedUpdateRunCount.Load()) + + close(r.toTrackLatencyChan) +} diff --git a/hack/perftest/1000stagedupdateruns/utils/run.go b/hack/perftest/1000stagedupdateruns/utils/run.go new file mode 100644 index 000000000..819b5b6b5 --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/run.go @@ -0,0 +1,86 @@ +package utils + +import ( + "context" + "fmt" + "sync" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/util/retry" + + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +func (r *Runner) CreateStagedUpdateRuns(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the producer. + wg.Add(1) + go func() { + defer wg.Done() + + for i := 0; i < r.maxCRPToUpdateCount; i++ { + select { + case r.toCreateStagedUpdateRunsChan <- i: + case <-ctx.Done(): + close(r.toCreateStagedUpdateRunsChan) + return + } + } + + close(r.toCreateStagedUpdateRunsChan) + }() + + // Run the workers to create staged update runs. + for i := 0; i < r.resourceSetupWorkerCount; i++ { + wg.Add(1) + + go func(workerIdx int) { + defer wg.Done() + + for { + // Read from the channel. + var resIdx int + var readOk bool + select { + case resIdx, readOk = <-r.toCreateStagedUpdateRunsChan: + if !readOk { + fmt.Printf("worker %d exits\n", workerIdx) + return + } + case <-ctx.Done(): + return + } + + // Create the staged update run. + stagedUpdateRun := placementv1beta1.ClusterStagedUpdateRun{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(stagedUpdateRunNameFmt, resIdx), + }, + Spec: placementv1beta1.UpdateRunSpec{ + PlacementName: fmt.Sprintf(placementNameFmt, resIdx), + StagedUpdateStrategyName: commonStagedUpdateRunStrategyName, + State: placementv1beta1.StateRun, + }, + } + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsAlreadyExists(err) + }, func() error { + return r.hubClient.Create(ctx, &stagedUpdateRun) + }) + if errAfterRetries != nil && !errors.IsAlreadyExists(errAfterRetries) { + fmt.Printf("worker %d: failed to create staged update run run-%d after retries: %v\n", workerIdx, resIdx, errAfterRetries) + continue + } + fmt.Printf("worker %d: successfully created staged update run run-%d\n", workerIdx, resIdx) + + // Submit the run for long polling. + r.toLongPollStagedUpdateRunsChan <- resIdx + r.longPollingStagedUpdateRunsCount.Add(1) + } + }(i) + } + + wg.Wait() +} diff --git a/hack/perftest/1000stagedupdateruns/utils/setup.go b/hack/perftest/1000stagedupdateruns/utils/setup.go new file mode 100644 index 000000000..d7437befe --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/setup.go @@ -0,0 +1,121 @@ +package utils + +import ( + "fmt" + "sync/atomic" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +const ( + nsNameFmt = "work-%d" + configMapNameFmt = "data-%d" + deployNameFmt = "app-%d" + + placementNameFmt = "crp-%d" + + stagedUpdateRunNameFmt = "run-%d" +) + +const ( + commonStagedUpdateRunStrategyName = "staging-canary-prod" + + envLabelKey = "env" + stagingEnvLabelValue = "staging" + canaryEnvLabelValue = "canary" + prodEnvLabelValue = "prod" + + randomUUIDLabelKey = "random-uuid" +) + +type latencyTrackAttempt struct { + latency time.Duration + resIdx int +} + +type Runner struct { + hubClient client.Client + + resourceSetupWorkerCount int + longPollingWorkerCount int + betweenStageCoolDownPeriod time.Duration + longPollingCoolDownPeriod time.Duration + + maxConcurrencyPerStage string + maxCRPToUpdateCount int + + retryOpsBackoff wait.Backoff + + toDeleteChan chan int + toPatchResourcesChan chan int + toCreateStagedUpdateRunsChan chan int + toLongPollStagedUpdateRunsChan chan int + toTrackLatencyChan chan latencyTrackAttempt + + resourcesPatchedCount atomic.Int32 + longPollingStagedUpdateRunsCount atomic.Int32 + completedStagedUpdateRunCount atomic.Int32 + + stagedUpdateRunCompletionLatencyByRunName map[string]time.Duration +} + +func New( + concurrentWorkerCount, longPollingWorkerCount int, + betweenStageCoolDownPeriod, longPollingCoolDownPeriod time.Duration, + maxConcurrencyPerStage string, + maxCRPToUpdateCount int, + retryOpsBackoff wait.Backoff, +) *Runner { + // Set up the K8s client for the hub cluster. + hubClusterConfig := ctrl.GetConfigOrDie() + hubClusterConfig.QPS = 200 + hubClusterConfig.Burst = 400 + hubClient, err := client.New(hubClusterConfig, client.Options{ + Scheme: scheme.Scheme, + }) + if err != nil { + panic(fmt.Sprintf("Failed to create hub client: %v", err)) + } + + return &Runner{ + hubClient: hubClient, + resourceSetupWorkerCount: concurrentWorkerCount, + longPollingWorkerCount: longPollingWorkerCount, + betweenStageCoolDownPeriod: betweenStageCoolDownPeriod, + longPollingCoolDownPeriod: longPollingCoolDownPeriod, + maxConcurrencyPerStage: maxConcurrencyPerStage, + maxCRPToUpdateCount: maxCRPToUpdateCount, + retryOpsBackoff: retryOpsBackoff, + toDeleteChan: make(chan int, 20), + toPatchResourcesChan: make(chan int, maxCRPToUpdateCount), + toCreateStagedUpdateRunsChan: make(chan int, 20), + toLongPollStagedUpdateRunsChan: make(chan int, maxCRPToUpdateCount), + toTrackLatencyChan: make(chan latencyTrackAttempt, maxCRPToUpdateCount+1), + stagedUpdateRunCompletionLatencyByRunName: make(map[string]time.Duration, maxCRPToUpdateCount), + } +} + +func init() { + // Set up the scheme. + if err := placementv1beta1.AddToScheme(scheme.Scheme); err != nil { + panic(fmt.Sprintf("Failed to add placement v1beta1 APIs to the scheme: %v", err)) + } + if err := corev1.AddToScheme(scheme.Scheme); err != nil { + panic(fmt.Sprintf("Failed to add core v1 APIs to the scheme: %v", err)) + } + if err := appsv1.AddToScheme(scheme.Scheme); err != nil { + panic(fmt.Sprintf("Failed to add apps v1 APIs to the scheme: %v", err)) + } +} + +func (r *Runner) CoolDown() { + time.Sleep(r.betweenStageCoolDownPeriod) +} diff --git a/hack/perftest/1000stagedupdateruns/utils/strategy.go b/hack/perftest/1000stagedupdateruns/utils/strategy.go new file mode 100644 index 000000000..3afdbea1c --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/strategy.go @@ -0,0 +1,61 @@ +package utils + +import ( + "context" + "fmt" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/util/retry" + "k8s.io/utils/ptr" + + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" +) + +func (r *Runner) CreateStagedUpdateRunStrategy(ctx context.Context) { + stagedUpdateRunStrategy := &placementv1beta1.ClusterStagedUpdateStrategy{ + ObjectMeta: metav1.ObjectMeta{ + Name: commonStagedUpdateRunStrategyName, + }, + Spec: placementv1beta1.UpdateStrategySpec{ + Stages: []placementv1beta1.StageConfig{ + { + Name: "staging", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + envLabelKey: stagingEnvLabelValue, + }, + }, + MaxConcurrency: ptr.To(intstr.FromString(r.maxConcurrencyPerStage)), + }, + { + Name: "canary", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + envLabelKey: canaryEnvLabelValue, + }, + }, + MaxConcurrency: ptr.To(intstr.FromString(r.maxConcurrencyPerStage)), + }, + { + Name: "prod", + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + envLabelKey: prodEnvLabelValue, + }, + }, + MaxConcurrency: ptr.To(intstr.FromString(r.maxConcurrencyPerStage)), + }, + }, + }, + } + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil && !errors.IsAlreadyExists(err) + }, func() error { + return r.hubClient.Create(ctx, stagedUpdateRunStrategy) + }) + if errAfterRetries != nil && !errors.IsAlreadyExists(errAfterRetries) { + panic(fmt.Sprintf("Failed to create staged update run strategy: %v", errAfterRetries)) + } +} diff --git a/hack/perftest/1000stagedupdateruns/utils/update.go b/hack/perftest/1000stagedupdateruns/utils/update.go new file mode 100644 index 000000000..0cfe5317c --- /dev/null +++ b/hack/perftest/1000stagedupdateruns/utils/update.go @@ -0,0 +1,164 @@ +package utils + +import ( + "context" + "fmt" + "sync" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/client-go/util/retry" +) + +func (r *Runner) UpdateResources(ctx context.Context) { + wg := sync.WaitGroup{} + + // Run the producer. + wg.Add(1) + go func() { + defer wg.Done() + + for i := 0; i < r.maxCRPToUpdateCount; i++ { + select { + case r.toPatchResourcesChan <- i: + case <-ctx.Done(): + close(r.toPatchResourcesChan) + return + } + } + + close(r.toPatchResourcesChan) + }() + + // Run the workers to add a label to each existing resource. + for i := 0; i < r.resourceSetupWorkerCount; i++ { + wg.Add(1) + go func(workerIdx int) { + defer wg.Done() + + for { + // Read from the channel. + var resIdx int + var readOk bool + select { + case resIdx, readOk = <-r.toPatchResourcesChan: + if !readOk { + fmt.Printf("worker %d exits\n", workerIdx) + return + } + case <-ctx.Done(): + return + } + + if err := r.updateNS(ctx, workerIdx, resIdx); err != nil { + fmt.Printf("worker %d: failed to update namespace work-%d: %v\n", workerIdx, resIdx, err) + continue + } + + if err := r.updateConfigMap(ctx, workerIdx, resIdx); err != nil { + fmt.Printf("worker %d: failed to update configmap data-%d: %v\n", workerIdx, resIdx, err) + continue + } + + if err := r.updateDeploy(ctx, workerIdx, resIdx); err != nil { + fmt.Printf("worker %d: failed to update deployment deploy-%d: %v\n", workerIdx, resIdx, err) + continue + } + + fmt.Printf("worker %d: successfully updated resources group of idx %d\n", workerIdx, resIdx) + + r.resourcesPatchedCount.Add(1) + } + }(i) + } + + wg.Wait() + + // Do a sanity check report. + fmt.Printf("patched %d out of %d resources in total\n", r.resourcesPatchedCount.Load(), r.maxCRPToUpdateCount) +} + +func (r *Runner) updateNS(ctx context.Context, workerIdx, resIdx int) error { + nsName := fmt.Sprintf(nsNameFmt, resIdx) + + // Update the namespace with the new label. + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil + }, func() error { + ns := &corev1.Namespace{} + if err := r.hubClient.Get(ctx, types.NamespacedName{Name: nsName}, ns); err != nil { + return fmt.Errorf("failed to get namespace %s before update: %w", nsName, err) + } + + labels := ns.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + labels[randomUUIDLabelKey] = string(uuid.NewUUID()) + ns.SetLabels(labels) + + if err := r.hubClient.Update(ctx, ns); err != nil { + return fmt.Errorf("failed to update namespace %s: %w", nsName, err) + } + return nil + }) + return errAfterRetries +} + +func (r *Runner) updateConfigMap(ctx context.Context, workerIdx, resIdx int) error { + cmName := fmt.Sprintf(configMapNameFmt, resIdx) + cmNamespace := fmt.Sprintf(nsNameFmt, resIdx) + + // Update the configmap with the new label. + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil + }, func() error { + cm := &corev1.ConfigMap{} + if err := r.hubClient.Get(ctx, types.NamespacedName{Name: cmName, Namespace: cmNamespace}, cm); err != nil { + return fmt.Errorf("failed to get configmap %s in namespace %s before update: %w", cmName, cmNamespace, err) + } + + labels := cm.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + labels[randomUUIDLabelKey] = string(uuid.NewUUID()) + cm.SetLabels(labels) + + if err := r.hubClient.Update(ctx, cm); err != nil { + return fmt.Errorf("failed to update configmap %s in namespace %s: %w", cmName, cmNamespace, err) + } + return nil + }) + return errAfterRetries +} + +func (r *Runner) updateDeploy(ctx context.Context, workerIdx, resIdx int) error { + deployName := fmt.Sprintf(deployNameFmt, resIdx) + deployNamespace := fmt.Sprintf(nsNameFmt, resIdx) + + // Update the deployment with the new label. + errAfterRetries := retry.OnError(r.retryOpsBackoff, func(err error) bool { + return err != nil + }, func() error { + deploy := &appsv1.Deployment{} + if err := r.hubClient.Get(ctx, types.NamespacedName{Name: deployName, Namespace: deployNamespace}, deploy); err != nil { + return fmt.Errorf("failed to get deployment %s in namespace %s before update: %w", deployName, deployNamespace, err) + } + + labels := deploy.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + labels[randomUUIDLabelKey] = string(uuid.NewUUID()) + deploy.SetLabels(labels) + + if err := r.hubClient.Update(ctx, deploy); err != nil { + return fmt.Errorf("failed to update deployment %s in namespace %s: %w", deployName, deployNamespace, err) + } + return nil + }) + return errAfterRetries +} From 0e6d6236534f35e3f37c889127273abd6f584ebd Mon Sep 17 00:00:00 2001 From: Simon Waight Date: Thu, 26 Mar 2026 19:51:29 +1100 Subject: [PATCH 12/15] chore: fix quickstart script to join members (#542) * Updated maintainers and few other minor tweaks. Signed-off-by: Simon Waight * FIx up join member clusters quickstart script (*nix). Signed-off-by: Simon Waight * Refactor to put version first. Signed-off-by: Simon Waight * Add PowerShell version of join script for Windows users. Signed-off-by: Simon Waight * Fix broken API server lookup from kubeconfig Signed-off-by: Simon Waight * Update join-member-clusters.ps1 to require hub control plane URL as an argument Signed-off-by: Simon Waight * Update join-member-clusters.sh to require hub control plane URL as an argument Signed-off-by: Simon Waight * Tweaks per PR feedback. Signed-off-by: Simon Waight --------- Signed-off-by: Simon Waight --- hack/quickstart/join-member-clusters.ps1 | 207 +++++++++++++++++++++++ hack/quickstart/join-member-clusters.sh | 160 ++++++++++++++++++ 2 files changed, 367 insertions(+) create mode 100644 hack/quickstart/join-member-clusters.ps1 create mode 100755 hack/quickstart/join-member-clusters.sh diff --git a/hack/quickstart/join-member-clusters.ps1 b/hack/quickstart/join-member-clusters.ps1 new file mode 100644 index 000000000..2e229d926 --- /dev/null +++ b/hack/quickstart/join-member-clusters.ps1 @@ -0,0 +1,207 @@ +# Note: you must have at least one hub cluster and one member cluster. +# +# The reason we require the hub cluster API URL as an argument is that, in some environments (e.g. kind), the URL cannot be derived from kubeconfig and needs to be explicitly provided by users. +# +# For example, using Docker, you can get the right IP address for member clusters to use: +# docker inspect local-hub-01-control-plane --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' +# +# You can assume port 6443 unless you have explicitly changed the API server port for your hub cluster. Don't use the mapped Docker port (i.e. 50063) +# +# Example usage: +# ./join-member-clusters.ps1 0.2.2 demo-hub-01 https://172.18.0.2:6443 member-cluster-1 member-cluster-2 + +[CmdletBinding()] +param( + [Parameter(Position = 0)] + [string]$KubefleetVersion, + + [Parameter(Position = 1)] + [string]$HubClusterName, + + [Parameter(Position = 2)] + [string]$HubControlPlaneURL, + + [Parameter(Position = 3, ValueFromRemainingArguments = $true)] + [string[]]$MemberClusterNames, + + [switch]$Help +) + +function Show-Usage { + @' +Usage: + ./join-member-clusters.ps1 [ ...] + +Example: + ./join-member-clusters.ps1 0.2.2 demo-hub-01 https://172.18.0.2:6443 member-cluster-1 member-cluster-2 + +Requirements: + - PowerShell 7, kubectl, and helm must be installed + - hub and member cluster names must exist in your kubeconfig +'@ +} + +function Fail-WithHelp { + param( + [Parameter(Mandatory = $true)] + [string]$Message + ) + + Write-Error $Message + Write-Host "" + Show-Usage + exit 1 +} + +function Test-CommandExists { + param( + [Parameter(Mandatory = $true)] + [string]$Name + ) + + return $null -ne (Get-Command $Name -ErrorAction SilentlyContinue) +} + +function Test-ClusterExists { + param( + [Parameter(Mandatory = $true)] + [string]$Name + ) + + $clusters = @(kubectl config get-clusters 2>$null) + if ($LASTEXITCODE -ne 0) { + return $false + } + + return $clusters | Select-Object -Skip 1 | Where-Object { $_.Trim() -eq $Name } | ForEach-Object { $true } | Select-Object -First 1 +} + +if ($Help) { + Show-Usage + exit 0 +} + +if ([string]::IsNullOrWhiteSpace($KubefleetVersion) -or [string]::IsNullOrWhiteSpace($HubClusterName) -or [string]::IsNullOrWhiteSpace($HubControlPlaneURL) -or $null -eq $MemberClusterNames -or $MemberClusterNames.Count -lt 1) { + $argumentCount = 0 + if (-not [string]::IsNullOrWhiteSpace($KubefleetVersion)) { + $argumentCount++ + } + if (-not [string]::IsNullOrWhiteSpace($HubClusterName)) { + $argumentCount++ + } + if (-not [string]::IsNullOrWhiteSpace($HubControlPlaneURL)) { + $argumentCount++ + } + if ($null -ne $MemberClusterNames) { + $argumentCount += $MemberClusterNames.Count + } + + Fail-WithHelp "expected at least 4 arguments, got $argumentCount" +} + +if (-not (Test-CommandExists -Name kubectl)) { + Fail-WithHelp "kubectl is not installed or not available in PATH" +} + +if (-not (Test-CommandExists -Name helm)) { + Fail-WithHelp "helm is not installed or not available in PATH" +} + +if (-not (Test-ClusterExists -Name $HubClusterName)) { + Fail-WithHelp "hub cluster '$HubClusterName' was not found in kubeconfig" +} + +[System.Uri]$parsedHubURL = $null +if (-not [System.Uri]::TryCreate($HubControlPlaneURL, [System.UriKind]::Absolute, [ref]$parsedHubURL)) { + Fail-WithHelp "hub control plane URL '$HubControlPlaneURL' is not a valid absolute URL" +} + +if ($parsedHubURL.Scheme -ne "https") { + Fail-WithHelp "hub control plane URL must use https" +} + +foreach ($memberClusterName in $MemberClusterNames) { + if ([string]::IsNullOrWhiteSpace($memberClusterName)) { + Fail-WithHelp "member cluster name cannot be empty" + } + + if ($memberClusterName -eq $HubClusterName) { + Fail-WithHelp "member cluster '$memberClusterName' cannot be the same as hub cluster" + } + + if (-not (Test-ClusterExists -Name $memberClusterName)) { + Fail-WithHelp "member cluster '$memberClusterName' was not found in kubeconfig" + } +} + +foreach ($memberClusterName in $MemberClusterNames) { + # Note that Fleet will recognize your cluster with this name once it joins. + $serviceAccount = "$memberClusterName-hub-cluster-access" + $serviceAccountSecret = "$memberClusterName-hub-cluster-access-token" + + Write-Host "Switching into hub cluster context..." + kubectl config use-context $HubClusterName + + # The service account can, in theory, be created in any namespace; for simplicity reasons, + # here you will use the namespace reserved by Fleet installation, `fleet-system`. + # + # Note that if you choose a different value, commands in some steps below need to be + # modified accordingly. + Write-Host "Creating member service account..." + kubectl create serviceaccount $serviceAccount -n fleet-system + + Write-Host "Creating member service account secret..." + @" +apiVersion: v1 +kind: Secret +metadata: + name: $serviceAccountSecret + namespace: fleet-system + annotations: + kubernetes.io/service-account.name: $serviceAccount +type: kubernetes.io/service-account-token +"@ | kubectl apply -f - + + Write-Host "Creating member cluster custom resource on hub cluster..." + $encodedToken = kubectl get secret $serviceAccountSecret -n fleet-system -o "jsonpath={.data.token}" + $token = [System.Text.Encoding]::UTF8.GetString([System.Convert]::FromBase64String($encodedToken)) + + @" +apiVersion: cluster.kubernetes-fleet.io/v1 +kind: MemberCluster +metadata: + name: $memberClusterName +spec: + identity: + name: $memberClusterName-hub-cluster-access + kind: ServiceAccount + namespace: fleet-system + apiGroup: "" + heartbeatPeriodSeconds: 15 +"@ | kubectl apply -f - + + # Install the member agent helm chart on the member cluster. + Write-Host "Switching to member cluster context..." + kubectl config use-context $memberClusterName + + # Create the secret with the token extracted previously for member agent to use. + Write-Host "Creating secret..." + kubectl delete secret hub-kubeconfig-secret + kubectl create secret generic hub-kubeconfig-secret --from-literal="token=$token" + + Write-Host "Uninstalling any existing member-agent instances..." + helm uninstall member-agent -n fleet-system --wait + + Write-Host "Installing member-agent..." + helm install member-agent oci://ghcr.io/kubefleet-dev/kubefleet/charts/member-agent ` + --version $KubefleetVersion ` + --set "config.hubURL=$HubControlPlaneURL" ` + --set "config.memberClusterName=$memberClusterName" ` + --set logFileMaxSize=100000 ` + --namespace fleet-system ` + --create-namespace + + kubectl get pods -A + kubectl config use-context $HubClusterName + kubectl get membercluster $memberClusterName +} diff --git a/hack/quickstart/join-member-clusters.sh b/hack/quickstart/join-member-clusters.sh new file mode 100755 index 000000000..e941dca56 --- /dev/null +++ b/hack/quickstart/join-member-clusters.sh @@ -0,0 +1,160 @@ +#!/bin/bash +# Note: you must have at least one hub cluster and one member cluster. +# +# The reason we require the hub cluster API URL as an argument is that, in some environments (e.g. kind), the URL cannot be derived from kubeconfig and needs to be explicitly provided by users. +# +# For example, using Docker, you can get the right IP address for member clusters to use: +# docker inspect local-hub-01-control-plane --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' +# +# You can assume port 6443 unless you have explicitly changed the API server port for your hub cluster. Don't use the mapped Docker port (i.e. 50063) +# +# Example usage: ./join-member-clusters.sh 0.2.2 demo-hub-01 https://172.18.0.2:6443 member-cluster-1 [ ...] + +usage() { + cat <<'EOF' +Usage: + ./join-member-clusters.sh [ ...] + +Example: + ./join-member-clusters.sh 0.2.2 demo-hub-01 https://172.18.0.2:6443 member-cluster-1 member-cluster-2 + +Requirements: + - kubectl and helm must be installed + - hub and member cluster names must exist in your kubeconfig +EOF +} + +fail_with_help() { + echo "Error: $1" >&2 + echo >&2 + usage >&2 + exit 1 +} + +if [ "$1" = "-h" ] || [ "$1" = "--help" ]; then + usage + exit 0 +fi + +if [ "$#" -lt 4 ]; then + fail_with_help "expected at least 4 arguments, got $#" +fi + +if ! command -v kubectl >/dev/null 2>&1; then + fail_with_help "kubectl is not installed or not available in PATH" +fi + +if ! command -v helm >/dev/null 2>&1; then + fail_with_help "helm is not installed or not available in PATH" +fi + +export KUBEFLEET_VERSION="$1" +export HUB_CLUSTER_NAME="$2" +export HUB_CONTROL_PLANE_URL="$3" + +if [ -z "$KUBEFLEET_VERSION" ]; then + fail_with_help "kubefleet version cannot be empty" +fi + +if [ -z "$HUB_CLUSTER_NAME" ]; then + fail_with_help "hub cluster name cannot be empty" +fi + +if [ -z "$HUB_CONTROL_PLANE_URL" ]; then + fail_with_help "hub control plane URL cannot be empty" +fi + +case "$HUB_CONTROL_PLANE_URL" in + https://*) ;; + *) fail_with_help "hub control plane URL must use https" ;; +esac + +if ! kubectl config get-clusters 2>/dev/null | grep -Fxq "$HUB_CLUSTER_NAME"; then + fail_with_help "hub cluster '$HUB_CLUSTER_NAME' was not found in kubeconfig" +fi + +for MC in "${@:4}"; do + if [ -z "$MC" ]; then + fail_with_help "member cluster name cannot be empty" + fi + + if [ "$MC" = "$HUB_CLUSTER_NAME" ]; then + fail_with_help "member cluster '$MC' cannot be the same as hub cluster" + fi + + if ! kubectl config get-clusters 2>/dev/null | grep -Fxq "$MC"; then + fail_with_help "member cluster '$MC' was not found in kubeconfig" + fi +done + +for MC in "${@:4}"; do + +# Note that Fleet will recognize your cluster with this name once it joins. +export MEMBER_CLUSTER_NAME=$MC +export SERVICE_ACCOUNT="$MEMBER_CLUSTER_NAME-hub-cluster-access" + +echo "Switching into hub cluster context..." +kubectl config use $HUB_CLUSTER_NAME +# The service account can, in theory, be created in any namespace; for simplicity reasons, +# here you will use the namespace reserved by Fleet installation, `fleet-system`. +# +# Note that if you choose a different value, commands in some steps below need to be +# modified accordingly. +echo "Creating member service account..." +kubectl create serviceaccount $SERVICE_ACCOUNT -n fleet-system + +echo "Creating member service account secret..." +export SERVICE_ACCOUNT_SECRET="$MEMBER_CLUSTER_NAME-hub-cluster-access-token" +cat < Date: Thu, 26 Mar 2026 08:50:21 -0700 Subject: [PATCH 13/15] chore: upgrade go to 1.25.8 (#550) --- .github/workflows/ci.yml | 2 +- .github/workflows/code-lint.yml | 2 +- .github/workflows/release.yml | 2 +- .github/workflows/trivy.yml | 2 +- .github/workflows/upgrade.yml | 2 +- .golangci.yml | 2 +- Makefile | 2 +- docker/hub-agent.Dockerfile | 2 +- docker/member-agent.Dockerfile | 2 +- docker/refresh-token.Dockerfile | 2 +- go.mod | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb156fc39..14bc0ff3e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ on: paths-ignore: [docs/**, "**.md", "**.mdx", "**.png", "**.jpg"] env: - GO_VERSION: '1.24.13' + GO_VERSION: '1.25.8' CERT_MANAGER_VERSION: 'v1.16.2' jobs: diff --git a/.github/workflows/code-lint.yml b/.github/workflows/code-lint.yml index c6e62b24a..17602a79c 100644 --- a/.github/workflows/code-lint.yml +++ b/.github/workflows/code-lint.yml @@ -14,7 +14,7 @@ on: env: # Common versions - GO_VERSION: "1.24.13" + GO_VERSION: "1.25.8" jobs: detect-noop: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index aa6c6e75b..8ad54f997 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,7 +20,7 @@ env: HUB_AGENT_IMAGE_NAME: hub-agent MEMBER_AGENT_IMAGE_NAME: member-agent REFRESH_TOKEN_IMAGE_NAME: refresh-token - GO_VERSION: "1.24.13" + GO_VERSION: "1.25.8" jobs: export-registry: diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml index 11aeafe4b..b5cf2b8a6 100644 --- a/.github/workflows/trivy.yml +++ b/.github/workflows/trivy.yml @@ -18,7 +18,7 @@ env: MEMBER_AGENT_IMAGE_NAME: member-agent REFRESH_TOKEN_IMAGE_NAME: refresh-token - GO_VERSION: '1.24.13' + GO_VERSION: '1.25.8' jobs: export-registry: diff --git a/.github/workflows/upgrade.yml b/.github/workflows/upgrade.yml index 2d6911bff..4ca166a0f 100644 --- a/.github/workflows/upgrade.yml +++ b/.github/workflows/upgrade.yml @@ -17,7 +17,7 @@ on: paths-ignore: [docs/**, "**.md", "**.mdx", "**.png", "**.jpg"] env: - GO_VERSION: '1.24.13' + GO_VERSION: '1.25.8' jobs: detect-noop: diff --git a/.golangci.yml b/.golangci.yml index 556731073..f6d620d1d 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,6 +1,6 @@ run: timeout: 15m - go: '1.24.13' + go: '1.25.8' linters-settings: stylecheck: diff --git a/Makefile b/Makefile index 16cfb16af..baa8230a3 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ TOOLS_BIN_DIR := $(abspath $(TOOLS_DIR)/bin) # Binaries # Note: Need to use abspath so we can invoke these from subdirectories -CONTROLLER_GEN_VER := v0.16.0 +CONTROLLER_GEN_VER := v0.20.0 CONTROLLER_GEN_BIN := controller-gen CONTROLLER_GEN := $(abspath $(TOOLS_BIN_DIR)/$(CONTROLLER_GEN_BIN)-$(CONTROLLER_GEN_VER)) diff --git a/docker/hub-agent.Dockerfile b/docker/hub-agent.Dockerfile index 1018a87a2..104669ac4 100644 --- a/docker/hub-agent.Dockerfile +++ b/docker/hub-agent.Dockerfile @@ -1,5 +1,5 @@ # Build the hubagent binary -FROM mcr.microsoft.com/oss/go/microsoft/golang:1.24.13 AS builder +FROM mcr.microsoft.com/oss/go/microsoft/golang:1.25.8 AS builder ARG GOOS=linux ARG GOARCH=amd64 diff --git a/docker/member-agent.Dockerfile b/docker/member-agent.Dockerfile index ca0786c2c..13f9c1f64 100644 --- a/docker/member-agent.Dockerfile +++ b/docker/member-agent.Dockerfile @@ -1,5 +1,5 @@ # Build the memberagent binary -FROM mcr.microsoft.com/oss/go/microsoft/golang:1.24.13 AS builder +FROM mcr.microsoft.com/oss/go/microsoft/golang:1.25.8 AS builder ARG GOOS=linux ARG GOARCH=amd64 diff --git a/docker/refresh-token.Dockerfile b/docker/refresh-token.Dockerfile index 31a492e86..6191383c9 100644 --- a/docker/refresh-token.Dockerfile +++ b/docker/refresh-token.Dockerfile @@ -1,5 +1,5 @@ # Build the refreshtoken binary -FROM mcr.microsoft.com/oss/go/microsoft/golang:1.24.13 AS builder +FROM mcr.microsoft.com/oss/go/microsoft/golang:1.25.8 AS builder ARG GOOS="linux" ARG GOARCH="amd64" diff --git a/go.mod b/go.mod index 3c82f9f17..513fd3b5e 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/kubefleet-dev/kubefleet -go 1.24.13 +go 1.25.8 require ( github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 From 7d800c9566d8635591541079119fa6fe69b61a4e Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Date: Thu, 26 Mar 2026 11:29:11 -0700 Subject: [PATCH 14/15] fix: make manifests to fix CRDs with latest change (#552) --- .../cluster.kubernetes-fleet.io_internalmemberclusters.yaml | 2 +- .../crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml | 2 +- .../crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml | 2 +- .../bases/placement.kubernetes-fleet.io_approvalrequests.yaml | 2 +- .../placement.kubernetes-fleet.io_clusterapprovalrequests.yaml | 2 +- .../placement.kubernetes-fleet.io_clusterresourcebindings.yaml | 2 +- .../placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml | 2 +- .../placement.kubernetes-fleet.io_clusterresourceoverrides.yaml | 2 +- ...nt.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml | 2 +- ...etes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml | 2 +- ...t.kubernetes-fleet.io_clusterresourceplacementevictions.yaml | 2 +- ...placement.kubernetes-fleet.io_clusterresourceplacements.yaml | 2 +- ...nt.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml | 2 +- .../placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml | 2 +- ...nt.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml | 2 +- .../placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml | 2 +- ...ement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml | 2 +- .../bases/placement.kubernetes-fleet.io_resourcebindings.yaml | 2 +- .../bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml | 2 +- .../bases/placement.kubernetes-fleet.io_resourceoverrides.yaml | 2 +- ...placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml | 2 +- .../bases/placement.kubernetes-fleet.io_resourceplacements.yaml | 2 +- .../bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml | 2 +- ...placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml | 2 +- .../bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml | 2 +- .../placement.kubernetes-fleet.io_stagedupdatestrategies.yaml | 2 +- config/crd/bases/placement.kubernetes-fleet.io_works.yaml | 2 +- 27 files changed, 27 insertions(+), 27 deletions(-) diff --git a/config/crd/bases/cluster.kubernetes-fleet.io_internalmemberclusters.yaml b/config/crd/bases/cluster.kubernetes-fleet.io_internalmemberclusters.yaml index 8742d05eb..7199ca697 100644 --- a/config/crd/bases/cluster.kubernetes-fleet.io_internalmemberclusters.yaml +++ b/config/crd/bases/cluster.kubernetes-fleet.io_internalmemberclusters.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: internalmemberclusters.cluster.kubernetes-fleet.io spec: group: cluster.kubernetes-fleet.io diff --git a/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml b/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml index cc28cb137..7e883d441 100644 --- a/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml +++ b/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: memberclusters.cluster.kubernetes-fleet.io spec: group: cluster.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml b/config/crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml index 21ae6b352..aceb00761 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_appliedworks.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: appliedworks.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml b/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml index b56bbdae9..fadbc2e4b 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_approvalrequests.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: approvalrequests.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml index 02ddd96fe..f02ba1461 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterapprovalrequests.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterapprovalrequests.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml index 0e12d6032..ddb481a31 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcebindings.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourcebindings.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml index 9b00cfba7..bee507d23 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceenvelopes.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceenvelopes.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml index 71474ff9b..7d8e978d4 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverrides.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceoverrides.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml index 336d3a861..f84768f50 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceoverridesnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceoverridesnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml index fd1f78819..a3af13ea4 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementdisruptionbudgets.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceplacementdisruptionbudgets.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementevictions.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementevictions.yaml index ae5fea701..ae40aae6f 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementevictions.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementevictions.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceplacementevictions.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml index 9299ee0af..630ad2a07 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacements.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceplacements.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml index 2c42689db..00b893dae 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourceplacementstatuses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourceplacementstatuses.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml index 72a3ef8a6..325f5b255 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterresourcesnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterresourcesnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml index ffa988280..954a72bab 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterschedulingpolicysnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterschedulingpolicysnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml index d4955b6ce..2b34db4d3 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdateruns.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterstagedupdateruns.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml index 1556c8946..9be8428dc 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_clusterstagedupdatestrategies.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: clusterstagedupdatestrategies.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourcebindings.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourcebindings.yaml index 046ce7ece..403e37e13 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourcebindings.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourcebindings.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourcebindings.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml index 963e66c40..09e0f8a4d 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceenvelopes.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourceenvelopes.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceoverrides.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceoverrides.yaml index dc538218c..1296c68a5 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceoverrides.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceoverrides.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourceoverrides.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml index 0d5279618..dc634b84e 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceoverridesnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourceoverridesnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml index 303f9e9b8..e233dc3bd 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourceplacements.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourceplacements.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml index 1d6265e1c..933e26276 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_resourcesnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: resourcesnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml b/config/crd/bases/placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml index b0b6c5ba1..0485e0696 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_schedulingpolicysnapshots.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: schedulingpolicysnapshots.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml index a0a5c1a4c..ff9152c9d 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdateruns.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: stagedupdateruns.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml index 487040869..5315f7322 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_stagedupdatestrategies.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: stagedupdatestrategies.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io diff --git a/config/crd/bases/placement.kubernetes-fleet.io_works.yaml b/config/crd/bases/placement.kubernetes-fleet.io_works.yaml index 9d6bcfb3b..4ae5e0adc 100644 --- a/config/crd/bases/placement.kubernetes-fleet.io_works.yaml +++ b/config/crd/bases/placement.kubernetes-fleet.io_works.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.0 + controller-gen.kubebuilder.io/version: v0.20.0 name: works.placement.kubernetes-fleet.io spec: group: placement.kubernetes-fleet.io From 35cd69ade9108b11bce4382e9aa7552a6083873e Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes Date: Thu, 26 Mar 2026 13:07:49 -0700 Subject: [PATCH 15/15] update crd-installer go version Signed-off-by: Britania Rodriguez Reyes --- docker/crd-installer.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/crd-installer.Dockerfile b/docker/crd-installer.Dockerfile index cf0bfd154..302c6e0a1 100644 --- a/docker/crd-installer.Dockerfile +++ b/docker/crd-installer.Dockerfile @@ -1,5 +1,5 @@ # Build the crdinstaller binary -FROM mcr.microsoft.com/oss/go/microsoft/golang:1.24.13 AS builder +FROM mcr.microsoft.com/oss/go/microsoft/golang:1.25.8 AS builder ARG GOOS=linux ARG GOARCH=amd64