From 62a69f9510a4c632543d404bdbfa0bf6650a948e Mon Sep 17 00:00:00 2001 From: Yuva Shankar <11082310+yuva29@users.noreply.github.com> Date: Thu, 18 Jun 2026 18:22:27 +0000 Subject: [PATCH] fix: resolve markdown lint errors across docs (CP from pensando #337) Fix markdownlint violations to pass ROCm CI linting checks: - Add blank lines around lists, fences, headings, and tables - Add language specifiers to fenced code blocks - Fix table column alignment and spacing - Fix trailing spaces and duplicate blank lines - Use descriptive link text instead of bare "here" Co-Authored-By: Claude Opus 4 (1M context) --- .wordlist.txt | 70 +++++---- docs/cluster_validation_framework/README.md | 16 +-- docs/device_plugin/deviceplugin.md | 15 +- docs/drivers/upgrading.md | 12 +- docs/index.md | 16 +-- docs/installation/kubernetes-helm.md | 2 +- docs/installation/networkconfig.md | 16 +-- docs/mpi_rccl/mpi_rccl.md | 136 ++++++++++++++++++ docs/openshift/installation-guide.md | 95 +++++++----- docs/secondary_network/amd-host-device-cni.md | 2 +- 10 files changed, 273 insertions(+), 107 deletions(-) create mode 100644 docs/mpi_rccl/mpi_rccl.md diff --git a/.wordlist.txt b/.wordlist.txt index ab2e90c5..12c68362 100644 --- a/.wordlist.txt +++ b/.wordlist.txt @@ -1,8 +1,8 @@ AGFHC AINIC AINICs -Allocatable allocatable +Allocatable amd amdgpu ANP @@ -16,47 +16,49 @@ caFile cardinality CatalogSource certFile +Chrony clientCAConfigMap clientName ClusterIP +ClusterServiceVersion CN CNI CNIPlugins CNIPluginsSpec CNIs -ClusterServiceVersion -ConfigMap -configMap configmap +configMap +ConfigMap ConfigMaps configs +containerd controllerManager -CoreOS coredns +CoreOS cpu -CRI CRD crds CRDs +CRI CronJob CronJobs CRs cryptographic CVEs cvf +daemonset DaemonSet -DaemonSets Daemonsets -DTK -daemonset +DaemonSets DeviceConfig DevicePlugin DevicePluginImage DevicePluginImagePullPolicy -DevicePluginSpec devicePluginSpec +DevicePluginSpec disableHttps DockerHub +DTK EnableNodeLabeller etcd ethtool @@ -76,13 +78,13 @@ ibverbs IfNotPresent imagePullPolicy imagePullSecrets -ImageStream imageRegistrySecret +ImageStream insecureSkipVerify installdefaultNFDRule io -ipc ipam +ipc IPC IPv json @@ -90,23 +92,24 @@ K8s kaniko keyFile keySecret -KMM kmm +KMM kube kubeconfig -Kubelet +kubectl kubelet +Kubelet kubernetes -kubectl -Labeller labeller -lifecycle +Labeller LIF +lifecycle MachineConfig MaxUnavailable MCO MetricsExporter MetricsExporterSpec +MPICH MPIJob mTLS multus @@ -118,30 +121,34 @@ NetworkAttachmentDefinitions NetworkConfig networkconfigs NFD -NodeFeatureDiscovery nic NICCTL nodeAffinity -NodeLabeller +NodeFeatureDiscovery Nodelabeller +NodeLabeller NodeLabellerImage NodeLabellerImagePullPolicy -NodePort nodePort +NodePort nodeSelector nodeSelectorTerms NotReady +NPL +NTP +NVMe OLM OnDelete OOM +OpenMPI OpenShift OpenShift's OperatorGroup OperatorHub oyaml pci -PDS pds +PDS Pensando Podman Pollara @@ -153,51 +160,52 @@ RCCL rdma RDMA relatedImageBuild -repo relatedImageBuildPullSecret -RPMs relatedImageSign relatedImageSignPullSecret relatedImageWorker relatedImageWorkerPullSecret -RoCE +repo roce +RoCE ROCm RollingUpdate +RPMs SAR SBR -serverName sdk +serverName ServiceAccount -ServiceAccounts serviceAccountNamespaceSelector +ServiceAccounts serviceAccountSelector ServiceMonitor serviceType -SR-IOV sriov +SR-IOV staticAuthorization SubjectAccessReview tawk +TBD techsupport TechSupport -tlsConfig TLS +tlsConfig TokenReview tolerations UI uncordoned Uncordoning upgradeCRD -UpgradePolicy upgradePolicy -UpgradeStrategy +UpgradePolicy upgradeStrategy +UpgradeStrategy VFs virtualized +vnic vNIC VNICs -vnic webhook webhook's webhookServer diff --git a/docs/cluster_validation_framework/README.md b/docs/cluster_validation_framework/README.md index 277c4ee3..4b241198 100644 --- a/docs/cluster_validation_framework/README.md +++ b/docs/cluster_validation_framework/README.md @@ -61,7 +61,7 @@ This framework supports Gang Scheduling by checking for Pod Running status and ## Key Components | Component | Description | -|------------|-------------| +| ------------ | ------------- | | **CronJob** | Periodically triggers node cluster node validation checks (e.g., every 24 hours). | | **ConfigMap** | Stores configuration, candidate selection script, Job and MPIJob manifest templates. | | **ServiceAccount + RBAC** | Grants permission to list/label nodes and create workloads. | @@ -143,11 +143,11 @@ kubectl logs job/cluster-validation-mpi-job-<20251110-0715>-launcher ## Example Output Labels -| Node | Label | Meaning | -|:--------|:--------------------------------------------|:-----------------------------------------------------------| -| node-a | `amd.com/cluster-validation-status=passed` | Node successfully passed all RCCL tests | -| node-b | `amd.com/cluster-validation-status=failed` | Node failed one or more RCCL tests | -| node-c | *(no label)* | Node not part of current candidate set | +| Node | Label | Meaning | +| ------ | ------------------------------------------- | ----------------------------------------------------------- | +| node-a | `amd.com/cluster-validation-status=passed` | Node successfully passed all RCCL tests | +| node-b | `amd.com/cluster-validation-status=failed` | Node failed one or more RCCL tests | +| node-c | *(no label)* | Node not part of current candidate set | --- @@ -155,8 +155,8 @@ kubectl logs job/cluster-validation-mpi-job-<20251110-0715>-launcher * Update image tags (**roce-workload**, **network-operator-utils**) as needed before deployment. * Modify `cluster-validation-config.yaml` to align with your deployment environment. -* Ensure `slotsPerWorker` and resource limits correspond to the underlying GPU and NIC configuration. -* Adjust `CronJob.spec` to set the job frequency. +* Ensure `slotsPerWorker` and resource limits correspond to the underlying GPU and NIC configuration. +* Adjust `CronJob.spec` to set the job frequency. * Set `debug_delay` to pause after job completion for debugging. * Configure `fluent_log_output` to define the log destination for Fluent sidecar-based centralized logging diff --git a/docs/device_plugin/deviceplugin.md b/docs/device_plugin/deviceplugin.md index 21591ff7..4a7c6d2a 100644 --- a/docs/device_plugin/deviceplugin.md +++ b/docs/device_plugin/deviceplugin.md @@ -32,13 +32,14 @@ spec: ### Field Description -| Field Name | Description | -|----------------------------------|-------------------------------------------------| -| **DevicePluginImage** | Device plugin image | -| **DevicePluginImagePullPolicy** | One of Always, Never, IfNotPresent. | -| **NodeLabellerImage** | Image to use for the Node Labeller | -| **NodeLabellerImagePullPolicy** | Image pull policy: Always, Never, IfNotPresent | -| **EnableNodeLabeller** | Enable or disable the Node Labeller (true/false)| +| Field Name | Description | +|----------------------------------|--------------------------------------------------| +| **DevicePluginImage** | Device plugin image | +| **DevicePluginImagePullPolicy** | One of Always, Never, IfNotPresent. | +| **NodeLabellerImage** | Image to use for the Node Labeller | +| **NodeLabellerImagePullPolicy** | Image pull policy: Always, Never, IfNotPresent | +| **EnableNodeLabeller** | Enable or disable the Node Labeller (true/false) | +
The `ImagePullPolicy` field defaults to `Always` if the image tag is `:latest`, or to `IfNotPresent` for other tags. This follows the default Kubernetes behavior for `ImagePullPolicy`. diff --git a/docs/drivers/upgrading.md b/docs/drivers/upgrading.md index 608c4f4c..44f9eebd 100644 --- a/docs/drivers/upgrading.md +++ b/docs/drivers/upgrading.md @@ -67,7 +67,7 @@ To check the full spec of upgrade configuration run kubectl get crds networkconf #### `driver.upgradePolicy` Parameters | Parameter | Description | Default | -|-----------|-------------|---------| +| --------- | ----------- | ------- | | `enable` | Enable this upgrade policy | `false` | | `maxParallelUpgrades` | Maximum number of nodes which will be upgraded in parallel | `1` | | `maxUnavailableNodes` | Maximum number (or Percentage) of nodes which can be unavailable (cordoned) in the cluster | `25%` | @@ -76,14 +76,14 @@ To check the full spec of upgrade configuration run kubectl get crds networkconf #### `driver.upgradePolicy.nodeDrainPolicy` Parameters | Parameter | Description | Default | -|-----------|-------------|---------| +| --------- | ----------- | ------- | | `force` | Allow drain to proceed on the node even if there are managed pods such as daemon-sets. In such cases drain will not proceed unless this option is set to true | `true` | | `timeout` | The length of time to wait before giving up. Zero means infinite | `300s` | #### `driver.upgradePolicy.podDeletionPolicy` Parameters | Parameter | Description | Default | -|-----------|-------------|---------| +| --------- | ----------- | ------- | | `force` | Force delete all pods that use amd nics | `true` | | `timeout` | The length of time to wait before giving up. Zero means infinite | `300s` | @@ -127,7 +127,7 @@ status: The following are the different node states during the upgrade process | State | Description | -|-----------|---------| +| ----- | ----------- | | `Install-In-Progress` | Driver is being installed on the node for the first time | | `Install-Complete` | Driver install is complete | | `Upgrade-Not-Started` | Automatic upgrade enabled and driver version change is detected. All nodes move to this state | @@ -136,7 +136,7 @@ The following are the different node states during the upgrade process | `Upgrade-Timed-Out` | Driver upgrade couldn't finish within 2 hours | | `Cordon-Failed` | Cordoning of the node failed | | `Uncordon-Failed` | Uncordoning of the node failed | -| `Drain-Failed` | Drain node or Delete pods operation failed| +| `Drain-Failed` | Drain node or Delete pods operation failed | | `Reboot-In-Progress` | Driver upgrade is done and reboot is in progress | | `Reboot-Failed` | Driver upgrade is done and reboot attempt failed | | `Upgrade-Failed` | Driver upgrade failed for any other reasons | @@ -202,7 +202,7 @@ The operator will automatically: The operator uses specific tag formats based on the OS: | OS | Tag Format | Example | -|----|------------|---------| +| -- | ---------- | ------- | | Ubuntu | `ubuntu---` | `ubuntu-22.04-6.8.0-40-generic-6.1.3` | | RHEL CoreOS | `coreos---` | `coreos-416.94-5.14.0-427.28.1.el9_4.x86_64-6.2.2` | diff --git a/docs/index.md b/docs/index.md index 50ee41d6..1f8c4e0c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,7 +16,7 @@ AMD Network Operator simplifies the use of AMD AINICs in Kubernetes environments ### **Supported Hardware** | Hardware | Status | -|-----------|---------| +| -------- | ------ | | AMD Pensando™ Pollara AI NIC | ✅ Supported | ### OS & Platform Support Matrix @@ -25,18 +25,18 @@ Below is a list of operating systems and Kubernetes versions validated with the Additional versions will be added in future releases. | Operating System | Kubernetes Versions | -|------------------|---------------------| +| ---------------- | ------------------- | | Ubuntu 22.04 LTS | 1.29 – 1.34 | | Ubuntu 24.04 LTS | 1.29 – 1.34 | ### Software Version Compatibility Matrix -| Network Operator | AINIC Firmware | Supported NICs | -|------------------|----------------|----------------| -| v1.0.0 | 1.117.1-a-63 | Pollara 400 | -| v1.0.1 | 1.117.1-a-63 | Pollara 400 | -| v1.1.0 | 1.117.5-a-56 | Pollara 400 | -| v1.2.0 | 1.117.5-a-56
1.117.5-a-77 | Pollara 400 | +| Network Operator | AINIC Firmware | Supported NICs | +|------------------|--------------------------------|----------------| +| v1.0.0 | 1.117.1-a-63 | Pollara 400 | +| v1.0.1 | 1.117.1-a-63 | Pollara 400 | +| v1.1.0 | 1.117.5-a-56 | Pollara 400 | +| v1.2.0 | 1.117.5-a-56
1.117.5-a-77 | Pollara 400 | ## Prerequisites diff --git a/docs/installation/kubernetes-helm.md b/docs/installation/kubernetes-helm.md index 48b12390..8991ad47 100644 --- a/docs/installation/kubernetes-helm.md +++ b/docs/installation/kubernetes-helm.md @@ -141,7 +141,7 @@ helm show values rocm/network-operator-charts ``` | Key | Type | Default | Description | -|-----|------|---------|-------------| +| ----- | ------ | --------- | ------------- | | controllerManager.manager.image.repository | string | `"docker.io/rocm/network-operator"` | AMD Network operator controller manager image repository | | controllerManager.manager.image.tag | string | `"v1.2.0"` | AMD Network operator controller manager image tag | | controllerManager.manager.imagePullPolicy | string | `"Always"` | Image pull policy for AMD Network operator controller manager pod | diff --git a/docs/installation/networkconfig.md b/docs/installation/networkconfig.md index 86552c9a..bf324e03 100644 --- a/docs/installation/networkconfig.md +++ b/docs/installation/networkconfig.md @@ -38,7 +38,7 @@ spec: nodePort: 32501 hostNetwork: true image: docker.io/rocm/device-metrics-exporter:nic-v1.2.0 - + # Secondary network config secondaryNetwork: cniPlugins: @@ -61,14 +61,14 @@ To check the full spec of `NetworkConfig` definition, run `kubectl get crds netw #### `metadata` Parameters | Parameter | Description | -|-----------|-------------| +| --------- | ----------- | | `name` | Unique identifier for the resource | | `namespace` | Namespace where the operator is running | #### `spec.driver` Parameters | Parameter | Description | Default | -|-----------|-------------|---------| +| --------- | ----------- | ------- | | `enable` | Enable/disable driver installation | `true` | | `image` | Image URL to pull/push kernel modules images | | | `version` | Driver version for source code builds | | @@ -79,7 +79,7 @@ To check the full spec of `NetworkConfig` definition, run `kubectl get crds netw #### `spec.devicePlugin` Parameters | Parameter | Description | Default | -|-----------|-------------|---------| +| --------- | ----------- | ------- | | `devicePluginImage` | AMD Network device plugin image | `docker.io/rocm/k8s-network-device-plugin:v1.2.0` | | `nodeLabellerImage` | Node labeller image | `docker.io/rocm/k8s-network-node-labeller:v1.2.0` | | `imageRegistrySecret.name` | Name of registry credentials secret
to pull device plugin / node labeller image | | @@ -88,7 +88,7 @@ To check the full spec of `NetworkConfig` definition, run `kubectl get crds netw #### `spec.metricsExporter` Parameters | Parameter | Description | Default | -|-----------|-------------|---------| +| --------- | ----------- | ------- | | `enable` | Enable/disable metrics exporter | `false` | | `imageRegistrySecret.name` | Name of registry credentials secret
to pull metrics exporter image | | | `serviceType` | Service type for metrics endpoint
Options: "ClusterIP" or "NodePort" | `ClusterIP` | @@ -99,15 +99,15 @@ To check the full spec of `NetworkConfig` definition, run `kubectl get crds netw #### `spec.secondaryNetwork` Parameters | Parameter | Description | Default | -|-----------|-------------|---------| +| --------- | ----------- | ------- | | `cniPlugins.enable` | Enable/disable CNI plugins | `false` | -| `cniPlugins.image` | CNI plugins image | `docker.io/rocm/k8s-cni-plugins:v1.2.0`| +| `cniPlugins.image` | CNI plugins image | `docker.io/rocm/cni-plugins:v1.2.0` | | `cniPlugins.imageRegistrySecret.name` | Name of registry credentials secret
to pull metrics exporter image | | #### `spec.selector` Parameters | Parameter | Description | Default | -|-----------|-------------|---------| +| --------- | ----------- | ------- | | `selector` | Labels to select nodes for driver installation | `feature.node.kubernetes.io/amd-nic: "true"` | ### Registry Secret Configuration diff --git a/docs/mpi_rccl/mpi_rccl.md b/docs/mpi_rccl/mpi_rccl.md new file mode 100644 index 00000000..0adaa273 --- /dev/null +++ b/docs/mpi_rccl/mpi_rccl.md @@ -0,0 +1,136 @@ +# MPI Operator Specification for RCCL Performance Testing + +## Table of Contents + +1. [Overview](#overview) +2. [Architecture](#architecture) +3. [Prerequisites](#prerequisites) +4. [Installation](#installation) + +## Overview + +To validate the performance of AMD GPU clusters for distributed AI/ML workloads, it is essential to benchmark the inter-node and intra-node communication capabilities. +The MPI Operator for Kubernetes enables the execution of Message Passing Interface (MPI) workloads on AMD GPU clusters. This specification defines how to deploy and configure the MPI Operator specifically for running RCCL (ROCm Collective Communications Library) performance tests to validate cluster performance and benchmark AMD GPU interconnect capabilities. + +### Purpose + +This specification provides a comprehensive guide for: + +- Deploying MPI Operator in Kubernetes clusters with AMD GPUs +- Creating MPIJob resources to run RCCL performance tests +- Validating cluster performance for distributed AI/ML workloads +- Benchmarking inter-node and intra-node GPU communication + +### Scope + +The specification covers: + +- MPI Operator installation and configuration +- Integration with AMD Network Operator and AMD GPU Operator +- Recipes for running RCCL performance tests + +## Architecture + +### Component Overview + +```text +┌─────────────────────────────────────────────────────────────┐ +│ Kubernetes Cluster │ +├─────────────────────────────────────────────────────────────┤ +│ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ MPI Operator │ │ AMD Network │ │ +│ │ Controller │◄──►│ Operator │ │ +│ └─────────────────┘ └─────────────────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ MPIJob CRD │ │ NetworkConfig │ │ +│ └─────────────────┘ │ CRD │ │ +│ │ └─────────────────┘ │ +│ ▼ │ │ +│ ┌─────────────────┐ ▼ │ +│ │ Launcher Pod │ ┌─────────────────┐ │ +│ │ - mpirun │ │ Device Plugin │ │ +│ │ - RCCL tests │ │ Node Labeller │ │ +│ └─────────────────┘ │ Metrics Export │ │ +│ │ └─────────────────┘ │ +│ ▼ │ │ +│ ┌─────────────────────────────────────────┐ │ +│ │ Worker Pods │ │ +│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │ +│ │ │Worker-0 │ │Worker-1 │ │Worker-N │ │ │ +│ │ │AMD GPU │ │AMD GPU │ │AMD GPU │ │ │ +│ │ │RCCL lib │ │RCCL lib │ │RCCL lib │ │ │ +│ │ └─────────┘ └─────────┘ └─────────┘ │ │ +│ └─────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────┘ +``` + +### Key Components + +1. **MPI Operator Controller**: Manages MPIJob lifecycle +2. **MPIJob Custom Resource**: Defines MPI workload specifications +3. **Launcher Pod**: Executes `mpirun` command to coordinate worker processes +4. **Worker Pods**: Run RCCL test processes on AMD GPUs +5. **AMD Network Operator**: Provides AINIC driver installation and network configuration +6. **AMD GPU Operator**: Manages GPU driver installation and management + +## Prerequisites + +### Cluster Requirements + +- Kubernetes cluster version 1.31+ +- AMD GPU nodes with ROCm drivers installed +- AMD AINIC network for high-performance interconnect +- Container runtime with GPU support (Docker/containerd with AMD GPU Operator) +- Network connectivity between nodes (InfiniBand/Ethernet for multi-node tests) + +### Software Dependencies + +- ROCm >= 7.0.0 +- RCCL >= 2.15.0 +- MPI implementation (OpenMPI, MPICH, or Intel MPI) +- AMD Network Operator (for AMD AINIC driver management) +- AMD GPU Operator (for GPU driver management) +- AINIC RCCL Container image (for RCCL test binaries) + - Include ROCM stack, RCCL, and MPI libraries + - AINIC NPL Plugin (for AINIC driver performance parameters) + +### Hardware Requirements + +- AMD GPUs (MI3XX series recommended) +- High-bandwidth interconnect (AMD AINIC preferred) +- NVMe storage for fast I/O + +### Network Configuration + +- Kubernetes pod network (CNI) +- High-performance secondary network for MPI communication +- Proper firewall rules for MPI port ranges +- Network time synchronization (NTP/Chrony) + +## Installation + +### 1. Network Operator Installation + +The AMD Network Operator is required to manage the installation and configuration of the AINIC driver for high-performance interconnect. + +[Install Instructions](../installation/kubernetes-helm.md#installing-the-amd-network-operator) + +### 2. GPU Operator Installation + +The AMD GPU Operator is required to manage the installation and configuration of the amdgpu drivers for AMD GPUs. + +[Instal lInstructions](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/installation/kubernetes-helm.html#) + +### 3. MPI Operator Installation + +TBD + +## Test Recipes + +TBD + +### 4. Test Result Verification + +TBD diff --git a/docs/openshift/installation-guide.md b/docs/openshift/installation-guide.md index 0f509f24..534dac79 100644 --- a/docs/openshift/installation-guide.md +++ b/docs/openshift/installation-guide.md @@ -31,7 +31,7 @@ This guide provides production-ready steps for deploying the AMD Network Operator on OpenShift clusters using OLM (Operator Lifecycle Manager). This operator manages AMD network drivers (ionic, ionic_rdma, pds_core, tawk_ipc) using Kernel Module Management (KMM). -**What this operator does:** +**What this operator does**: - Automatically loads AMD network drivers on OpenShift CoreOS nodes - Manages kernel module lifecycle through KMM @@ -57,20 +57,24 @@ This guide provides production-ready steps for deploying the AMD Network Operato **Key Requirements**: -- ✅ OpenShift 4.16+ with CoreOS -- ✅ NFD and KMM operators installed -- ✅ Container registry (insecure registry configured if internal) -- ✅ AMD Pensando NICs installed on nodes +- OpenShift 4.16+ with CoreOS +- NFD and KMM operators installed +- Container registry (insecure registry configured if internal) +- AMD Pensando NICs installed on nodes --- ## Important Notes -> 💡 **TIP**: This guide uses production-style versioning (`v1.0.0-netop-beta`). Replace with your actual version tags. -> -> ⚠️ **WARNING**: Only install KMM operator **ONCE** in `openshift-kmm` namespace. Multiple instances cause conflicts. -> -> 🌐 **REGISTRY**: Configure insecure registries at cluster level before starting. Images won't pull otherwise. +> **TIP**: This guide uses production-style versioning (`v1.0.0-netop-beta`). Replace with your actual version tags. + + + +> **WARNING**: Only install KMM operator **ONCE** in `openshift-kmm` namespace. Multiple instances cause conflicts. + + + +> **REGISTRY**: Configure insecure registries at cluster level before starting. Images won't pull otherwise. --- @@ -85,13 +89,13 @@ This guide provides production-ready steps for deploying the AMD Network Operato ### Required Operators Installation -**These operators must be installed BEFORE deploying the AMD Network Operator:** +**These operators must be installed BEFORE deploying the AMD Network Operator**: #### 1. Install Node Feature Discovery (NFD) NFD detects hardware features on nodes and labels them accordingly. -**Installation via OpenShift Web Console:** +**Installation via OpenShift Web Console**: 1. Log in to OpenShift Web Console 2. Navigate to **Operators** → **OperatorHub** @@ -105,14 +109,14 @@ NFD detects hardware features on nodes and labels them accordingly. - **Update Approval**: Automatic 7. Click **Install** and wait for the operator to become ready -**Verification:** +**Verification**: ```bash kubectl get csv -n openshift-nfd | grep nfd # Expected: nfd.x.x.x Node Feature Discovery x.x.x Succeeded ``` -**Create a NodeFeatureDiscovery instance to activate NFD:** +**Create a NodeFeatureDiscovery instance to activate NFD**: After installing the NFD operator, create a `NodeFeatureDiscovery` CR to start NFD workers on the cluster: @@ -162,7 +166,7 @@ kubectl get pods -n openshift-nfd | grep worker KMM manages out-of-tree kernel modules on OpenShift clusters. -**Installation via OpenShift Web Console:** +**Installation via OpenShift Web Console**: 1. Log in to OpenShift Web Console 2. Navigate to **Operators** → **OperatorHub** @@ -176,7 +180,7 @@ KMM manages out-of-tree kernel modules on OpenShift clusters. - **Update Approval**: Automatic 7. Click **Install** and wait for the operator to become ready -**Verification:** +**Verification**: ```bash kubectl get csv -n openshift-kmm | grep kernel-module-management @@ -298,7 +302,7 @@ export REPO_URL="https://repo.radeon.com" export DTK_IMAGE="quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:288b3574a5524121c139b846e98a223da793305560f8b42dcd8d2aa712912998" ``` -**Finding Version Values:** +**Finding Version Values**: ```bash # Get node kernel version @@ -408,7 +412,7 @@ make bundle-build \ make bundle-push BUNDLE_IMG=${BUNDLE_IMG} ``` -**What this does:** +**What this does**: - Generates CSV (ClusterServiceVersion) with operator metadata - Creates RBAC manifests for all service accounts @@ -433,7 +437,7 @@ make bundle-push -n openshift-amd-network ``` -**Flags explained:** +**Flags explained**: - `--use-http`: Use HTTP instead of HTTPS for registry communication - `--skip-tls`: Skip TLS verification (for insecure registries) @@ -480,14 +484,14 @@ kubectl get sa -n openshift-amd-network > ⚠️ **THIS SECTION IS OPTIONAL**: For most users with connected clusters, you can **skip this entire section** and proceed directly to [Deploying NetworkConfig CR](#deploying-networkconfig-cr). When you create a NetworkConfig CR, KMM will automatically build driver images in-cluster using the OpenShift internal registry. -**When to use this section:** +**When to use this section**: - **Air-gapped/disconnected environments**: No internet access during runtime - **Pre-staging images**: Want driver images ready before deployment - **External registry requirements**: Need images in a specific external registry - **Custom build pipelines**: Integrating with CI/CD systems -**When to skip this section:** +**When to skip this section**: - **Connected clusters**: Have internet access to `repo.radeon.com` - **Quick start/trial**: Want the fastest path to running drivers @@ -501,14 +505,14 @@ kubectl get sa -n openshift-amd-network The operator supports two methods for building driver images, controlled by the `useSourceImage` field in NetworkConfig CR: -**Method 1: RPM-based Build** (`useSourceImage: false`) - **Recommended** +**Method 1: RPM-based Build** (`useSourceImage: false`) - **Recommended**: - Downloads pre-compiled RPM packages from repo.radeon.com - Installs drivers directly from RPMs - Faster build process - Uses: `DockerfileTemplate.rpm.ionic.coreos` -**Method 2: Source Image Build** (`useSourceImage: true`) - **Advanced** +**Method 2: Source Image Build** (`useSourceImage: true`) - **Advanced**: - Requires building a source image first containing driver source code - KMM compiles modules from source against specific kernel @@ -597,14 +601,14 @@ kubectl start-build amd-driver-build -n openshift-amd-network kubectl logs -f build/amd-driver-build-1 -n openshift-amd-network ``` -**Build Arguments Explained:** +**Build Arguments Explained**: - `DTK_AUTO`: Driver Toolkit image matching your OpenShift version and kernel - `KERNEL_VERSION`: Target kernel version from node - `DRIVERS_VERSION`: AMD driver package version from repo.radeon.com - `REPO_URL`: AMD repository URL -**Finding the Correct DTK Image:** +**Finding the Correct DTK Image**: ```bash # Get node kernel version (already set in env vars) @@ -637,7 +641,7 @@ sudo podman push --tls-verify=false \ ${REGISTRY_URL}/amdnetwork_kmod:coreos-${RHEL_VERSION}-${KERNEL_VERSION}-${DRIVERS_VERSION} ``` -**Why Push to External Registry?** +Why push to an external registry? - OpenShift's internal registry may not be accessible during module loading - External registry provides consistent access across cluster operations @@ -671,14 +675,14 @@ These images are automatically built and published by the GitHub Actions workflo If you need to build source images yourself (e.g., for a custom driver version or internal registry): -##### Option 1: Using the automated builder script +**Option 1: Using the automated builder script**: ```bash cd internal-example/driverSrcImage ./build-all-source-images.sh --version ${DRIVERS_VERSION} --registry your-registry.com ``` -##### Option 2: Using OpenShift BuildConfig +**Option 2: Using OpenShift BuildConfig**: ```bash cat > /tmp/source-image-build.yaml << EOF @@ -727,7 +731,7 @@ kubectl start-build amd-source-image-build -n openshift-amd-network kubectl logs -f build/amd-source-image-build-1 -n openshift-amd-network ``` -**What source images contain:** +**What source images contain**: - `/ionic_src/driver/` - Source code for ionic, pds, tawk-ipc modules - `/ionic_src/firmware/` - Firmware files @@ -768,7 +772,7 @@ spec: > 💡 **Note**: If using a custom/internal source image registry, replace `docker.io/amdpsdo/amdnic-drivers` with your registry path. -**How it works:** +**How it works**: 1. KMM uses `DockerfileTemplate.srcimg.ionic.coreos` 2. Copies source code from your source image (`sourceImageRepo`) @@ -777,6 +781,23 @@ spec: --- +## Verifying Service Accounts + +The operator creates multiple service accounts for different components: + +```bash +kubectl get sa -n openshift-amd-network + +# Expected service accounts: +# - amd-network-operator-controller-manager +# - amd-network-operator-device-plugin +# - amd-network-operator-kmm-module-loader +# - amd-network-operator-node-labeller +# - amd-network-operator-metrics-exporter +# - amd-network-operator-config-manager +# - amd-network-operator-utils-container +``` + ## Deploying NetworkConfig CR ### 1. Create NFD Rule for NIC Detection @@ -840,7 +861,7 @@ metadata: spec: selector: feature.node.kubernetes.io/amd-nic: "true" - + driver: enable: true version: "${DRIVERS_VERSION}" @@ -856,12 +877,12 @@ spec: insecure: true insecureSkipTLSVerify: true AMDNetworkInstallerRepoURL: "${REPO_URL}" - + devicePlugin: enableNodeLabeller: true devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest nodeLabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest - + metricsExporter: enable: true image: docker.io/rocm/device-metrics-exporter:v1.2.0 @@ -880,7 +901,7 @@ metadata: spec: selector: feature.node.kubernetes.io/amd-nic: "true" - + driver: enable: true version: "${DRIVERS_VERSION}" @@ -897,12 +918,12 @@ spec: insecure: true insecureSkipTLSVerify: true AMDNetworkInstallerRepoURL: "${REPO_URL}" - + devicePlugin: enableNodeLabeller: true devicePluginImage: docker.io/rocm/k8s-device-plugin:rhubi-latest nodeLabellerImage: docker.io/rocm/k8s-device-plugin:labeller-rhubi-latest - + metricsExporter: enable: true image: docker.io/rocm/device-metrics-exporter:v1.2.0 @@ -1191,5 +1212,5 @@ All service accounts use consistent naming without platform-specific suffixes to --- -**Document Version**: 1.0.0 +**Document Version**: 1.0.0
**Target Platform**: OpenShift 4.16+ with CoreOS diff --git a/docs/secondary_network/amd-host-device-cni.md b/docs/secondary_network/amd-host-device-cni.md index 5045b59e..4d1df1bd 100644 --- a/docs/secondary_network/amd-host-device-cni.md +++ b/docs/secondary_network/amd-host-device-cni.md @@ -50,7 +50,7 @@ spec: }' ``` -For detailed information on how this resource is allocated and how the CNI is invoked, please refer to the documentation [here](./integration-flow.md). +For detailed information on how this resource is allocated and how the CNI is invoked, please refer to the [integration flow documentation](./integration-flow.md). ## Verification