From f20e1093864777d21cb686657f58a6cd0edcc503 Mon Sep 17 00:00:00 2001 From: Yuvarani Shankar Date: Mon, 8 Jun 2026 13:00:45 -0700 Subject: [PATCH 1/4] feat: add NFD rule documentation and remove stale NodeFeatureDiscovery config (NETOP-172) (#329) Update OpenShift installation guide to use NFD-based node selector instead of hostname. Add ready-to-apply NodeFeatureRule YAML for automatic AMD Pensando NIC detection (vendor 1dd8, PF device 1002, VF device 1003). - Add NFD rule YAML to docs for users to apply manually - Update NetworkConfig examples to use feature.node.kubernetes.io/amd-nic selector instead of kubernetes.io/hostname - Remove incorrect nodefeaturediscovery.yaml template (wrong resource type, wrong PCI IDs pointing to GPU instead of NIC) - Remove all nodefeaturediscoveries RBAC (Go markers, generated config, CSV, Helm chart) since the operator does not manage that resource - Clean up pre-delete hook Job and Makefile uninstall references Co-authored-by: Yuva Shankar <11082310+yuva29@users.noreply.github.com> Co-authored-by: Claude Opus 4 (1M context) # Conflicts: # bundle/manifests/amd-network-operator.clusterserviceversion.yaml # docs/openshift/installation-guide.md --- config/rbac/role.yaml | 16 ---- .../template-patch/nodefeaturediscovery.yaml | 94 ------------------- .../template-patch/pre-delete-hook.yaml | 46 --------- helm-charts-k8s/templates/manager-rbac.yaml | 16 ---- .../controllers/network_config_reconciler.go | 3 - 5 files changed, 175 deletions(-) delete mode 100644 hack/openshift-patch/template-patch/nodefeaturediscovery.yaml diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index fddd999f..5ab985f0 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -178,19 +178,3 @@ rules: - patch - update - watch -- apiGroups: - - nfd.openshift.io - resources: - - nodefeaturediscoveries - verbs: - - delete - - get - - list -- apiGroups: - - nfd.openshift.io - resources: - - nodefeaturediscoveries/finalizers - - nodefeaturediscoveries/status - verbs: - - get - - update diff --git a/hack/openshift-patch/template-patch/nodefeaturediscovery.yaml b/hack/openshift-patch/template-patch/nodefeaturediscovery.yaml deleted file mode 100644 index b0a7be41..00000000 --- a/hack/openshift-patch/template-patch/nodefeaturediscovery.yaml +++ /dev/null @@ -1,94 +0,0 @@ -{{- if .Values.nfd.enabled }} -apiVersion: nfd.openshift.io/v1 -kind: NodeFeatureDiscovery -metadata: - name: {{ .Release.Name }}-nfd-instance - namespace: {{ .Release.Namespace }} -spec: - #instance: "" # instance is empty by default - #labelWhiteList: "" - #extraLabelNs: - # - "example.com" - #resourceLabels: - # - "example.com/resource" - operand: - image: quay.io/openshift/origin-node-feature-discovery:4.16 - imagePullPolicy: IfNotPresent - servicePort: 12000 - workerConfig: - configData: | - core: - # labelWhiteList: - # noPublish: false - sleepInterval: 60s - # sources: [all] - # klog: - # addDirHeader: false - # alsologtostderr: false - # logBacktraceAt: - # logtostderr: true - # skipHeaders: false - # stderrthreshold: 2 - # v: 0 - # vmodule: - ## NOTE: the following options are not dynamically run-time - ## configurable and require a nfd-worker restart to take effect - ## after being changed - # logDir: - # logFile: - # logFileMaxSize: 1800 - # skipLogHeaders: false - sources: - # cpu: - # cpuid: - ## NOTE: whitelist has priority over blacklist - # attributeBlacklist: - # - "BMI1" - # - "BMI2" - # - "CLMUL" - # - "CMOV" - # - "CX16" - # - "ERMS" - # - "F16C" - # - "HTT" - # - "LZCNT" - # - "MMX" - # - "MMXEXT" - # - "NX" - # - "POPCNT" - # - "RDRAND" - # - "RDSEED" - # - "RDTSCP" - # - "SGX" - # - "SSE" - # - "SSE2" - # - "SSE3" - # - "SSE4.1" - # - "SSE4.2" - # - "SSSE3" - # attributeWhitelist: - # kernel: - # kconfigFile: "/path/to/kconfig" - # configOpts: - # - "NO_HZ" - # - "X86" - # - "DMI" - pci: - deviceClassWhitelist: - - "0200" - - "03" - - "12" - deviceLabelFields: - - "vendor" - - "device" - custom: - - name: amd-nic - labels: - feature.node.kubernetes.io/amd-nic: "true" - matchAny: - - matchFeatures: - - feature: pci.device - matchExpressions: - vendor: {op: In, value: ["1002"]} - device: {op: In, value: ["74a0"]} # MI300A -{{- end }} diff --git a/hack/openshift-patch/template-patch/pre-delete-hook.yaml b/hack/openshift-patch/template-patch/pre-delete-hook.yaml index 8429bf44..16dab3f6 100644 --- a/hack/openshift-patch/template-patch/pre-delete-hook.yaml +++ b/hack/openshift-patch/template-patch/pre-delete-hook.yaml @@ -31,12 +31,6 @@ rules: verbs: - get - list - - apiGroups: - - nfd.openshift.io - resources: - - nodefeaturediscoveries - verbs: - - delete --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -104,43 +98,3 @@ spec: {{- end }} restartPolicy: Never ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: pre-uninstall-remove-nodefeaturediscovery - namespace: {{ .Release.Namespace }} - annotations: - # hook with lower weight value will run firstly - "helm.sh/hook-weight": "3" - # hook will be executed before helm uninstall - "helm.sh/hook": pre-delete - # remove the resource created by the hook whether it succeeded or failed - "helm.sh/hook-delete-policy": before-hook-creation, hook-succeeded -spec: - backoffLimit: 0 # once the job finished first run, don't retry to create another pod - ttlSecondsAfterFinished: 60 # job info will be kept for 1 min then deleted - template: - spec: - serviceAccountName: {{ include "helm-charts-openshift.fullname" . }}-pre-delete - containers: - - name: pre-uninstall-remove-nodefeaturediscovery - image: {{ .Values.controllerManager.manager.image.repository }}:{{ .Values.controllerManager.manager.image.tag }} - command: - - /bin/sh - - -c - - | - kubectl delete nodefeaturediscoveries --all -n {{ .Release.Namespace }} - {{- if .Values.controllerManager.manager.imagePullSecrets }} - imagePullSecrets: - - name: {{ .Values.controllerManager.manager.imagePullSecrets }} - {{- end}} - {{- with .Values.controllerManager.manager.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.controllerManager.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - restartPolicy: Never diff --git a/helm-charts-k8s/templates/manager-rbac.yaml b/helm-charts-k8s/templates/manager-rbac.yaml index c75e0467..24e96131 100644 --- a/helm-charts-k8s/templates/manager-rbac.yaml +++ b/helm-charts-k8s/templates/manager-rbac.yaml @@ -181,22 +181,6 @@ rules: - patch - update - watch -- apiGroups: - - nfd.openshift.io - resources: - - nodefeaturediscoveries - verbs: - - delete - - get - - list -- apiGroups: - - nfd.openshift.io - resources: - - nodefeaturediscoveries/finalizers - - nodefeaturediscoveries/status - verbs: - - get - - update --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/internal/controllers/network_config_reconciler.go b/internal/controllers/network_config_reconciler.go index 4fff5ee9..1d5ef830 100644 --- a/internal/controllers/network_config_reconciler.go +++ b/internal/controllers/network_config_reconciler.go @@ -174,9 +174,6 @@ func (r *NetworkConfigReconciler) init(ctx context.Context) { //+kubebuilder:rbac:groups=kmm.sigs.x-k8s.io,resources=nodemodulesconfigs,verbs=get;list;watch //+kubebuilder:rbac:groups=kmm.sigs.x-k8s.io,resources=nodemodulesconfigs/status,verbs=get;list;watch //+kubebuilder:rbac:groups=kmm.sigs.x-k8s.io,resources=nodemodulesconfigs/finalizers,verbs=get;update;watch -//+kubebuilder:rbac:groups=nfd.openshift.io,resources=nodefeaturediscoveries,verbs=list;get;delete -//+kubebuilder:rbac:groups=nfd.openshift.io,resources=nodefeaturediscoveries/status,verbs=get;update -//+kubebuilder:rbac:groups=nfd.openshift.io,resources=nodefeaturediscoveries/finalizers,verbs=get;update //+kubebuilder:rbac:groups=core,resources=configmaps,verbs=create;delete;get;list;patch;watch;create //+kubebuilder:rbac:groups=core,resources=nodes,verbs=get;patch;list;watch //+kubebuilder:rbac:groups=core,resources=nodes/status,verbs=get;update;watch From 003bc37ac6577836427f79e7d4e409003b2b4184 Mon Sep 17 00:00:00 2001 From: Yuva Shankar <11082310+yuva29@users.noreply.github.com> Date: Thu, 11 Jun 2026 22:22:29 +0000 Subject: [PATCH 2/4] docs: add OpenShift installation guide Bring docs/openshift/installation-guide.md from pensando main. Includes NFD-based node selector documentation and NodeFeatureRule YAML for automatic AMD Pensando NIC detection. Co-Authored-By: Claude Opus 4 (1M context) --- docs/openshift/installation-guide.md | 1132 ++++++++++++++++++++++++++ 1 file changed, 1132 insertions(+) create mode 100644 docs/openshift/installation-guide.md diff --git a/docs/openshift/installation-guide.md b/docs/openshift/installation-guide.md new file mode 100644 index 00000000..7e8edb77 --- /dev/null +++ b/docs/openshift/installation-guide.md @@ -0,0 +1,1132 @@ +# AMD Network Operator - Production Deployment Guide + +## Table of Contents +1. [Overview](#overview) +2. [Prerequisites](#prerequisites) + - [Infrastructure Requirements](#infrastructure-requirements) + - [Required Operators Installation](#required-operators-installation) + - [Development Tools](#development-tools) +3. [Architecture](#architecture) +4. [Cluster Configuration](#cluster-configuration) +5. [Installing the AMD Network Operator](#installing-the-amd-network-operator) + - [Official Installation (Production)](#official-installation-production) + - [Development Installation (Build from Source)](#development-installation-build-from-source) +6. [Post-Installation Verification](#post-installation-verification) +7. [Preparing Pre-Compiled Driver Images (Optional)](#preparing-pre-compiled-driver-images-optional) + - [Two Driver Image Build Methods](#two-driver-image-build-methods) + - [Method 1: RPM-based Build (Recommended)](#method-1-rpm-based-build-recommended) + - [Method 2: Source Image Build (Advanced)](#method-2-source-image-build-advanced) +8. [Deploying NetworkConfig CR](#deploying-networkconfig-cr) +9. [Verification](#verification) +10. [Updating the Operator](#updating-the-operator) +11. [Cleanup](#cleanup) +12. [Troubleshooting](#troubleshooting) +13. [Key Implementation Details](#key-implementation-details) +14. [Production Checklist](#production-checklist) + +--- + +## Overview + +This guide provides production-ready steps for deploying the AMD Network Operator on OpenShift clusters using OLM (Operator Lifecycle Manager). This operator manages AMD network drivers (ionic, ionic_rdma, pds_core, tawk_ipc) using Kernel Module Management (KMM). + +**What this operator does:** +- Automatically loads AMD network drivers on OpenShift CoreOS nodes +- Manages kernel module lifecycle through KMM +- Deploys device plugins for GPU-NIC integration +- Provides metrics and monitoring capabilities +- Supports RDMA and high-performance networking + +### Quick Start Summary + +**Time Required**: 30-45 minutes (excluding build time) + +**High-Level Steps**: +1. Install NFD and KMM operators from OperatorHub *(5 min)* +2. Configure insecure registry (if needed) *(2 min)* +3. **Install AMD Network Operator**: + - **Production**: Install from OperatorHub *(5 min)* + - **Development**: Build and deploy from source *(15 min)* +4. Create NetworkConfig CR *(2 min)* +5. Verify drivers loaded on nodes *(5 min)* + +> 💡 **Quick Start**: For connected environments, you can skip directly to step 4 after installing the operator. KMM will automatically build driver images in-cluster using the OpenShift internal registry. + +**Key Requirements**: +- ✅ OpenShift 4.16+ with CoreOS +- ✅ NFD and KMM operators installed +- ✅ Container registry (insecure registry configured if internal) +- ✅ AMD Pensando NICs installed on nodes + +--- + +## Important Notes + +> 💡 **TIP**: This guide uses production-style versioning (`v1.0.0-netop-beta`). Replace with your actual version tags. + +> ⚠️ **WARNING**: Only install KMM operator **ONCE** in `openshift-kmm` namespace. Multiple instances cause conflicts. + +> 🌐 **REGISTRY**: Configure insecure registries at cluster level before starting. Images won't pull otherwise. + +--- + +## Prerequisites + +### Infrastructure Requirements +- OpenShift 4.16+ cluster with CoreOS nodes +- AMD Pensando network hardware +- Container registry accessible from the cluster +- Administrative access to OpenShift cluster + +### Required Operators Installation + +**These operators must be installed BEFORE deploying the AMD Network Operator:** + +#### 1. Install Node Feature Discovery (NFD) + +NFD detects hardware features on nodes and labels them accordingly. + +**Installation via OpenShift Web Console:** +1. Log in to OpenShift Web Console +2. Navigate to **Operators** → **OperatorHub** +3. Search for **"Node Feature Discovery"** +4. Click on the operator from Red Hat +5. Click **Install** +6. Keep default settings: + - **Update Channel**: Select the latest stable channel + - **Installation Mode**: All namespaces on the cluster + - **Installed Namespace**: `openshift-nfd` (auto-created) + - **Update Approval**: Automatic +7. Click **Install** and wait for the operator to become ready + +**Verification:** +```bash +kubectl get csv -n openshift-nfd | grep nfd +# Expected: nfd.x.x.x Node Feature Discovery x.x.x Succeeded +``` + +**Create a NodeFeatureDiscovery instance to activate NFD:** + +After installing the NFD operator, create a `NodeFeatureDiscovery` CR to start NFD workers on the cluster: + +1. Navigate to **Operators** → **Installed Operators** → **Node Feature Discovery** +2. Click the **NodeFeatureDiscovery** tab +3. Click **Create NodeFeatureDiscovery** +4. Accept defaults and click **Create** + +Or via CLI: + +```bash +cat < **Note**: Without a `NodeFeatureDiscovery` instance, the NFD operator is installed but idle — no node feature detection or labeling occurs. + +#### 2. Install Kernel Module Management (KMM) + +KMM manages out-of-tree kernel modules on OpenShift clusters. + +**Installation via OpenShift Web Console:** +1. Log in to OpenShift Web Console +2. Navigate to **Operators** → **OperatorHub** +3. Search for **"Kernel Module Management"** +4. Click on the operator from Red Hat +5. Click **Install** +6. Configure installation settings: + - **Update Channel**: Select `stable` or latest channel + - **Installation Mode**: All namespaces on the cluster + - **Installed Namespace**: `openshift-kmm` (auto-created) + - **Update Approval**: Automatic +7. Click **Install** and wait for the operator to become ready + +**Verification:** +```bash +kubectl get csv -n openshift-kmm | grep kernel-module-management +# Expected: kernel-module-management.v2.5.1 Kernel Module Management 2.5.1 Succeeded + +kubectl get deployment -n openshift-kmm +# Expected: +# NAME READY UP-TO-DATE AVAILABLE +# kmm-operator-controller 1/1 1 1 +# kmm-operator-webhook 1/1 1 1 +``` + +**⚠️ IMPORTANT**: Only install KMM **once** in the `openshift-kmm` namespace. Multiple KMM instances cause conflicts and module loading failures. + +### Development Tools (for building) +- Docker or Podman +- Go 1.23+ +- make +- operator-sdk v1.32.0+ +- Git + +## Architecture + +``` +NetworkConfig CR → AMD Network Operator → KMM Module CR → KMM Operator → Driver Pods → Node (drivers loaded) +``` + +## Cluster Configuration + +### 1. Configure Insecure Registry (if using internal registry) + +OpenShift needs to trust your internal registry for pulling images without TLS: + +```bash +# Check current configuration +kubectl get image.config.openshift.io/cluster -o yaml + +# If your registry is not listed, add it: +kubectl patch image.config.openshift.io/cluster --type=merge \ + -p "{\"spec\":{\"registrySources\":{\"insecureRegistries\":[\"${REGISTRY_URL}\"]}}}" +``` + +**Note**: This configuration allows all nodes to pull from the specified registry without TLS verification. + +### 2. Verify KMM Installation + +```bash +# Verify KMM is running in openshift-kmm namespace +kubectl get csv -n openshift-kmm | grep kernel-module-management + +# Check KMM deployments +kubectl get deployment -n openshift-kmm +# Expected output: +# NAME READY UP-TO-DATE AVAILABLE +# kmm-operator-controller 1/1 1 1 +# kmm-operator-webhook 1/1 1 1 +``` + +**Troubleshooting**: If KMM exists in multiple namespaces, keep only the one in `openshift-kmm` to avoid conflicts. + +### 3. Set Environment Variables + +Set version variables that will be used throughout the deployment: + +```bash +# Driver and firmware versions +export DRIVERS_VERSION="1.117.5-a-56" +export KERNEL_VERSION="5.14.0-570.76.1.el9_6.x86_64" +export RHEL_VERSION="9.6" + +# Operator versions +export OPERATOR_VERSION="v1.0.0-netop-beta" + +# Registry configuration +export REGISTRY_URL="registry.test.pensando.io:5000" +export REPO_URL="https://repo.radeon.com" + +# DTK image (get from: kubectl get is -n openshift driver-toolkit -o yaml) +export DTK_IMAGE="quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:288b3574a5524121c139b846e98a223da793305560f8b42dcd8d2aa712912998" +``` + +**Finding Version Values:** + +```bash +# Get node kernel version +export KERNEL_VERSION=$(kubectl get nodes -o jsonpath='{.items[0].status.nodeInfo.kernelVersion}') + +# Find matching DTK image +export DTK_IMAGE=$(kubectl get is -n openshift driver-toolkit -o jsonpath="{.spec.tags[?(@.name=='${KERNEL_VERSION}')].from.name}") +``` + +--- + +## Installing the AMD Network Operator + +Choose the appropriate installation method based on your use case: +- **Official Installation**: Install published operator from OperatorHub (recommended for production) +- **Development Installation**: Build and install from source (for development and testing) + +--- + +### Official Installation (Production) + +This method installs the AMD Network Operator from Red Hat OperatorHub. Use this for production deployments when the operator is officially published. + +#### Step 1: Install from OperatorHub via Web Console + +1. Log in to **OpenShift Web Console** +2. Navigate to **Operators** → **OperatorHub** +3. Search for **"AMD Network Operator"** +4. Click on the **AMD Network Operator** tile +5. Click **Install** +6. Configure installation settings: + - **Update Channel**: Select `stable` or latest channel + - **Installation Mode**: Select namespace (e.g., `openshift-amd-network`) + - **Installed Namespace**: `openshift-amd-network` (create if doesn't exist) + - **Update Approval**: Automatic (recommended) or Manual +7. Click **Install** and wait for the operator to become ready + +#### Step 2: Verify Installation + +```bash +# Check operator is installed +kubectl get csv -n openshift-amd-network | grep amd-network-operator +# Expected: amd-network-operator.vX.Y.Z AMD Network Operator X.Y.Z Succeeded + +# Verify operator pod is running +kubectl get pods -n openshift-amd-network -l control-plane=controller-manager +# Expected: STATUS Running + +# Check operator logs +kubectl logs -f deployment/amd-network-operator-controller-manager -n openshift-amd-network +``` + +#### Step 3: Proceed to Deployment + +Once the operator is installed, proceed to [Deploying NetworkConfig CR](#deploying-networkconfig-cr) to start using the operator. + +> 💡 **Optional**: If you need to pre-build driver images (for air-gapped environments or external registries), see [Preparing Pre-Compiled Driver Images](#preparing-pre-compiled-driver-images-optional). + +--- + +### Development Installation (Build from Source) + +This method is for developers and testers who need to build and deploy the operator from source code. + +#### Prerequisites + +Ensure you have the following tools installed: +- Docker or Podman +- Go 1.23+ +- make +- operator-sdk v1.32.0+ +- Git + +#### Step 1: Build Operator Image + +```bash +# Clone repository +git clone https://github.com/ROCm/network-operator.git +cd network-operator +git checkout + +# Set image tags +export OPERATOR_IMG=${REGISTRY_URL}/amd-network-operator:${OPERATOR_VERSION} +export BUNDLE_IMG=${REGISTRY_URL}/amd-network-operator-bundle:${OPERATOR_VERSION} + +# Build operator image +make docker-build IMG=${OPERATOR_IMG} + +# Push to registry +docker push ${OPERATOR_IMG} +``` + +#### Step 2: Build OLM Bundle + +The OLM bundle packages the operator for deployment via Operator Lifecycle Manager: + +```bash +# Build bundle with your version +make bundle-build \ + IMG=${OPERATOR_IMG} \ + BUNDLE_IMG=${BUNDLE_IMG} \ + PROJECT_VERSION=${OPERATOR_VERSION} + +# Push bundle image +make bundle-push BUNDLE_IMG=${BUNDLE_IMG} +``` + +**What this does:** +- Generates CSV (ClusterServiceVersion) with operator metadata +- Creates RBAC manifests for all service accounts +- Packages CRDs and required resources +- Builds and pushes a container image with the bundle + +#### Step 3: Deploy via OLM + +Deploy the bundle image using operator-sdk: + +```bash +# Build the OLM bundle +make bundle-build + +# Push bundle image to registry +make bundle-push + +# Deploy using operator-sdk (automatically creates CatalogSource, Subscription, etc.) +./bin/operator-sdk run bundle ${BUNDLE_IMG} \ + --use-http \ + --skip-tls \ + -n openshift-amd-network +``` + +**Flags explained:** +- `--use-http`: Use HTTP instead of HTTPS for registry communication +- `--skip-tls`: Skip TLS verification (for insecure registries) +- `-n`: Target namespace for operator deployment + +> 💡 **What `operator-sdk run bundle` does**: This command automatically creates the CatalogSource, OperatorGroup, and Subscription resources needed by OLM. You don't need to create them manually! + +#### Step 4: Verify Installation + +```bash +# Verify CSV is in Succeeded phase +kubectl get csv -n openshift-amd-network + +# Check operator pod +kubectl get pods -n openshift-amd-network -l control-plane=controller-manager + +# View operator logs +kubectl logs -f deployment/amd-network-operator-controller-manager \ + -n openshift-amd-network +``` + +--- + +## Post-Installation Verification + +The operator creates multiple service accounts for different components: + +```bash +kubectl get sa -n openshift-amd-network + +# Expected service accounts: +# - amd-network-operator-controller-manager +# - amd-network-operator-device-plugin +# - amd-network-operator-kmm-module-loader +# - amd-network-operator-node-labeller +# - amd-network-operator-metrics-exporter +# - amd-network-operator-config-manager +# - amd-network-operator-utils-container +``` + +--- + +## Preparing Pre-Compiled Driver Images (Optional) + +> ⚠️ **THIS SECTION IS OPTIONAL**: For most users with connected clusters, you can **skip this entire section** and proceed directly to [Deploying NetworkConfig CR](#deploying-networkconfig-cr). When you create a NetworkConfig CR, KMM will automatically build driver images in-cluster using the OpenShift internal registry. + +**When to use this section:** +- **Air-gapped/disconnected environments**: No internet access during runtime +- **Pre-staging images**: Want driver images ready before deployment +- **External registry requirements**: Need images in a specific external registry +- **Custom build pipelines**: Integrating with CI/CD systems + +**When to skip this section:** +- **Connected clusters**: Have internet access to `repo.radeon.com` +- **Quick start/trial**: Want the fastest path to running drivers +- **Using internal registry**: OpenShift's built-in registry is sufficient + +> 💡 **WORKFLOW TIP**: If you do choose to pre-build images, you can do this in parallel while the operator deploys. The operator will wait idle until you create a NetworkConfig CR. + +--- + +### Two Driver Image Build Methods + +The operator supports two methods for building driver images, controlled by the `useSourceImage` field in NetworkConfig CR: + +**Method 1: RPM-based Build** (`useSourceImage: false`) - **Recommended** +- Downloads pre-compiled RPM packages from repo.radeon.com +- Installs drivers directly from RPMs +- Faster build process +- Uses: `DockerfileTemplate.rpm.ionic.coreos` + +**Method 2: Source Image Build** (`useSourceImage: true`) - **Advanced** +- Requires building a source image first containing driver source code +- KMM compiles modules from source against specific kernel +- More flexible for custom builds +- Uses: `DockerfileTemplate.srcimg.ionic.coreos` + source image from `internal-example/driverSrcImage/Dockerfile.ionic.coreos` + +--- + +### Method 1: RPM-based Build (Recommended) + +#### Step 1: Build Driver Image Using BuildConfig + +**Prerequisites**: The BuildConfig requires a pull secret for accessing the Driver Toolkit (DTK) image from `quay.io`: + +```bash +# The global-pull-secret typically contains credentials for: +# - quay.io (for DTK images) +# - registry.redhat.io (for UBI base images) +# - cloud.openshift.com (for OpenShift pull-through cache) + +# Verify pull secret exists +kubectl get secret global-pull-secret -n openshift-amd-network + +# If missing, copy from openshift-config or create new one +kubectl get secret pull-secret -n openshift-config -o yaml | \ + sed 's/namespace: openshift-config/namespace: openshift-amd-network/' | \ + sed 's/name: pull-secret/name: global-pull-secret/' | \ + kubectl apply -f - +``` + +This method downloads RPM packages and installs pre-compiled drivers: + +```bash +cat > /tmp/driver-buildconfig.yaml << EOF +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: amdnetwork_kmod + namespace: openshift-amd-network +spec: + lookupPolicy: + local: true +--- +apiVersion: build.openshift.io/v1 +kind: BuildConfig +metadata: + name: amd-driver-build + namespace: openshift-amd-network +spec: + output: + to: + kind: ImageStreamTag + name: amdnetwork_kmod:coreos-${RHEL_VERSION}-${KERNEL_VERSION}-${DRIVERS_VERSION} + source: + type: Git + git: + uri: "https://github.com/ROCm/network-operator.git" + ref: + contextDir: "internal/kmmmodule/dockerfiles" + dockerfile: "DockerfileTemplate.rpm.ionic.coreos" + strategy: + type: Docker + dockerStrategy: + pullSecret: + name: global-pull-secret + buildArgs: + - name: DTK_AUTO + value: "${DTK_IMAGE}" + - name: KERNEL_VERSION + value: "${KERNEL_VERSION}" + - name: DRIVERS_VERSION + value: "${DRIVERS_VERSION}" + - name: REPO_URL + value: "${REPO_URL}" + forcePull: true + triggers: [] +EOF + +# Apply BuildConfig +kubectl apply -f /tmp/driver-buildconfig.yaml + +# Start the build +kubectl start-build amd-driver-build -n openshift-amd-network + +# Follow the build logs (this takes 10-15 minutes) +kubectl logs -f build/amd-driver-build-1 -n openshift-amd-network +``` + +**Build Arguments Explained:** +- `DTK_AUTO`: Driver Toolkit image matching your OpenShift version and kernel +- `KERNEL_VERSION`: Target kernel version from node +- `DRIVERS_VERSION`: AMD driver package version from repo.radeon.com +- `REPO_URL`: AMD repository URL + +**Finding the Correct DTK Image:** + +```bash +# Get node kernel version (already set in env vars) +echo $KERNEL_VERSION + +# Find matching DTK image +kubectl get is -n openshift driver-toolkit -o yaml | grep "${KERNEL_VERSION}" +``` + +#### Step 4b: Push Driver Image to External Registry + +The BuildConfig creates an ImageStream in OpenShift's internal registry. Push it to an external registry for KMM to access: + +```bash +# Get the node IP +NODE_IP= + +# SSH to the node +ssh core@${NODE_IP} + +# Find the built image +sudo podman images | grep amdnetwork_kmod + +# Tag with external registry +sudo podman tag \ + ${REGISTRY_URL}/amdnetwork_kmod:coreos-${RHEL_VERSION}-${KERNEL_VERSION}-${DRIVERS_VERSION} + +# Push to external registry +sudo podman push --tls-verify=false \ + ${REGISTRY_URL}/amdnetwork_kmod:coreos-${RHEL_VERSION}-${KERNEL_VERSION}-${DRIVERS_VERSION} +``` + +**Why Push to External Registry?** +- OpenShift's internal registry may not be accessible during module loading +- External registry provides consistent access across cluster operations +- Simplifies image management and versioning + +--- + +### Method 2: Source Image Build (Advanced) + +This method first builds a source container image, then KMM compiles modules from that source against the specific kernel. + +> **AIR-GAPPED ENVIRONMENTS**: This approach is designed for air-gapped or disconnected environments where direct access to external repositories (like `repo.radeon.com`) is restricted. By building a source image first, all required driver sources are packaged into a container that can be transferred and used in isolated environments without internet access during module compilation. + +#### Step 5a: Use Pre-Built Source Images (Recommended) + +Pre-built source images are available on Docker Hub for all published driver versions: + +```bash +# Available at: +docker.io/amdpsdo/amdnic-drivers: + +# Example versions: +# docker.io/amdpsdo/amdnic-drivers:1.117.5-a-56 +# docker.io/amdpsdo/amdnic-drivers:1.117.5 +# docker.io/amdpsdo/amdnic-drivers:1.117.1 +``` + +These images are automatically built and published by the GitHub Actions workflow. You can skip to [Step 5c](#step-5c-configure-networkconfig-to-use-source-image) and use these images directly. + +#### Step 5a (Alternative): Build Source Image Manually + +If you need to build source images yourself (e.g., for a custom driver version or internal registry): + +**Option 1: Using the automated builder script** + +```bash +cd internal-example/driverSrcImage +./build-all-source-images.sh --version ${DRIVERS_VERSION} --registry your-registry.com +``` + +**Option 2: Using OpenShift BuildConfig** + +```bash +cat > /tmp/source-image-build.yaml << EOF +apiVersion: image.openshift.io/v1 +kind: ImageStream +metadata: + name: amdainic-driver-source + namespace: openshift-amd-network +spec: + lookupPolicy: + local: true +--- +apiVersion: build.openshift.io/v1 +kind: BuildConfig +metadata: + name: amd-source-image-build + namespace: openshift-amd-network +spec: + output: + to: + kind: ImageStreamTag + name: amdainic-driver-source:latest + source: + type: Git + git: + uri: "https://github.com/ROCm/network-operator.git" + ref: "main" + contextDir: "internal-example/driverSrcImage" + dockerfile: "Dockerfile.ionic.coreos" + strategy: + type: Docker + dockerStrategy: + buildArgs: + - name: REPO_URL + value: "${REPO_URL}" + - name: MAJOR_VERSION + value: "9" + - name: DRIVERS_VERSION + value: "${DRIVERS_VERSION}" + forcePull: true + triggers: [] +EOF + +kubectl apply -f /tmp/source-image-build.yaml +kubectl start-build amd-source-image-build -n openshift-amd-network +kubectl logs -f build/amd-source-image-build-1 -n openshift-amd-network +``` + +**What source images contain:** +- `/ionic_src/driver/` - Source code for ionic, pds, tawk-ipc modules +- `/ionic_src/firmware/` - Firmware files + +#### Step 5b: Push Source Image to External Registry (If Built Manually) + +Skip this step if using pre-built images from `docker.io/amdpsdo/amdnic-drivers`. + +```bash +# SSH to a node +NODE_IP= +ssh core@${NODE_IP} + +# Find and push the source image +sudo podman images | grep amdainic-driver-source + +sudo podman tag \ + ${REGISTRY_URL}/amdainic-driver-source:${DRIVERS_VERSION} + +sudo podman push --tls-verify=false \ + ${REGISTRY_URL}/amdainic-driver-source:latest +``` + +#### Step 5c: Configure NetworkConfig to Use Source Image + +When creating your NetworkConfig CR, set `useSourceImage: true` and provide the source image repository: + +```yaml +spec: + driver: + enable: true + useSourceImage: true # Enable source image build + version: "${DRIVERS_VERSION}" + image: ${REGISTRY_URL}/amdnetwork_kmod # Final driver image (compiled .ko files) + imageBuild: + sourceImageRepo: "docker.io/amdpsdo/amdnic-drivers" # Pre-built source images +``` + +> 💡 **Note**: If using a custom/internal source image registry, replace `docker.io/amdpsdo/amdnic-drivers` with your registry path. + +**How it works:** +1. KMM uses `DockerfileTemplate.srcimg.ionic.coreos` +2. Copies source code from your source image (`sourceImageRepo`) +3. Compiles modules against the Driver Toolkit (DTK) for the specific kernel version +4. Creates final driver image with compiled `.ko` files + +--- + +## Post-Installation Verification + +The operator creates multiple service accounts for different components: + +```bash +kubectl get sa -n openshift-amd-network + +# Expected service accounts: +# - amd-network-operator-controller-manager +# - amd-network-operator-device-plugin +# - amd-network-operator-kmm-module-loader +# - amd-network-operator-node-labeller +# - amd-network-operator-metrics-exporter +# - amd-network-operator-config-manager +# - amd-network-operator-utils-container +``` + +## Deploying NetworkConfig CR + +### 1. Create NFD Rule for NIC Detection + +Create a `NodeFeatureRule` to instruct NFD to automatically label nodes that have AMD Pensando NICs: + +```bash +cat < **Note**: This requires the NFD operator to be installed and a `NodeFeatureDiscovery` CR to be created (see [Prerequisites](#required-operators-installation)). NFD will automatically apply the `feature.node.kubernetes.io/amd-nic: "true"` label to any node with AMD Pensando PCI devices (vendor `1dd8`). No node reboot is required. + +### 2. Create NetworkConfig + +Create the NetworkConfig CR to deploy drivers on your nodes. Choose the configuration based on which build method you used: + +**Option 1: RPM-based Build** (if you used Method 1): + +```bash +cat < -- chroot /host lsmod | grep -E '^(ionic|pds_core|tawk_ipc)' +# Expected: ionic, ionic_rdma, pds_core, tawk_ipc modules loaded + +# 7. RDMA Devices Available +kubectl debug node/ -- chroot /host ls /sys/class/infiniband/ +# Expected: ionic_0, ionic_1, ... (one per NIC) +``` + +## Updating the Operator + +### Update to New Version + +```bash +# Build new operator image +export NEW_VERSION=v1.0.1-netop-beta +export OPERATOR_IMG=${REGISTRY_URL}/amd-network-operator:${NEW_VERSION} +export BUNDLE_IMG=${REGISTRY_URL}/amd-network-operator-bundle:${NEW_VERSION} + +make docker-build IMG=${OPERATOR_IMG} +docker push ${OPERATOR_IMG} + +# Build new bundle +make bundle-build \ + IMG=${OPERATOR_IMG} \ + BUNDLE_IMG=${BUNDLE_IMG} \ + PROJECT_VERSION=${NEW_VERSION} + +make bundle-push BUNDLE_IMG=${BUNDLE_IMG} + +# Update via operator-sdk +./bin/operator-sdk run bundle-upgrade ${BUNDLE_IMG} \ + --use-http \ + --skip-tls \ + -n openshift-amd-network +``` + +## Cleanup + +### Remove Operator Completely + +```bash +# Delete all NetworkConfig CRs first +kubectl delete networkconfigs.amd.com -n openshift-amd-network --all + +# Clean up using operator-sdk +./bin/operator-sdk cleanup amd-network-operator -n openshift-amd-network + +# Or manually delete subscription and CSV +kubectl delete subscription amd-network-operator -n openshift-amd-network +kubectl delete csv amd-network-operator. -n openshift-amd-network + +# Delete CatalogSource (if using catalog method) +kubectl delete catalogsource amd-network-operator-catalog -n openshift-marketplace + +# Delete namespace (optional) +kubectl delete namespace openshift-amd-network +``` + +## Troubleshooting + +### Common Issues and Solutions + +#### 1. operator-sdk: TLS Error with Insecure Registry + +**Problem**: +``` +http: server gave HTTP response to HTTPS client +``` + +**Root Cause**: operator-sdk running locally doesn't know about cluster's insecure registry configuration. + +**Solution**: Use `--use-http` and `--skip-tls` flags: +```bash +./bin/operator-sdk run bundle ${BUNDLE_IMG} --use-http --skip-tls -n +``` + +--- + +#### 2. OperatorGroup Conflict + +**Problem**: +``` +csv failed: reason: "InterOperatorGroupOwnerConflict" +intersecting operatorgroups provide the same apis +``` + +**Root Cause**: Another operator instance already exists providing the same CRDs. + +**Solution**: Only one operator instance allowed per cluster: +1. Check existing operators: `kubectl get csv -A | grep amd-network` +2. Either use existing namespace or cleanup old deployment first +3. Delete test namespaces if created during troubleshooting + +--- + +#### 3. Image Pull Failures + +**Problem**: Pods stuck in `ImagePullBackOff` or `ErrImagePull` + +**Root Cause**: Registry not accessible or missing credentials. + +**Solution**: +1. **For insecure registries**: Verify configuration + ```bash + kubectl get image.config.openshift.io/cluster -o yaml | grep insecureRegistries + ``` +2. **For authenticated registries**: Check pull secrets + ```bash + kubectl get secret -n openshift-amd-network | grep pull + ``` +3. **Test registry access** from node: + ```bash + kubectl debug node/ -- chroot /host podman pull --tls-verify=false + ``` + +--- + +#### 4. Driver Modules Not Loading + +**Problem**: `lsmod` shows no ionic modules on node + +**Diagnostic Steps**: +```bash +# 1. Check KMM Module status +kubectl get module -n openshift-amd-network -o yaml + +# 2. Check if worker pods ran +kubectl get pods -n openshift-amd-network | grep worker + +# 3. Check worker pod logs +kubectl logs -n openshift-amd-network + +# 4. Verify node selector +kubectl get module -n openshift-amd-network -o jsonpath='{.spec.selector}' +kubectl get nodes --show-labels | grep