From bc94810d71135e2ff7a0486953809ad71ad0d25d Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Wed, 31 Dec 2025 07:27:08 +0000 Subject: [PATCH 01/47] modify templates for doca ofed Signed-off-by: Vrinda_Marwah --- ...-group-login_compiler_node_aarch64.yaml.j2 | 7 ++ ...i-group-login_compiler_node_x86_64.yaml.j2 | 7 ++ .../ci-group-login_node_aarch64.yaml.j2 | 7 ++ .../ci-group-login_node_x86_64.yaml.j2 | 7 ++ ...ce_kube_control_plane_first_x86_64.yaml.j2 | 7 ++ ...-service_kube_control_plane_x86_64.yaml.j2 | 7 ++ .../ci-group-service_kube_node_x86_64.yaml.j2 | 7 ++ ...ci-group-slurm_control_node_x86_64.yaml.j2 | 15 +++- .../ci-group-slurm_node_aarch64.yaml.j2 | 14 +++- .../ci-group-slurm_node_x86_64.yaml.j2 | 15 +++- .../templates/doca-ofed/doca-install.sh.j2 | 77 +++++++++++++++++++ .../roles/configure_ochami/vars/main.yml | 2 +- .../slurm_config/tasks/create_slurm_dir.yml | 60 ++++++++++++++- discovery/roles/slurm_config/vars/main.yml | 19 +++++ .../aarch64/rhel/10.0/slurm_custom.json | 6 +- .../config/x86_64/rhel/10.0/slurm_custom.json | 6 +- 16 files changed, 250 insertions(+), 13 deletions(-) create mode 100644 discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 9d182a23db..50fd55f498 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -18,6 +18,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -169,6 +175,7 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} runcmd: + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - groupadd -r {{ slurm_group_name }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 5b9ad2dcd1..ae84d3b32a 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -18,6 +18,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -171,6 +177,7 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} runcmd: + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - groupadd -r {{ slurm_group_name }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index cf46e66b95..abc1103242 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -20,6 +20,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -92,6 +98,7 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} runcmd: + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - groupadd -r {{ slurm_group_name }} - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index a1dfc3708a..c921d3fe86 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -20,6 +20,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -91,6 +97,7 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} runcmd: + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - groupadd -r {{ slurm_group_name }} - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 39068b4f40..95ef161a26 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -20,6 +20,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -311,6 +317,7 @@ {% endif %} runcmd: + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" - "systemctl restart chronyd" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 7d04398cca..b6d9c99bd2 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -20,6 +20,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -218,6 +224,7 @@ chmod 0755 /etc/bash_completion.d/helm.sh runcmd: + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" - "systemctl restart chronyd" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index 6b52f12c55..8643e6c539 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -20,6 +20,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -128,6 +134,7 @@ location = "{{ pulp_mirror }}" runcmd: + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" - "systemctl restart chronyd" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 8e9d66b214..ff64b8175f 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -20,6 +20,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -243,7 +249,7 @@ - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} # Create directories for nfs and mount all - - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track + - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm /etc/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/my.cnf.d /etc/my.cnf.d nfs defaults,_netdev 0 0" >> /etc/fstab @@ -253,8 +259,12 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh - chown -R {{ user }}:{{ slurm_group_name }} {{ home_dir }} - chmod {{ file_mode_755 }} {{ home_dir }} @@ -277,9 +287,6 @@ - systemctl enable sshd - systemctl start sshd - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - {% if hostvars['localhost']['openldap_support'] %} - /usr/local/bin/update_ldap_conf.sh - mkdir /ldapcerts diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 45e5f11386..7b36e7fc87 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -20,6 +20,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -220,7 +226,7 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Slurm and Munge (aarch64) =====" echo "[INFO] Creating base directories for Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track + mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track /var/lib/packages echo "[INFO] Updating /etc/fstab with NFS entries for Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab @@ -229,6 +235,9 @@ echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + chmod {{ file_mode }} /etc/fstab echo "[INFO] Mounting all NFS entries from /etc/fstab" @@ -376,6 +385,9 @@ - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - /usr/local/bin/configure_dirs_and_mounts.sh + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 94c12fd6d2..61424158d3 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -20,6 +20,12 @@ disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -229,7 +235,7 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge =====" echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge" - mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages echo "[INFO] Updating /etc/fstab with NFS entries for Pulp cert, Slurm and Munge paths" echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab @@ -238,6 +244,7 @@ echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab chmod {{ file_mode }} /etc/fstab echo "[INFO] Mounting all NFS entries from /etc/fstab" @@ -379,11 +386,11 @@ - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - /usr/local/bin/configure_dirs_and_mounts.sh - - /usr/local/bin/configure_slurmd_setup.sh - - /usr/local/bin/configure_munge_and_pam.sh - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh + - /usr/local/bin/configure_slurmd_setup.sh + - /usr/local/bin/configure_munge_and_pam.sh - setenforce 0 - /usr/local/bin/configure_firewall_and_services.sh diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 new file mode 100644 index 0000000000..72ea31ac5a --- /dev/null +++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -0,0 +1,77 @@ +#!/bin/bash +# Optimize firewall ports declaration later +DOCA_FIREWALL_PORTS=( + "18515-18520/tcp" + "18515-18520/udp" + "18515/tcp" + "18515/udp" +) + +echo "Checking for Mellanox / ConnectX / InfiniBand card..." + +if ! lspci | grep -i 'mellanox'; then + echo "No Mellanox RDMA hardware detected. Skipping DOCA-OFED installation." + exit 0 +fi + +echo "Mellanox RDMA hardware detected. Proceeding with DOCA-OFED installation." + +sys_arch="$(uname -m)" +case "${sys_arch}" in + x86_64|amd64) arch="x86_64" ;; + aarch64|arm64) arch="aarch64" ;; + *) + echo "Unsupported architecture: ${sys_arch}" + exit 1 + ;; +esac + +echo "Check if kernel-devel package is present" +if rpm -q kernel-devel-$(uname -r) >/dev/null 2>&1; then + echo "kernel-devel package is already installed." +else + echo "kernel-devel package is not installed. Installing..." + dnf install -y kernel-devel-$(uname -r) +fi + +echo "Check if kernel-headers package is present" +if rpm -q kernel-headers-$(uname -r) >/dev/null 2>&1; then + echo "kernel-headers package is already installed." +else + echo "kernel-headers package is not installed. Installing..." + dnf install -y kernel-headers-$(uname -r) +fi + +echo "Bootstrap doca-ofed package..." +rpm -i "/var/lib/packages/${arch}/doca-ofed/doca-host-3.2.1-044000_25.10_rhel10.${arch}.rpm" + +echo "Installing doca-ofed..." +dnf install -y doca-ofed + +echo "Loading RDMA kernel modules..." +modprobe mlx5_core || true +modprobe mlx5_ib || true +modprobe ib_core || true +modprobe ib_uverbs || true +modprobe ib_umad || true +modprobe ib_cm || true +modprobe rdma_cm || true +modprobe rdma_ucm || true +modprobe xpmem || true +modprobe knem || true +modprobe ib_ipoib || true + +if command -v firewall-cmd &>/dev/null; then + echo "Adding firewall ports..." + + for port in "${DOCA_FIREWALL_PORTS[@]}"; do + firewall-cmd --zone=public --add-port="$port" --permanent || true + done + + firewall-cmd --reload || true +else + echo "firewalld not running. Skipping firewall configuration." +fi + +echo "DOCA-OFED installation completed successfully." + \ No newline at end of file diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index 3f82454590..e705580e11 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -38,7 +38,7 @@ common_cloud_init_groups: bss_template: bss/bss.yaml.j2 bss_dir: "{{ openchami_work_dir }}/boot" bss_params_cloud_init: 'ds=nocloud;s=http://{{ cluster_boot_ip }}:8081/cloud-init/' -bss_params_opts: 'ip=dhcp rd.live.image rd.live.ram rd.neednet=1 rd.driver.blacklist=ccp,edac_core,power_meter,ahci,megaraid_sas modprobe.blacklist=ccp,edac_core,power_meter,ahci,megaraid_sas libata.force=1:disable,2:disable,3:disable,4:disable rd.luks=0 rd.md=0 rd.dm=0 console=tty0 console=ttyS0,115200 selinux=0 apparmor=0 ip6=off cloud-init=enabled' # noqa: yaml[line-length] +bss_params_opts: 'ip=dhcp rd.live.image rd.live.ram rd.neednet=1 rd.driver.blacklist=ccp,edac_core,power_meter,ahci,megaraid_sas,ib_core,ib_uverbs,ib_umad,ib_cm,ib_ipoib,ib_iser,rdma_cm,mlx5_ib,mlx4_ib,bnxt_re,mlx5_core,ib_isert,ib_srp,scsi_transport_srp,knem,xpmem modprobe.blacklist=ccp,edac_core,power_meter,ahci,megaraid_sas,ib_core,ib_uverbs,ib_umad,ib_cm,ib_ipoib,ib_iser,rdma_cm,mlx5_ib,mlx4_ib,bnxt_re,mlx5_core,ib_isert,ib_srp,scsi_transport_srp,knem,xpmem libata.force=1:disable,2:disable,3:disable,4:disable rd.luks=0 rd.md=0 rd.dm=0 console=tty0 console=ttyS0,115200 selinux=0 apparmor=0 ip6=off cloud-init=enabled' # noqa: yaml[line-length] image_missing_fail_msg: "Failed to set kernel or initrd. Create the image using build_image.yml and try again." # Usage: configure_cloud_init_group.yml, configure_bss_cloud_init.yml diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index f3547fee71..e4755604b6 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -103,6 +103,7 @@ mode: "{{ file_mode }}" become: true +# Move to packages directory moving forward - name: Create the cuda directory on share ansible.builtin.file: path: "{{ slurm_config_path }}/cuda" @@ -111,6 +112,64 @@ group: root mode: "{{ common_mode }}" +- name: Create x86_64 package base directory + ansible.builtin.file: + path: "{{ packages_base_dir_x86_64 }}" + state: directory + mode: '{{ common_mode }}' + +- name: Create aarch64 package base directory + ansible.builtin.file: + path: "{{ packages_base_dir_aarch64 }}" + state: directory + mode: '{{ common_mode }}' + +- name: Create x86_64 package layout directories + ansible.builtin.file: + path: "{{ packages_base_dir_x86_64 }}/{{ item }}" + state: directory + mode: '{{ common_mode }}' + loop: "{{ packages_layout_x86_64 }}" + +- name: Create aarch64 package layout directories + ansible.builtin.file: + path: "{{ packages_base_dir_aarch64 }}/{{ item }}" + state: directory + mode: '{{ common_mode }}' + loop: "{{ packages_layout_aarch64 }}" + +- name: Print copy paths for x86_64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_x86_64 }}" + +- name: Print copy paths for aarch64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_aarch64 }}" + +- name: Copy x86_64 offline packages + copy: + src: "{{ item.source_path }}/" + dest: "{{ item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ offline_path_x86_64 }}" + when: + - item.source_path | length > 0 + - item.dest_path | length > 0 + +- name: Copy aarch64 offline packages + copy: + src: "{{ item.source_path }}/" + dest: "{{ item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ offline_path_aarch64 }}" + when: + - item.source_path | length > 0 + - item.dest_path | length > 0 + - name: Create the runfile directory on share ansible.builtin.file: path: "{{ slurm_config_path }}/runfile" @@ -128,7 +187,6 @@ path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso/cuda-run/" register: src_dir_check_x86_64 - - name: Check if source directory exists ansible.builtin.stat: path: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso/cuda-run/" diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 2c93ad32cb..1ca1ed0805 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -103,3 +103,22 @@ auth_tls_certs_path: "/opt/omnia/auth/tls_certs/ldapserver.crt" slurm_installation_type: configless pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" controller_empty_msg: "Slurm controller functional group is missing from PXE mapping file. Please update the file and rerun discovery.yml." +packages_base_dir_x86_64: "{{ slurm_config_path }}/packages/x86_64" +packages_base_dir_aarch64: "{{ slurm_config_path }}/packages/aarch64" +offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" +offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" +packages_layout_x86_64: + - doca-ofed + - cuda +packages_layout_aarch64: + - doca-ofed + - cuda +print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" +offline_path_x86_64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" +offline_path_aarch64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 1571a82198..77e4ab3eb4 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -6,7 +6,11 @@ {"package": "firewalld", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, - {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"} + {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "doca-ofed", + "type": "iso", + "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm" + } ] }, "slurm_control_node": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index b52ca5540b..90cb8ce541 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -4,7 +4,11 @@ {"package": "munge", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, - {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"} + {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "doca-ofed", + "type": "iso", + "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm" + } ] }, "slurm_control_node": { From 2766c979703ac46ced0ad2c5c9ba2650d3abadbe Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Wed, 31 Dec 2025 14:03:43 +0000 Subject: [PATCH 02/47] doca ofed installation changes for k8s Signed-off-by: Vrinda_Marwah --- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 11 +-- ...-service_kube_control_plane_x86_64.yaml.j2 | 11 +-- .../ci-group-service_kube_node_x86_64.yaml.j2 | 11 +-- .../tasks/create_k8s_config_nfs.yml | 73 +++++++++++++++++++ discovery/roles/k8s_config/vars/main.yml | 21 ++++++ .../slurm_config/tasks/create_slurm_dir.yml | 38 +++++++--- .../config/x86_64/rhel/10.0/service_k8s.json | 3 +- 7 files changed, 140 insertions(+), 28 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 95ef161a26..72e52e4812 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -96,6 +96,7 @@ {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubelet /var/lib/kubelet nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubernetes /etc/kubernetes nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/pod-logs /var/log/pods nfs noatime,nolock 0 0 + {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/packages /var/lib/packages nfs noatime,nolock 0 0 tmpfs /tmp/crio-storage tmpfs size={{ k8s_crio_storage_size }},noatime,nodev,nosuid 0 0 permissions: '0644' @@ -317,7 +318,6 @@ {% endif %} runcmd: - - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" - "systemctl restart chronyd" @@ -376,7 +376,7 @@ - sudo modprobe nf_conntrack || true - sudo modprobe vxlan || true - sysctl --system - - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/etcd /var/lib/kubelet /etc/kubernetes /var/log/pods + - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/etcd /var/lib/kubelet /etc/kubernetes /var/log/pods /var/lib/packages - | tmpfile=$(mktemp) # Extract the first 'search' line only (ignore duplicates) @@ -398,14 +398,15 @@ chattr +i /etc/resolv.conf || true fi - mount -a + - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors + - update-ca-trust extract + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - mv /tmp/generate-control-plane-join.sh {{ k8s_client_mount_path }} - - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - - update-ca-trust extract - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - systemctl daemon-reload - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index b6d9c99bd2..38d9ca7741 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -83,6 +83,7 @@ {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubelet /var/lib/kubelet nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubernetes /etc/kubernetes nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/pod-logs /var/log/pods nfs noatime,nolock 0 0 + {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/packages /var/lib/packages nfs noatime,nolock 0 0 tmpfs /tmp/crio-storage tmpfs size={{ k8s_crio_storage_size }},noatime,nodev,nosuid 0 0 permissions: '0644' - path: /etc/containers/storage.conf @@ -224,7 +225,6 @@ chmod 0755 /etc/bash_completion.d/helm.sh runcmd: - - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" - "systemctl restart chronyd" @@ -283,7 +283,7 @@ - sudo modprobe nf_conntrack || true - sudo modprobe vxlan || true - sysctl --system - - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/etcd /var/lib/kubelet /etc/kubernetes /var/log/pods + - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/etcd /var/lib/kubelet /etc/kubernetes /var/log/pods /var/lib/packages - | tmpfile=$(mktemp) @@ -306,15 +306,16 @@ chattr +i /etc/resolv.conf || true fi - mount -a + - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors + - update-ca-trust extract + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - - update-ca-trust extract - systemctl daemon-reload - systemctl restart crio - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} - echo "Installing helm" - /usr/local/bin/install-helm.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index 8643e6c539..752b408e63 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -82,6 +82,7 @@ {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubelet /var/lib/kubelet nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/kubernetes /etc/kubernetes nfs noatime,nolock 0 0 {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}/pod-logs /var/log/pods nfs noatime,nolock 0 0 + {{ k8s_nfs_server_ip }}:{{ k8s_server_share_path }}/packages /var/lib/packages nfs noatime,nolock 0 0 tmpfs /tmp/crio-storage tmpfs size={{ k8s_crio_storage_size }},noatime,nodev,nosuid 0 0 permissions: '0644' - path: /etc/containers/storage.conf @@ -134,7 +135,6 @@ location = "{{ pulp_mirror }}" runcmd: - - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - "systemctl enable chronyd" - "systemctl restart chronyd" @@ -185,7 +185,7 @@ - sudo modprobe nf_conntrack || true - sudo modprobe vxlan || true - sysctl --system - - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/kubelet /etc/kubernetes /var/log/pods + - mkdir -p /tmp/crio-storage {{ k8s_client_mount_path }} /var/lib/kubelet /etc/kubernetes /var/log/pods /var/lib/packages - | tmpfile=$(mktemp) @@ -209,14 +209,15 @@ fi - systemctl restart rpcbind - mount -a + - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors + - update-ca-trust extract + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet - mv /tmp/crio.conf /etc/containers/registries.conf.d/crio.conf - - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - - update-ca-trust extract - systemctl daemon-reload - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - systemctl restart crio - kubeadm config images pull --kubernetes-version={{ service_k8s_version }} - | diff --git a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml index f83250f7e8..0560b86642 100644 --- a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -104,6 +104,79 @@ - name: Creating the persist folders in nfs share ansible.builtin.include_tasks: create_node_dir.yml +# additional packages +- name: Create x86_64 package base directory + ansible.builtin.file: + path: "{{ packages_base_dir_x86_64 }}" + state: directory + mode: '{{ common_mode }}' + +- name: Create aarch64 package base directory + ansible.builtin.file: + path: "{{ packages_base_dir_aarch64 }}" + state: directory + mode: '{{ common_mode }}' + +- name: Create x86_64 package layout directories + ansible.builtin.file: + path: "{{ packages_base_dir_x86_64 }}/{{ item }}" + state: directory + mode: '{{ common_mode }}' + loop: "{{ packages_layout_x86_64 }}" + +- name: Create aarch64 package layout directories + ansible.builtin.file: + path: "{{ packages_base_dir_aarch64 }}/{{ item }}" + state: directory + mode: '{{ common_mode }}' + loop: "{{ packages_layout_aarch64 }}" + +- name: Print copy paths for x86_64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_x86_64 | default([]) }}" + +- name: Print copy paths for aarch64 + ansible.builtin.debug: + msg: "{{ print_copy_msg }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + +- name: Check x86_64 offline package sources + ansible.builtin.stat: + path: "{{ item.source_path }}" + loop: "{{ offline_path_x86_64 | default([]) }}" + register: x86_64_offline_pkg_sources + +- name: Check aarch64 offline package sources + ansible.builtin.stat: + path: "{{ item.source_path }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + register: aarch64_offline_pkg_sources + +- name: Copy x86_64 offline packages + copy: + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ x86_64_offline_pkg_sources.results | default([]) }}" + when: + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 + +- name: Copy aarch64 offline packages + copy: + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" + remote_src: true + mode: preserve + loop: "{{ aarch64_offline_pkg_sources.results | default([]) }}" + when: + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 + - name: Include local repo access variable file ansible.builtin.include_vars: "{{ local_repo_access_config_file }}" diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml index 1e5400c270..ef843e2c28 100644 --- a/discovery/roles/k8s_config/vars/main.yml +++ b/discovery/roles/k8s_config/vars/main.yml @@ -71,3 +71,24 @@ nfs_export_help_msg: | 1) Run 'exportfs -ra' on the NFS server and verify permissions/mounts 2) Execute 'systemctl restart nfs-server' 3) Rerun the playbook. + +# Usage create_k8s_config_nfs.yml +packages_base_dir_x86_64: "{{ k8s_client_mount_path }}/packages/x86_64" +packages_base_dir_aarch64: "{{ k8s_client_mount_path }}/packages/aarch64" +offline_repo_basepath_x86_64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/x86_64/rhel/10.0/iso" +offline_repo_basepath_aarch64: "{{ oim_shared_path }}/omnia/offline_repo/cluster/aarch64/rhel/10.0/iso" +packages_layout_x86_64: + - doca-ofed + - cuda +packages_layout_aarch64: + - doca-ofed + - cuda +print_copy_msg: "Copying {{ item.name }} from {{ item.source_path }} to {{ item.dest_path }}" +offline_path_x86_64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_x86_64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_x86_64 }}/doca-ofed" +offline_path_aarch64: + - name: doca-ofed + source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" + dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index e4755604b6..1afa04dbff 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -141,34 +141,48 @@ - name: Print copy paths for x86_64 ansible.builtin.debug: msg: "{{ print_copy_msg }}" - loop: "{{ offline_path_x86_64 }}" + loop: "{{ offline_path_x86_64 | default([]) }}" - name: Print copy paths for aarch64 ansible.builtin.debug: msg: "{{ print_copy_msg }}" - loop: "{{ offline_path_aarch64 }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + +- name: Check x86_64 offline package sources + ansible.builtin.stat: + path: "{{ item.source_path }}" + loop: "{{ offline_path_x86_64 | default([]) }}" + register: x86_64_offline_pkg_sources + +- name: Check aarch64 offline package sources + ansible.builtin.stat: + path: "{{ item.source_path }}" + loop: "{{ offline_path_aarch64 | default([]) }}" + register: aarch64_offline_pkg_sources - name: Copy x86_64 offline packages copy: - src: "{{ item.source_path }}/" - dest: "{{ item.dest_path }}/" + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" remote_src: true mode: preserve - loop: "{{ offline_path_x86_64 }}" + loop: "{{ x86_64_offline_pkg_sources.results | default([]) }}" when: - - item.source_path | length > 0 - - item.dest_path | length > 0 + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 - name: Copy aarch64 offline packages copy: - src: "{{ item.source_path }}/" - dest: "{{ item.dest_path }}/" + src: "{{ item.item.source_path }}/" + dest: "{{ item.item.dest_path }}/" remote_src: true mode: preserve - loop: "{{ offline_path_aarch64 }}" + loop: "{{ aarch64_offline_pkg_sources.results | default([]) }}" when: - - item.source_path | length > 0 - - item.dest_path | length > 0 + - item.stat.exists + - item.item.source_path | length > 0 + - item.item.dest_path | length > 0 - name: Create the runfile directory on share ansible.builtin.file: diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 62dc041faa..9512ae4d4d 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -31,7 +31,8 @@ { "package": "quay.io/strimzi/kafka", "tag": "0.48.0-kafka-4.1.0", "type": "image" }, { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, - { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" } + { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"} ] }, "service_kube_control_plane": { From a71819d445822b44c884d03a709a28d0a5f29295 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Fri, 2 Jan 2026 09:43:58 +0000 Subject: [PATCH 03/47] add ansible builtin Signed-off-by: Vrinda_Marwah --- discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml | 4 ++-- discovery/roles/slurm_config/tasks/create_slurm_dir.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml index 0560b86642..0b5eee7e4a 100644 --- a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -154,7 +154,7 @@ register: aarch64_offline_pkg_sources - name: Copy x86_64 offline packages - copy: + ansible.builtin.copy: src: "{{ item.item.source_path }}/" dest: "{{ item.item.dest_path }}/" remote_src: true @@ -166,7 +166,7 @@ - item.item.dest_path | length > 0 - name: Copy aarch64 offline packages - copy: + ansible.builtin.copy: src: "{{ item.item.source_path }}/" dest: "{{ item.item.dest_path }}/" remote_src: true diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 1afa04dbff..bbca6f9ee6 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -161,7 +161,7 @@ register: aarch64_offline_pkg_sources - name: Copy x86_64 offline packages - copy: + ansible.builtin.copy: src: "{{ item.item.source_path }}/" dest: "{{ item.item.dest_path }}/" remote_src: true @@ -173,7 +173,7 @@ - item.item.dest_path | length > 0 - name: Copy aarch64 offline packages - copy: + ansible.builtin.copy: src: "{{ item.item.source_path }}/" dest: "{{ item.item.dest_path }}/" remote_src: true From 30d5135bcf4a6842e8f9bc5d398909f5bb2e7fce Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Mon, 5 Jan 2026 15:31:11 +0530 Subject: [PATCH 04/47] Update ansible-lint.yml Signed-off-by: Vrinda Marwah --- .github/workflows/ansible-lint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 2d851373d1..7597ea0905 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -11,6 +11,7 @@ on: - pub/ochami - pub/ochami_aarch64 - pub/k8s_telemetry + - pub/ib_support jobs: build: From b20eb562537f6f7e8cf9972c5a47d0ead6647fe8 Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Mon, 5 Jan 2026 15:31:27 +0530 Subject: [PATCH 05/47] Update pylint.yml Signed-off-by: Vrinda Marwah --- .github/workflows/pylint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index b7e5c822d0..41d43ab2c1 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -10,6 +10,7 @@ on: - pub/ochami - pub/ochami_aarch64 - pub/k8s_telemetry + - pub/ib_support jobs: build: From 8f7dec7a4f62f9b4cb586f9b0480c617a6589d00 Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Tue, 6 Jan 2026 12:54:29 +0530 Subject: [PATCH 06/47] Update image-build to use docker.io/dellhpcomniaaisolution/image-build-el10:1.0 --- .../roles/image_creation/tasks/build_image_tag.yml | 12 ++---------- .../roles/image_creation/vars/main.yml | 8 ++------ 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml index c344f59d8f..4e0a4489ab 100644 --- a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml +++ b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml @@ -13,9 +13,9 @@ # limitations under the License. --- -- name: Pull specific OpenCHAMI image by version tag +- name: Pull image-build image ansible.builtin.command: - cmd: "podman pull {{ openchami_image_sha }}" + cmd: "podman pull {{ image_build_el10 }}" register: pull_result changed_when: "'Image is up to date' not in pull_result.stdout" @@ -23,11 +23,3 @@ ansible.builtin.fail: msg: "{{ pull_result.stdout }}" when: pull_result.rc != 0 - -- name: Tagging OpenCHAMI image with stable name - ansible.builtin.command: - cmd: "{{ ochami_stable_image_tag }}" - args: - creates: "{{ ochami_stable_image_path }}" - register: tag_result - changed_when: "'Tagged' in tag_result.stdout" diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml index 66b7b2538d..6ddf55bb38 100644 --- a/build_image_x86_64/roles/image_creation/vars/main.yml +++ b/build_image_x86_64/roles/image_creation/vars/main.yml @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -openchami_image_sha: "ghcr.io/openchami/image-build@sha256:52dd9d546951ce4f2f6f9febd08a228cfcb5b9e8e204ca4f5ee232f6be65d3a4" +image_build_el10: "docker.io/dellhpcomniaaisolution/image-build-el10:1.0" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml" dir_permissions_644: "0644" @@ -33,7 +33,7 @@ ochami_compute_mounts: ochami_x86_64_image: - --entrypoint /bin/bash - - ghcr.io/openchami/image-build:stable + - docker.io/dellhpcomniaaisolution/image-build-el10:1.0 ochami_base_command: - -c 'update-ca-trust extract && image-build --config /home/builder/config.yaml --log-level DEBUG' @@ -52,7 +52,3 @@ compute_image_failure_msg: | # build_compute_image.yml openchami_compute_image_vars_template: "{{ role_path }}/templates/compute_images_templates.j2" openchami_compute_image_vars_path: "/opt/omnia/openchami/compute_images_template.yaml" - -# build_image_tag.yml -ochami_stable_image_tag: "podman tag {{ openchami_image_sha }} ghcr.io/openchami/image-build:stable" -ochami_stable_image_path: "/var/lib/containers/storage/overlay-images/{{ openchami_image_sha }}" From 94e7e5e585ce82172a4ede43955c4f81bd90acf8 Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Tue, 6 Jan 2026 14:10:11 +0530 Subject: [PATCH 07/47] Remove rpmdb rebuild commands from base_image_commands --- common/vars/openchami_image_cmd.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/common/vars/openchami_image_cmd.yml b/common/vars/openchami_image_cmd.yml index 4746bb4037..96cd3abcb2 100644 --- a/common/vars/openchami_image_cmd.yml +++ b/common/vars/openchami_image_cmd.yml @@ -20,8 +20,6 @@ rhel_aarch64_base_image_name: "rhel-aarch64_base" base_image_commands: - "dracut --add 'dmsquash-live livenet network-manager' --install '/usr/lib/systemd/systemd-sysroot-fstab-check' --kver $(basename /lib/modules/*) -N -f --logfile /tmp/dracut.log 2>/dev/null" # noqa: yaml[line-length] - "echo DRACUT LOG:; cat /tmp/dracut.log" - - "rm -f /var/lib/rpm/__db*" - - "rpmdb --rebuilddb" # x86_64 compute commands default_x86_64_compute_commands: From f120e16d8c875938924ecfb4c1bea42d6be7947d Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Tue, 6 Jan 2026 14:15:38 +0530 Subject: [PATCH 08/47] Add retry logic for image pull with pull_image_retries and pull_image_delay variables --- .../roles/image_creation/tasks/build_image_tag.yml | 3 +++ build_image_x86_64/roles/image_creation/vars/main.yml | 2 ++ 2 files changed, 5 insertions(+) diff --git a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml index 4e0a4489ab..0b7a56072d 100644 --- a/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml +++ b/build_image_x86_64/roles/image_creation/tasks/build_image_tag.yml @@ -17,6 +17,9 @@ ansible.builtin.command: cmd: "podman pull {{ image_build_el10 }}" register: pull_result + retries: "{{ pull_image_retries }}" + delay: "{{ pull_image_delay }}" + until: pull_result.rc == 0 changed_when: "'Image is up to date' not in pull_result.stdout" - name: Fail if image not pulled successfully diff --git a/build_image_x86_64/roles/image_creation/vars/main.yml b/build_image_x86_64/roles/image_creation/vars/main.yml index 6ddf55bb38..a05a39d37d 100644 --- a/build_image_x86_64/roles/image_creation/vars/main.yml +++ b/build_image_x86_64/roles/image_creation/vars/main.yml @@ -13,6 +13,8 @@ # limitations under the License. --- image_build_el10: "docker.io/dellhpcomniaaisolution/image-build-el10:1.0" +pull_image_retries: "3" +pull_image_delay: "10" input_project_dir: "{{ hostvars['localhost']['input_project_dir'] }}" omnia_metadata_file: "/opt/omnia/.data/oim_metadata.yml" dir_permissions_644: "0644" From 50c87bd04623cae65975a579e450001de5834e1e Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Wed, 7 Jan 2026 11:32:01 +0000 Subject: [PATCH 09/47] doca changes to build image Signed-off-by: Vrinda_Marwah --- common/vars/openchami_image_cmd.yml | 3 +++ .../templates/doca-ofed/doca-install.sh.j2 | 7 ++++++- discovery/roles/configure_ochami/vars/main.yml | 2 +- input/config/aarch64/rhel/10.0/slurm_custom.json | 3 +++ input/config/x86_64/rhel/10.0/service_k8s.json | 3 +++ input/config/x86_64/rhel/10.0/slurm_custom.json | 3 +++ input/local_repo_config.yml | 2 ++ 7 files changed, 21 insertions(+), 2 deletions(-) diff --git a/common/vars/openchami_image_cmd.yml b/common/vars/openchami_image_cmd.yml index 96cd3abcb2..827ef0bd5a 100644 --- a/common/vars/openchami_image_cmd.yml +++ b/common/vars/openchami_image_cmd.yml @@ -42,6 +42,9 @@ service_kube_node_x86_64_compute_commands: service_kube_control_plane_first_x86_64_compute_commands: - "echo 'Service kube control plane first x86_64 compute'" + - "modprobe mlx5_ib || true" + - "modprobe ib_uverbs || true" + - "modprobe ib_ipoib || true" service_kube_control_plane_x86_64_compute_commands: - "echo 'Service kube control plane x86_64 compute'" diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 index 72ea31ac5a..80f1feb1d0 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -46,7 +46,12 @@ echo "Bootstrap doca-ofed package..." rpm -i "/var/lib/packages/${arch}/doca-ofed/doca-host-3.2.1-044000_25.10_rhel10.${arch}.rpm" echo "Installing doca-ofed..." -dnf install -y doca-ofed +if rpm -q doca-ofed >/dev/null 2>&1; then + echo "doca-ofed package is already installed." +else + echo "doca-ofed package is not installed. Installing..." + dnf install -y doca-ofed +fi echo "Loading RDMA kernel modules..." modprobe mlx5_core || true diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index e705580e11..3f82454590 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -38,7 +38,7 @@ common_cloud_init_groups: bss_template: bss/bss.yaml.j2 bss_dir: "{{ openchami_work_dir }}/boot" bss_params_cloud_init: 'ds=nocloud;s=http://{{ cluster_boot_ip }}:8081/cloud-init/' -bss_params_opts: 'ip=dhcp rd.live.image rd.live.ram rd.neednet=1 rd.driver.blacklist=ccp,edac_core,power_meter,ahci,megaraid_sas,ib_core,ib_uverbs,ib_umad,ib_cm,ib_ipoib,ib_iser,rdma_cm,mlx5_ib,mlx4_ib,bnxt_re,mlx5_core,ib_isert,ib_srp,scsi_transport_srp,knem,xpmem modprobe.blacklist=ccp,edac_core,power_meter,ahci,megaraid_sas,ib_core,ib_uverbs,ib_umad,ib_cm,ib_ipoib,ib_iser,rdma_cm,mlx5_ib,mlx4_ib,bnxt_re,mlx5_core,ib_isert,ib_srp,scsi_transport_srp,knem,xpmem libata.force=1:disable,2:disable,3:disable,4:disable rd.luks=0 rd.md=0 rd.dm=0 console=tty0 console=ttyS0,115200 selinux=0 apparmor=0 ip6=off cloud-init=enabled' # noqa: yaml[line-length] +bss_params_opts: 'ip=dhcp rd.live.image rd.live.ram rd.neednet=1 rd.driver.blacklist=ccp,edac_core,power_meter,ahci,megaraid_sas modprobe.blacklist=ccp,edac_core,power_meter,ahci,megaraid_sas libata.force=1:disable,2:disable,3:disable,4:disable rd.luks=0 rd.md=0 rd.dm=0 console=tty0 console=ttyS0,115200 selinux=0 apparmor=0 ip6=off cloud-init=enabled' # noqa: yaml[line-length] image_missing_fail_msg: "Failed to set kernel or initrd. Create the image using build_image.yml and try again." # Usage: configure_cloud_init_group.yml, configure_bss_cloud_init.yml diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 77e4ab3eb4..04c6d72ec3 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -7,6 +7,9 @@ {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, + { "package": "doca-ofed", "type": "rpm", "repo_name": "doca-ofed"}, + { "package": "perftes", "type": "rpm", "repo_name": "x86_64_baseos"}, + { "package": "librdmacm-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm" diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 9512ae4d4d..041c5d3baf 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -12,6 +12,9 @@ { "package": "kubelet-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, { "package": "container-selinux", "type": "rpm", "repo_name": "x86_64_appstream"}, { "package": "cri-o-1.34.1", "type": "rpm", "repo_name": "cri-o"}, + { "package": "doca-ofed", "type": "rpm", "repo_name": "doca-ofed"}, + { "package": "perftes", "type": "rpm", "repo_name": "x86_64_baseos"}, + { "package": "librdmacm-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 90cb8ce541..b2cf3525a7 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -5,6 +5,9 @@ {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, + { "package": "doca-ofed", "type": "rpm", "repo_name": "doca-ofed"}, + { "package": "perftes", "type": "rpm", "repo_name": "x86_64_baseos"}, + { "package": "librdmacm-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm" diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 118b563e66..83b4f6739a 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -101,6 +101,8 @@ omnia_repo_url_rhel_x86_64: - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key'", name: "cri-o"} + - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "", name: "doca-ofed"} omnia_repo_url_rhel_aarch64: - { url: "https://download.docker.com/linux/centos/10/aarch64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/aarch64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} + - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/", gpgkey: "", name: "doca-ofed"} From 4261525878d941b17b39f3c7f0ac4e3ac2ba21fa Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Wed, 7 Jan 2026 18:35:42 +0530 Subject: [PATCH 10/47] slurm user uid set to 6001 --- ...-group-login_compiler_node_aarch64.yaml.j2 | 17 ++++++---- ...i-group-login_compiler_node_x86_64.yaml.j2 | 17 ++++++---- .../ci-group-login_node_aarch64.yaml.j2 | 17 ++++++---- .../ci-group-login_node_x86_64.yaml.j2 | 17 ++++++---- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 33 ++++++++++--------- .../ci-group-slurm_node_aarch64.yaml.j2 | 18 +++++----- .../ci-group-slurm_node_x86_64.yaml.j2 | 21 +++++++----- .../roles/configure_ochami/vars/main.yml | 1 - 8 files changed, 81 insertions(+), 60 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 50fd55f498..727eb70323 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -15,6 +15,11 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: @@ -178,8 +183,6 @@ - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab @@ -192,17 +195,17 @@ - mount -a - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index ae84d3b32a..0d82c75c23 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -15,6 +15,11 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: @@ -180,8 +185,6 @@ - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab @@ -195,17 +198,17 @@ - mount -a - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index abc1103242..3eeb555b9e 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -17,6 +17,11 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: @@ -100,8 +105,6 @@ runcmd: - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab @@ -114,17 +117,17 @@ - mount -a - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index c921d3fe86..a6fcccea4f 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -17,6 +17,11 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: @@ -99,8 +104,6 @@ runcmd: - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab @@ -114,17 +117,17 @@ - mount -a - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index ff64b8175f..1468f0fd17 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -17,6 +17,11 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: @@ -114,8 +119,8 @@ content: | #!/bin/bash SLURMDBD_CONF="/etc/slurm/slurmdbd.conf" - SLURM_USER="{{ user }}" - SLURM_GROUP="{{ slurm_group_name }}" + SLURM_USER="{{ slurm_user }}" + SLURM_GROUP="{{ slurm_user }}" # Function to extract value from slurm.conf get_value_slurm_conf() { local key="$1" @@ -125,8 +130,8 @@ echo "${value:-$default}" } chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql - chown -R {{ user }}:{{ slurm_group_name }} /var/log/mariadb - chown -R {{ user }}:{{ slurm_group_name }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? + chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/mariadb + chown -R {{ slurm_user }}:{{ slurm_user }} /etc/my.cnf.d # Required? why slurm user for my.cnf?? chmod {{ file_mode_755 }} /etc/my.cnf.d /var/lib/mysql /var/log/mariadb #firewall systemctl enable firewalld @@ -143,8 +148,8 @@ content: | #!/bin/bash SLURMDBD_CONF="/etc/slurm/slurmdbd.conf" - SLURM_USER="{{ user }}" - SLURM_GROUP="{{ slurm_group_name }}" + SLURM_USER="{{ slurm_user }}" + SLURM_GROUP="{{ slurm_user }}" # Function to extract value from slurm.conf get_value_slurm_conf() { local key="$1" @@ -154,7 +159,7 @@ echo "${value:-$default}" } chmod {{ file_mode_600 }} /etc/slurm/slurmdbd.conf - chown {{ user }}:{{ slurm_group_name }} /etc/slurm/slurmdbd.conf + chown {{ slurm_user }}:{{ slurm_user }} /etc/slurm/slurmdbd.conf #file PidFile PidFile=$(get_value_slurm_conf "PidFile" "/var/run/slurmdbd.pid") mkdir -pv $(dirname "$PidFile") @@ -181,8 +186,8 @@ content: | #!/bin/bash SLURM_CONF="/etc/slurm/slurm.conf" - SLURM_USER="{{ user }}" - SLURM_GROUP="{{ slurm_group_name }}" + SLURM_USER="{{ slurm_user }}" + SLURM_GROUP="{{ slurm_user }}" # Function to extract value from slurm.conf get_value_slurm_conf() { local key="$1" @@ -244,10 +249,8 @@ runcmd: - /usr/local/bin/set-ssh.sh - - useradd -mG wheel -p '$6$VHdSKZNm$O3iFYmRiaFQCemQJjhfrpqqV7DdHBi5YpY6Aq06JSQpABPw.3d8PQ8bNY9NuZSmDv7IL/TsrhRJ6btkgKaonT.' testuser # Required?? - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - + # slurm user and group created in the users module + # Create directories for nfs and mount all - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab @@ -266,10 +269,10 @@ - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh - - chown -R {{ user }}:{{ slurm_group_name }} {{ home_dir }} + - chown -R {{ slurm_user }}:{{ slurm_user }} {{ home_dir }} - chmod {{ file_mode_755 }} {{ home_dir }} - - chown -R {{ user }}:{{ slurm_group_name }} /etc/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /etc/slurm - chmod {{ file_mode_755 }} /etc/slurm - chmod {{ file_mode }} /etc/slurm/slurm.conf diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 7b36e7fc87..addd0df64c 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -17,6 +17,11 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: @@ -259,10 +264,10 @@ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm echo "[INFO] Setting permissions for Slurm directories" chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm @@ -274,7 +279,7 @@ echo "[INFO] Creating and configuring /var/spool/slurmd" mkdir -p /var/spool/slurmd chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd echo "[INFO] ===== Completed slurmd setup (aarch64) =====" @@ -380,9 +385,6 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_nvidia_driver.sh - - useradd -mG wheel -p '$6$VHdSKZNm$O3iFYmRiaFQCemQJjhfrpqqV7DdHBi5YpY6Aq06JSQpABPw.3d8PQ8bNY9NuZSmDv7IL/TsrhRJ6btkgKaonT.' testuser - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 61424158d3..c503974343 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -17,6 +17,12 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin + disable_root: false write_files: @@ -266,10 +272,10 @@ bash /usr/local/bin/check_slurm_controller_status.sh echo "[INFO] Setting ownership for Slurm directories" - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm echo "[INFO] Setting permissions for Slurm directories" chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm @@ -281,7 +287,7 @@ echo "[INFO] Creating and configuring /var/spool/slurmd" mkdir -p /var/spool/slurmd chmod {{ file_mode_755 }} /var/spool/slurmd - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd echo "[INFO] ===== Completed slurmd setup =====" @@ -381,13 +387,12 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_nvidia_driver.sh - - useradd -mG wheel -p '$6$VHdSKZNm$O3iFYmRiaFQCemQJjhfrpqqV7DdHBi5YpY6Aq06JSQpABPw.3d8PQ8bNY9NuZSmDv7IL/TsrhRJ6btkgKaonT.' testuser - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} + # slurm user and group created in the users module - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index e705580e11..6ec3fe9d5d 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -74,7 +74,6 @@ ldap_starttls_port: 389 ldap_ssl_port: 636 # Usage: ci-group-slurm_control_node_x86_64.yaml.j2 -slurm_group_name: slurm home_dir: /var/lib/slurm user: slurm munge_user: munge From 27974c5a8a6b1c8b41ee0ce06d587792bac890ae Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Thu, 8 Jan 2026 11:19:12 +0000 Subject: [PATCH 11/47] add static ip for ib interface Signed-off-by: Vrinda_Marwah --- ...-group-login_compiler_node_aarch64.yaml.j2 | 7 ++ ...i-group-login_compiler_node_x86_64.yaml.j2 | 7 ++ .../ci-group-login_node_aarch64.yaml.j2 | 7 ++ .../ci-group-login_node_x86_64.yaml.j2 | 7 ++ ...ce_kube_control_plane_first_x86_64.yaml.j2 | 7 ++ ...-service_kube_control_plane_x86_64.yaml.j2 | 7 ++ .../ci-group-service_kube_node_x86_64.yaml.j2 | 7 ++ ...ci-group-slurm_control_node_x86_64.yaml.j2 | 7 ++ .../ci-group-slurm_node_aarch64.yaml.j2 | 7 ++ .../ci-group-slurm_node_x86_64.yaml.j2 | 7 ++ .../doca-ofed/configure-ib-network.sh.j2 | 64 +++++++++++++++++++ .../templates/doca-ofed/doca-install.sh.j2 | 10 ++- .../tasks/include_software_config.yml | 1 + input/network_spec.yml | 9 +++ 14 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 50fd55f498..47e75cabf4 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -24,6 +24,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -176,6 +182,7 @@ runcmd: - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - groupadd -r {{ slurm_group_name }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index ae84d3b32a..fc193b1f84 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -24,6 +24,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -178,6 +184,7 @@ runcmd: - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - groupadd -r {{ slurm_group_name }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index abc1103242..63c7df4836 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -26,6 +26,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -99,6 +105,7 @@ runcmd: - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/set-ssh.sh - groupadd -r {{ slurm_group_name }} - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index c921d3fe86..9f3b56c849 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -26,6 +26,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -98,6 +104,7 @@ runcmd: - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/set-ssh.sh - groupadd -r {{ slurm_group_name }} - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 72e52e4812..1aa0cb8f8f 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -26,6 +26,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -402,6 +408,7 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 38d9ca7741..a874237740 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -26,6 +26,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -310,6 +316,7 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index 752b408e63..b32ca53886 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -26,6 +26,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -213,6 +219,7 @@ - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index ff64b8175f..26b9f9ac22 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -26,6 +26,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -265,6 +271,7 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - chown -R {{ user }}:{{ slurm_group_name }} {{ home_dir }} - chmod {{ file_mode_755 }} {{ home_dir }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 7b36e7fc87..fa7dc29cad 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -26,6 +26,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -388,6 +394,7 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 61424158d3..60cd8a9fd8 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -26,6 +26,12 @@ content: | {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '{{ file_mode_755 }}' content: | @@ -389,6 +395,7 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 new file mode 100644 index 0000000000..0e09676b9b --- /dev/null +++ b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -0,0 +1,64 @@ +#!/bin/bash +set -euo pipefail + +ADMIN_NIC_IP="{% raw %}{{ ds.meta_data.instance_data.local_ipv4 }}{% endraw %}" +NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" +IB_NETWORK_SUBNET="{{ hostvars['localhost']['ib_network_subnet'] }}" + +ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) +} + +int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" +} + + +ADMIN_IP_INT=$(ip_to_int "$ADMIN_NIC_IP") +IB_NET_INT=$(ip_to_int "$IB_NETWORK_SUBNET") + +HOST_BITS=$(( 32 - NETMASK_BITS )) +HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + +HOST_OFFSET=$(( ADMIN_IP_INT & HOST_MASK )) +IB_IP_INT=$(( IB_NET_INT + HOST_OFFSET )) + +IB_IP=$(int_to_ip "$IB_IP_INT") + +echo "Derived IB IP : $IB_IP/$NETMASK_BITS" + + +IB_NIC="" + +for nic in $(ip -o link show | awk -F': ' '{print $2}' | grep '^ib'); do + if ip link show "$nic" | grep -q "UP,LOWER_UP"; then + IB_NIC="$nic" + break + fi +done + +if [[ -z "$IB_NIC" ]]; then + echo "No active InfiniBand interface found. Exiting." + exit 0 +fi + +echo "Using IB interface: $IB_NIC" + +if command -v nmcli >/dev/null 2>&1; then + echo "Configuring IB interface using NetworkManager" + nmcli con delete "$IB_NIC" &>/dev/null || true + nmcli con add type infiniband ifname "$IB_NIC" con-name "$IB_NIC" + nmcli con modify "$IB_NIC" ipv4.method manual ipv4.addresses "$IB_IP/$NETMASK_BITS" + nmcli con up "$IB_NIC" +else + echo "Configuring IB interface using iproute2" + ip addr flush dev "$IB_NIC" + ip addr add "$IB_IP/$NETMASK_BITS" dev "$IB_NIC" + ip link set "$IB_NIC" up +fi + +echo "SUCCESS: Assigned $IB_IP/$NETMASK_BITS to $IB_NIC" + \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 index 80f1feb1d0..58bc88c25e 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -53,6 +53,14 @@ else dnf install -y doca-ofed fi +echo "Unloading RDMA kernel modules..." +rmmod bnxt_re || true +rmmod mlx5_ib || true +rmmod ib_uverbs || true +rmmod xpmem || true +rmmod ib_core || true +rmmod mlx5_core || true + echo "Loading RDMA kernel modules..." modprobe mlx5_core || true modprobe mlx5_ib || true @@ -79,4 +87,4 @@ else fi echo "DOCA-OFED installation completed successfully." - \ No newline at end of file + diff --git a/discovery/roles/discovery_validations/tasks/include_software_config.yml b/discovery/roles/discovery_validations/tasks/include_software_config.yml index 321e74df41..f4b8b40466 100644 --- a/discovery/roles/discovery_validations/tasks/include_software_config.yml +++ b/discovery/roles/discovery_validations/tasks/include_software_config.yml @@ -41,6 +41,7 @@ admin_nic_ip: "{{ network_data.admin_network.primary_oim_admin_ip }}" admin_nic: "{{ network_data.admin_network.oim_nic_name }}" admin_netmask_bits: "{{ network_data.admin_network.netmask_bits }}" + ib_network_subnet: "{{ network_data.ib_network.subnet }}" dns: "{{ network_data.admin_network.dns }}" - name: Initialise variables diff --git a/input/network_spec.yml b/input/network_spec.yml index 0bb3a5e196..4720c9bc35 100644 --- a/input/network_spec.yml +++ b/input/network_spec.yml @@ -33,6 +33,11 @@ # ntp_servers: # - { address: "172.16.10.80", type: "server" } +# 'ib_network' is a mandatory field, essential for IB network configuration. +# The 'ib_network' section contains the following variables: +# - 'subnet': The subnet of the IB network. +# - 'netmask_bits': The number of bits in the subnet mask. + Networks: - admin_network: oim_nic_name: "eno1" @@ -42,3 +47,7 @@ Networks: dynamic_range: "172.16.107.201-172.16.107.250" dns: [] ntp_servers: [] + +- ib_network: + subnet: "192.168.0.0" + netmask_bits: "24" From 40f36b9c317004e97970304c9a8a3b822aeccc22 Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Thu, 8 Jan 2026 17:50:13 +0530 Subject: [PATCH 12/47] Update openchami_image_cmd.yml Signed-off-by: Vrinda Marwah --- common/vars/openchami_image_cmd.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/common/vars/openchami_image_cmd.yml b/common/vars/openchami_image_cmd.yml index 827ef0bd5a..96cd3abcb2 100644 --- a/common/vars/openchami_image_cmd.yml +++ b/common/vars/openchami_image_cmd.yml @@ -42,9 +42,6 @@ service_kube_node_x86_64_compute_commands: service_kube_control_plane_first_x86_64_compute_commands: - "echo 'Service kube control plane first x86_64 compute'" - - "modprobe mlx5_ib || true" - - "modprobe ib_uverbs || true" - - "modprobe ib_ipoib || true" service_kube_control_plane_x86_64_compute_commands: - "echo 'Service kube control plane x86_64 compute'" From 5096861b1b45f297e172e15a14a45643d6c9e6c0 Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Thu, 8 Jan 2026 17:51:57 +0530 Subject: [PATCH 13/47] Update slurm_custom.json Signed-off-by: Vrinda Marwah --- input/config/aarch64/rhel/10.0/slurm_custom.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 04c6d72ec3..77e4ab3eb4 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -7,9 +7,6 @@ {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, - { "package": "doca-ofed", "type": "rpm", "repo_name": "doca-ofed"}, - { "package": "perftes", "type": "rpm", "repo_name": "x86_64_baseos"}, - { "package": "librdmacm-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm" From 43827ef09d2d7b31adb091bb640996d999b95dbe Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Thu, 8 Jan 2026 17:52:14 +0530 Subject: [PATCH 14/47] Update slurm_custom.json Signed-off-by: Vrinda Marwah --- input/config/x86_64/rhel/10.0/slurm_custom.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index b2cf3525a7..90cb8ce541 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -5,9 +5,6 @@ {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, - { "package": "doca-ofed", "type": "rpm", "repo_name": "doca-ofed"}, - { "package": "perftes", "type": "rpm", "repo_name": "x86_64_baseos"}, - { "package": "librdmacm-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm" From 6df651502216e004e128721d287b943aaa931aa1 Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Thu, 8 Jan 2026 17:52:32 +0530 Subject: [PATCH 15/47] Update service_k8s.json Signed-off-by: Vrinda Marwah --- input/config/x86_64/rhel/10.0/service_k8s.json | 3 --- 1 file changed, 3 deletions(-) diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 041c5d3baf..9512ae4d4d 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -12,9 +12,6 @@ { "package": "kubelet-1.34.1", "type": "rpm", "repo_name": "kubernetes"}, { "package": "container-selinux", "type": "rpm", "repo_name": "x86_64_appstream"}, { "package": "cri-o-1.34.1", "type": "rpm", "repo_name": "cri-o"}, - { "package": "doca-ofed", "type": "rpm", "repo_name": "doca-ofed"}, - { "package": "perftes", "type": "rpm", "repo_name": "x86_64_baseos"}, - { "package": "librdmacm-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, { "package": "docker.io/victoriametrics/victoria-metrics", "type": "image", "tag": "v1.128.0" }, { "package": "docker.io/victoriametrics/vmagent", "type": "image", "tag": "v1.128.0" }, { "package": "docker.io/victoriametrics/vmstorage", "type": "image", "tag": "v1.128.0-cluster" }, From 6630ec7e4c8e747ba3466409d83ca9fc26a2bd1e Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Thu, 8 Jan 2026 17:52:52 +0530 Subject: [PATCH 16/47] Update local_repo_config.yml Signed-off-by: Vrinda Marwah --- input/local_repo_config.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/input/local_repo_config.yml b/input/local_repo_config.yml index 83b4f6739a..118b563e66 100644 --- a/input/local_repo_config.yml +++ b/input/local_repo_config.yml @@ -101,8 +101,6 @@ omnia_repo_url_rhel_x86_64: - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/x86_64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - { url: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/", gpgkey: "https://pkgs.k8s.io/core:/stable:/v1.34/rpm/repodata/repomd.xml.key", name: "kubernetes"} - { url: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/", gpgkey: "https://download.opensuse.org/repositories/isv:/cri-o:/stable:/v1.34/rpm/repodata/repomd.xml.key'", name: "cri-o"} - - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/x86_64/", gpgkey: "", name: "doca-ofed"} omnia_repo_url_rhel_aarch64: - { url: "https://download.docker.com/linux/centos/10/aarch64/stable/", gpgkey: "https://download.docker.com/linux/centos/gpg", name: "docker-ce"} - { url: "https://dl.fedoraproject.org/pub/epel/10/Everything/aarch64/", gpgkey: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-10", name: "epel"} - - { url: "https://linux.mellanox.com/public/repo/doca/3.2.1/rhel10/arm64-sbsa/", gpgkey: "", name: "doca-ofed"} From f03b32cbd12df081a1667a9c6f3a5059ddd89d2c Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Sat, 10 Jan 2026 09:07:22 +0530 Subject: [PATCH 17/47] remove unused vars main.yml Signed-off-by: balajikumaran.cs --- build_image_aarch64/roles/prepare_arm_node/vars/main.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml index a72369a092..d240f27de4 100644 --- a/build_image_aarch64/roles/prepare_arm_node/vars/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/vars/main.yml @@ -22,7 +22,6 @@ aarch64_regctl_url: "https://github.com/regclient/regclient/releases/latest/down pulp_repo_file_path: "/etc/yum.repos.d/pulp.repo" pulp_webserver_cert_path: "/opt/omnia/pulp/settings/certs/pulp_webserver.crt" anchors_path: "/etc/pki/ca-trust/source/anchors/pulp_webserver.crt" -regctl_tar_path: "omnia/offline_repo/cluster/aarch64/rhel/10.0/tarball/regctl-linux-arm64/regctl-linux-arm64.tar.gz" regctl_bin_path: "/usr/local/bin/regctl" # Error messages From 322ccd0e821f2630586d34b906f0e63a5fc87ccc Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Sat, 10 Jan 2026 09:07:48 +0530 Subject: [PATCH 18/47] Updated image tag in main.yml Signed-off-by: balajikumaran.cs --- build_image_aarch64/roles/prepare_arm_node/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml index d7a5a4467e..941d575ebf 100644 --- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml @@ -167,7 +167,7 @@ - name: Build full Podman image path ansible.builtin.set_fact: - pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:latest" + pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.0" - name: Pull aarch64 image using Podman ansible.builtin.command: From 0407906dc58d6c603e396d40ad9b66e3d1b83573 Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Sat, 10 Jan 2026 09:08:21 +0530 Subject: [PATCH 19/47] Update image tag in default_packages.json Signed-off-by: balajikumaran.cs --- input/config/aarch64/rhel/10.0/default_packages.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/input/config/aarch64/rhel/10.0/default_packages.json b/input/config/aarch64/rhel/10.0/default_packages.json index 61a9048690..3a49bf8f88 100644 --- a/input/config/aarch64/rhel/10.0/default_packages.json +++ b/input/config/aarch64/rhel/10.0/default_packages.json @@ -59,7 +59,7 @@ {"package": "kexec-tools", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "which", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "iperf3", "type": "rpm", "repo_name": "aarch64_appstream"}, - { "package": "docker.io/dellhpcomniaaisolution/image-build-aarch64", "tag": "latest", "type": "image" } + { "package": "docker.io/dellhpcomniaaisolution/image-build-aarch64", "tag": "1.0", "type": "image" } ] } } From 9974216ceff1f83eec2846b36c33933fa5793d3e Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Sun, 11 Jan 2026 09:19:55 +0000 Subject: [PATCH 20/47] add package mounts for doca installation Signed-off-by: Vrinda_Marwah --- .../ci-group-login_compiler_node_aarch64.yaml.j2 | 10 +++++++--- .../ci-group-login_compiler_node_x86_64.yaml.j2 | 10 +++++++--- .../cloud_init/ci-group-login_node_aarch64.yaml.j2 | 10 +++++++--- .../cloud_init/ci-group-login_node_x86_64.yaml.j2 | 10 +++++++--- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 604c82d620..c596b01661 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -186,20 +186,24 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} runcmd: - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index fe53e1258d..1dd4a68e5e 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -188,12 +188,10 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} runcmd: - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab @@ -201,8 +199,14 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index b3be1a0c6b..40db6718ca 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -109,19 +109,23 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} runcmd: - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/set-ssh.sh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index deaadef96c..bb871c9d11 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -108,11 +108,9 @@ {{ lookup('template', 'templates/slurm/check_slurm_controller_status.sh.j2') | indent(12) }} runcmd: - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/set-ssh.sh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab @@ -120,8 +118,14 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm From 2a86f1cf7f87197a12372141a517f478b627f5c7 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Sun, 11 Jan 2026 09:22:50 +0000 Subject: [PATCH 21/47] updating comments in network_spec Signed-off-by: Vrinda_Marwah --- input/network_spec.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/input/network_spec.yml b/input/network_spec.yml index 4720c9bc35..e46d36a0f5 100644 --- a/input/network_spec.yml +++ b/input/network_spec.yml @@ -36,7 +36,7 @@ # 'ib_network' is a mandatory field, essential for IB network configuration. # The 'ib_network' section contains the following variables: # - 'subnet': The subnet of the IB network. -# - 'netmask_bits': The number of bits in the subnet mask. +# - 'netmask_bits': The number of bits in the subnet mask. This value must be same as the admin_network netmask_bits. Networks: - admin_network: From 3abb36c3f39896e3d9ba69f46ffa6f8793a81998 Mon Sep 17 00:00:00 2001 From: mcas Date: Mon, 12 Jan 2026 10:48:08 +0530 Subject: [PATCH 22/47] passwordless_ssh changes --- discovery/discovery.yml | 19 ++- .../tasks/configure_bss_cloud_init.yml | 8 +- ...i-group-login_compiler_node_x86_64.yaml.j2 | 17 ++- .../ci-group-login_node_x86_64.yaml.j2 | 16 +- ...ce_kube_control_plane_first_x86_64.yaml.j2 | 7 + ...-service_kube_control_plane_x86_64.yaml.j2 | 7 + .../ci-group-service_kube_node_x86_64.yaml.j2 | 7 + ...ci-group-slurm_control_node_x86_64.yaml.j2 | 19 ++- .../ci-group-slurm_node_x86_64.yaml.j2 | 14 +- .../roles/configure_ochami/vars/main.yml | 16 ++ .../tasks/create_k8s_config_nfs.yml | 18 ++- .../tasks/build_host_lists.yml | 129 ++++++++++++++++ .../tasks/configure_oim_ssh.yml | 83 ++++++++++ .../roles/passwordless_ssh/tasks/main.yml | 23 +++ .../tasks/read_nodes_yaml.yml | 143 ++++++++++++++++++ .../roles/passwordless_ssh/vars/main.yml | 43 ++++++ .../slurm_config/tasks/create_slurm_dir.yml | 16 ++ 17 files changed, 573 insertions(+), 12 deletions(-) create mode 100644 discovery/roles/passwordless_ssh/tasks/build_host_lists.yml create mode 100644 discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml create mode 100644 discovery/roles/passwordless_ssh/tasks/main.yml create mode 100644 discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml create mode 100644 discovery/roles/passwordless_ssh/vars/main.yml diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 877a137e34..2a20cbb0ca 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -75,6 +75,18 @@ name: discovery_validations tasks_from: validate_oim_timezone.yml +- name: Build cluster host lists from PXE mapping + hosts: localhost + connection: local + roles: + - passwordless_ssh + +- name: Configure OIM SSH from cluster host lists + hosts: oim + connection: ssh + roles: + - passwordless_ssh + - name: Validate discovery parameters hosts: oim connection: ssh @@ -102,10 +114,15 @@ ansible.builtin.include_role: name: configure_ochami tasks_from: discover_mapping_nodes.yml + + - name: Read nodes.yaml and derive Omnia node facts + ansible.builtin.include_role: + name: passwordless_ssh + tasks_from: read_nodes_yaml.yml roles: - nfs_client - k8s_config - slurm_config - openldap - telemetry - - configure_ochami + - configure_ochami \ No newline at end of file diff --git a/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml b/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml index 2d7858b16c..eb40c89faf 100644 --- a/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml +++ b/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml @@ -66,6 +66,12 @@ register: read_ssh_key no_log: true +- name: Read the ssh private key + ansible.builtin.command: cat {{ ssh_private_key_path }} + changed_when: false + register: read_ssh_private_key + no_log: true + - name: Hash the password ansible.builtin.command: openssl passwd -6 "{{ hostvars['localhost']['provision_password'] }}" changed_when: false @@ -102,4 +108,4 @@ - name: Set openchami SELinux context ansible.builtin.command: chcon -R system_u:object_r:container_file_t:s0 "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami" changed_when: true - failed_when: false + failed_when: false \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 0d82c75c23..7b71eaa8ca 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -63,7 +63,13 @@ fi done fi - + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes - path: /usr/local/bin/install_cuda_toolkit.sh permissions: '0755' @@ -185,7 +191,11 @@ - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - + - groupadd -r {{ slurm_group_name }} + - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} + + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/ssh - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab @@ -194,6 +204,7 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ @@ -329,4 +340,4 @@ - /root/ldms_sampler.sh {% endif %} - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index a6fcccea4f..62c6a9d745 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -66,6 +66,13 @@ fi done fi + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf @@ -104,7 +111,11 @@ runcmd: - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - + - groupadd -r {{ slurm_group_name }} + - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} + + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/ssh - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab @@ -113,6 +124,7 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ @@ -183,4 +195,4 @@ - /root/ldms_sampler.sh {% endif %} - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 72e52e4812..4fc9bb430d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -60,6 +60,13 @@ fi done fi + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ k8s_control_ssh_patterns }} + IdentityFile {{ k8s_client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes - path: /etc/chrony.conf permissions: '0644' diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 38d9ca7741..a26f3d9865 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -61,6 +61,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ k8s_control_ssh_patterns }} + IdentityFile {{ k8s_client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes + - path: /etc/modules-load.d/k8s.conf content: | br_netfilter diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index 752b408e63..ce9fa420b4 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -61,6 +61,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ k8s_control_ssh_patterns }} + IdentityFile {{ k8s_client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes + - path: /etc/modules-load.d/k8s.conf content: | br_netfilter diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 1468f0fd17..37987686fe 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -65,6 +65,13 @@ done fi + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes + {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -249,8 +256,15 @@ runcmd: - /usr/local/bin/set-ssh.sh + + - useradd -mG wheel -p '$6$VHdSKZNm$O3iFYmRiaFQCemQJjhfrpqqV7DdHBi5YpY6Aq06JSQpABPw.3d8PQ8bNY9NuZSmDv7IL/TsrhRJ6btkgKaonT.' testuser # Required?? + - groupadd -r {{ slurm_group_name }} + - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} + + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/ssh + # slurm user and group created in the users module - # Create directories for nfs and mount all - mkdir -p /var/log/slurm /etc/slurm {{ home_dir }} /etc/my.cnf.d /etc/munge /var/lib/mysql /var/log/mariadb /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab @@ -263,6 +277,7 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust @@ -323,4 +338,4 @@ - /root/ldms_sampler.sh {% endif %} - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index c503974343..98760e25f3 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -66,6 +66,13 @@ fi done fi + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes - path: /usr/local/bin/install_nvidia_driver.sh permissions: '0755' @@ -239,7 +246,9 @@ exec > >(tee -a "$LOGFILE") 2>&1 echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge =====" - + + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + mkdir -p {{ client_mount_path }}/ssh echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge" mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages @@ -251,6 +260,7 @@ echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab chmod {{ file_mode }} /etc/fstab echo "[INFO] Mounting all NFS entries from /etc/fstab" @@ -437,4 +447,4 @@ - /root/ldms_sampler.sh {% endif %} - - echo "Cloud-Init has completed successfully." + - echo "Cloud-Init has completed successfully." \ No newline at end of file diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index 6ec3fe9d5d..d558992701 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -49,6 +49,7 @@ ci_defaults_dest: '{{ cloud_init_dir }}/ci-defaults.yaml' ci_group_load_fail_msg: | "Template loading failed. Ensure the template exists in the specified path and is compatible with the defined functional groups." default_file_path: "{{ playbook_dir }}/roles/slurm_config/defaults/main.yml" +ssh_private_key_path: /root/.ssh/oim_rsa # Usage: configure_cloud_init_common.yml ci_group_common_template: cloud_init/ci-group-common.yaml.j2 @@ -85,3 +86,18 @@ file_mode_755: "0755" file_mode_600: "0600" ip_timeout: 10 ip_wait_loop: 60 + +# Hostname lists for stack-specific SSH configs (populated by passwordless_ssh role) +k8s_cluster_hostnames: "{{ hostvars['localhost']['k8s_cluster_hostnames'] | default([]) }}" +slurm_cluster_hostnames: "{{ hostvars['localhost']['slurm_cluster_hostnames'] | default([]) }}" + +# IP wildcard lists for stack-specific SSH configs +k8s_cluster_ip_patterns: "{{ hostvars['localhost']['k8s_cluster_ip_patterns'] | default([]) }}" +slurm_cluster_ip_patterns: "{{ hostvars['localhost']['slurm_cluster_ip_patterns'] | default([]) }}" + +# SSH Host patterns precomputed on OIM by passwordless_ssh/read_nodes_yaml.yml +slurm_control_ssh_patterns: "{{ hostvars['oim']['slurm_ssh_patterns'] | default('*') }}" +k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') }}" + +# Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) +all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" \ No newline at end of file diff --git a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml index 0b5eee7e4a..1a4e7663c3 100644 --- a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -39,6 +39,22 @@ k8s_nfs_server_ip: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_ip }}" k8s_server_share_path: "{{ (nfs_client_params | selectattr('nfs_name', 'equalto', nfs_storage_name) | first).server_share_path }}" +- name: Ensure SSH key directory exists on K8s share + ansible.builtin.file: + path: "{{ k8s_client_mount_path }}/ssh" + state: directory + owner: root + group: root + mode: '0700' + +- name: Copy OIM private key to K8s share for node-to-node SSH + ansible.builtin.copy: + src: /root/.ssh/oim_rsa + dest: "{{ k8s_client_mount_path }}/ssh/oim_rsa" + owner: root + group: root + mode: '0600' + - name: Set admin network nic and ip ansible.builtin.set_fact: admin_nic_ip: "{{ hostvars['localhost']['admin_nic_ip'] }}" @@ -269,4 +285,4 @@ - name: Include PowerScale CSI dependency tasks ansible.builtin.include_tasks: get_powerscale_dependencies.yml - when: hostvars['localhost']['csi_driver_powerscale_support'] | bool + when: hostvars['localhost']['csi_driver_powerscale_support'] | bool \ No newline at end of file diff --git a/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml b/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml new file mode 100644 index 0000000000..58de280f78 --- /dev/null +++ b/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml @@ -0,0 +1,129 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# tasks/build_host_lists.yml + +- name: Ensure PXE mapping file path is set + ansible.builtin.assert: + that: pxe_mapping_file_path is defined + fail_msg: "pxe_mapping_file_path is not defined. Check provision_config.yml." + +- name: Read PXE mapping file (FUNCTIONAL_GROUP_NAME, HOSTNAME, ...) + community.general.read_csv: + path: "{{ pxe_mapping_file_path }}" + key: ADMIN_MAC + register: pxe_mapping_dict + +- name: Initialize per-stack hostname lists and IP wildcard patterns + ansible.builtin.set_fact: + k8s_cluster_hostnames: [] + slurm_cluster_hostnames: [] + k8s_cluster_ip_patterns: [] + slurm_cluster_ip_patterns: [] + omnia_cluster_ip_patterns: [] + omnia_hosts_map: {} + when: inventory_hostname == 'localhost' + +- name: Build per-stack hostname lists and IP wildcard patterns from PXE mapping + ansible.builtin.set_fact: + k8s_cluster_hostnames: >- + {{ + (k8s_cluster_hostnames + [item.value.HOSTNAME]) + if item.value.FUNCTIONAL_GROUP_NAME in k8s_functional_groups + else k8s_cluster_hostnames + }} + slurm_cluster_hostnames: >- + {{ + (slurm_cluster_hostnames + [item.value.HOSTNAME]) + if item.value.FUNCTIONAL_GROUP_NAME in slurm_functional_groups + else slurm_cluster_hostnames + }} + k8s_cluster_ip_patterns: >- + {{ + (k8s_cluster_ip_patterns + [ (item.value.ADMIN_IP | regex_replace('\\.[0-9]+$', '.*')) ]) + if ( + item.value.ADMIN_IP | default('') | length > 0 and + item.value.FUNCTIONAL_GROUP_NAME in k8s_functional_groups + ) + else k8s_cluster_ip_patterns + }} + slurm_cluster_ip_patterns: >- + {{ + (slurm_cluster_ip_patterns + [ (item.value.ADMIN_IP | regex_replace('\\.[0-9]+$', '.*')) ]) + if ( + item.value.ADMIN_IP | default('') | length > 0 and + item.value.FUNCTIONAL_GROUP_NAME in slurm_functional_groups + ) + else slurm_cluster_ip_patterns + }} + omnia_cluster_ip_patterns: >- + {{ + (omnia_cluster_ip_patterns + [ (item.value.ADMIN_IP | regex_replace('\\.[0-9]+$', '.*')) ]) + if ( + item.value.ADMIN_IP | default('') | length > 0 and + (item.value.FUNCTIONAL_GROUP_NAME in k8s_functional_groups or + item.value.FUNCTIONAL_GROUP_NAME in slurm_functional_groups) + ) + else omnia_cluster_ip_patterns + }} + omnia_hosts_map: >- + {{ + (omnia_hosts_map | default({})) + | combine( + ({ (item.value.HOSTNAME): item.value.ADMIN_IP } + if (item.value.HOSTNAME | default('') | length > 0 and + item.value.ADMIN_IP | default('') | length > 0) + else {}), + recursive=False + ) + }} + loop: "{{ pxe_mapping_dict.dict | dict2items }}" + loop_control: + label: "{{ item.value.FUNCTIONAL_GROUP_NAME }} -> {{ item.value.HOSTNAME }} ({{ item.value.ADMIN_IP | default('no-ip') }})" + +- name: Deduplicate host lists and IP wildcard patterns + ansible.builtin.set_fact: + k8s_cluster_hostnames: "{{ k8s_cluster_hostnames | unique }}" + slurm_cluster_hostnames: "{{ slurm_cluster_hostnames | unique }}" + k8s_cluster_ip_patterns: >- + {{ + (k8s_cluster_ip_patterns | default([])) + | map('regex_replace', '\\.[0-9]+$', '.*') + | list + | unique + }} + slurm_cluster_ip_patterns: >- + {{ + (slurm_cluster_ip_patterns | default([])) + | map('regex_replace', '\\.[0-9]+$', '.*') + | list + | unique + }} + omnia_cluster_ip_patterns: >- + {{ + (omnia_cluster_ip_patterns | default([])) + | map('regex_replace', '\\.[0-9]+$', '.*') + | list + | unique + }} + +- name: DEBUG passwordless_ssh facts built from PXE mapping + ansible.builtin.debug: + msg: + k8s_cluster_hostnames: "{{ k8s_cluster_hostnames | default([]) }}" + slurm_cluster_hostnames: "{{ slurm_cluster_hostnames | default([]) }}" + k8s_cluster_ip_patterns: "{{ k8s_cluster_ip_patterns | default([]) }}" + slurm_cluster_ip_patterns: "{{ slurm_cluster_ip_patterns | default([]) }}" + omnia_cluster_ip_patterns: "{{ omnia_cluster_ip_patterns | default([]) }}" + omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" \ No newline at end of file diff --git a/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml b/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml new file mode 100644 index 0000000000..d1b0a2cc10 --- /dev/null +++ b/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml @@ -0,0 +1,83 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# tasks/configure_oim_ssh.yml + +- name: Gather cluster hostnames and IP wildcard patterns from localhost facts + ansible.builtin.set_fact: + k8s_cluster_hostnames: "{{ hostvars['localhost']['k8s_cluster_hostnames'] | default([]) }}" + slurm_cluster_hostnames: "{{ hostvars['localhost']['slurm_cluster_hostnames'] | default([]) }}" + omnia_cluster_ip_patterns_raw: "{{ hostvars['localhost']['omnia_cluster_ip_patterns'] | default([]) }}" + omnia_hosts_map: "{{ hostvars['localhost']['omnia_hosts_map'] | default({}) }}" + +- name: Normalize OIM cluster IP patterns to wildcard subnets (x.x.x.*) + ansible.builtin.set_fact: + omnia_cluster_ip_patterns: >- + {{ + (omnia_cluster_ip_patterns_raw | default([])) + | map('regex_replace', '\\.[0-9]+$', '.*') + | list + | unique + }} + +- name: Build hostname wildcard patterns from actual cluster hostnames + ansible.builtin.set_fact: + omnia_cluster_hostname_patterns: >- + {{ + ( + (k8s_cluster_hostnames | default([])) + + + (slurm_cluster_hostnames | default([])) + ) + | map('regex_replace', '[0-9]+$', '*') + | list + | unique + }} + +- name: Build combined OIM SSH match list (hostname patterns + IP wildcard patterns) + ansible.builtin.set_fact: + omnia_cluster_ssh_matches: >- + {{ + (omnia_cluster_hostname_patterns + omnia_cluster_ip_patterns) + | map('regex_replace', '\.[0-9]+$', '.*') + | list + | unique + }} + +- name: DEBUG OIM SSH match list + ansible.builtin.debug: + var: omnia_cluster_ssh_matches + +- name: Manage /etc/hosts entries on OIM for Omnia cluster nodes + ansible.builtin.blockinfile: + path: /etc/hosts + create: true + mode: '0644' + marker: "# {mark} OMNIA_CLUSTER_NODES" + block: | + {% for h in omnia_hosts_map | dict2items %} + {{ h.value }} {{ h.key }} + {% endfor %} + when: omnia_hosts_map | default({}) | length > 0 + +- name: DEBUG configure_oim_ssh facts + ansible.builtin.debug: + msg: + k8s_cluster_hostnames: "{{ k8s_cluster_hostnames | default([]) }}" + slurm_cluster_hostnames: "{{ slurm_cluster_hostnames | default([]) }}" + omnia_cluster_ip_patterns_raw: "{{ omnia_cluster_ip_patterns_raw | default([]) }}" + omnia_cluster_ip_patterns: "{{ omnia_cluster_ip_patterns | default([]) }}" + omnia_cluster_hostname_patterns: "{{ omnia_cluster_hostname_patterns | default([]) }}" + omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" + omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" \ No newline at end of file diff --git a/discovery/roles/passwordless_ssh/tasks/main.yml b/discovery/roles/passwordless_ssh/tasks/main.yml new file mode 100644 index 0000000000..b195106e1f --- /dev/null +++ b/discovery/roles/passwordless_ssh/tasks/main.yml @@ -0,0 +1,23 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# tasks/main.yml + +- name: Build cluster host lists from PXE mapping (run on localhost/omnia_core) + when: inventory_hostname == 'localhost' + ansible.builtin.include_tasks: build_host_lists.yml + +- name: Configure OIM SSH based on PXE mapping (run on oim) + when: inventory_hostname == 'oim' + ansible.builtin.include_tasks: configure_oim_ssh.yml \ No newline at end of file diff --git a/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml b/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml new file mode 100644 index 0000000000..d28d72a907 --- /dev/null +++ b/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml @@ -0,0 +1,143 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# tasks/read_nodes_yaml.yml +--- + +- name: DEBUG passwordless_ssh facts from PXE mapping flow + ansible.builtin.debug: + msg: + k8s_cluster_hostnames: "{{ hostvars['localhost']['k8s_cluster_hostnames'] | default([]) }}" + slurm_cluster_hostnames: "{{ hostvars['localhost']['slurm_cluster_hostnames'] | default([]) }}" + k8s_cluster_ip_patterns: "{{ hostvars['localhost']['k8s_cluster_ip_patterns'] | default([]) }}" + slurm_cluster_ip_patterns: "{{ hostvars['localhost']['slurm_cluster_ip_patterns'] | default([]) }}" + omnia_cluster_ip_patterns: "{{ hostvars['localhost']['omnia_cluster_ip_patterns'] | default([]) }}" + omnia_hosts_map: "{{ hostvars['localhost']['omnia_hosts_map'] | default({}) }}" + +- name: Set nodes.yaml path for nodes.yaml debugging + ansible.builtin.set_fact: + omnia_nodes_yaml_path: "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami/workdir/nodes/nodes.yaml" + +- name: Read nodes.yaml for group/host/IP data + ansible.builtin.slurp: + src: "{{ omnia_nodes_yaml_path }}" + register: omnia_nodes_yaml_raw + +- name: Parse nodes.yaml content + ansible.builtin.set_fact: + omnia_nodes_data: "{{ omnia_nodes_yaml_raw.content | b64decode | from_yaml }}" + +- name: Build groups, hostnames and admin IPs from nodes.yaml + ansible.builtin.set_fact: + omnia_nodes_groups_from_yaml: >- + {{ + (omnia_nodes_data.nodes | default([])) + | map(attribute='group') + | list + | unique + }} + +- name: Initialize all_group_names_present flag + ansible.builtin.set_fact: + all_group_names_present: false + +- name: Set all_group_names_present when all required and optional groups are present + ansible.builtin.set_fact: + all_group_names_present: true + when: >- + ( + omnia_required_groups_from_nodes_yaml + | difference(omnia_nodes_groups_from_yaml | default([])) + ) | length == 0 + +- name: Build SSH Host pattern strings for k8s and slurm based on nodes.yaml completeness + ansible.builtin.set_fact: + k8s_ssh_patterns: >- + {{ + '*' + if (all_group_names_present | default(false)) + else ( + ( + (hostvars['localhost']['k8s_cluster_hostnames'] | default([])) + | map('regex_replace', '[0-9]+$', '*') + | list + | unique + ) + + (hostvars['localhost']['k8s_cluster_ip_patterns'] | default([])) + ) + | unique + | join(' ') + }} + slurm_ssh_patterns: >- + {{ + '*' + if (all_group_names_present | default(false)) + else ( + ( + (hostvars['localhost']['slurm_cluster_hostnames'] | default([])) + | map('regex_replace', '[0-9]+$', '*') + | list + | unique + ) + + (hostvars['localhost']['slurm_cluster_ip_patterns'] | default([])) + ) + | unique + | join(' ') + }} + +- name: DEBUG nodes.yaml groups, hostnames and admin IPs + ansible.builtin.debug: + msg: + omnia_nodes_yaml_path: "{{ omnia_nodes_yaml_path }}" + omnia_nodes_groups_from_yaml: "{{ omnia_nodes_groups_from_yaml | default([]) }}" + all_group_names_present: "{{ all_group_names_present | default(false) }}" + +- name: Configure SSH on OIM with Host * when all groups are present in nodes.yaml + ansible.builtin.blockinfile: + path: /root/.ssh/config + create: true + mode: '0600' + marker: "# {mark} OMNIA_CLUSTER_SSH" + block: | + Host * + IdentityFile ~/.ssh/oim_rsa + IdentitiesOnly yes + when: all_group_names_present + +- name: Configure SSH on OIM with derived hostname/IP patterns when groups are incomplete + ansible.builtin.blockinfile: + path: /root/.ssh/config + create: true + mode: '0600' + marker: "# {mark} OMNIA_CLUSTER_SSH" + block: | + Host {{ omnia_cluster_ssh_matches + | default([]) + | list + | unique + | join(' ') }} + IdentityFile ~/.ssh/oim_rsa + IdentitiesOnly yes + when: + - not all_group_names_present | default(false) | bool + - omnia_cluster_ssh_matches | default([]) | length > 0 + +- name: DEBUG summary from read_nodes_yaml flow + ansible.builtin.debug: + msg: + omnia_nodes_yaml_path: "{{ omnia_nodes_yaml_path }}" + omnia_nodes_groups_from_yaml: "{{ omnia_nodes_groups_from_yaml | default([]) }}" + all_group_names_present: "{{ all_group_names_present | default(false) }}" + omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" + k8s_ssh_patterns: "{{ k8s_ssh_patterns | default('') }}" + slurm_ssh_patterns: "{{ slurm_ssh_patterns | default('') }}" \ No newline at end of file diff --git a/discovery/roles/passwordless_ssh/vars/main.yml b/discovery/roles/passwordless_ssh/vars/main.yml new file mode 100644 index 0000000000..2804fc4bb8 --- /dev/null +++ b/discovery/roles/passwordless_ssh/vars/main.yml @@ -0,0 +1,43 @@ +# Copyright 2025 Dell Inc. or its subsidiaries. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +# vars/main.yml + +# K8s functional groups (x86_64 example; extend if you have aarch64 variants) +k8s_functional_groups: + - service_kube_control_plane_first_x86_64 + - service_kube_control_plane_x86_64 + - service_kube_node_x86_64 + +# Slurm / login functional groups +slurm_functional_groups: + - slurm_control_node_x86_64 + - slurm_node_x86_64 + - login_node_x86_64 + - login_compiler_node_x86_64 + +# Nodes.yaml group completeness checks +omnia_required_groups_from_nodes_yaml: + - service_kube_control_plane_first_x86_64 + - service_kube_control_plane_x86_64 + - service_kube_node_x86_64 + - slurm_control_node_x86_64 + - slurm_node_x86_64 + - login_node_x86_64 + - login_compiler_node_x86_64 + +omnia_optional_groups_from_nodes_yaml: + - service_kube_control_plane_first_aarch64 + - service_kube_control_plane_aarch64 + - service_kube_node_aarch64 \ No newline at end of file diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index bbca6f9ee6..0c81a1a26c 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -372,3 +372,19 @@ ansible.builtin.set_fact: cloud_init_slurm_nfs_path: "{{ nfs_server_ip }}:{{ nfs_server_path }}" client_mount_path: "{{ share_path }}" + +- name: Ensure SSH key directory exists on Slurm share + ansible.builtin.file: + path: "{{ slurm_config_path }}/ssh" + state: directory + owner: root + group: root + mode: '0700' + +- name: Copy OIM private key to Slurm share for node-to-node SSH + ansible.builtin.copy: + src: /root/.ssh/oim_rsa + dest: "{{ slurm_config_path }}/ssh/oim_rsa" + owner: root + group: root + mode: '0600' \ No newline at end of file From 216a06c7b85bf8a5acce528ec6c94ec4f522d28f Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Mon, 12 Jan 2026 05:27:59 +0000 Subject: [PATCH 23/47] ansible lint fixes --- discovery/discovery.yml | 2 +- .../roles/configure_ochami/tasks/configure_bss_cloud_init.yml | 2 +- discovery/roles/configure_ochami/vars/main.yml | 2 +- discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml | 2 +- discovery/roles/passwordless_ssh/tasks/build_host_lists.yml | 2 +- discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml | 2 +- discovery/roles/passwordless_ssh/tasks/main.yml | 2 +- discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml | 2 +- discovery/roles/passwordless_ssh/vars/main.yml | 2 +- discovery/roles/slurm_config/tasks/create_slurm_dir.yml | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/discovery/discovery.yml b/discovery/discovery.yml index 2a20cbb0ca..75efadb47c 100644 --- a/discovery/discovery.yml +++ b/discovery/discovery.yml @@ -125,4 +125,4 @@ - slurm_config - openldap - telemetry - - configure_ochami \ No newline at end of file + - configure_ochami diff --git a/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml b/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml index eb40c89faf..96a0cbd556 100644 --- a/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml +++ b/discovery/roles/configure_ochami/tasks/configure_bss_cloud_init.yml @@ -108,4 +108,4 @@ - name: Set openchami SELinux context ansible.builtin.command: chcon -R system_u:object_r:container_file_t:s0 "{{ hostvars['localhost']['oim_shared_path'] }}/omnia/openchami" changed_when: true - failed_when: false \ No newline at end of file + failed_when: false diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index d558992701..76bb80a6e0 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -100,4 +100,4 @@ slurm_control_ssh_patterns: "{{ hostvars['oim']['slurm_ssh_patterns'] | default( k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') }}" # Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) -all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" \ No newline at end of file +all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" diff --git a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml index 1a4e7663c3..f4360648df 100644 --- a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -285,4 +285,4 @@ - name: Include PowerScale CSI dependency tasks ansible.builtin.include_tasks: get_powerscale_dependencies.yml - when: hostvars['localhost']['csi_driver_powerscale_support'] | bool \ No newline at end of file + when: hostvars['localhost']['csi_driver_powerscale_support'] | bool diff --git a/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml b/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml index 58de280f78..53b734ac89 100644 --- a/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml +++ b/discovery/roles/passwordless_ssh/tasks/build_host_lists.yml @@ -126,4 +126,4 @@ k8s_cluster_ip_patterns: "{{ k8s_cluster_ip_patterns | default([]) }}" slurm_cluster_ip_patterns: "{{ slurm_cluster_ip_patterns | default([]) }}" omnia_cluster_ip_patterns: "{{ omnia_cluster_ip_patterns | default([]) }}" - omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" \ No newline at end of file + omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" diff --git a/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml b/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml index d1b0a2cc10..57df8990a9 100644 --- a/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml +++ b/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml @@ -80,4 +80,4 @@ omnia_cluster_ip_patterns: "{{ omnia_cluster_ip_patterns | default([]) }}" omnia_cluster_hostname_patterns: "{{ omnia_cluster_hostname_patterns | default([]) }}" omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" - omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" \ No newline at end of file + omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" diff --git a/discovery/roles/passwordless_ssh/tasks/main.yml b/discovery/roles/passwordless_ssh/tasks/main.yml index b195106e1f..aff8bee7e7 100644 --- a/discovery/roles/passwordless_ssh/tasks/main.yml +++ b/discovery/roles/passwordless_ssh/tasks/main.yml @@ -20,4 +20,4 @@ - name: Configure OIM SSH based on PXE mapping (run on oim) when: inventory_hostname == 'oim' - ansible.builtin.include_tasks: configure_oim_ssh.yml \ No newline at end of file + ansible.builtin.include_tasks: configure_oim_ssh.yml diff --git a/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml b/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml index d28d72a907..c10e03155a 100644 --- a/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml +++ b/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml @@ -140,4 +140,4 @@ all_group_names_present: "{{ all_group_names_present | default(false) }}" omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" k8s_ssh_patterns: "{{ k8s_ssh_patterns | default('') }}" - slurm_ssh_patterns: "{{ slurm_ssh_patterns | default('') }}" \ No newline at end of file + slurm_ssh_patterns: "{{ slurm_ssh_patterns | default('') }}" diff --git a/discovery/roles/passwordless_ssh/vars/main.yml b/discovery/roles/passwordless_ssh/vars/main.yml index 2804fc4bb8..737e68431b 100644 --- a/discovery/roles/passwordless_ssh/vars/main.yml +++ b/discovery/roles/passwordless_ssh/vars/main.yml @@ -40,4 +40,4 @@ omnia_required_groups_from_nodes_yaml: omnia_optional_groups_from_nodes_yaml: - service_kube_control_plane_first_aarch64 - service_kube_control_plane_aarch64 - - service_kube_node_aarch64 \ No newline at end of file + - service_kube_node_aarch64 diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 0c81a1a26c..bc533351b7 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -387,4 +387,4 @@ dest: "{{ slurm_config_path }}/ssh/oim_rsa" owner: root group: root - mode: '0600' \ No newline at end of file + mode: '0600' From cd729f54cbb3c926bf9640168c8ad226aacad6fc Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Mon, 12 Jan 2026 06:13:51 +0000 Subject: [PATCH 24/47] input validation for ib network --- .../common_utils/en_us_validation_msg.py | 7 +++ .../input_validation/schema/network_spec.json | 26 ++++++++++ .../validation_flows/provision_validation.py | 50 +++++++++++++++++++ input/network_spec.yml | 9 ++++ 4 files changed, 92 insertions(+) diff --git a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py index 77bc8c3544..e72c474513 100644 --- a/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py +++ b/common/library/module_utils/input_validation/common_utils/en_us_validation_msg.py @@ -326,6 +326,12 @@ def json_file_mandatory(file_path): "Please ensure the CSV file has the required headers." ) NETWORK_SPEC_FILE_NOT_FOUND_MSG = "network_spec.yml file not found in input folder." +IB_NETMASK_BITS_MISMATCH_MSG = ( + "netmask_bits configured for ib_network must match admin_network netmask_bits in network_spec.yml." +) +IB_SUBNET_IN_ADMIN_RANGE_MSG = ( + "ib_network subnet must be outside the admin network range derived from primary_oim_admin_ip/netmask_bits in network_spec.yml." +) # telemetry MANDATORY_FIELD_FAIL_MSG = "must not be empty" @@ -427,3 +433,4 @@ def get_logic_failed(input_file_path): def get_logic_success(input_file_path): """Returns a formatted message indicating logic validation success for a file.""" return f"{'#' * 10} Logic validation successful for {input_file_path} {'#' * 10}" + diff --git a/common/library/module_utils/input_validation/schema/network_spec.json b/common/library/module_utils/input_validation/schema/network_spec.json index 64fe70f407..bea5622095 100644 --- a/common/library/module_utils/input_validation/schema/network_spec.json +++ b/common/library/module_utils/input_validation/schema/network_spec.json @@ -100,9 +100,35 @@ } }, "additionalProperties": false + }, + { + "type": "object", + "required": ["ib_network"], + "properties": { + "ib_network": { + "type": "object", + "required": [ + "subnet", + "netmask_bits" + ], + "properties": { + "subnet": { + "type": "string", + "pattern": "^(?:(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})\\.){3}(?:25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})$" + }, + "netmask_bits": { + "type": "string", + "pattern": "^(1[0-9]|2[0-9]|[1-9])$|^3[0-2]$" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false } ] } } } } + diff --git a/common/library/module_utils/input_validation/validation_flows/provision_validation.py b/common/library/module_utils/input_validation/validation_flows/provision_validation.py index e598bc155e..7eef7bef20 100644 --- a/common/library/module_utils/input_validation/validation_flows/provision_validation.py +++ b/common/library/module_utils/input_validation/validation_flows/provision_validation.py @@ -21,6 +21,7 @@ import itertools import csv import yaml +import ipaddress from ansible.module_utils.input_validation.common_utils import validation_utils from ansible.module_utils.input_validation.common_utils import config from ansible.module_utils.input_validation.common_utils import en_us_validation_msg @@ -744,6 +745,54 @@ def validate_network_spec( ) return errors + # Extract admin and IB parameters for cross-validation + admin_netmask_bits = None + admin_primary_ip = None + ib_netmask_bits = None + ib_subnet = None + ib_present = False + + for network in data["Networks"]: + if "admin_network" in network and isinstance(network["admin_network"], dict): + admin_net = network["admin_network"] + admin_netmask_bits = admin_net.get("netmask_bits", admin_netmask_bits) + admin_primary_ip = admin_net.get("primary_oim_admin_ip", admin_primary_ip) + + if "ib_network" in network and isinstance(network["ib_network"], dict): + ib_net = network["ib_network"] + # Consider IB network present only when config is non-empty + if ib_net: + ib_present = True + ib_netmask_bits = ib_net.get("netmask_bits", ib_netmask_bits) + ib_subnet = ib_net.get("subnet", ib_subnet) + + # If IB network is configured and both netmask bits are available, they must match + if ib_present and ib_netmask_bits and admin_netmask_bits and ib_netmask_bits != admin_netmask_bits: + errors.append( + create_error_msg( + "ib_network.netmask_bits", + ib_netmask_bits, + en_us_validation_msg.IB_NETMASK_BITS_MISMATCH_MSG, + ) + ) + + # If IB subnet and admin primary IP are available, ensure IB subnet is not in admin range + if ib_present and ib_subnet and admin_primary_ip and admin_netmask_bits: + try: + admin_network = ipaddress.IPv4Network(f"{admin_primary_ip}/{admin_netmask_bits}", strict=False) + ib_ip = ipaddress.IPv4Address(ib_subnet) + if ib_ip in admin_network: + errors.append( + create_error_msg( + "ib_network.subnet", + ib_subnet, + en_us_validation_msg.IB_SUBNET_IN_ADMIN_RANGE_MSG, + ) + ) + except ValueError: + # If IPs/netmask are invalid, rely on existing validations to report issues + pass + for network in data["Networks"]: errors.extend(_validate_admin_network(network)) @@ -941,3 +990,4 @@ def _validate_ip_ranges(dynamic_range, network_type, netmask_bits): ) return errors + diff --git a/input/network_spec.yml b/input/network_spec.yml index 0bb3a5e196..4720c9bc35 100644 --- a/input/network_spec.yml +++ b/input/network_spec.yml @@ -33,6 +33,11 @@ # ntp_servers: # - { address: "172.16.10.80", type: "server" } +# 'ib_network' is a mandatory field, essential for IB network configuration. +# The 'ib_network' section contains the following variables: +# - 'subnet': The subnet of the IB network. +# - 'netmask_bits': The number of bits in the subnet mask. + Networks: - admin_network: oim_nic_name: "eno1" @@ -42,3 +47,7 @@ Networks: dynamic_range: "172.16.107.201-172.16.107.250" dns: [] ntp_servers: [] + +- ib_network: + subnet: "192.168.0.0" + netmask_bits: "24" From f015e987fb2481f149ee87b950d59ba22c47e271 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Tue, 13 Jan 2026 05:43:06 +0000 Subject: [PATCH 25/47] removing duplicate code --- .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 | 4 +--- .../templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 | 4 +--- .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 | 5 +---- input/config/x86_64/rhel/10.0/service_k8s.json | 1 + input/config/x86_64/rhel/10.0/slurm_custom.json | 1 + 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 7b71eaa8ca..722a77bda7 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -191,8 +191,6 @@ - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/ssh @@ -340,4 +338,4 @@ - /root/ldms_sampler.sh {% endif %} - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 62c6a9d745..6a1a890550 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -111,8 +111,6 @@ runcmd: - bash /usr/local/bin/doca-install.sh - /usr/local/bin/set-ssh.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/ssh @@ -195,4 +193,4 @@ - /root/ldms_sampler.sh {% endif %} - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 37987686fe..d4a8c43808 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -257,9 +257,6 @@ runcmd: - /usr/local/bin/set-ssh.sh - - useradd -mG wheel -p '$6$VHdSKZNm$O3iFYmRiaFQCemQJjhfrpqqV7DdHBi5YpY6Aq06JSQpABPw.3d8PQ8bNY9NuZSmDv7IL/TsrhRJ6btkgKaonT.' testuser # Required?? - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/ssh @@ -338,4 +335,4 @@ - /root/ldms_sampler.sh {% endif %} - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." diff --git a/input/config/x86_64/rhel/10.0/service_k8s.json b/input/config/x86_64/rhel/10.0/service_k8s.json index 9512ae4d4d..afc073a19f 100644 --- a/input/config/x86_64/rhel/10.0/service_k8s.json +++ b/input/config/x86_64/rhel/10.0/service_k8s.json @@ -32,6 +32,7 @@ { "package": "docker.io/dellhpcomniaaisolution/ubuntu-ldms", "tag": "1.0", "type": "image" }, { "package": "strimzi-kafka-operator-helm-3-chart-0.48.0", "type": "tarball", "url": "https://github.com/strimzi/strimzi-kafka-operator/releases/download/0.48.0/strimzi-kafka-operator-helm-3-chart-0.48.0.tgz" }, { "package": "quay.io/strimzi/kafka-bridge", "tag": "0.33.1", "type": "image" }, + { "package": "apptainer", "type": "rpm", "repo_name": "epel" }, {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm"} ] }, diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 90cb8ce541..71bf1bd809 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -5,6 +5,7 @@ {"package": "firewalld", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "python3-firewall", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.x86_64.rpm" From 684cfc6b7332d195981c84cb1e6dd91fc9d75cc7 Mon Sep 17 00:00:00 2001 From: vasanthsathya <106577201+vasanthsathya@users.noreply.github.com> Date: Tue, 13 Jan 2026 11:27:42 +0530 Subject: [PATCH 26/47] Update README.md Update README.md Signed-off-by: vasanthsathya <106577201+vasanthsathya@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 02fe78df05..248842a4c3 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Omnia 1.x Documentation is hosted on [Read The Docs 1.x](https://omnia-doc.readt Omnia 2.x Documentation is hosted on [Read The Docs 2.x](https://omnia.readthedocs.io/en/latest/index.html). -Current Status: ![GitHub](https://readthedocs.org/projects/omnia-doc/badge/?version=latest) +Current Status: ![GitHub](https://readthedocs.org/projects/omnia/badge/?version=latest) ## Licensing From e770d864d3136c3311105e8dcc9c175135d686fb Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Tue, 13 Jan 2026 13:08:57 +0000 Subject: [PATCH 27/47] variablize filenames --- .../tasks/create_k8s_config_nfs.yml | 2 +- discovery/roles/k8s_config/vars/main.yml | 2 ++ .../tasks/configure_oim_ssh.yml | 23 +++++++-------- .../tasks/read_nodes_yaml.yml | 29 +++++++------------ .../roles/passwordless_ssh/vars/main.yml | 2 ++ .../slurm_config/tasks/create_slurm_dir.yml | 2 +- discovery/roles/slurm_config/vars/main.yml | 2 ++ 7 files changed, 29 insertions(+), 33 deletions(-) diff --git a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml index f4360648df..40e9328cdd 100644 --- a/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml +++ b/discovery/roles/k8s_config/tasks/create_k8s_config_nfs.yml @@ -49,7 +49,7 @@ - name: Copy OIM private key to K8s share for node-to-node SSH ansible.builtin.copy: - src: /root/.ssh/oim_rsa + src: "{{ ssh_private_key_path }}" dest: "{{ k8s_client_mount_path }}/ssh/oim_rsa" owner: root group: root diff --git a/discovery/roles/k8s_config/vars/main.yml b/discovery/roles/k8s_config/vars/main.yml index ef843e2c28..433b8e9f76 100644 --- a/discovery/roles/k8s_config/vars/main.yml +++ b/discovery/roles/k8s_config/vars/main.yml @@ -92,3 +92,5 @@ offline_path_aarch64: - name: doca-ofed source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" + +ssh_private_key_path: /root/.ssh/oim_rsa diff --git a/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml b/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml index 57df8990a9..6c7c297724 100644 --- a/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml +++ b/discovery/roles/passwordless_ssh/tasks/configure_oim_ssh.yml @@ -55,9 +55,6 @@ | unique }} -- name: DEBUG OIM SSH match list - ansible.builtin.debug: - var: omnia_cluster_ssh_matches - name: Manage /etc/hosts entries on OIM for Omnia cluster nodes ansible.builtin.blockinfile: @@ -71,13 +68,13 @@ {% endfor %} when: omnia_hosts_map | default({}) | length > 0 -- name: DEBUG configure_oim_ssh facts - ansible.builtin.debug: - msg: - k8s_cluster_hostnames: "{{ k8s_cluster_hostnames | default([]) }}" - slurm_cluster_hostnames: "{{ slurm_cluster_hostnames | default([]) }}" - omnia_cluster_ip_patterns_raw: "{{ omnia_cluster_ip_patterns_raw | default([]) }}" - omnia_cluster_ip_patterns: "{{ omnia_cluster_ip_patterns | default([]) }}" - omnia_cluster_hostname_patterns: "{{ omnia_cluster_hostname_patterns | default([]) }}" - omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" - omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" +# - name: DEBUG configure_oim_ssh facts + # ansible.builtin.debug: + # msg: + # k8s_cluster_hostnames: "{{ k8s_cluster_hostnames | default([]) }}" + # slurm_cluster_hostnames: "{{ slurm_cluster_hostnames | default([]) }}" + # omnia_cluster_ip_patterns_raw: "{{ omnia_cluster_ip_patterns_raw | default([]) }}" + # omnia_cluster_ip_patterns: "{{ omnia_cluster_ip_patterns | default([]) }}" + # omnia_cluster_hostname_patterns: "{{ omnia_cluster_hostname_patterns | default([]) }}" + # omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" + # omnia_hosts_map: "{{ omnia_hosts_map | default({}) }}" diff --git a/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml b/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml index c10e03155a..093ce44790 100644 --- a/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml +++ b/discovery/roles/passwordless_ssh/tasks/read_nodes_yaml.yml @@ -95,16 +95,9 @@ | join(' ') }} -- name: DEBUG nodes.yaml groups, hostnames and admin IPs - ansible.builtin.debug: - msg: - omnia_nodes_yaml_path: "{{ omnia_nodes_yaml_path }}" - omnia_nodes_groups_from_yaml: "{{ omnia_nodes_groups_from_yaml | default([]) }}" - all_group_names_present: "{{ all_group_names_present | default(false) }}" - - name: Configure SSH on OIM with Host * when all groups are present in nodes.yaml ansible.builtin.blockinfile: - path: /root/.ssh/config + path: "{{ ssh_private_key_path }}" create: true mode: '0600' marker: "# {mark} OMNIA_CLUSTER_SSH" @@ -116,7 +109,7 @@ - name: Configure SSH on OIM with derived hostname/IP patterns when groups are incomplete ansible.builtin.blockinfile: - path: /root/.ssh/config + path: "{{ ssh_private_key_path }}" create: true mode: '0600' marker: "# {mark} OMNIA_CLUSTER_SSH" @@ -132,12 +125,12 @@ - not all_group_names_present | default(false) | bool - omnia_cluster_ssh_matches | default([]) | length > 0 -- name: DEBUG summary from read_nodes_yaml flow - ansible.builtin.debug: - msg: - omnia_nodes_yaml_path: "{{ omnia_nodes_yaml_path }}" - omnia_nodes_groups_from_yaml: "{{ omnia_nodes_groups_from_yaml | default([]) }}" - all_group_names_present: "{{ all_group_names_present | default(false) }}" - omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" - k8s_ssh_patterns: "{{ k8s_ssh_patterns | default('') }}" - slurm_ssh_patterns: "{{ slurm_ssh_patterns | default('') }}" +# - name: DEBUG summary from read_nodes_yaml flow + # ansible.builtin.debug: + # msg: + # omnia_nodes_yaml_path: "{{ omnia_nodes_yaml_path }}" + # omnia_nodes_groups_from_yaml: "{{ omnia_nodes_groups_from_yaml | default([]) }}" + # all_group_names_present: "{{ all_group_names_present | default(false) }}" + # omnia_cluster_ssh_matches: "{{ omnia_cluster_ssh_matches | default([]) }}" + # k8s_ssh_patterns: "{{ k8s_ssh_patterns | default('') }}" + # SLURM_SSH_patterns: "{{ slurm_ssh_patterns | default('') }}" diff --git a/discovery/roles/passwordless_ssh/vars/main.yml b/discovery/roles/passwordless_ssh/vars/main.yml index 737e68431b..2aa99da9e0 100644 --- a/discovery/roles/passwordless_ssh/vars/main.yml +++ b/discovery/roles/passwordless_ssh/vars/main.yml @@ -41,3 +41,5 @@ omnia_optional_groups_from_nodes_yaml: - service_kube_control_plane_first_aarch64 - service_kube_control_plane_aarch64 - service_kube_node_aarch64 + +ssh_private_key_path: /root/.ssh/oim_rsa diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index bc533351b7..8986434547 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -383,7 +383,7 @@ - name: Copy OIM private key to Slurm share for node-to-node SSH ansible.builtin.copy: - src: /root/.ssh/oim_rsa + src: "{{ ssh_private_key_path }}" dest: "{{ slurm_config_path }}/ssh/oim_rsa" owner: root group: root diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 1ca1ed0805..ca9f650168 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -122,3 +122,5 @@ offline_path_aarch64: - name: doca-ofed source_path: "{{ offline_repo_basepath_aarch64 }}/doca-ofed" dest_path: "{{ packages_base_dir_aarch64 }}/doca-ofed" + +ssh_private_key_path: /root/.ssh/oim_rsa From 078997ed1276d978e3bc5b0febc1877c93cc2dd0 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Wed, 14 Jan 2026 04:26:11 -0600 Subject: [PATCH 28/47] extract cuda in nfs --- ...-group-login_compiler_node_aarch64.yaml.j2 | 83 +++++++--------- ...i-group-login_compiler_node_x86_64.yaml.j2 | 95 +++++++------------ 2 files changed, 64 insertions(+), 114 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index c596b01661..95036b1add 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -15,26 +15,9 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" - - name: {{ slurm_user }} - uid: {{ slurm_uid }} - system: true - no_create_home: true - shell: /sbin/nologin disable_root: false write_files: - - path: /usr/local/bin/doca-install.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} - - - path: /usr/local/bin/configure-ib-network.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} - - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -95,25 +78,37 @@ exit 1 fi - echo "[INFO] Installing CUDA toolkit..." + echo "[INFO] Setting up shared CUDA directory..." + # Create and mount shared directory for compute nodes + mkdir -p /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to mount NFS cuda share. Exiting." + umount /cuda-runfile 2>/dev/null + exit 1 + fi + + echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." if [ -f "/cuda-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run" ]; then - # Install only the toolkit component - bash /cuda-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run --silent --toolkit --toolkitpath=/usr/local/cuda --override + mkdir -p /shared-cuda-toolkit/tmp + # Install toolkit directly to the NFS-mounted shared location + bash /cuda-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully." + echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - # Set up environment variables + # Set up environment variables pointing to shared location cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit ENDOFFILE # Apply environment variables for current session - export PATH=/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit echo "[INFO] CUDA environment configured" else @@ -132,17 +127,6 @@ echo "[ERROR] CUDA toolkit (nvcc) not found after installation." fi - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit - - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true - echo "[INFO] Cleaning up temporary mounts..." umount /cuda-runfile 2>/dev/null rmdir /cuda-runfile 2>/dev/null @@ -188,35 +172,31 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh + - groupadd -r {{ slurm_group_name }} + - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track /var/lib/packages + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm + - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm + - chown -R {{ user }}:{{ slurm_group_name }} /var/spool + - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd + - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld @@ -336,3 +316,4 @@ - /root/ldms_sampler.sh {% endif %} - echo "Cloud-Init has completed successfully." + diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 45735d0122..d64e91ab30 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -15,26 +15,9 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" - - name: {{ slurm_user }} - uid: {{ slurm_uid }} - system: true - no_create_home: true - shell: /sbin/nologin disable_root: false write_files: - - path: /usr/local/bin/doca-install.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} - - - path: /usr/local/bin/configure-ib-network.sh - owner: root:root - permissions: '{{ file_mode_755 }}' - content: | - {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} - - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -69,13 +52,7 @@ fi done fi - - - path: /root/.ssh/config - permissions: '0600' - content: | - Host {{ slurm_control_ssh_patterns }} - IdentityFile {{ client_mount_path }}/ssh/oim_rsa - IdentitiesOnly yes + - path: /usr/local/bin/install_cuda_toolkit.sh permissions: '0755' @@ -102,25 +79,37 @@ exit 1 fi - echo "[INFO] Installing CUDA toolkit..." + echo "[INFO] Setting up shared CUDA directory..." + # Create and mount shared directory for compute nodes + mkdir -p /shared-cuda-toolkit + mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit + + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to mount NFS cuda share. Exiting." + umount /cuda-runfile 2>/dev/null + exit 1 + fi + + echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." if [ -f "/cuda-runfile/cuda_13.0.2_580.95.05_linux.run" ]; then - # Install only the toolkit component - bash /cuda-runfile/cuda_13.0.2_580.95.05_linux.run --silent --toolkit --toolkitpath=/usr/local/cuda --override + mkdir -p /shared-cuda-toolkit/tmp + # Install toolkit directly to the NFS-mounted shared location + bash /cuda-runfile/cuda_13.0.2_580.95.05_linux.run --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override if [ $? -eq 0 ]; then - echo "[SUCCESS] CUDA toolkit installed successfully." + echo "[SUCCESS] CUDA toolkit installed successfully to shared location." - # Set up environment variables + # Set up environment variables pointing to shared location cat > /etc/profile.d/cuda.sh << 'ENDOFFILE' - export PATH=/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit ENDOFFILE # Apply environment variables for current session - export PATH=/usr/local/cuda/bin:$PATH - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export CUDA_HOME=/usr/local/cuda + export PATH=/shared-cuda-toolkit/bin:$PATH + export LD_LIBRARY_PATH=/shared-cuda-toolkit/lib64:$LD_LIBRARY_PATH + export CUDA_HOME=/shared-cuda-toolkit echo "[INFO] CUDA environment configured" else @@ -139,17 +128,6 @@ echo "[ERROR] CUDA toolkit (nvcc) not found after installation." fi - echo "[INFO] Setting up shared CUDA directory for compute nodes..." - # Create shared directory for compute nodes to mount - mkdir -p /shared-cuda-toolkit - # Mount the shared NFS location where compute nodes will access the toolkit - mount -t nfs {{ cloud_init_nfs_path }}/cuda/ /shared-cuda-toolkit - - echo "[INFO] Copying CUDA toolkit to shared location..." - # Copy the installed CUDA toolkit to the shared location for compute nodes - #rsync -av /usr/local/cuda/ /shared-cuda-toolkit/ --exclude='*.a' --exclude='doc/' - cp -r /usr/local/cuda/* /shared-cuda-toolkit/ 2>/dev/null || true - echo "[INFO] Cleaning up temporary mounts..." umount /cuda-runfile 2>/dev/null rmdir /cuda-runfile 2>/dev/null @@ -196,11 +174,10 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh + - groupadd -r {{ slurm_group_name }} + - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - - mkdir -p {{ client_mount_path }}/ssh - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages - + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab @@ -208,29 +185,21 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - - chmod {{ file_mode }} /etc/fstab - mount -a - - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm + - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm + - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm + - chown -R {{ user }}:{{ slurm_group_name }} /var/spool + - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd + - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld From ddc00f89bdf848459b21823bc08ddb8de124605a Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 14 Jan 2026 10:26:18 +0000 Subject: [PATCH 29/47] making path changes --- .../ci-group-login_compiler_node_x86_64.yaml.j2 | 6 +++--- .../cloud_init/ci-group-login_node_x86_64.yaml.j2 | 6 +++--- .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 | 6 +++--- .../cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 45735d0122..8fa57102ff 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -74,7 +74,7 @@ permissions: '0600' content: | Host {{ slurm_control_ssh_patterns }} - IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes - path: /usr/local/bin/install_cuda_toolkit.sh @@ -198,7 +198,7 @@ - /usr/local/bin/install_cuda_toolkit.sh # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - - mkdir -p {{ client_mount_path }}/ssh + - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab @@ -208,7 +208,7 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index ae5fe22eae..310c4914b3 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -77,7 +77,7 @@ permissions: '0600' content: | Host {{ slurm_control_ssh_patterns }} - IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes {% if hostvars['localhost']['openldap_support'] %} @@ -119,7 +119,7 @@ # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - - mkdir -p {{ client_mount_path }}/ssh + - mkdir -p {{ client_mount_path }}/slurm/ssh - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab @@ -128,7 +128,7 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 94b0d286c0..fa13e9e5ac 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -75,7 +75,7 @@ permissions: '0600' content: | Host {{ slurm_control_ssh_patterns }} - IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes {% if hostvars['localhost']['openldap_support'] %} @@ -265,7 +265,7 @@ # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - - mkdir -p {{ client_mount_path }}/ssh + - mkdir -p {{ client_mount_path }}/slurm/ssh # slurm user and group created in the users module # Create directories for nfs and mount all @@ -280,7 +280,7 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 4085e52918..d9c64c48fd 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -77,7 +77,7 @@ permissions: '0600' content: | Host {{ slurm_control_ssh_patterns }} - IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes - path: /usr/local/bin/install_nvidia_driver.sh @@ -254,7 +254,7 @@ echo "[INFO] ===== Starting directory creation and NFS mounts for Pulp cert, Slurm and Munge =====" # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) - mkdir -p {{ client_mount_path }}/ssh + mkdir -p {{ client_mount_path }}/slurm/ssh echo "[INFO] Creating base directories for Pulp cert, Slurm and Munge" mkdir -pv /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages @@ -266,7 +266,7 @@ echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab + echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/slurm/ssh nfs defaults,_netdev 0 0" >> /etc/fstab chmod {{ file_mode }} /etc/fstab echo "[INFO] Mounting all NFS entries from /etc/fstab" @@ -454,4 +454,4 @@ - /root/ldms_sampler.sh {% endif %} - - echo "Cloud-Init has completed successfully." \ No newline at end of file + - echo "Cloud-Init has completed successfully." From 64d4b28944b7a0a9980473ce5d9bfd4d01ff80cf Mon Sep 17 00:00:00 2001 From: Nagachandan P Date: Wed, 14 Jan 2026 16:38:16 +0530 Subject: [PATCH 30/47] Update ci-group-login_compiler_node_aarch64.yaml.j2 Signed-off-by: Nagachandan P --- ...-group-login_compiler_node_aarch64.yaml.j2 | 38 ++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 95036b1add..7824101db8 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -15,9 +15,26 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -172,31 +189,35 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /var/log/track /var/lib/packages - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm/epilog.d /etc/slurm/epilog.d nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld @@ -316,4 +337,3 @@ - /root/ldms_sampler.sh {% endif %} - echo "Cloud-Init has completed successfully." - From 34aea3713606e3857bc681c2793bfae08a66fff7 Mon Sep 17 00:00:00 2001 From: Nagachandan P Date: Wed, 14 Jan 2026 16:45:00 +0530 Subject: [PATCH 31/47] Update ci-group-login_compiler_node_x86_64.yaml.j2 Signed-off-by: Nagachandan P --- ...i-group-login_compiler_node_x86_64.yaml.j2 | 50 +++++++++++++++---- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index d64e91ab30..88a6be7eee 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -15,9 +15,26 @@ ssh_authorized_keys: "{{ read_ssh_key.stdout }}" lock_passwd: false hashed_passwd: "{{ hashed_password_output.stdout }}" + - name: {{ slurm_user }} + uid: {{ slurm_uid }} + system: true + no_create_home: true + shell: /sbin/nologin disable_root: false write_files: + - path: /usr/local/bin/doca-install.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/doca-install.sh.j2') | indent(12) }} + + - path: /usr/local/bin/configure-ib-network.sh + owner: root:root + permissions: '{{ file_mode_755 }}' + content: | + {{ lookup('template', 'templates/doca-ofed/configure-ib-network.sh.j2') | indent(12) }} + - path: /usr/local/bin/set-ssh.sh permissions: '0755' content: | @@ -52,7 +69,13 @@ fi done fi - + + - path: /root/.ssh/config + permissions: '0600' + content: | + Host {{ slurm_control_ssh_patterns }} + IdentityFile {{ client_mount_path }}/ssh/oim_rsa + IdentitiesOnly yes - path: /usr/local/bin/install_cuda_toolkit.sh permissions: '0755' @@ -174,10 +197,11 @@ runcmd: - /usr/local/bin/set-ssh.sh - /usr/local/bin/install_cuda_toolkit.sh - - groupadd -r {{ slurm_group_name }} - - useradd -r -g {{ slurm_group_name }} -d {{ home_dir }} -s /sbin/nologin {{ user }} - - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track + # Ensure Slurm NFS root is mounted at client_mount_path (e.g. /share_omnia) + - mkdir -p {{ client_mount_path }}/ssh + - mkdir -p /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm /etc/slurm/epilog.d /etc/munge /cert /var/log/track /var/lib/packages + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab @@ -185,21 +209,29 @@ - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/ssh {{ client_mount_path }}/ssh nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab + - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab + - chmod {{ file_mode }} /etc/fstab - mount -a + - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust + - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf + - bash /usr/local/bin/doca-install.sh + - bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - - chown -R {{ user }}:{{ slurm_group_name }} /var/log/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/run/slurm - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool - - chown -R {{ user }}:{{ slurm_group_name }} /var/lib/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/run/slurm + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/lib/slurm - chown -R {{ munge_user }}:{{ munge_group }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /var/log/slurm /var/run/slurm /var/spool /var/lib/slurm - chmod {{ file_mode_400 }} /etc/munge/munge.key - chmod {{ file_mode_755 }} /etc/slurm/epilog.d/ - mkdir -p /var/spool/slurmd - chmod {{ file_mode_755 }} /var/spool/slurmd - - chown -R {{ user }}:{{ slurm_group_name }} /var/spool/slurmd + - chown -R {{ slurm_user }}:{{ slurm_user }} /var/spool/slurmd - setenforce 0 - systemctl enable firewalld - systemctl start firewalld From e3dc75a5b43dfdb1977a152beb417dff9f8f8eda Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Wed, 14 Jan 2026 11:22:07 +0000 Subject: [PATCH 32/47] adding the repo for apptainer --- input/config/aarch64/rhel/10.0/slurm_custom.json | 1 + 1 file changed, 1 insertion(+) diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index 77e4ab3eb4..3292aeab7d 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -7,6 +7,7 @@ {"package": "python3-firewall", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "pmix", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, + {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, {"package": "doca-ofed", "type": "iso", "url": "https://www.mellanox.com/downloads/DOCA/DOCA_v3.2.1/host/doca-host-3.2.1-044000_25.10_rhel10.aarch64.rpm" From 66661de049e478125ea78f187e85cb5339c41ad7 Mon Sep 17 00:00:00 2001 From: Vrinda_Marwah Date: Wed, 14 Jan 2026 11:26:27 +0000 Subject: [PATCH 33/47] add set pipefail to doca-ofed script Signed-off-by: Vrinda_Marwah --- .../cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 | 3 +-- .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 | 3 +-- .../templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 | 3 +-- .../templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 | 3 +-- .../ci-group-service_kube_control_plane_first_x86_64.yaml.j2 | 3 +-- .../ci-group-service_kube_control_plane_x86_64.yaml.j2 | 3 +-- .../cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 | 3 +-- .../cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 | 3 +-- .../templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 3 +-- .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 3 +-- .../configure_ochami/templates/doca-ofed/doca-install.sh.j2 | 2 ++ 11 files changed, 12 insertions(+), 20 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 7824101db8..569b0c7772 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -203,8 +203,7 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 88a6be7eee..1474d9cd67 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -217,8 +217,7 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index 40db6718ca..3b67afc108 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -124,8 +124,7 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index ae5fe22eae..182a88fe20 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -137,8 +137,7 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - yes | cp /etc/slurm/epilog.d/slurmd.service /usr/lib/systemd/system/ - /usr/local/bin/check_slurm_controller_status.sh - chown -R {{ slurm_user }}:{{ slurm_user }} /var/log/slurm diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 index 700d645d91..14616e9226 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_first_x86_64.yaml.j2 @@ -414,8 +414,7 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 index 340055cf04..c27216fcdf 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_control_plane_x86_64.yaml.j2 @@ -322,8 +322,7 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 index 0316b19537..7f115c766d 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-service_kube_node_x86_64.yaml.j2 @@ -225,8 +225,7 @@ - cp {{ k8s_client_mount_path }}/pulp_webserver.crt /etc/pki/ca-trust/source/anchors - update-ca-trust extract - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - systemctl start crio.service - systemctl enable crio.service - sudo systemctl enable --now kubelet diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index 94b0d286c0..9e4d786808 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -285,8 +285,7 @@ - mount -a - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - chown -R {{ slurm_user }}:{{ slurm_user }} {{ home_dir }} - chmod {{ file_mode_755 }} {{ home_dir }} diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 08c92ea069..aa4763dea5 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -395,8 +395,7 @@ - /usr/local/bin/configure_dirs_and_mounts.sh - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 4085e52918..5a53f28ae0 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -409,8 +409,7 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - - bash /usr/local/bin/doca-install.sh - - bash /usr/local/bin/configure-ib-network.sh + - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh - /usr/local/bin/configure_slurmd_setup.sh - /usr/local/bin/configure_munge_and_pam.sh diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 index 58bc88c25e..111abcb3a1 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/doca-install.sh.j2 @@ -1,4 +1,6 @@ #!/bin/bash +set -euo pipefail + # Optimize firewall ports declaration later DOCA_FIREWALL_PORTS=( "18515-18520/tcp" From 667006180056962c68de0d2c2be8b7aad99e9eb0 Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Wed, 14 Jan 2026 17:03:01 +0530 Subject: [PATCH 34/47] Update ansible-lint.yml Signed-off-by: Vrinda Marwah --- .github/workflows/ansible-lint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ansible-lint.yml b/.github/workflows/ansible-lint.yml index 7597ea0905..35ab68f037 100644 --- a/.github/workflows/ansible-lint.yml +++ b/.github/workflows/ansible-lint.yml @@ -12,6 +12,7 @@ on: - pub/ochami_aarch64 - pub/k8s_telemetry - pub/ib_support + - pub/v2.1_rc1 jobs: build: From 05c11462724f8caea899f7d31af18f4262dd2e1f Mon Sep 17 00:00:00 2001 From: Vrinda Marwah Date: Wed, 14 Jan 2026 17:03:18 +0530 Subject: [PATCH 35/47] Update pylint.yml Signed-off-by: Vrinda Marwah --- .github/workflows/pylint.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 41d43ab2c1..c9544dc16c 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -11,6 +11,7 @@ on: - pub/ochami_aarch64 - pub/k8s_telemetry - pub/ib_support + - pub/v2.1_rc1 jobs: build: From be913495538b90fd1439abd2046040a0b1b64681 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Fri, 16 Jan 2026 06:22:40 +0000 Subject: [PATCH 36/47] variablize the cuda version --- .../cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 | 4 ++-- .../cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 | 4 ++-- .../cloud_init/ci-group-slurm_node_aarch64.yaml.j2 | 4 ++-- .../templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 | 4 ++-- discovery/roles/configure_ochami/vars/main.yml | 6 ++++++ 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 7824101db8..83d1d59420 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -107,10 +107,10 @@ fi echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." - if [ -f "/cuda-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run" ]; then + if [ -f "/cuda-runfile/{{ cuda_runfile_aarch64 }}" ]; then mkdir -p /shared-cuda-toolkit/tmp # Install toolkit directly to the NFS-mounted shared location - bash /cuda-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override + bash /cuda-runfile/{{ cuda_runfile_aarch64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override if [ $? -eq 0 ]; then echo "[SUCCESS] CUDA toolkit installed successfully to shared location." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 88a6be7eee..ae6c25ed60 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -114,10 +114,10 @@ fi echo "[INFO] Installing CUDA toolkit directly to shared NFS location..." - if [ -f "/cuda-runfile/cuda_13.0.2_580.95.05_linux.run" ]; then + if [ -f "/cuda-runfile/{{ cuda_runfile_x86_64 }}" ]; then mkdir -p /shared-cuda-toolkit/tmp # Install toolkit directly to the NFS-mounted shared location - bash /cuda-runfile/cuda_13.0.2_580.95.05_linux.run --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override + bash /cuda-runfile/{{ cuda_runfile_x86_64 }} --silent --toolkit --tmpdir=/shared-cuda-toolkit/tmp --toolkitpath=/shared-cuda-toolkit --override if [ $? -eq 0 ]; then echo "[SUCCESS] CUDA toolkit installed successfully to shared location." diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 08c92ea069..8c47ad1827 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -104,8 +104,8 @@ fi echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run" ]; then - bash /gpu-runfile/cuda_13.0.2_580.95.05_linux_sbsa.run --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build + if [ -f "/gpu-runfile/{{ cuda_runfile_aarch64 }}" ]; then + bash /gpu-runfile/{{ cuda_runfile_aarch64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then echo "[SUCCESS] NVIDIA driver installed successfully." else diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index 4085e52918..cfa9f3f520 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -112,8 +112,8 @@ fi echo "[INFO] Installing NVIDIA driver..." - if [ -f "/gpu-runfile/cuda_13.0.2_580.95.05_linux.run" ]; then - bash /gpu-runfile/cuda_13.0.2_580.95.05_linux.run --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build + if [ -f "/gpu-runfile/{{ cuda_runfile_x86_64 }}" ]; then + bash /gpu-runfile/{{ cuda_runfile_x86_64 }} --silent --driver --no-opengl-libs --kernel-source-path=/lib/modules/$(uname -r)/build if [ $? -eq 0 ] && command -v nvidia-smi &>/dev/null; then echo "[SUCCESS] NVIDIA driver installed successfully." else diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index aee44aaf4e..6c6678fdae 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -101,3 +101,9 @@ k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') # Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" + +# CUDA/NVIDIA versions +cuda_toolkit_version: "13.0.2" +cuda_driver_version: "580.95.05" +cuda_runfile_x86_64: "cuda_{{ cuda_toolkit_version }}_{{ cuda_driver_version }}_linux.run" +cuda_runfile_aarch64: "cuda_{{ cuda_toolkit_version }}_{{ cuda_driver_version }}_linux_sbsa.run" From eeda08fc3300a155057941920cf43844548e3238 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 19 Jan 2026 06:41:08 +0000 Subject: [PATCH 37/47] dynamic extraction of cuda version --- .../roles/configure_ochami/vars/main.yml | 8 +++---- .../slurm_config/tasks/create_slurm_dir.yml | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/discovery/roles/configure_ochami/vars/main.yml b/discovery/roles/configure_ochami/vars/main.yml index 6c6678fdae..8c7ff96a82 100644 --- a/discovery/roles/configure_ochami/vars/main.yml +++ b/discovery/roles/configure_ochami/vars/main.yml @@ -102,8 +102,6 @@ k8s_control_ssh_patterns: "{{ hostvars['oim']['k8s_ssh_patterns'] | default('*') # Passwordless SSH mode flag derived from nodes.yaml (set on OIM by passwordless_ssh role) all_group_names_present: "{{ hostvars['oim']['all_group_names_present'] | default(false) }}" -# CUDA/NVIDIA versions -cuda_toolkit_version: "13.0.2" -cuda_driver_version: "580.95.05" -cuda_runfile_x86_64: "cuda_{{ cuda_toolkit_version }}_{{ cuda_driver_version }}_linux.run" -cuda_runfile_aarch64: "cuda_{{ cuda_toolkit_version }}_{{ cuda_driver_version }}_linux_sbsa.run" +# CUDA/NVIDIA runfile names (extracted from slurm_custom.json in slurm_config role) +cuda_runfile_x86_64: "{{ hostvars['oim']['cuda_runfile_x86_64'] | default('cuda_13.0.2_580.95.05_linux.run') }}" +cuda_runfile_aarch64: "{{ hostvars['oim']['cuda_runfile_aarch64'] | default('cuda_13.0.2_580.95.05_linux_sbsa.run') }}" diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index 8986434547..a9911ea42b 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -18,6 +18,30 @@ - name: Include storage vars ansible.builtin.include_vars: "{{ input_project_dir }}/storage_config.yml" +- name: Load slurm_custom.json for x86_64 + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" + name: slurm_custom_x86_64 + ignore_errors: true + +- name: Load slurm_custom.json for aarch64 + ansible.builtin.include_vars: + file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" + name: slurm_custom_aarch64 + ignore_errors: true + +- name: Extract CUDA runfile name for x86_64 from slurm_custom.json + ansible.builtin.set_fact: + cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" + when: slurm_custom_x86_64 is defined and slurm_custom_x86_64.slurm_node is defined + ignore_errors: true + +- name: Extract CUDA runfile name for aarch64 from slurm_custom.json + ansible.builtin.set_fact: + cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" + when: slurm_custom_aarch64 is defined and slurm_custom_aarch64.slurm_node is defined + ignore_errors: true + - name: Set facts for slurm ansible.builtin.set_fact: nfs_storage_name: "{{ slurm_cluster[0].nfs_storage_name }}" From e392595038cbb4be25188d26f422f664794e9662 Mon Sep 17 00:00:00 2001 From: Nagachandan-P Date: Mon, 19 Jan 2026 06:49:47 +0000 Subject: [PATCH 38/47] lint issue fixed --- .../slurm_config/tasks/create_slurm_dir.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml index a9911ea42b..48b39097fe 100644 --- a/discovery/roles/slurm_config/tasks/create_slurm_dir.yml +++ b/discovery/roles/slurm_config/tasks/create_slurm_dir.yml @@ -22,25 +22,29 @@ ansible.builtin.include_vars: file: "{{ input_project_dir }}/config/x86_64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" name: slurm_custom_x86_64 - ignore_errors: true + failed_when: false - name: Load slurm_custom.json for aarch64 ansible.builtin.include_vars: file: "{{ input_project_dir }}/config/aarch64/rhel/{{ hostvars['localhost']['cluster_os_version'] }}/slurm_custom.json" name: slurm_custom_aarch64 - ignore_errors: true + failed_when: false - name: Extract CUDA runfile name for x86_64 from slurm_custom.json ansible.builtin.set_fact: cuda_runfile_x86_64: "{{ (slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: slurm_custom_x86_64 is defined and slurm_custom_x86_64.slurm_node is defined - ignore_errors: true + when: + - slurm_custom_x86_64 is defined + - slurm_custom_x86_64.slurm_node is defined + - slurm_custom_x86_64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - name: Extract CUDA runfile name for aarch64 from slurm_custom.json ansible.builtin.set_fact: cuda_runfile_aarch64: "{{ (slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | first).url | basename }}" - when: slurm_custom_aarch64 is defined and slurm_custom_aarch64.slurm_node is defined - ignore_errors: true + when: + - slurm_custom_aarch64 is defined + - slurm_custom_aarch64.slurm_node is defined + - slurm_custom_aarch64.slurm_node.cluster | selectattr('package', 'equalto', 'cuda-run') | list | length > 0 - name: Set facts for slurm ansible.builtin.set_fact: From 83a56253b8eb8ca43c6556f50cf56e1cddb98fc2 Mon Sep 17 00:00:00 2001 From: sakshi-singla-1735 Date: Tue, 20 Jan 2026 06:52:39 +0000 Subject: [PATCH 39/47] file path change --- discovery/roles/passwordless_ssh/vars/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discovery/roles/passwordless_ssh/vars/main.yml b/discovery/roles/passwordless_ssh/vars/main.yml index 2aa99da9e0..edd72e1e90 100644 --- a/discovery/roles/passwordless_ssh/vars/main.yml +++ b/discovery/roles/passwordless_ssh/vars/main.yml @@ -42,4 +42,4 @@ omnia_optional_groups_from_nodes_yaml: - service_kube_control_plane_aarch64 - service_kube_node_aarch64 -ssh_private_key_path: /root/.ssh/oim_rsa +ssh_private_key_path: /root/.ssh/config From 503a295d963b73192376d69da142b64e4703767f Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Tue, 20 Jan 2026 16:25:59 +0530 Subject: [PATCH 40/47] Update image-builder version to 1.1 Signed-off-by: balajikumaran.cs --- build_image_aarch64/roles/prepare_arm_node/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml index 941d575ebf..1801448611 100644 --- a/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml +++ b/build_image_aarch64/roles/prepare_arm_node/tasks/main.yml @@ -167,7 +167,7 @@ - name: Build full Podman image path ansible.builtin.set_fact: - pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.0" + pulp_aarch_image: "{{ hostvars['localhost']['oim_pxe_ip'] }}:2225/dellhpcomniaaisolution/image-build-aarch64:1.1" - name: Pull aarch64 image using Podman ansible.builtin.command: From 2d74de0e6d528eaab8a69b25b42ceaf68f923439 Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Tue, 20 Jan 2026 16:26:32 +0530 Subject: [PATCH 41/47] Update image-builder version to 1.1 in default_packages.json Signed-off-by: balajikumaran.cs --- input/config/aarch64/rhel/10.0/default_packages.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/input/config/aarch64/rhel/10.0/default_packages.json b/input/config/aarch64/rhel/10.0/default_packages.json index 3a49bf8f88..84709a7c66 100644 --- a/input/config/aarch64/rhel/10.0/default_packages.json +++ b/input/config/aarch64/rhel/10.0/default_packages.json @@ -59,7 +59,7 @@ {"package": "kexec-tools", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "which", "type": "rpm", "repo_name": "aarch64_baseos"}, {"package": "iperf3", "type": "rpm", "repo_name": "aarch64_appstream"}, - { "package": "docker.io/dellhpcomniaaisolution/image-build-aarch64", "tag": "1.0", "type": "image" } + { "package": "docker.io/dellhpcomniaaisolution/image-build-aarch64", "tag": "1.1", "type": "image" } ] } } From f5f4f572b24a1eb1a6bc631248b1483ac818cad7 Mon Sep 17 00:00:00 2001 From: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> Date: Wed, 21 Jan 2026 16:34:21 +0530 Subject: [PATCH 42/47] Update configure-ib-network for fixing race condition Signed-off-by: Katakam Rakesh Naga Sai <125246792+Katakam-Rakesh@users.noreply.github.com> --- .../doca-ofed/configure-ib-network.sh.j2 | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 index 0e09676b9b..1cb95d6f9b 100644 --- a/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 +++ b/discovery/roles/configure_ochami/templates/doca-ofed/configure-ib-network.sh.j2 @@ -30,18 +30,26 @@ IB_IP=$(int_to_ip "$IB_IP_INT") echo "Derived IB IP : $IB_IP/$NETMASK_BITS" - +MAX_WAIT=120 # total wait time in seconds (2 minutes) +INTERVAL=10 # check every 10 seconds +ELAPSED=0 IB_NIC="" -for nic in $(ip -o link show | awk -F': ' '{print $2}' | grep '^ib'); do - if ip link show "$nic" | grep -q "UP,LOWER_UP"; then - IB_NIC="$nic" - break - fi +while [[ $ELAPSED -lt $MAX_WAIT ]]; do + for nic in $(ip -o link show | awk -F': ' '{print $2}' | grep '^ib'); do + if ip link show "$nic" | grep -q "UP,LOWER_UP"; then + IB_NIC="$nic" + break 2 + fi + done + + echo "IB interface not ready yet. Waiting..." + sleep $INTERVAL + ELAPSED=$((ELAPSED + INTERVAL)) done if [[ -z "$IB_NIC" ]]; then - echo "No active InfiniBand interface found. Exiting." + echo "No active InfiniBand interface found after ${MAX_WAIT}s. Exiting." exit 0 fi @@ -61,4 +69,4 @@ else fi echo "SUCCESS: Assigned $IB_IP/$NETMASK_BITS to $IB_NIC" - \ No newline at end of file + From 112681fa38b9f41d369cf6ce7f705783bbb4ded3 Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Thu, 22 Jan 2026 15:20:26 +0000 Subject: [PATCH 43/47] added powervault packages --- input/config/x86_64/rhel/10.0/slurm_custom.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 71bf1bd809..8ab1f60295 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -17,7 +17,10 @@ {"package": "slurm-slurmctld", "type": "rpm", "repo_name": "x86_64_slurm_custom"}, {"package": "slurm-slurmdbd", "type": "rpm", "repo_name": "x86_64_slurm_custom"}, {"package": "python3-PyMySQL", "type": "rpm", "repo_name": "x86_64_appstream"}, - {"package": "mariadb-server", "type": "rpm", "repo_name": "x86_64_appstream"} + {"package": "mariadb-server", "type": "rpm", "repo_name": "x86_64_appstream"}, + {"package": "iscsi-initiator-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, + {"package": "device-mapper-multipath", "type": "rpm", "repo_name": "x86_64_baseos"}, + {"package": "sg3_utils", "type": "rpm", "repo_name": "x86_64_baseos"} ] }, "slurm_node": { From 7640fa75d7731f95043dc18bd8e1f7e1ad53e6ed Mon Sep 17 00:00:00 2001 From: Jagadeesh N V Date: Thu, 22 Jan 2026 20:26:02 +0530 Subject: [PATCH 44/47] Added powervault input --- .../schema/storage_config.json | 30 ++++++++++++++++++ .../config/x86_64/rhel/10.0/slurm_custom.json | 3 +- input/storage_config.yml | 31 +++++++++++++++++-- 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/common/library/module_utils/input_validation/schema/storage_config.json b/common/library/module_utils/input_validation/schema/storage_config.json index 9cae297a43..41746905f1 100644 --- a/common/library/module_utils/input_validation/schema/storage_config.json +++ b/common/library/module_utils/input_validation/schema/storage_config.json @@ -49,6 +49,36 @@ ] }, "minItems": 1 + }, + "powervault_config": { + "required": ["ip", "isci_initiators", "volume_id"], + "properties": { + "ip": { + "description": "List of target controller IP addresses", + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "format": "ipv4" + }, + "uniqueItems": true + }, + + "port": { + "description": "TCP port for iSCSI (default 3260)", + "type": "integer" + }, + + "isci_initiators": { + "description": "iSCSI initiator IQN", + "type": "string" + }, + + "volume_id": { + "description": "Volume identifier (hex string)", + "type": "string" + } + } } }, "required": [ diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 8ab1f60295..fc08673a34 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -20,7 +20,8 @@ {"package": "mariadb-server", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "iscsi-initiator-utils", "type": "rpm", "repo_name": "x86_64_baseos"}, {"package": "device-mapper-multipath", "type": "rpm", "repo_name": "x86_64_baseos"}, - {"package": "sg3_utils", "type": "rpm", "repo_name": "x86_64_baseos"} + {"package": "sg3_utils", "type": "rpm", "repo_name": "x86_64_baseos"}, + {"package": "lsscsi", "type": "rpm", "repo_name": "x86_64_baseos"} ] }, "slurm_node": { diff --git a/input/storage_config.yml b/input/storage_config.yml index 563ae0eb65..685edf1344 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -12,12 +12,38 @@ # See the License for the specific language governing permissions and # limitations under the License. --- - # *********************************************************************** # DO NOT REMOVE OR COMMENT OUT ANY LINES IN THIS FILE. # SIMPLY APPEND THE REQUIRED VALUES AGAINST THE PARAMETER OF YOUR CHOICE. # *********************************************************************** +# -----------------------------Powervault------------------------------------------- +# powervault_config +# ip: ipv4 +# A list of PowerVault controller IP addresses used for iSCSI target discovery and login. +# In this configuration, a single controller portal is provided. + +# port: +# Defines the TCP port for the iSCSI target service. +# Port 3260 is the standard port for iSCSI communication. + +# isci_initiators: +# Specifies the InitiatorName used by the host when connecting to the iSCSI target. +# This IQN uniquely identifies the host to the storage array. + +# volume_id: +# This is the unique LUN serial/identifier for the +# specific volume that should be used for persistent storage. +# The script uses this value during multipath scanning to select the correct mapped device + +powervault_config: + ip: + - 172.1.2.3 + port: 3260 + isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 + volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 + + # -----------------------------NFS------------------------------------------------ # This variable is used for mounting NFS share on slurm_control_node, slurm_node, login_node @@ -49,4 +75,5 @@ nfs_client_params: server_share_path: "/mnt/share/omnia_k8s" # Provide server share path of the NFS Server client_share_path: /share_omnia_k8s client_mount_options: "nosuid,rw,sync,hard,intr" - nfs_name: nfs_k8s \ No newline at end of file + nfs_name: nfs_k8s + \ No newline at end of file From ed551a79ee19ffdf182e936fea9fdf5b0d5ce776 Mon Sep 17 00:00:00 2001 From: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> Date: Thu, 22 Jan 2026 21:42:30 +0530 Subject: [PATCH 45/47] Update storage_config.yml Signed-off-by: Jagadeesh N V <39791839+jagadeeshnv@users.noreply.github.com> --- input/storage_config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/input/storage_config.yml b/input/storage_config.yml index 685edf1344..4da6336e08 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -32,7 +32,7 @@ # This IQN uniquely identifies the host to the storage array. # volume_id: -# This is the unique LUN serial/identifier for the +# This is the unique WWN/identifier for the # specific volume that should be used for persistent storage. # The script uses this value during multipath scanning to select the correct mapped device @@ -76,4 +76,4 @@ nfs_client_params: client_share_path: /share_omnia_k8s client_mount_options: "nosuid,rw,sync,hard,intr" nfs_name: nfs_k8s - \ No newline at end of file + From 0c28ab631743420ea7757bfde44e50436ae6d17e Mon Sep 17 00:00:00 2001 From: "balajikumaran.cs" Date: Thu, 22 Jan 2026 22:15:39 +0530 Subject: [PATCH 46/47] Commented powervault details Signed-off-by: balajikumaran.cs --- input/storage_config.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/input/storage_config.yml b/input/storage_config.yml index 4da6336e08..48eac2d5cc 100644 --- a/input/storage_config.yml +++ b/input/storage_config.yml @@ -36,12 +36,12 @@ # specific volume that should be used for persistent storage. # The script uses this value during multipath scanning to select the correct mapped device -powervault_config: - ip: - - 172.1.2.3 - port: 3260 - isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 - volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 +#powervault_config: +# ip: +# - 172.1.2.3 +# port: 3260 +# isci_initiators: iqn.initiator.com.example:7d7d7d7d7d7 +# volume_id: 00c0ff4343f1f1f1001c8c4e6901000000 # -----------------------------NFS------------------------------------------------ From 38a7bc2a29b0fbde7b7ccb90222ec90fa788aab1 Mon Sep 17 00:00:00 2001 From: balajikumaran-c-s Date: Thu, 22 Jan 2026 17:10:29 +0000 Subject: [PATCH 47/47] powervault cloud-init changes --- ...ci-group-slurm_control_node_x86_64.yaml.j2 | 202 +++++++++++++++++- 1 file changed, 201 insertions(+), 1 deletion(-) diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 index e787f6a296..35079fb0f5 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_control_node_x86_64.yaml.j2 @@ -78,6 +78,201 @@ IdentityFile {{ client_mount_path }}/slurm/ssh/oim_rsa IdentitiesOnly yes +{% if powervault_config is defined %} + - path: /usr/local/bin/setup_iscsi_storage.sh + permissions: '{{ file_mode_755 }}' + content: | + #!/bin/bash + set -euo pipefail + + LOGFILE="/var/log/omnia_iscsi_setup.log" + exec > >(tee -a "$LOGFILE") 2>&1 + + log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*"; } + + PORTALS=({% for ip in powervault_config.ip %}"{{ ip }}" {% endfor %}) + PORT="{{ powervault_config.port | default(3260) }}" + INITIATOR_IQN="{{ powervault_config.isci_initiators | default('') }}" + VOLUME_ID="{{ powervault_config.volume_id | default('') }}" + FS_TYPE="{{ powervault_config.fs_type | default('xfs') }}" + MOUNT_OPTS="{{ powervault_config.mount_options | default('defaults,_netdev,noatime') }}" + + PERSIST_MOUNT="/mnt/slurm-persist" + MYSQL_SUBDIR="${PERSIST_MOUNT}/mysql" + SPOOL_SUBDIR="${PERSIST_MOUNT}/spool" + + log "Enabling iSCSI daemon" + systemctl enable --now iscsid + /sbin/mpathconf --enable || true + + if [[ -n "${INITIATOR_IQN}" ]]; then + log "Setting InitiatorName to ${INITIATOR_IQN}" + if [[ -f /etc/iscsi/initiatorname.iscsi ]] && grep -q "^InitiatorName=${INITIATOR_IQN}$" /etc/iscsi/initiatorname.iscsi; then + log "InitiatorName already set; not changing" + else + printf "InitiatorName=%s\n" "${INITIATOR_IQN}" > /etc/iscsi/initiatorname.iscsi + log "Restarting iscsid after InitiatorName change" + systemctl restart iscsid + fi + else + log "INITIATOR_IQN not set; leaving /etc/iscsi/initiatorname.iscsi unchanged" + fi + + log "Current initiatorname:" + cat /etc/iscsi/initiatorname.iscsi || true + + log "Discovering iSCSI targets from all portals" + TARGET_IQN="" + + for ip in "${PORTALS[@]}"; do + log "Trying discovery on ${ip}:${PORT}" + OUT=$(iscsiadm -m discovery -t sendtargets -p "${ip}:${PORT}" 2>/dev/null || true) + echo "$OUT" + if [[ -z "${TARGET_IQN}" ]]; then + CANDIDATE_IQN=$(echo "$OUT" | awk '{print $2}' | head -1) + if [[ -n "${CANDIDATE_IQN}" ]]; then + TARGET_IQN="${CANDIDATE_IQN}" + fi + fi + done + + if [[ -z "${TARGET_IQN}" ]]; then + log "ERROR: Unable to determine target IQN from discovery output" + exit 1 + fi + log "Discovered TARGET_IQN=${TARGET_IQN}" + + log "Logging in to ALL discovered iSCSI targets" + iscsiadm -m node --login || true + + log "Setting automatic startup for all nodes" + iscsiadm -m node --op update -n node.startup -v automatic || true + + log "Waiting for devices to settle..." + sleep 5 + + log "Enabling multipathd" + systemctl enable --now multipathd || true + + log "Rescanning iSCSI sessions" + iscsiadm -m session --rescan || true + + log "Reloading multipath configuration" + multipath -r || true + + sleep 3 + + log "Verifying disks" + lsblk || true + lsscsi -s 2>/dev/null | grep -iE "ME|DELL" || true + + log "Multipath devices:" + multipath -ll || true + + LATEST_MPATH="" + + if [[ -n "${VOLUME_ID}" ]]; then + log "Selecting multipath using VOLUME_ID match: ${VOLUME_ID}" + LATEST_MPATH=$(multipath -ll 2>/dev/null | grep -iF "${VOLUME_ID}" | awk '{print $1}' | head -1 || true) + fi + + if [[ -z "${LATEST_MPATH}" ]]; then + log "Selecting multipath using vendor match DellEMC,ME5" + LATEST_MPATH=$(multipath -ll 2>/dev/null | grep -i "DellEMC,ME5" | awk '{print $1}' | head -1 || true) + fi + + if [[ -z "${LATEST_MPATH}" ]]; then + log "Selecting multipath using vendor match DellEMC,ME4" + LATEST_MPATH=$(multipath -ll 2>/dev/null | grep -i "DellEMC,ME4" | awk '{print $1}' | head -1 || true) + fi + + if [[ -z "${LATEST_MPATH}" ]]; then + log "Selecting multipath using latest dm-* fallback" + LATEST=$(multipath -ll 2>/dev/null | grep -oP 'dm-\d+' | sort -t- -k2 -n | tail -1 || true) + if [[ -z "${LATEST}" ]]; then + log "ERROR: No multipath dm-* devices detected" + exit 1 + fi + LATEST_MPATH=$(multipath -ll 2>/dev/null | grep "${LATEST}" | awk '{print $1}' | head -1 || true) + fi + + if [[ -z "${LATEST_MPATH}" ]]; then + log "ERROR: Unable to determine multipath device" + exit 1 + fi + + MPATH_DEV="/dev/mapper/${LATEST_MPATH}" + log "Using multipath device: ${MPATH_DEV}" + + PART_DEV="/dev/mapper/${LATEST_MPATH}1" + + if [[ ! -e "${PART_DEV}" ]]; then + log "Creating GPT label and partition on ${MPATH_DEV}" + parted -s "${MPATH_DEV}" mklabel gpt + parted -s "${MPATH_DEV}" mkpart primary "${FS_TYPE}" 0% 100% + sleep 2 + partprobe "${MPATH_DEV}" || true + kpartx -av "${MPATH_DEV}" || true + sleep 2 + fi + + log "Using partition device: ${PART_DEV}" + + if ! blkid -s TYPE -o value "${PART_DEV}" 2>/dev/null | grep -q .; then + log "Formatting ${PART_DEV} with ${FS_TYPE}" + mkfs."${FS_TYPE}" -f "${PART_DEV}" + else + log "Filesystem already exists on ${PART_DEV}; skipping format" + fi + + mkdir -p "${PERSIST_MOUNT}" + + UUID=$(blkid -s UUID -o value "${PART_DEV}" 2>/dev/null || true) + + if [[ -n "${UUID}" ]]; then + log "Using UUID=${UUID} for fstab" + FSTAB_ENTRY="UUID=${UUID}" + FSTAB_MATCH="^UUID=${UUID}\\s" + else + log "UUID not available, using device path ${PART_DEV} for fstab" + FSTAB_ENTRY="${PART_DEV}" + FSTAB_MATCH="^${PART_DEV}\\s" + fi + + if ! grep -qE "${FSTAB_MATCH}" /etc/fstab; then + log "Adding persistent mount to /etc/fstab" + echo "${FSTAB_ENTRY} ${PERSIST_MOUNT} ${FS_TYPE} ${MOUNT_OPTS} 0 0" >> /etc/fstab + fi + + if ! mountpoint -q "${PERSIST_MOUNT}"; then + log "Mounting ${PERSIST_MOUNT}" + mount "${PART_DEV}" "${PERSIST_MOUNT}" + fi + + df -h "${PERSIST_MOUNT}" || true + + mkdir -p "${MYSQL_SUBDIR}" "${SPOOL_SUBDIR}" /var/lib/mysql /var/spool + + grep -qE "\s+/var/lib/mysql\s+none\s+bind" /etc/fstab || echo "${MYSQL_SUBDIR} /var/lib/mysql none bind 0 0" >> /etc/fstab + grep -qE "\s+/var/spool\s+none\s+bind" /etc/fstab || echo "${SPOOL_SUBDIR} /var/spool none bind 0 0" >> /etc/fstab + + mount /var/lib/mysql || true + mount /var/spool || true + + chown -R {{ mysql_user }}:{{ mysql_group }} /var/lib/mysql + + log "Final mount summary:" + mount | grep -E "/mnt/slurm-persist|/var/lib/mysql|/var/spool" || true + + log "iSCSI sessions:" + iscsiadm -m session || true + + log "Multipath status:" + multipath -ll || true + + log "iSCSI/multipath setup complete. Log saved to ${LOGFILE}" +{% endif %} + {% if hostvars['localhost']['openldap_support'] %} - path: /etc/sssd/sssd.conf owner: root:root @@ -273,10 +468,12 @@ - echo "{{ cloud_init_nfs_path }}/cert /cert nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/slurm /etc/slurm nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/my.cnf.d /etc/my.cnf.d nfs defaults,_netdev 0 0" >> /etc/fstab - - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/lib/mysql /var/lib/mysql nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/mariadb /var/log/mariadb nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/log/slurm /var/log/slurm nfs defaults,_netdev 0 0" >> /etc/fstab +{% if powervault_config is not defined %} + - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/lib/mysql /var/lib/mysql nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/var/spool /var/spool nfs defaults,_netdev 0 0" >> /etc/fstab +{% endif %} - echo "{{ cloud_init_nfs_path }}/$(hostname -s)/etc/munge /etc/munge nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ trackfile_nfs_path }} /var/log/track nfs defaults,_netdev 0 0" >> /etc/fstab - echo "{{ cloud_init_nfs_path }}/packages /var/lib/packages nfs defaults,_netdev 0 0" >> /etc/fstab @@ -286,6 +483,9 @@ - cp /cert/pulp_webserver.crt /etc/pki/ca-trust/source/anchors && update-ca-trust - sed -i 's/^gpgcheck=1/gpgcheck=0/' /etc/dnf/dnf.conf - bash /usr/local/bin/doca-install.sh && bash /usr/local/bin/configure-ib-network.sh +{% if powervault_config is defined %} + - /usr/local/bin/setup_iscsi_storage.sh +{% endif %} - chown -R {{ slurm_user }}:{{ slurm_user }} {{ home_dir }} - chmod {{ file_mode_755 }} {{ home_dir }}