diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 index 36332685ac..fe6966c4be 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_aarch64.yaml.j2 @@ -315,6 +315,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 index 51121a2e82..1ee1fce5e1 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_compiler_node_x86_64.yaml.j2 @@ -317,6 +317,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 index d6071116ac..cdea0cd340 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_aarch64.yaml.j2 @@ -171,6 +171,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 index 524553bd55..b744859381 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-login_node_x86_64.yaml.j2 @@ -174,6 +174,38 @@ - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp - firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp + + # Add PXE network to trusted zone for ORTE communication + - echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + - | + bash -c ' + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + ' + - firewall-cmd --reload - systemctl enable sshd - systemctl start sshd diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 index 0f25c81307..981b283a99 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_aarch64.yaml.j2 @@ -386,6 +386,36 @@ firewall-cmd --permanent --add-service=ssh firewall-cmd --permanent --add-port="${SRUN_RANGE}"/tcp firewall-cmd --permanent --add-port="${SLURMD_PORT}"/tcp + + # Add PXE network to trusted zone for ORTE communication + echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + # Calculate PXE subnet using admin IP and netmask bits + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + firewall-cmd --reload echo "[INFO] Unmounting controller slurm.conf directory from $CTLD_SLURM_DIR_MNT" diff --git a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 index d21fcf9c5c..b914abdecd 100644 --- a/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 +++ b/discovery/roles/configure_ochami/templates/cloud_init/ci-group-slurm_node_x86_64.yaml.j2 @@ -403,6 +403,36 @@ firewall-cmd --permanent --add-service=ssh firewall-cmd --permanent --add-port="${SRUN_RANGE}"/tcp firewall-cmd --permanent --add-port="${SLURMD_PORT}"/tcp + + # Add PXE network to trusted zone for ORTE communication + echo "[INFO] Adding PXE network to trusted zone for ORTE communication" + # Calculate PXE subnet using admin IP and netmask bits + ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}" + NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}" + + # Convert IP to integer and calculate network address + ip_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) + (b << 16) + (c << 8) + d )) + } + + int_to_ip() { + local ip=$1 + echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))" + } + + ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP") + HOST_BITS=$(( 32 - NETMASK_BITS )) + HOST_MASK=$(( (1 << HOST_BITS) - 1 )) + NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF )) + NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK )) + NETWORK_IP=$(int_to_ip "$NETWORK_INT") + + PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS" + echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET" + firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent + firewall-cmd --reload echo "[INFO] Unmounting controller slurm.conf directory from $CTLD_SLURM_DIR_MNT" diff --git a/discovery/roles/slurm_config/defaults/main.yml b/discovery/roles/slurm_config/defaults/main.yml index edee927fc9..40840d5c44 100644 --- a/discovery/roles/slurm_config/defaults/main.yml +++ b/discovery/roles/slurm_config/defaults/main.yml @@ -19,7 +19,7 @@ slurmctld_service_default_path: '/usr/lib/systemd/system/slurmctld.service' slurmd_service_default_path: '/usr/lib/systemd/system/slurmd.service' slurmdbd_service_default_path: '/usr/lib/systemd/system/slurmdbd.service' sys_env_path: '/etc/environment' -default_real_memory: 884736 +default_real_memory: 864 default_threadspercore: 1 default_corespersocket: 72 default_sockets: 2 @@ -51,7 +51,10 @@ __default_config: PrologFlags: contain JobAcctGatherType: jobacct_gather/linux JobAcctGatherFrequency: 30 - SelectType: select/linear + SelectType: select/cons_tres + GresTypes: gpu + SelectTypeParameters: CR_Core_Memory + SlurmdParameters: l3cache_as_socket # Requires hwloc v2. SlurmctldLogFile: "/var/log/slurm/slurmctld.log" SlurmdLogFile: "/var/log/slurm/slurmd.log" SlurmctldPidFile: /var/run/slurmctld.pid @@ -62,6 +65,7 @@ __default_config: SlurmdTimeout: 300 Epilog: "/etc/slurm/epilog.d/logout_user.sh" PluginDir: "{{ plugin_slurm_dir }}" + MaxNodeCount: 65000 NodeSet: - NodeSet: "{{ slurm_partition_name }}" Feature: "{{ slurm_partition_name }}" @@ -73,9 +77,6 @@ __default_config: Nodes: ALL MaxTime: INFINITE State: UP - # S_P_ARRAY type paramater to be provided this way - # Epilog: - # - Epilog: "/etc/slurm/epilog.d/logout_user.sh" slurmdbd: AuthType: auth/munge LogFile: "/var/log/slurm/slurmdbd.log" diff --git a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml index e322c4f3f1..d930f64418 100644 --- a/discovery/roles/slurm_config/tasks/build_slurm_conf.yml +++ b/discovery/roles/slurm_config/tasks/build_slurm_conf.yml @@ -46,12 +46,6 @@ when: node_params is defined and node_params no_log: "{{ _no_log }}" -- name: Add gpu parameters to slurm conf - ansible.builtin.set_fact: - apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}" - when: gpu_params is defined and gpu_params - no_log: "{{ _no_log }}" - - name: Add dbd parameters to slurm conf ansible.builtin.set_fact: apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}" diff --git a/discovery/roles/slurm_config/tasks/confs.yml b/discovery/roles/slurm_config/tasks/confs.yml index 1601a81334..2378682b7d 100644 --- a/discovery/roles/slurm_config/tasks/confs.yml +++ b/discovery/roles/slurm_config/tasks/confs.yml @@ -321,6 +321,14 @@ - busy_nodes | length > 0 - user_input.user_input | default('') | trim | upper == 'A' +- name: Empty Busy nodes for Force removal + ansible.builtin.set_fact: + busy_nodes: {} + when: + - busy_nodes is defined + - busy_nodes | length > 0 + - user_input.user_input | default('') | trim | upper == 'F' + - name: Remove nodes ansible.builtin.include_tasks: remove_node.yml when: @@ -384,26 +392,17 @@ - name: Handle user choice - ABORT ansible.builtin.fail: - msg: | - ================================================================================ - PLAYBOOK ABORTED BY USER - ================================================================================ - - You chose to abort the playbook (Option A). - - Next Steps: - 1. Cancel running jobs manually: - {% for node in busy_nodes.keys() %} - scancel -w {{ node }} - {% endfor %} - - 2. Or wait for jobs to complete naturally - - 3. Re-run this playbook to remove the nodes - - Idle nodes (if any) have already been removed from the cluster. - - ================================================================================ + msg: + - "===============================================================================" + - "PLAYBOOK ABORTED BY USER" + - "===============================================================================" + - "You chose to abort the playbook (Option A)." + - "Next Steps:" + - "1. Cancel running jobs manually: 'scancel -w '" + - "2. Or wait for jobs to complete naturally" + - "3. Re-run this playbook to remove the nodes" + - "Idle nodes (if any) have already been removed from the cluster." + - "===============================================================================" when: - busy_nodes is defined - busy_nodes | length > 0 diff --git a/discovery/roles/slurm_config/tasks/remove_node.yml b/discovery/roles/slurm_config/tasks/remove_node.yml index da6b2a72ae..ceb766898b 100644 --- a/discovery/roles/slurm_config/tasks/remove_node.yml +++ b/discovery/roles/slurm_config/tasks/remove_node.yml @@ -47,11 +47,19 @@ - "'slurm' in conf_merge_dict" - filtered_nodenames is defined +- name: Set partition nodes exactly as cmpt_list minus removed nodes + ansible.builtin.set_fact: + partition_nodes: "{{ cmpt_list | difference(nodes_in_normal_not_in_cmpt) | union(busy_nodes.keys() | default([]) | list) }}" + when: + - "'slurm' in conf_merge_dict" + - nodes_in_normal_not_in_cmpt is defined + - nodes_in_normal_not_in_cmpt | length > 0 + - name: Update normal partition Nodes to match cmpt_list ansible.builtin.set_fact: updated_partitions: "{{ updated_partitions | default([]) - + [item | combine({'Nodes': ((cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | join(',')) - if (cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | length > 0 else slurm_partition_name}) + + [item | combine({'Nodes': (partition_nodes | join(',')) + if partition_nodes | length > 0 else slurm_partition_name}) if item.PartitionName == slurm_partition_name else item] }}" loop: "{{ slurm_conf_dict.PartitionName | default([]) }}" when: diff --git a/discovery/roles/slurm_config/vars/main.yml b/discovery/roles/slurm_config/vars/main.yml index 37e6ff9869..3cc8bdd6af 100644 --- a/discovery/roles/slurm_config/vars/main.yml +++ b/discovery/roles/slurm_config/vars/main.yml @@ -61,11 +61,6 @@ db_dir: cmpt_dir: - /etc/slurm/epilog.d -gpu_slurm_conf: - GresTypes: gpu - SelectType: select/cons_tres - SelectTypeParameters: CR_Core_Memory - SlurmdParameters: l3cache_as_socket innodb_buffer_pool_size: 4G innodb_lock_wait_timeout: 900 conf_server: "--conf-server {{ ctld_list | join(',') }}" @@ -131,6 +126,7 @@ partition_params: MaxTime: "INFINITE" State: "UP" Default: "YES" +busy_nodes: {} openldap_dir_name: "openldap/" software_config_file: "{{ input_project_dir }}/software_config.json" omnia_run_tags: "{{ hostvars['localhost']['omnia_run_tags'] }}" diff --git a/examples/pxe_mapping_file.csv b/examples/pxe_mapping_file.csv index 4d1c4775ed..f4c50517ac 100644 --- a/examples/pxe_mapping_file.csv +++ b/examples/pxe_mapping_file.csv @@ -8,4 +8,4 @@ service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54 service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55 service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56 -service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 \ No newline at end of file +service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57 diff --git a/input/config/aarch64/rhel/10.0/slurm_custom.json b/input/config/aarch64/rhel/10.0/slurm_custom.json index db95f2f5fb..b0477f2a95 100644 --- a/input/config/aarch64/rhel/10.0/slurm_custom.json +++ b/input/config/aarch64/rhel/10.0/slurm_custom.json @@ -9,7 +9,7 @@ {"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/input/config/x86_64/rhel/10.0/slurm_custom.json b/input/config/x86_64/rhel/10.0/slurm_custom.json index 2b33b0de90..e7962f4723 100644 --- a/input/config/x86_64/rhel/10.0/slurm_custom.json +++ b/input/config/x86_64/rhel/10.0/slurm_custom.json @@ -7,7 +7,7 @@ {"package": "pmix", "type": "rpm", "repo_name": "x86_64_appstream"}, {"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"}, {"package": "apptainer", "type": "rpm", "repo_name": "epel" }, - {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } + {"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" } ] }, "slurm_control_node": { diff --git a/utils/roles/idrac_pxe_boot/tasks/main.yml b/utils/roles/idrac_pxe_boot/tasks/main.yml index 70fb4dde2c..79f1ec3957 100644 --- a/utils/roles/idrac_pxe_boot/tasks/main.yml +++ b/utils/roles/idrac_pxe_boot/tasks/main.yml @@ -19,18 +19,21 @@ idrac_password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" validate_certs: false register: lc_check_status - ignore_errors: true - -- name: Set reboot type - ansible.builtin.set_fact: - reboot_type: "{{ 'ForceRestart' if force_restart else 'GracefulRestart' }}" + until: + - lc_check_status.lc_status_info.LCReady is defined + - lc_check_status.lc_status_info.LCReady + retries: 3 + delay: 5 + # ignore_errors: true + # free startegy should be used to avoid blocking on failed hosts + # but issue will be summarizing/syncing summary report of all at the end - name: IDRAC ops when ready when: - lc_check_status is success - lc_check_status.lc_status_info.LCReady block: - - name: Set boot from pxe + - name: Set boot option from pxe dellemc.openmanage.idrac_boot: idrac_ip: "{{ inventory_hostname }}" idrac_user: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" @@ -50,7 +53,7 @@ username: "{{ bmc_username | default(hostvars['localhost']['bmc_username']) }}" password: "{{ bmc_password | default(hostvars['localhost']['bmc_password']) }}" validate_certs: false - reset_type: ForceRestart + reset_type: "{{ 'ForceRestart' if force_restart else 'GracefulRestart' }}" when: restart_host register: restart_op failed_when: false @@ -69,37 +72,46 @@ - not (restart_op is changed) - name: Check LC availibility - ansible.builtin.fail: - msg: "{{ lc_check_fail_msg }}" - when: lc_check_status is failed or not (lc_check_status.lc_status_info.LCReady | default(false)) + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ lc_check_fail_msg }}" + when: lc_check_status is unreachable or lc_check_status is failed or not (lc_check_status.lc_status_info.LCReady | default(false)) - name: Fail if PXE provisioning failed - ansible.builtin.fail: - msg: "{{ pxe_provisioning_fail_msg }}" + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ pxe_provisioning_fail_msg }}" when: + - not reboot_failed - pxe_provisioning is defined - pxe_provisioning is failed - name: Fail if PXE provisioning target is unreachable - ansible.builtin.fail: - msg: "{{ unreachable_idrac_msg }}" + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "{{ unreachable_idrac_msg }}" when: + - not reboot_failed - pxe_provisioning is defined - pxe_provisioning is unreachable - name: Fail if power operation failed - ansible.builtin.fail: - msg: "Power operation failed on {{ inventory_hostname }}. Failed to restart server." + ansible.builtin.set_fact: + reboot_failed: true + reboot_status: "Power operation failed on {{ inventory_hostname }}. Failed to restart server." when: + - not reboot_failed - restart_host - not (restart_op is defined and restart_op is changed) - not (power_on_op is defined and power_on_op is changed) - name: Summarize PXE boot and power operation results - ansible.builtin.debug: - msg: >- + ansible.builtin.set_fact: + reboot_failed: false + reboot_status: >- PXE Boot: {{ 'OK' if pxe_provisioning is success else ('UNREACHABLE' if pxe_provisioning is unreachable else 'FAILED') }} | Power: {{ 'Restart OK' if (restart_op is defined and restart_op is changed) else ('On OK' if (power_on_op is defined and power_on_op is changed) else ('Skipped (no restart)' if not restart_host else 'FAILED')) }} + when: not reboot_failed diff --git a/utils/roles/idrac_pxe_boot/vars/main.yml b/utils/roles/idrac_pxe_boot/vars/main.yml index 53de8aa0e9..4bf99d7bfe 100644 --- a/utils/roles/idrac_pxe_boot/vars/main.yml +++ b/utils/roles/idrac_pxe_boot/vars/main.yml @@ -18,6 +18,9 @@ restart_host: true # Change to true for forceful reboot. by default graceful will happen force_restart: true +reboot_status: "PXE boot initiated but not completed." +reboot_failed: false + # Set boot source override mode. Valid values are once, continuous, or disabled boot_source_override_enabled: continuous diff --git a/utils/set_pxe_boot.yml b/utils/set_pxe_boot.yml index a46ebd7091..16e11534b0 100644 --- a/utils/set_pxe_boot.yml +++ b/utils/set_pxe_boot.yml @@ -54,13 +54,27 @@ # This configures Dell iDRAC BMCs to boot a host from PXE (network) and optionally reboots the server. # This will set the boot mode to pxe -# Note: Restart will not happen if the server is powered off, only pxe mode will be set. - name: Reboot Host via PXE hosts: bmc connection: local + strategy: host_pinned gather_facts: false roles: - role: idrac_pxe_boot # vars: # restart_host: false # By default restart will be true, set to false not to restart # force_restart: true # By default graceful_restart will happen, set to true to force restart + +- name: Synchronized Reporting + hosts: bmc + connection: local + gather_facts: false + tasks: + - name: Fail if reboot function failed + ansible.builtin.fail: + msg: "{{ reboot_status }}" + when: reboot_failed + + - name: Show passed iDRACs + ansible.builtin.debug: + msg: "{{ inventory_hostname }}: {{ reboot_status }}"