Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,38 @@
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp

# Add PXE network to trusted zone for ORTE communication
- echo "[INFO] Adding PXE network to trusted zone for ORTE communication"
- |
bash -c '
ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}"
NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}"

# Convert IP to integer and calculate network address
ip_to_int() {
local IFS=.
read -r a b c d <<< "$1"
echo $(( (a << 24) + (b << 16) + (c << 8) + d ))
}

int_to_ip() {
local ip=$1
echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))"
}

ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP")
HOST_BITS=$(( 32 - NETMASK_BITS ))
HOST_MASK=$(( (1 << HOST_BITS) - 1 ))
NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF ))
NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK ))
NETWORK_IP=$(int_to_ip "$NETWORK_INT")

PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS"
echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET"
firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent
'

- firewall-cmd --reload
- systemctl enable sshd
- systemctl start sshd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,38 @@
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp

# Add PXE network to trusted zone for ORTE communication
- echo "[INFO] Adding PXE network to trusted zone for ORTE communication"
- |
bash -c '
ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}"
NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}"

# Convert IP to integer and calculate network address
ip_to_int() {
local IFS=.
read -r a b c d <<< "$1"
echo $(( (a << 24) + (b << 16) + (c << 8) + d ))
}

int_to_ip() {
local ip=$1
echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))"
}

ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP")
HOST_BITS=$(( 32 - NETMASK_BITS ))
HOST_MASK=$(( (1 << HOST_BITS) - 1 ))
NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF ))
NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK ))
NETWORK_IP=$(int_to_ip "$NETWORK_INT")

PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS"
echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET"
firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent
'

- firewall-cmd --reload
- systemctl enable sshd
- systemctl start sshd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,38 @@
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp

# Add PXE network to trusted zone for ORTE communication
- echo "[INFO] Adding PXE network to trusted zone for ORTE communication"
- |
bash -c '
ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}"
NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}"

# Convert IP to integer and calculate network address
ip_to_int() {
local IFS=.
read -r a b c d <<< "$1"
echo $(( (a << 24) + (b << 16) + (c << 8) + d ))
}

int_to_ip() {
local ip=$1
echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))"
}

ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP")
HOST_BITS=$(( 32 - NETMASK_BITS ))
HOST_MASK=$(( (1 << HOST_BITS) - 1 ))
NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF ))
NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK ))
NETWORK_IP=$(int_to_ip "$NETWORK_INT")

PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS"
echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET"
firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent
'

- firewall-cmd --reload
- systemctl enable sshd
- systemctl start sshd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,38 @@
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SrunPortRange }}/udp
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/tcp
- firewall-cmd --permanent --add-port={{ slurm_conf_dict.SlurmdPort }}/udp

# Add PXE network to trusted zone for ORTE communication
- echo "[INFO] Adding PXE network to trusted zone for ORTE communication"
- |
bash -c '
ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}"
NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}"

# Convert IP to integer and calculate network address
ip_to_int() {
local IFS=.
read -r a b c d <<< "$1"
echo $(( (a << 24) + (b << 16) + (c << 8) + d ))
}

int_to_ip() {
local ip=$1
echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))"
}

ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP")
HOST_BITS=$(( 32 - NETMASK_BITS ))
HOST_MASK=$(( (1 << HOST_BITS) - 1 ))
NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF ))
NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK ))
NETWORK_IP=$(int_to_ip "$NETWORK_INT")

PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS"
echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET"
firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent
'

- firewall-cmd --reload
- systemctl enable sshd
- systemctl start sshd
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,36 @@
firewall-cmd --permanent --add-service=ssh
firewall-cmd --permanent --add-port="${SRUN_RANGE}"/tcp
firewall-cmd --permanent --add-port="${SLURMD_PORT}"/tcp

# Add PXE network to trusted zone for ORTE communication
echo "[INFO] Adding PXE network to trusted zone for ORTE communication"
# Calculate PXE subnet using admin IP and netmask bits
ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}"
NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}"

# Convert IP to integer and calculate network address
ip_to_int() {
local IFS=.
read -r a b c d <<< "$1"
echo $(( (a << 24) + (b << 16) + (c << 8) + d ))
}

int_to_ip() {
local ip=$1
echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))"
}

ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP")
HOST_BITS=$(( 32 - NETMASK_BITS ))
HOST_MASK=$(( (1 << HOST_BITS) - 1 ))
NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF ))
NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK ))
NETWORK_IP=$(int_to_ip "$NETWORK_INT")

PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS"
echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET"
firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent

firewall-cmd --reload

echo "[INFO] Unmounting controller slurm.conf directory from $CTLD_SLURM_DIR_MNT"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,36 @@
firewall-cmd --permanent --add-service=ssh
firewall-cmd --permanent --add-port="${SRUN_RANGE}"/tcp
firewall-cmd --permanent --add-port="${SLURMD_PORT}"/tcp

# Add PXE network to trusted zone for ORTE communication
echo "[INFO] Adding PXE network to trusted zone for ORTE communication"
# Calculate PXE subnet using admin IP and netmask bits
ADMIN_IP="{{ hostvars['localhost']['admin_nic_ip'] }}"
NETMASK_BITS="{{ hostvars['localhost']['admin_netmask_bits'] }}"

# Convert IP to integer and calculate network address
ip_to_int() {
local IFS=.
read -r a b c d <<< "$1"
echo $(( (a << 24) + (b << 16) + (c << 8) + d ))
}

int_to_ip() {
local ip=$1
echo "$(( (ip >> 24) & 255 )).$(( (ip >> 16) & 255 )).$(( (ip >> 8) & 255 )).$(( ip & 255 ))"
}

ADMIN_IP_INT=$(ip_to_int "$ADMIN_IP")
HOST_BITS=$(( 32 - NETMASK_BITS ))
HOST_MASK=$(( (1 << HOST_BITS) - 1 ))
NETWORK_MASK=$(( ~HOST_MASK & 0xFFFFFFFF ))
NETWORK_INT=$(( ADMIN_IP_INT & NETWORK_MASK ))
NETWORK_IP=$(int_to_ip "$NETWORK_INT")

PXE_SUBNET="$NETWORK_IP/$NETMASK_BITS"
echo "[INFO] Admin IP: $ADMIN_IP, Netmask: /$NETMASK_BITS, PXE Subnet: $PXE_SUBNET"
firewall-cmd --zone=trusted --add-source="$PXE_SUBNET" --permanent

firewall-cmd --reload

echo "[INFO] Unmounting controller slurm.conf directory from $CTLD_SLURM_DIR_MNT"
Expand Down
11 changes: 6 additions & 5 deletions discovery/roles/slurm_config/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ slurmctld_service_default_path: '/usr/lib/systemd/system/slurmctld.service'
slurmd_service_default_path: '/usr/lib/systemd/system/slurmd.service'
slurmdbd_service_default_path: '/usr/lib/systemd/system/slurmdbd.service'
sys_env_path: '/etc/environment'
default_real_memory: 884736
default_real_memory: 864
default_threadspercore: 1
default_corespersocket: 72
default_sockets: 2
Expand Down Expand Up @@ -51,7 +51,10 @@ __default_config:
PrologFlags: contain
JobAcctGatherType: jobacct_gather/linux
JobAcctGatherFrequency: 30
SelectType: select/linear
SelectType: select/cons_tres
GresTypes: gpu
SelectTypeParameters: CR_Core_Memory
SlurmdParameters: l3cache_as_socket # Requires hwloc v2.
SlurmctldLogFile: "/var/log/slurm/slurmctld.log"
SlurmdLogFile: "/var/log/slurm/slurmd.log"
SlurmctldPidFile: /var/run/slurmctld.pid
Expand All @@ -62,6 +65,7 @@ __default_config:
SlurmdTimeout: 300
Epilog: "/etc/slurm/epilog.d/logout_user.sh"
PluginDir: "{{ plugin_slurm_dir }}"
MaxNodeCount: 65000
NodeSet:
- NodeSet: "{{ slurm_partition_name }}"
Feature: "{{ slurm_partition_name }}"
Expand All @@ -73,9 +77,6 @@ __default_config:
Nodes: ALL
MaxTime: INFINITE
State: UP
# S_P_ARRAY type paramater to be provided this way
# Epilog:
# - Epilog: "/etc/slurm/epilog.d/logout_user.sh"
slurmdbd:
AuthType: auth/munge
LogFile: "/var/log/slurm/slurmdbd.log"
Expand Down
6 changes: 0 additions & 6 deletions discovery/roles/slurm_config/tasks/build_slurm_conf.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,6 @@
when: node_params is defined and node_params
no_log: "{{ _no_log }}"

- name: Add gpu parameters to slurm conf
ansible.builtin.set_fact:
apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(gpu_slurm_conf))}) }}"
when: gpu_params is defined and gpu_params
no_log: "{{ _no_log }}"

- name: Add dbd parameters to slurm conf
ansible.builtin.set_fact:
apply_config: "{{ apply_config | default({}) | combine({'slurm': (apply_config['slurm'] | combine(dbd_slurm_conf))}) }}"
Expand Down
39 changes: 19 additions & 20 deletions discovery/roles/slurm_config/tasks/confs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,14 @@
- busy_nodes | length > 0
- user_input.user_input | default('') | trim | upper == 'A'

- name: Empty Busy nodes for Force removal
ansible.builtin.set_fact:
busy_nodes: {}
when:
- busy_nodes is defined
- busy_nodes | length > 0
- user_input.user_input | default('') | trim | upper == 'F'

- name: Remove nodes
ansible.builtin.include_tasks: remove_node.yml
when:
Expand Down Expand Up @@ -384,26 +392,17 @@

- name: Handle user choice - ABORT
ansible.builtin.fail:
msg: |
================================================================================
PLAYBOOK ABORTED BY USER
================================================================================

You chose to abort the playbook (Option A).

Next Steps:
1. Cancel running jobs manually:
{% for node in busy_nodes.keys() %}
scancel -w {{ node }}
{% endfor %}

2. Or wait for jobs to complete naturally

3. Re-run this playbook to remove the nodes

Idle nodes (if any) have already been removed from the cluster.

================================================================================
msg:
- "==============================================================================="
- "PLAYBOOK ABORTED BY USER"
- "==============================================================================="
- "You chose to abort the playbook (Option A)."
- "Next Steps:"
- "1. Cancel running jobs manually: 'scancel -w <node_name>'"
- "2. Or wait for jobs to complete naturally"
- "3. Re-run this playbook to remove the nodes"
- "Idle nodes (if any) have already been removed from the cluster."
- "==============================================================================="
when:
- busy_nodes is defined
- busy_nodes | length > 0
Expand Down
12 changes: 10 additions & 2 deletions discovery/roles/slurm_config/tasks/remove_node.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,19 @@
- "'slurm' in conf_merge_dict"
- filtered_nodenames is defined

- name: Set partition nodes exactly as cmpt_list minus removed nodes
ansible.builtin.set_fact:
partition_nodes: "{{ cmpt_list | difference(nodes_in_normal_not_in_cmpt) | union(busy_nodes.keys() | default([]) | list) }}"
when:
- "'slurm' in conf_merge_dict"
- nodes_in_normal_not_in_cmpt is defined
- nodes_in_normal_not_in_cmpt | length > 0

- name: Update normal partition Nodes to match cmpt_list
ansible.builtin.set_fact:
updated_partitions: "{{ updated_partitions | default([])
+ [item | combine({'Nodes': ((cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | join(','))
if (cmpt_list | difference(nodes_in_normal_not_in_cmpt)) | length > 0 else slurm_partition_name})
+ [item | combine({'Nodes': (partition_nodes | join(','))
if partition_nodes | length > 0 else slurm_partition_name})
if item.PartitionName == slurm_partition_name else item] }}"
loop: "{{ slurm_conf_dict.PartitionName | default([]) }}"
when:
Expand Down
6 changes: 1 addition & 5 deletions discovery/roles/slurm_config/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@ db_dir:
cmpt_dir:
- /etc/slurm/epilog.d

gpu_slurm_conf:
GresTypes: gpu
SelectType: select/cons_tres
SelectTypeParameters: CR_Core_Memory
SlurmdParameters: l3cache_as_socket
innodb_buffer_pool_size: 4G
innodb_lock_wait_timeout: 900
conf_server: "--conf-server {{ ctld_list | join(',') }}"
Expand Down Expand Up @@ -131,6 +126,7 @@ partition_params:
MaxTime: "INFINITE"
State: "UP"
Default: "YES"
busy_nodes: {}
openldap_dir_name: "openldap/"
software_config_file: "{{ input_project_dir }}/software_config.json"
omnia_run_tags: "{{ hostvars['localhost']['omnia_run_tags'] }}"
Expand Down
2 changes: 1 addition & 1 deletion examples/pxe_mapping_file.csv
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ service_kube_control_plane_x86_64,grp3,ABFG79,,service-kube-control-plane1,aa:bb
service_kube_control_plane_x86_64,grp4,ABFH78,,service-kube-control-plane2,aa:bb:cc:dd:ee:hh,172.16.107.54,xx:yy:zz:aa:bb:hh,172.17.107.54
service_kube_control_plane_x86_64,grp4,ABFH80,,service-kube-control-plane3,aa:bb:cc:dd:ee:ii,172.16.107.55,xx:yy:zz:aa:bb:ii,172.17.107.55
service_kube_node_x86_64,grp5,ABFL82,,service-kube-node1,aa:bb:cc:dd:ee:jj,172.16.107.56,xx:yy:zz:aa:bb:jj,172.17.107.56
service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57
service_kube_node_x86_64,grp5,ABKD88,,service-kube-node2,aa:bb:cc:dd:ee:kk,172.16.107.57,xx:yy:zz:aa:bb:ff,172.17.107.57
2 changes: 1 addition & 1 deletion input/config/aarch64/rhel/10.0/slurm_custom.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
{"package": "pmix-devel", "type": "rpm", "repo_name": "aarch64_appstream"},
{"package": "nvcr.io/nvidia/hpc-benchmarks", "tag": "25.09", "type": "image"},
{"package": "apptainer", "type": "rpm", "repo_name": "epel" },
{"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }
{"package": "doca-ofed", "type": "rpm_repo", "repo_name": "doca" }
]
},
"slurm_control_node": {
Expand Down
Loading