From c19a5073313875a6536839c6837834e79e442779 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Tue, 20 May 2025 11:47:22 -0400 Subject: [PATCH 01/50] WIP hub deployment --- bicep/hub/anf_params.json | 14 +++++++ bicep/hub/create_hub.sh | 79 ++++++++++++++++++++++++++++++++++++++ bicep/hub/db_params.json | 14 +++++++ bicep/hub/hub-vnet.bicep | 69 +++++++++++++++++++++++++++++++++ bicep/hub/vnet_params.json | 5 +++ 5 files changed, 181 insertions(+) create mode 100644 bicep/hub/anf_params.json create mode 100644 bicep/hub/create_hub.sh create mode 100644 bicep/hub/db_params.json create mode 100644 bicep/hub/hub-vnet.bicep create mode 100644 bicep/hub/vnet_params.json diff --git a/bicep/hub/anf_params.json b/bicep/hub/anf_params.json new file mode 100644 index 00000000..de191c4f --- /dev/null +++ b/bicep/hub/anf_params.json @@ -0,0 +1,14 @@ +{ + "name": { + "value": "gb200-hub" + }, + "subnetId": { + "value": "/subscriptions/820a6cdf-49cb-4a89-9113-ccc4d94feb32/resourceGroups/gb200-hub/providers/Microsoft.Network/virtualNetworks/vnet/subnets/netapp" + }, + "serviceLevel": { + "value": "Premium" + }, + "sizeTiB": { + "value": 4 + } +} \ No newline at end of file diff --git a/bicep/hub/create_hub.sh b/bicep/hub/create_hub.sh new file mode 100644 index 00000000..f1cd5243 --- /dev/null +++ b/bicep/hub/create_hub.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# Initialize variables +RESOURCE_GROUP="" +LOCATION="" + +# Parse arguments +while [ "$#" -gt 0 ]; do + case "$1" in + -rg|--resource-group) + RESOURCE_GROUP="$2" + shift 2 + ;; + -l|--location) + LOCATION="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 --resource-group --location " + echo " or: $0 -rg -l " + exit 0 + ;; + *) + echo "Unknown parameter: $1" + echo "Use --help for usage information." + exit 1 + ;; + esac +done + +# Validate inputs +if [ -z "$RESOURCE_GROUP" ] || [ -z "$LOCATION" ]; then + echo "Error: Both --resource-group and --location are required." + echo "Use --help for usage information." + exit 1 +fi + +cd "$(dirname "$0")" + +# Check if the resource group exists and create it if it doesn't +RG_EXISTS=$(az group exists -n "$RESOURCE_GROUP" | tr -d '\r\n') +if [ "$RG_EXISTS" = "false" ]; then + echo "Resource group '$RESOURCE_GROUP' does not exist. Creating it in location '$LOCATION'." + az group create -n "$RESOURCE_GROUP" -l "$LOCATION" + + while RG_CREATED=$(az group exists -n "$RESOURCE_GROUP" | tr -d '\r\n'); [ "$RG_CREATED" = "false" ]; do + echo "Waiting for resource group '$RESOURCE_GROUP' to be created..." + sleep 1 + done +fi + +# Deploy vnet +az deployment group create \ + --resource-group "$RESOURCE_GROUP" \ + --template-file "$(pwd)/hub-vnet.bicep" \ + --parameters "$(pwd)/vnet_params.json" \ + --parameters location="$LOCATION" \ + --name hub-vnet + +exit 0 + +# Deploy MySQL server +az deployment group create \ + --resource-group "$RESOURCE_GROUP" \ + --template-file ../mysql.bicep \ + --parameters db_params.json \ + --name hub-db + +# Deploy Azure NetApp Files +az deployment group create \ + --resource-group "$RESOURCE_GROUP" \ + --template-file ../anfs.bicep \ + --parameters '{ \"location\": { \"value\": \"${LOCATION}\" } }' \ + --name hub-anf-account +az deployment group create \ + --resource-group "$RESOURCE_GROUP" \ + --template-file ../anf.bicep \ + --parameters anf_params.json \ + --name hub-anf-volume \ No newline at end of file diff --git a/bicep/hub/db_params.json b/bicep/hub/db_params.json new file mode 100644 index 00000000..8139cec8 --- /dev/null +++ b/bicep/hub/db_params.json @@ -0,0 +1,14 @@ +{ + "Name": { + "value": "gb200-hub" + }, + "adminUser": { + "value": "hpcadmin" + }, + "adminPassword": { + "value": "royGbiv!1" + }, + "subnetId": { + "value": "/subscriptions/820a6cdf-49cb-4a89-9113-ccc4d94feb32/resourceGroups/gb200-hub/providers/Microsoft.Network/virtualNetworks/vnet/subnets/database" + } +} \ No newline at end of file diff --git a/bicep/hub/hub-vnet.bicep b/bicep/hub/hub-vnet.bicep new file mode 100644 index 00000000..3c4a0e27 --- /dev/null +++ b/bicep/hub/hub-vnet.bicep @@ -0,0 +1,69 @@ +targetScope = 'resourceGroup' +import { tags_t } from '../types.bicep' +import {subnet_config} from '../network-new.bicep' + + +param location string +param address string +param tags tags_t = {} + +var subnet_cidr = subnet_config(address) + +var vnet = { + name: 'hub-vnet-${resourceGroup().name}' + cidr: address + subnets: { + netapp: { + name: 'netapp' + cidr: subnet_cidr.netapp + nat_gateway : false + service_endpoints: [] + delegations: [ + 'Microsoft.Netapp/volumes' + ] + } + database: { + name: 'database' + cidr: subnet_cidr.database + nat_gateway : false + service_endpoints: [] + delegations: [ + 'Microsoft.DBforMySQL/flexibleServers' + ] + } + } +} + +resource virtualNetwork 'Microsoft.Network/virtualNetworks@2024-05-01' = { + name: vnet.name + location: location + tags: tags + properties: { + addressSpace: { + addressPrefixes: [vnet.cidr] + } + subnets: [ + for subnet in items(vnet.subnets): { + name: subnet.value.name + properties: { + addressPrefixes: [subnet.value.cidr] + // natGateway: (natGatewayId != '' && subnet.value.nat_gateway) ? { + // id: natGatewayId + // } : null + // networkSecurityGroup: { + // id: ccwCommonNsg.id + // } + delegations: map(subnet.value.delegations, delegation => { + name: subnet.value.name + properties: { + serviceName: delegation + } + }) + serviceEndpoints: map(subnet.value.service_endpoints, endpoint => { + service: endpoint + }) + } + } + ] + } +} diff --git a/bicep/hub/vnet_params.json b/bicep/hub/vnet_params.json new file mode 100644 index 00000000..1934218a --- /dev/null +++ b/bicep/hub/vnet_params.json @@ -0,0 +1,5 @@ +{ + "address": { + "value": "10.0.0.0/16" + } +} \ No newline at end of file From e3b0e7dcbee183c7a74a1f63225becb2e777d304 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Fri, 23 May 2025 15:17:46 -0400 Subject: [PATCH 02/50] Further refinement of deploy hub script and creation of deploy spoke script --- bicep/anf.bicep | 4 +- bicep/ccw.bicep | 16 ++-- bicep/hub/anf_params.json | 9 +-- bicep/hub/bastion_params.json | 5 ++ bicep/hub/create_hub.sh | 55 +++++++++---- bicep/hub/db_params.json | 7 +- bicep/hub/deploy_spoke.sh | 72 +++++++++++++++++ bicep/hub/hub-vnet.bicep | 9 +++ bicep/hub/original_spoke_params.json | 116 +++++++++++++++++++++++++++ bicep/hub/vnet_params.json | 2 +- bicep/mainTemplate.bicep | 4 +- bicep/mysql.bicep | 14 +++- bicep/network-new.bicep | 9 ++- 13 files changed, 279 insertions(+), 43 deletions(-) create mode 100644 bicep/hub/bastion_params.json create mode 100644 bicep/hub/deploy_spoke.sh create mode 100644 bicep/hub/original_spoke_params.json diff --git a/bicep/anf.bicep b/bicep/anf.bicep index 8c41f37b..84495682 100644 --- a/bicep/anf.bicep +++ b/bicep/anf.bicep @@ -4,12 +4,12 @@ import {tags_t, availabilityZone_t} from './types.bicep' param name string param location string param tags tags_t -param availabilityZone availabilityZone_t[] +param availabilityZone availabilityZone_t[] = [] param resourcePostfix string = uniqueString(resourceGroup().id) param subnetId string param serviceLevel string param sizeTiB int -param defaultMountOptions string +param defaultMountOptions string = 'rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev,nconnect=8' param infrastructureOnly bool = false var capacity = sizeTiB * 1024 * 1024 * 1024 * 1024 diff --git a/bicep/ccw.bicep b/bicep/ccw.bicep index 10732d39..417dd0dd 100644 --- a/bicep/ccw.bicep +++ b/bicep/ccw.bicep @@ -7,7 +7,7 @@ param insidersBuild bool param branch string param projectVersion string -param pyxisProjectVersion string +param monitoringProjectVersion string param adminUsername string @secure() @@ -202,7 +202,7 @@ module mySQLccw './mysql.bicep' = if (create_database) { params: { location: location tags: getTags('Microsoft.DBforMySQL/flexibleServers', tags) - Name: db_name + // Name: db_name adminUser: adminUsername adminPassword: databaseAdminPassword subnetId: subnets.database.id @@ -316,7 +316,7 @@ output cyclecloudPrincipalId string = infrastructureOnly ? '' : ccwVM.outputs.pr output managedIdentityId string = infrastructureOnly ? '' : ccwManagedIdentity.outputs.managedIdentityId -// Automatically inject the ccw and pyxis cluster init specs +// Automatically inject the ccw and monitoring cluster init specs var ccwClusterInitSpec = { type: 'gitHubReleaseURL' @@ -325,15 +325,15 @@ var ccwClusterInitSpec = { target: ['login', 'scheduler', 'htc', 'hpc', 'gpu', 'dynamic'] } -var pyxisClusterInitSpec = { +var monitoringClusterInitSpec = { type: 'gitHubReleaseURL' - gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-pyxis/releases/tag/', pyxisProjectVersion) + gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-monitoring/releases/tag/', monitoringProjectVersion) spec: 'default' target: ['login', 'scheduler', 'htc', 'hpc', 'gpu', 'dynamic'] } -// Projects <= 2025.02.06 have the pyxis logic embedded in the ccw cluster init spec -var requiredClusterInitSpecs = [ccwClusterInitSpec, pyxisClusterInitSpec] +// Use of azslurm 4.0 does not require pyxis +var requiredClusterInitSpecs = [ccwClusterInitSpec, monitoringClusterInitSpec] output clusterInitSpecs types.cluster_init_param_t = union(requiredClusterInitSpecs, clusterInitSpecs) @@ -389,7 +389,7 @@ output manualInstall bool = manualInstall output acceptMarketplaceTerms bool = acceptMarketplaceTerms output ood object = union(ood, { - version: '1.0.1' + version: '1.1.0' nic: deployOOD ? oodNIC.outputs.NICId : '' managedIdentity: deployOOD ? createOODMI ? oodNewManagedIdentity.id : ood.?appManagedIdentityId : '' clientId: deployOOD ? registerOODApp ? oodApp.outputs.oodClientAppId : ood.?appId : '' diff --git a/bicep/hub/anf_params.json b/bicep/hub/anf_params.json index de191c4f..d2c2d805 100644 --- a/bicep/hub/anf_params.json +++ b/bicep/hub/anf_params.json @@ -1,14 +1,11 @@ { - "name": { - "value": "gb200-hub" - }, - "subnetId": { - "value": "/subscriptions/820a6cdf-49cb-4a89-9113-ccc4d94feb32/resourceGroups/gb200-hub/providers/Microsoft.Network/virtualNetworks/vnet/subnets/netapp" - }, "serviceLevel": { "value": "Premium" }, "sizeTiB": { "value": 4 + }, + "tags": { + "value": {} } } \ No newline at end of file diff --git a/bicep/hub/bastion_params.json b/bicep/hub/bastion_params.json new file mode 100644 index 00000000..8c5a06cf --- /dev/null +++ b/bicep/hub/bastion_params.json @@ -0,0 +1,5 @@ +{ + "tags": { + "value": {} + } +} \ No newline at end of file diff --git a/bicep/hub/create_hub.sh b/bicep/hub/create_hub.sh index f1cd5243..f1a397c1 100644 --- a/bicep/hub/create_hub.sh +++ b/bicep/hub/create_hub.sh @@ -38,6 +38,7 @@ fi cd "$(dirname "$0")" # Check if the resource group exists and create it if it doesn't +echo Checking if resource group "${RESOURCE_GROUP}" exists... RG_EXISTS=$(az group exists -n "$RESOURCE_GROUP" | tr -d '\r\n') if [ "$RG_EXISTS" = "false" ]; then echo "Resource group '$RESOURCE_GROUP' does not exist. Creating it in location '$LOCATION'." @@ -51,29 +52,55 @@ fi # Deploy vnet az deployment group create \ - --resource-group "$RESOURCE_GROUP" \ + --resource-group "${RESOURCE_GROUP}" \ --template-file "$(pwd)/hub-vnet.bicep" \ --parameters "$(pwd)/vnet_params.json" \ --parameters location="$LOCATION" \ - --name hub-vnet + --name "hub-vnet-${RESOURCE_GROUP}" + +echo "Virtual network deployment is complete. Please enter the Azure Portal to create a VPN Gateway while the remainder of this script runs." -exit 0 +# Deploy Bastion +bastion_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n AzureBastionSubnet --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') +az deployment group create \ + --resource-group "${RESOURCE_GROUP}" \ + --template-file "$(pwd)/../bastion.bicep" \ + --parameters "$(pwd)/bastion_params.json" \ + --parameters location="${LOCATION}" \ + --parameters subnetId="${bastion_subnet_id}" \ + --name "hub-bastion-${RESOURCE_GROUP}" # Deploy MySQL server +db_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n database --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') az deployment group create \ - --resource-group "$RESOURCE_GROUP" \ - --template-file ../mysql.bicep \ - --parameters db_params.json \ - --name hub-db + --resource-group "${RESOURCE_GROUP}" \ + --template-file "$(pwd)/../mysql.bicep" \ + --parameters "$(pwd)/db_params.json" \ + --parameters location="${LOCATION}" \ + --parameters subnetId="${db_subnet_id}" \ + --name "hub-db-${RESOURCE_GROUP}" # Deploy Azure NetApp Files +netapp_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n netapp --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') az deployment group create \ - --resource-group "$RESOURCE_GROUP" \ - --template-file ../anfs.bicep \ - --parameters '{ \"location\": { \"value\": \"${LOCATION}\" } }' \ - --name hub-anf-account + --resource-group "${RESOURCE_GROUP}" \ + --template-file "$(pwd)/../anf-account.bicep" \ + --parameters location="${LOCATION}" \ + --name "hub-anf-account-${RESOURCE_GROUP}" az deployment group create \ --resource-group "$RESOURCE_GROUP" \ - --template-file ../anf.bicep \ - --parameters anf_params.json \ - --name hub-anf-volume \ No newline at end of file + --template-file "$(pwd)/../anf.bicep"\ + --parameters "$(pwd)/anf_params.json" \ + --parameters subnetId="${netapp_subnet_id}" \ + --parameters location="${LOCATION}" \ + --parameters name="shared" \ + --name "hub-anf-resources-${RESOURCE_GROUP}" + +# Deploy monitoring +MONITORING_PROJECT_VERSION="1.0.0" +rm -rf cyclecloud-monitoring +git clone --branch "${MONITORING_PROJECT_VERSION}" https://github.com/Azure/cyclecloud-monitoring.git + +cd cyclecloud-monitoring/infra +sh $(pwd)/deploy.sh "$RESOURCE_GROUP" +cd ../.. diff --git a/bicep/hub/db_params.json b/bicep/hub/db_params.json index 8139cec8..acfd9bae 100644 --- a/bicep/hub/db_params.json +++ b/bicep/hub/db_params.json @@ -1,14 +1,11 @@ { - "Name": { - "value": "gb200-hub" - }, "adminUser": { "value": "hpcadmin" }, "adminPassword": { "value": "royGbiv!1" }, - "subnetId": { - "value": "/subscriptions/820a6cdf-49cb-4a89-9113-ccc4d94feb32/resourceGroups/gb200-hub/providers/Microsoft.Network/virtualNetworks/vnet/subnets/database" + "tags": { + "value": {} } } \ No newline at end of file diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh new file mode 100644 index 00000000..153b0a0d --- /dev/null +++ b/bicep/hub/deploy_spoke.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +cd "$(dirname "$0")" + +RG="gb200-hub-westus2" +SUFFIX="-${RG}" +SPOKE_NUMBER="1" + +fetch_outputs() { +az deployment group show -g "$RG" -n "hub-vnet${SUFFIX}" --query properties.outputs > hub-vnet-outputs.json +az deployment group show -g "$RG" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > hub-anf-outputs.json +az deployment group show -g "$RG" -n "hub-db${SUFFIX}" --query properties.outputs > hub-db-outputs.json + +# Monitoring NTS: Copy cc-monitoring outputs to directory containing hub outputs +# cp /path/to/outputs.json hub-monitoring-outputs.json + +# TODO: blob storage +} + +fetch_outputs + +cp original_spoke_params.json spoke_params.json + +replace_fields() { + jq "$1" spoke_params.json > tmp_spoke_params.json && mv tmp_spoke_params.json spoke_params.json +} + +# shared FS +IP_ADDRESS=$(jq -r '.ipAddress.value' hub-anf-outputs.json) +EXPORT_PATH=$(jq -r '.exportPath.value' hub-anf-outputs.json) +MOUNT_OPTIONS=$(jq -r '.mountOptions.value' hub-anf-outputs.json) +replace_fields ".sharedFilesystem={ value: { type: \"nfs-existing\", ipAddress: \"$IP_ADDRESS\", exportPath: \"$EXPORT_PATH\", mountOptions: \"$MOUNT_OPTIONS\" } }" + +# new vnet +ADDRESS_SPACE="10.${SPOKE_NUMBER}.0.0/24" +replace_fields ".network.value.addressSpace=\"$ADDRESS_SPACE\"" + +# vnet to peer +PEERED_VNET_ID=$(jq -r '.vnetId.value' hub-vnet-outputs.json) +PEERED_VNET_NAME=$(echo $PEERED_VNET_ID | cut -d '/' -f9) +PEERED_VNET_LOCATION=$(az network vnet show -g "$RG" -n "$PEERED_VNET_NAME" --query location -o tsv | tr -d '\r\n') +replace_fields ".network.value.vnetToPeer={ name: \"$PEERED_VNET_NAME\", id: \"$PEERED_VNET_ID\", location: \"$PEERED_VNET_LOCATION\", subscriptionName: \"\"}" + +# database config +DB_IP="10.0.0.228" +DB_USERNAME=$(jq -r '.adminUser.value' db_params.json) +DB_PASSWORD=$(jq -r '.adminPassword.value' db_params.json) +replace_fields ".databaseConfig={ value: { type: \"privateIp\", databaseUser: \"$DB_USERNAME\", privateIp: \"$DB_IP\" }}" +replace_fields ".databaseAdminPassword={ value: \"$DB_PASSWORD\" }" + +LOCATION='westus2' +SPOKE_RG_NAME="gb200-ccw-${LOCATION}-0${SPOKE_NUMBER}-rg" +SPOKE_DEPLOYMENT_NAME="spoke-ccw-0${SPOKE_NUMBER}" + +if az deployment sub show -g -n "${SPOKE_DEPLOYMENT_NAME}" > /dev/null 2>&1; then + RG_EXISTS=$(az group exists -n "$SPOKE_RG_NAME" | tr -d '\r\n') + if [ "$RG_EXISTS" = "false" ]; then + echo "Spoke #${SPOKE_NUMBER} already exists. Please set a new spoke number. Exiting." + exit 0 + fi +fi + +az deployment sub create \ + --location "$LOCATION" \ + --template-file "$(pwd)/../mainTemplate.bicep" \ + --parameters "$(pwd)/spoke_params.json" \ + --parameters location="$LOCATION" \ + --parameters resourceGroup="${SPOKE_RG_NAME}" \ + --parameters ccVMName="ccw-${SPOKE_NUMBER}-cyclecloud-vm" \ + --parameters clusterName="ccw-${SPOKE_NUMBER}" \ + --name "spoke-ccw-0${SPOKE_DEPLOYMENT_NAME}" + \ No newline at end of file diff --git a/bicep/hub/hub-vnet.bicep b/bicep/hub/hub-vnet.bicep index 3c4a0e27..6edb9b5d 100644 --- a/bicep/hub/hub-vnet.bicep +++ b/bicep/hub/hub-vnet.bicep @@ -31,6 +31,13 @@ var vnet = { 'Microsoft.DBforMySQL/flexibleServers' ] } + bastion: { + name: 'AzureBastionSubnet' + cidr: subnet_cidr.bastion + nat_gateway : true + service_endpoints: [] + delegations: [] + } } } @@ -67,3 +74,5 @@ resource virtualNetwork 'Microsoft.Network/virtualNetworks@2024-05-01' = { ] } } + +output vnetId string = virtualNetwork.id diff --git a/bicep/hub/original_spoke_params.json b/bicep/hub/original_spoke_params.json new file mode 100644 index 00000000..3b1033b3 --- /dev/null +++ b/bicep/hub/original_spoke_params.json @@ -0,0 +1,116 @@ +{ + "adminUsername": { + "value": "hpcadmin" + }, + "adminPassword": { + "value": "royGbiv!1" + }, + "adminSshPublicKey": { + "value": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCxuk0+FlxDyOsDtY/XTbLW8xcQFGsCuz+SayUUfRIqV1+QR9WNppujIwR5Sf30PmoJWGu0fNAz1Gd7rpGbNIfustZW7lTAWZkZb9UavswiRCRog1EdYPugEuqTPJtULFUdND4pX7G92DCPTtScPnrAKK6j9iVIDq81hiV2n/xtieidz3UhTXztjFpiNdbNGTWVV94IbqSB/PGwbrphs5ApkBxli8W2EIVhINUCcxXB7O256fMsTg//I/4+Vi5pTGcC+3XM8KGTfQhNopDRBFDT4C58aT4xGb63D2wQJaPDJXMqAns3H+1YRs4G/JWGe1S5FIYb1qScO5D1IGC3pDizteQ5bL5EbzMw4cgVtboq04bVA07L/9rsQqO50+IG+wA/iYUvKSEtpwQej++80Hsli0wJ+mb9cCc0VUa0TO1tS7jiQqKEonslDij2YMZEkXEj961DB1H4MlJeJT7cNCSeZ9Ms5iZiVnGz7gwA6zsWjQgWQcx0G70QEWGrHEjgNAs= redmond\\abatallas@LAPTOP-G2QDGDV1" + }, + "ccVMSize": { + "value": "Standard_D4as_v5" + }, + "sharedFilesystem": { + "value": { + "type": "nfs-existing", + "ipAddress": "0.0.0.0", + "exportPath": "/home-path", + "mountOptions": "rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev,nconnect=8" + } + }, + "additionalFilesystem": { + "value": { + "type": "disabled" + } + }, + "network": { + "value": { + "type": "new", + "addressSpace": "10.1.0.0/24", + "bastion": false, + "createNatGateway": true, + "vnetToPeer": { + "name": "t-abatallas-vnet", + "id": "/subscriptions/1dc1b726-3fdf-40a5-b356-3b8bd6227e52/resourceGroups/t-abatallas-rg/providers/Microsoft.Network/virtualNetworks/t-abatallas-vnet", + "location": "eastus", + "subscriptionName": "" + }, + "peeringAllowGatewayTransit": true + } + }, + "storagePrivateDnsZone": { + "value": { + "type": "new" + } + }, + "databaseAdminPassword": { + "value": "royGbiv!1" + }, + "databaseConfig": { + "value": { + "type": "privateIp", + "databaseUser": "hpcadmin", + "privateIp": "10.0.0.228" + } + }, + "acceptMarketplaceTerms": { + "value": false + }, + "slurmSettings": { + "value": { + "startCluster": false, + "version": "24.11.3-1", + "healthCheckEnabled": false + } + }, + "schedulerNode": { + "value": { + "sku": "Standard_D4as_v5", + "osImage": "cycle.image.ubuntu22" + } + }, + "loginNodes": { + "value": { + "sku": "Standard_F4s_v2", + "osImage": "cycle.image.ubuntu24", + "initialNodes": 1, + "maxNodes": 1 + } + }, + "htc": { + "value": { + "sku": "Standard_F2s_v2", + "maxNodes": 100, + "osImage": "cycle.image.ubuntu24", + "useSpot": false + } + }, + "hpc": { + "value": { + "sku": "Standard_D2plds_v6", + "maxNodes": 16, + "osImage": "cycle.image.ubuntu24" + } + }, + "gpu": { + "value": { + "sku": "Standard_NC24ads_A100_v4", + "maxNodes": 8, + "osImage": "cycle.image.ubuntu24" + } + }, + "ood": { + "value": { + "type": "enabled", + "startCluster": false, + "sku": "Standard_D4as_v5", + "osImage": "cycle.image.ubuntu24", + "userDomain": "microsoft.com", + "registerEntraIDApp": true + } + }, + "tags": { + "value": {} + } +} \ No newline at end of file diff --git a/bicep/hub/vnet_params.json b/bicep/hub/vnet_params.json index 1934218a..4253c0d6 100644 --- a/bicep/hub/vnet_params.json +++ b/bicep/hub/vnet_params.json @@ -1,5 +1,5 @@ { "address": { - "value": "10.0.0.0/16" + "value": "10.0.0.0/22" } } \ No newline at end of file diff --git a/bicep/mainTemplate.bicep b/bicep/mainTemplate.bicep index eca31478..bb6b82c0 100644 --- a/bicep/mainTemplate.bicep +++ b/bicep/mainTemplate.bicep @@ -40,7 +40,7 @@ param insidersBuild bool = false param branch string = 'main' // This needs to be updated on each release. Our Cloud.Project records require a release tag param projectVersion string = '2025.04.24' -param pyxisProjectVersion string = '1.0.0' +param monitoringProjectVersion string = '1.0.0' //Internal developer use only: set true use custom CycleCloud release build param manualInstall bool = false @@ -84,7 +84,7 @@ module makeCCWresources 'ccw.bicep' = { clusterName: clusterName branch: branch projectVersion: projectVersion - pyxisProjectVersion: pyxisProjectVersion + monitoringProjectVersion: monitoringProjectVersion manualInstall: manualInstall acceptMarketplaceTerms: acceptMarketplaceTerms ood: ood diff --git a/bicep/mysql.bicep b/bicep/mysql.bicep index a835aca7..10405af7 100644 --- a/bicep/mysql.bicep +++ b/bicep/mysql.bicep @@ -3,7 +3,6 @@ import {tags_t} from './types.bicep' param location string param tags tags_t -param Name string param adminUser string @secure() param adminPassword string @@ -19,10 +18,10 @@ param serverEdition string = 'Burstable' param skuName string = 'Standard_B2ms' // Create a MySQL Flexible Server -resource server 'Microsoft.DBforMySQL/flexibleServers@2023-10-01-preview' = { +resource server 'Microsoft.DBforMySQL/flexibleServers@2024-12-30' = { location: location tags: tags - name: Name + name: 'hub-mysql-${resourceGroup().name}' sku: { name: skuName tier: serverEdition @@ -51,5 +50,14 @@ resource server 'Microsoft.DBforMySQL/flexibleServers@2023-10-01-preview' = { } } +resource require_secure_transport 'Microsoft.DBforMySQL/flexibleServers/configurations@2024-12-30' = { + name: 'require_secure_transport' + parent: server + properties: { + value: 'OFF' + source: 'user-override' + } +} + //output fqdn string = reference(server.id, server.apiVersion, 'full').properties.fullyQualifiedDomainName output fqdn string = server.properties.fullyQualifiedDomainName diff --git a/bicep/network-new.bicep b/bicep/network-new.bicep index 5b2e1f32..e6ee50dc 100644 --- a/bicep/network-new.bicep +++ b/bicep/network-new.bicep @@ -80,6 +80,7 @@ func subnet_ranges(decomp_ip object, subnet object) object => { compute: '${decomp_ip.o1}.${decomp_ip.o2}.${decomp_ip.o3+subnet.compute.o3}.${decomp_ip.o4+subnet.compute.o4}/${subnet.compute.cidr}' } +@export() func subnet_config(ip string) object => subnet_ranges(decompose_ip(ip),subnet_octets(get_cidr(ip))) var subnet_cidr = subnet_config(address) @@ -165,7 +166,9 @@ var nsg_rules = { AllowCycleClientComputeIn: ['460', 'Inbound', 'Allow', 'Tcp', 'CycleCloud', 'subnet', 'compute', 'subnet', 'cyclecloud'] // Deny all remaining traffic - DenyVnetInbound: ['3100', 'Inbound', 'Deny', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] + // TODO: Parameterize this rule or condition on peering choice + AllowVnetInbound: ['3100', 'Inbound', 'Allow', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] + // DenyVnetInbound: ['3100', 'Inbound', 'Deny', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] // // OUTBOUND RULES @@ -183,7 +186,9 @@ var nsg_rules = { // Deny all remaining traffic and allow Internet access AllowInternetOutBound: ['3000', 'Outbound', 'Allow', 'Tcp', 'All', 'tag', 'VirtualNetwork', 'tag', 'Internet'] - DenyVnetOutbound: ['3100', 'Outbound', 'Deny', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] + // TODO: Parameterize this rule or condition on peering choice + AllowVnetOutbound: ['3100', 'Outbound', 'Allow', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] + //DenyVnetOutbound: ['3100', 'Outbound', 'Deny', '*', 'All', 'tag', 'VirtualNetwork', 'tag', 'VirtualNetwork'] } // TODO : Need to be validated mysql: { From 1dc0929d68ac89183e9c45a8b1fa2e0dd0887eb9 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Fri, 23 May 2025 16:20:58 -0400 Subject: [PATCH 03/50] Pass along monitoring ingestion endpoint to bicep --- bicep/ccw.bicep | 2 ++ bicep/hub/deploy_spoke.sh | 32 ++++++++++++++++---------------- bicep/mainTemplate.bicep | 3 +++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/bicep/ccw.bicep b/bicep/ccw.bicep index 417dd0dd..5a2fa73b 100644 --- a/bicep/ccw.bicep +++ b/bicep/ccw.bicep @@ -8,6 +8,7 @@ param insidersBuild bool param branch string param projectVersion string param monitoringProjectVersion string +param monitoringIngestionEndpoint string param adminUsername string @secure() @@ -315,6 +316,7 @@ output filerInfoFinal types.filerInfo_t = { output cyclecloudPrincipalId string = infrastructureOnly ? '' : ccwVM.outputs.principalId output managedIdentityId string = infrastructureOnly ? '' : ccwManagedIdentity.outputs.managedIdentityId +output monitoringIngestionEndpoint string = monitoringIngestionEndpoint // Automatically inject the ccw and monitoring cluster init specs diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh index 153b0a0d..e8ad85e0 100644 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -6,15 +6,23 @@ RG="gb200-hub-westus2" SUFFIX="-${RG}" SPOKE_NUMBER="1" +LOCATION='westus2' +SPOKE_RG_NAME="gb200-ccw-${LOCATION}-0${SPOKE_NUMBER}-rg" +SPOKE_DEPLOYMENT_NAME="spoke-ccw-0${SPOKE_NUMBER}" + +if az deployment sub show -g -n "${SPOKE_DEPLOYMENT_NAME}" > /dev/null 2>&1; then + RG_EXISTS=$(az group exists -n "$SPOKE_RG_NAME" | tr -d '\r\n') + if [ "$RG_EXISTS" = "false" ]; then + echo "Spoke #${SPOKE_NUMBER} already exists. Please set a new spoke number. Exiting." + exit 0 + fi +fi + fetch_outputs() { az deployment group show -g "$RG" -n "hub-vnet${SUFFIX}" --query properties.outputs > hub-vnet-outputs.json az deployment group show -g "$RG" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > hub-anf-outputs.json az deployment group show -g "$RG" -n "hub-db${SUFFIX}" --query properties.outputs > hub-db-outputs.json - -# Monitoring NTS: Copy cc-monitoring outputs to directory containing hub outputs -# cp /path/to/outputs.json hub-monitoring-outputs.json - -# TODO: blob storage +cp cyclecloud-monitoring/infra/outputs.json hub-monitoring-outputs.json } fetch_outputs @@ -48,17 +56,8 @@ DB_PASSWORD=$(jq -r '.adminPassword.value' db_params.json) replace_fields ".databaseConfig={ value: { type: \"privateIp\", databaseUser: \"$DB_USERNAME\", privateIp: \"$DB_IP\" }}" replace_fields ".databaseAdminPassword={ value: \"$DB_PASSWORD\" }" -LOCATION='westus2' -SPOKE_RG_NAME="gb200-ccw-${LOCATION}-0${SPOKE_NUMBER}-rg" -SPOKE_DEPLOYMENT_NAME="spoke-ccw-0${SPOKE_NUMBER}" - -if az deployment sub show -g -n "${SPOKE_DEPLOYMENT_NAME}" > /dev/null 2>&1; then - RG_EXISTS=$(az group exists -n "$SPOKE_RG_NAME" | tr -d '\r\n') - if [ "$RG_EXISTS" = "false" ]; then - echo "Spoke #${SPOKE_NUMBER} already exists. Please set a new spoke number. Exiting." - exit 0 - fi -fi +# monitoring +MONITORING_INGESTION_ENDPOINT=$(jq -r '.properties.outputs.ingestionEndpoint.value' hub-monitoring-outputs.json) az deployment sub create \ --location "$LOCATION" \ @@ -68,5 +67,6 @@ az deployment sub create \ --parameters resourceGroup="${SPOKE_RG_NAME}" \ --parameters ccVMName="ccw-${SPOKE_NUMBER}-cyclecloud-vm" \ --parameters clusterName="ccw-${SPOKE_NUMBER}" \ + --parameters monitoringIngestionEndpoint="${MONITORING_INGESTION_ENDPOINT}" \ --name "spoke-ccw-0${SPOKE_DEPLOYMENT_NAME}" \ No newline at end of file diff --git a/bicep/mainTemplate.bicep b/bicep/mainTemplate.bicep index bb6b82c0..85cdc1f0 100644 --- a/bicep/mainTemplate.bicep +++ b/bicep/mainTemplate.bicep @@ -44,6 +44,8 @@ param monitoringProjectVersion string = '1.0.0' //Internal developer use only: set true use custom CycleCloud release build param manualInstall bool = false +param monitoringIngestionEndpoint string + resource ccwResourceGroup 'Microsoft.Resources/resourceGroups@2024-03-01' = { name: resourceGroup location: location @@ -85,6 +87,7 @@ module makeCCWresources 'ccw.bicep' = { branch: branch projectVersion: projectVersion monitoringProjectVersion: monitoringProjectVersion + monitoringIngestionEndpoint: monitoringIngestionEndpoint manualInstall: manualInstall acceptMarketplaceTerms: acceptMarketplaceTerms ood: ood From bd26bdf6cf5452961b01a8541490ca6bbb277a00 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Fri, 23 May 2025 16:38:26 -0400 Subject: [PATCH 04/50] Accept command line input in spoke deployment script --- bicep/hub/deploy_spoke.sh | 57 +++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 8 deletions(-) diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh index e8ad85e0..6279ec43 100644 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -2,11 +2,52 @@ cd "$(dirname "$0")" -RG="gb200-hub-westus2" -SUFFIX="-${RG}" -SPOKE_NUMBER="1" +# hub params +HUB_RG_NAME="" -LOCATION='westus2' +# spoke params +LOCATION="" +SPOKE_NUMBER="" + +# Parse arguments +while [ "$#" -gt 0 ]; do + case "$1" in + -rg|--hub-resource-group) + HUB_RG_NAME="$2" + shift 2 + ;; + -s|--spoke-number) + SPOKE_NUMBER="$2" + shift 2 + ;; + -l|--location) + LOCATION="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 --hub-resource-group --location --spoke-number " + echo " or: $0 -rg -l -s " + exit 0 + ;; + *) + echo "Unknown parameter: $1" + echo "Use --help for usage information." + exit 1 + ;; + esac +done + +# Validate inputs +if [ -z "$RESOURCE_GROUP" ] || [ -z "$LOCATION" ] || [ -z "$SPOKE_NUMBER" ]; then + echo "Error: --resource-group, --location, --spoke-number are required." + echo "Use --help for usage information." + exit 1 +fi + +# hub +SUFFIX="-${HUB_RG_NAME}" + +# spoke SPOKE_RG_NAME="gb200-ccw-${LOCATION}-0${SPOKE_NUMBER}-rg" SPOKE_DEPLOYMENT_NAME="spoke-ccw-0${SPOKE_NUMBER}" @@ -19,9 +60,9 @@ if az deployment sub show -g -n "${SPOKE_DEPLOYMENT_NAME}" > /dev/null 2>&1; the fi fetch_outputs() { -az deployment group show -g "$RG" -n "hub-vnet${SUFFIX}" --query properties.outputs > hub-vnet-outputs.json -az deployment group show -g "$RG" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > hub-anf-outputs.json -az deployment group show -g "$RG" -n "hub-db${SUFFIX}" --query properties.outputs > hub-db-outputs.json +az deployment group show -g "$HUB_RG_NAME" -n "hub-vnet${SUFFIX}" --query properties.outputs > hub-vnet-outputs.json +az deployment group show -g "$HUB_RG_NAME" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > hub-anf-outputs.json +az deployment group show -g "$HUB_RG_NAME" -n "hub-db${SUFFIX}" --query properties.outputs > hub-db-outputs.json cp cyclecloud-monitoring/infra/outputs.json hub-monitoring-outputs.json } @@ -45,7 +86,7 @@ replace_fields ".network.value.addressSpace=\"$ADDRESS_SPACE\"" # vnet to peer PEERED_VNET_ID=$(jq -r '.vnetId.value' hub-vnet-outputs.json) -PEERED_VNET_NAME=$(echo $PEERED_VNET_ID | cut -d '/' -f9) +PEERED_VNET_NAME=$(echo "${PEERED_VNET_ID}" | cut -d '/' -f9) PEERED_VNET_LOCATION=$(az network vnet show -g "$RG" -n "$PEERED_VNET_NAME" --query location -o tsv | tr -d '\r\n') replace_fields ".network.value.vnetToPeer={ name: \"$PEERED_VNET_NAME\", id: \"$PEERED_VNET_ID\", location: \"$PEERED_VNET_LOCATION\", subscriptionName: \"\"}" From 71cc3981638cf5e1bf427ddd36156124a63fdce5 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Fri, 23 May 2025 16:49:30 -0400 Subject: [PATCH 05/50] Add RA to MI for monitoring in Bicep --- bicep/exports.bicep | 1 + bicep/mi.bicep | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bicep/exports.bicep b/bicep/exports.bicep index 7aad9a61..2dbfef59 100644 --- a/bicep/exports.bicep +++ b/bicep/exports.bicep @@ -5,4 +5,5 @@ var role_lookup = { 'Storage Account Contributor': resourceId('microsoft.authorization/roleDefinitions', '17d1049b-9a84-46fb-8f53-869881c3d3ab') 'Storage Blob Data Contributor': resourceId('microsoft.authorization/roleDefinitions', 'ba92f5b4-2d11-453d-a403-e96b0029c9fe') 'Storage Blob Data Reader': resourceId('microsoft.authorization/roleDefinitions', '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1') + 'Monitoring Metrics Publisher': resourceId('microsoft.authorization/roleDefinitions', '3913510d-42f4-4e42-8a64-420c390055eb') } diff --git a/bicep/mi.bicep b/bicep/mi.bicep index 36185743..a5a5e5b9 100644 --- a/bicep/mi.bicep +++ b/bicep/mi.bicep @@ -17,7 +17,10 @@ module ccwMIRoleAssignments './miRoleAssignments.bicep' = { name: 'ccwRoleForLockerManagedIdentity' params: { principalId: managedIdentity.properties.principalId - roles: ['Storage Blob Data Reader'] + roles: [ + 'Storage Blob Data Reader' + 'Monitoring Metrics Publisher' + ] storageAccountName: storageAccountName } } From b971d5c7da983965203b7b13132169c26ae56efa Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Fri, 23 May 2025 17:12:08 -0400 Subject: [PATCH 06/50] Ingest monitoring parameters in create_cc_param.py --- bicep/files-to-load/create_cc_param.py | 5 +++++ bicep/hub/deploy_spoke.sh | 5 +++-- bicep/mainTemplate.bicep | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bicep/files-to-load/create_cc_param.py b/bicep/files-to-load/create_cc_param.py index a20e827a..439e9d1d 100644 --- a/bicep/files-to-load/create_cc_param.py +++ b/bicep/files-to-load/create_cc_param.py @@ -96,6 +96,11 @@ def set_slurm_params(params, dbPassword, outputs): params['AdditionalNFSMountOptions'] = outputs['filerInfoFinal']['value']['additional']['mountOptions'] params['AdditionalNFSAddress'] = outputs['filerInfoFinal']['value']['additional']['ipAddress'] + # Monitoring + params['MonitoringEnabled'] = outputs['monitoringIngestionEndpoint']['value'] != '' + params['MonitoringIngestionEndpoint'] = outputs['monitoringIngestionEndpoint']['value'] + params['MonitoringIdentityClientId'] = outputs['managedIdentityId']['value'] + def set_ood_params(params, outputs): slurm_params = get_json_dict('initial_params.json') diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh index 6279ec43..51d943e6 100644 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -63,7 +63,8 @@ fetch_outputs() { az deployment group show -g "$HUB_RG_NAME" -n "hub-vnet${SUFFIX}" --query properties.outputs > hub-vnet-outputs.json az deployment group show -g "$HUB_RG_NAME" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > hub-anf-outputs.json az deployment group show -g "$HUB_RG_NAME" -n "hub-db${SUFFIX}" --query properties.outputs > hub-db-outputs.json -cp cyclecloud-monitoring/infra/outputs.json hub-monitoring-outputs.json +[ -f cyclecloud-monitoring/infra/outputs.json ] && cp cyclecloud-monitoring/infra/outputs.json hub-monitoring-outputs.json +# az deployment group show -g "$HUB_RG_NAME" -n ingestionEndpoint --query properties.outputs > hub-monitoring-outputs.json } fetch_outputs @@ -98,7 +99,7 @@ replace_fields ".databaseConfig={ value: { type: \"privateIp\", databaseUser: \" replace_fields ".databaseAdminPassword={ value: \"$DB_PASSWORD\" }" # monitoring -MONITORING_INGESTION_ENDPOINT=$(jq -r '.properties.outputs.ingestionEndpoint.value' hub-monitoring-outputs.json) +MONITORING_INGESTION_ENDPOINT=$([ -f hub-monitoring-outputs.json ] && jq -r '.properties.outputs.ingestionEndpoint.value' hub-monitoring-outputs.json || echo "") az deployment sub create \ --location "$LOCATION" \ diff --git a/bicep/mainTemplate.bicep b/bicep/mainTemplate.bicep index 85cdc1f0..9a239488 100644 --- a/bicep/mainTemplate.bicep +++ b/bicep/mainTemplate.bicep @@ -44,7 +44,7 @@ param monitoringProjectVersion string = '1.0.0' //Internal developer use only: set true use custom CycleCloud release build param manualInstall bool = false -param monitoringIngestionEndpoint string +param monitoringIngestionEndpoint string = '' resource ccwResourceGroup 'Microsoft.Resources/resourceGroups@2024-03-01' = { name: resourceGroup From 5887a1186332cf9c3302288e982c9d4501186c53 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Mon, 26 May 2025 11:43:36 -0400 Subject: [PATCH 07/50] Incident: passwords stored in git All resources removed that used this. They only were ever used in locked down environments accessible by a unique VPN. --- bicep/hub/db_params.json | 3 +-- bicep/hub/original_spoke_params.json | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/bicep/hub/db_params.json b/bicep/hub/db_params.json index acfd9bae..d13aeb5f 100644 --- a/bicep/hub/db_params.json +++ b/bicep/hub/db_params.json @@ -3,9 +3,8 @@ "value": "hpcadmin" }, "adminPassword": { - "value": "royGbiv!1" }, "tags": { "value": {} } -} \ No newline at end of file +} diff --git a/bicep/hub/original_spoke_params.json b/bicep/hub/original_spoke_params.json index 3b1033b3..194cfb98 100644 --- a/bicep/hub/original_spoke_params.json +++ b/bicep/hub/original_spoke_params.json @@ -3,10 +3,8 @@ "value": "hpcadmin" }, "adminPassword": { - "value": "royGbiv!1" }, "adminSshPublicKey": { - "value": "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCxuk0+FlxDyOsDtY/XTbLW8xcQFGsCuz+SayUUfRIqV1+QR9WNppujIwR5Sf30PmoJWGu0fNAz1Gd7rpGbNIfustZW7lTAWZkZb9UavswiRCRog1EdYPugEuqTPJtULFUdND4pX7G92DCPTtScPnrAKK6j9iVIDq81hiV2n/xtieidz3UhTXztjFpiNdbNGTWVV94IbqSB/PGwbrphs5ApkBxli8W2EIVhINUCcxXB7O256fMsTg//I/4+Vi5pTGcC+3XM8KGTfQhNopDRBFDT4C58aT4xGb63D2wQJaPDJXMqAns3H+1YRs4G/JWGe1S5FIYb1qScO5D1IGC3pDizteQ5bL5EbzMw4cgVtboq04bVA07L/9rsQqO50+IG+wA/iYUvKSEtpwQej++80Hsli0wJ+mb9cCc0VUa0TO1tS7jiQqKEonslDij2YMZEkXEj961DB1H4MlJeJT7cNCSeZ9Ms5iZiVnGz7gwA6zsWjQgWQcx0G70QEWGrHEjgNAs= redmond\\abatallas@LAPTOP-G2QDGDV1" }, "ccVMSize": { "value": "Standard_D4as_v5" @@ -45,7 +43,6 @@ } }, "databaseAdminPassword": { - "value": "royGbiv!1" }, "databaseConfig": { "value": { @@ -113,4 +110,4 @@ "tags": { "value": {} } -} \ No newline at end of file +} From fefd0bfff2c8e1b2e309ce44b6356cda44b377e3 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Mon, 26 May 2025 11:58:54 -0400 Subject: [PATCH 08/50] WIP: tweak spoke to use outputs/ dir --- bicep/hub/deploy_spoke.sh | 51 ++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh index 51d943e6..9cdfd523 100644 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -60,25 +60,48 @@ if az deployment sub show -g -n "${SPOKE_DEPLOYMENT_NAME}" > /dev/null 2>&1; the fi fetch_outputs() { -az deployment group show -g "$HUB_RG_NAME" -n "hub-vnet${SUFFIX}" --query properties.outputs > hub-vnet-outputs.json -az deployment group show -g "$HUB_RG_NAME" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > hub-anf-outputs.json -az deployment group show -g "$HUB_RG_NAME" -n "hub-db${SUFFIX}" --query properties.outputs > hub-db-outputs.json -[ -f cyclecloud-monitoring/infra/outputs.json ] && cp cyclecloud-monitoring/infra/outputs.json hub-monitoring-outputs.json -# az deployment group show -g "$HUB_RG_NAME" -n ingestionEndpoint --query properties.outputs > hub-monitoring-outputs.json + mkdir -p outputs + if [ -f outputs/hub-vnet-outputs.json ]; then + echo "outputs/hub-vnet-outputs.json already fetched. Skipping." + else + echo "Fetching outputs for hub vnet..." + az deployment group show -g "$HUB_RG_NAME" -n "hub-vnet${SUFFIX}" --query properties.outputs > outputs/hub-vnet-outputs.json + fi + + if [ -f outputs/hub-anf-outputs.json ]; then + echo "outputs/hub-anf-outputs.json already fetched. Skipping." + else + echo "Fetching outputs for hub ANF..." + az deployment group show -g "$HUB_RG_NAME" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > outputs/hub-anf-outputs.json + fi + if [ -f outputs/hub-db-outputs.json ]; then + echo "outputs/hub-db-outputs.json already fetched. Skipping." + else + echo "Fetching outputs for hub MySQL database..." + az deployment group show -g "$HUB_RG_NAME" -n "hub-db${SUFFIX}" --query properties.outputs > outputs/hub-db-outputs.json + fi + if [ -f outputs/hub-monitoring-outputs.json ]; then + echo "outputs/hub-monitoring-outputs.json already fetched. Skipping." + else + echo "Fetching outputs for hub monitoring..." + [ -f cyclecloud-monitoring/infra/outputs.json ] && cp cyclecloud-monitoring/infra/outputs.json outputs/hub-monitoring-outputs.json + # az deployment group show -g "$HUB_RG_NAME" -n ingestionEndpoint --query properties.outputs > outputs/hub-monitoring-outputs.json + fi + echo "Done fetching outputs." } fetch_outputs -cp original_spoke_params.json spoke_params.json +cp original_spoke_params.json params/spoke_params.json replace_fields() { - jq "$1" spoke_params.json > tmp_spoke_params.json && mv tmp_spoke_params.json spoke_params.json + jq "$1" params/spoke_params.json > tmp_spoke_params.json && mv tmp_spoke_params.json params/spoke_params.json } # shared FS -IP_ADDRESS=$(jq -r '.ipAddress.value' hub-anf-outputs.json) -EXPORT_PATH=$(jq -r '.exportPath.value' hub-anf-outputs.json) -MOUNT_OPTIONS=$(jq -r '.mountOptions.value' hub-anf-outputs.json) +IP_ADDRESS=$(jq -r '.ipAddress.value' outputs/hub-anf-outputs.json) +EXPORT_PATH=$(jq -r '.exportPath.value' outputs/hub-anf-outputs.json) +MOUNT_OPTIONS=$(jq -r '.mountOptions.value' outputs/hub-anf-outputs.json) replace_fields ".sharedFilesystem={ value: { type: \"nfs-existing\", ipAddress: \"$IP_ADDRESS\", exportPath: \"$EXPORT_PATH\", mountOptions: \"$MOUNT_OPTIONS\" } }" # new vnet @@ -86,20 +109,20 @@ ADDRESS_SPACE="10.${SPOKE_NUMBER}.0.0/24" replace_fields ".network.value.addressSpace=\"$ADDRESS_SPACE\"" # vnet to peer -PEERED_VNET_ID=$(jq -r '.vnetId.value' hub-vnet-outputs.json) +PEERED_VNET_ID=$(jq -r '.vnetId.value' outputs/hub-vnet-outputs.json) PEERED_VNET_NAME=$(echo "${PEERED_VNET_ID}" | cut -d '/' -f9) PEERED_VNET_LOCATION=$(az network vnet show -g "$RG" -n "$PEERED_VNET_NAME" --query location -o tsv | tr -d '\r\n') replace_fields ".network.value.vnetToPeer={ name: \"$PEERED_VNET_NAME\", id: \"$PEERED_VNET_ID\", location: \"$PEERED_VNET_LOCATION\", subscriptionName: \"\"}" # database config DB_IP="10.0.0.228" -DB_USERNAME=$(jq -r '.adminUser.value' db_params.json) -DB_PASSWORD=$(jq -r '.adminPassword.value' db_params.json) +DB_USERNAME=$(jq -r '.adminUser.value' params/db_params.json) +DB_PASSWORD=$(jq -r '.adminPassword.value' params/db_params.json) replace_fields ".databaseConfig={ value: { type: \"privateIp\", databaseUser: \"$DB_USERNAME\", privateIp: \"$DB_IP\" }}" replace_fields ".databaseAdminPassword={ value: \"$DB_PASSWORD\" }" # monitoring -MONITORING_INGESTION_ENDPOINT=$([ -f hub-monitoring-outputs.json ] && jq -r '.properties.outputs.ingestionEndpoint.value' hub-monitoring-outputs.json || echo "") +MONITORING_INGESTION_ENDPOINT=$([ -f outputs/hub-monitoring-outputs.json ] && jq -r '.properties.outputs.ingestionEndpoint.value' outputs/hub-monitoring-outputs.json || echo "") az deployment sub create \ --location "$LOCATION" \ From d41a7950ecb043fb4469ab8af2a29fc2f0b68fcb Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Tue, 27 May 2025 17:01:49 -0400 Subject: [PATCH 09/50] WIP add hub-mi bicep and creation script --- .gitignore | 1 + bicep/hub/create_hub_mi.sh | 9 ++++++ bicep/hub/hub-mi.bicep | 32 +++++++++++++++++++ .../hub/{ => params/template}/anf_params.json | 0 .../{ => params/template}/bastion_params.json | 0 .../hub/{ => params/template}/db_params.json | 0 .../template}/original_spoke_params.json | 2 ++ .../{ => params/template}/vnet_params.json | 0 8 files changed, 44 insertions(+) create mode 100644 bicep/hub/create_hub_mi.sh create mode 100644 bicep/hub/hub-mi.bicep rename bicep/hub/{ => params/template}/anf_params.json (100%) rename bicep/hub/{ => params/template}/bastion_params.json (100%) rename bicep/hub/{ => params/template}/db_params.json (100%) rename bicep/hub/{ => params/template}/original_spoke_params.json (98%) rename bicep/hub/{ => params/template}/vnet_params.json (100%) diff --git a/.gitignore b/.gitignore index 65cd7465..71a254a6 100644 --- a/.gitignore +++ b/.gitignore @@ -407,3 +407,4 @@ arm-ttk/ # delete_roles.sh files util/.role_assignment_cleanup* +bicep/hub/params/*.json diff --git a/bicep/hub/create_hub_mi.sh b/bicep/hub/create_hub_mi.sh new file mode 100644 index 00000000..6b0fb10e --- /dev/null +++ b/bicep/hub/create_hub_mi.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -e +RG=$1 +LOCATION=$2 + +az deployment group create \ + --name "$RG-hub-mi" \ + --resource-group "$RG" \ + --template-file ./hub-mi.bicep \ \ No newline at end of file diff --git a/bicep/hub/hub-mi.bicep b/bicep/hub/hub-mi.bicep new file mode 100644 index 00000000..58871217 --- /dev/null +++ b/bicep/hub/hub-mi.bicep @@ -0,0 +1,32 @@ +targetScope = 'resourceGroup' +import {tags_t} from '.././types.bicep' +import * as exports from './exports.bicep' + +param name string = '{resourceGroup().name}-mi' +param location string = resourceGroup().location +param tags tags_t = {} + +//create managed identity for VMSSs +resource managedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = { + name: name + location: location + tags: tags +} + +var roles = [ + 'Storage Blob Data Reader' + 'Storage Blob Data Constributor' + 'Monitoring Metrics Publisher' + ] + +resource roleAssignments 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ for role in roles: { + name: guid(subscription().id, principalId, exports.role_lookup[role]) + scope: storageAccount + properties: { + roleDefinitionId: exports.role_lookup[role] + + principalType: 'ResourceGroup' + } +}] + +output hubMI string = managedIdentity.id diff --git a/bicep/hub/anf_params.json b/bicep/hub/params/template/anf_params.json similarity index 100% rename from bicep/hub/anf_params.json rename to bicep/hub/params/template/anf_params.json diff --git a/bicep/hub/bastion_params.json b/bicep/hub/params/template/bastion_params.json similarity index 100% rename from bicep/hub/bastion_params.json rename to bicep/hub/params/template/bastion_params.json diff --git a/bicep/hub/db_params.json b/bicep/hub/params/template/db_params.json similarity index 100% rename from bicep/hub/db_params.json rename to bicep/hub/params/template/db_params.json diff --git a/bicep/hub/original_spoke_params.json b/bicep/hub/params/template/original_spoke_params.json similarity index 98% rename from bicep/hub/original_spoke_params.json rename to bicep/hub/params/template/original_spoke_params.json index 194cfb98..13816b24 100644 --- a/bicep/hub/original_spoke_params.json +++ b/bicep/hub/params/template/original_spoke_params.json @@ -3,6 +3,7 @@ "value": "hpcadmin" }, "adminPassword": { + "value": "" }, "adminSshPublicKey": { }, @@ -43,6 +44,7 @@ } }, "databaseAdminPassword": { + "value": "" }, "databaseConfig": { "value": { diff --git a/bicep/hub/vnet_params.json b/bicep/hub/params/template/vnet_params.json similarity index 100% rename from bicep/hub/vnet_params.json rename to bicep/hub/params/template/vnet_params.json From 0062bbf3eb69425ddc81cad1d81250b50ee47224 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Tue, 27 May 2025 17:02:10 -0400 Subject: [PATCH 10/50] WIP add custom slurm template --- bicep/files-to-load/slurm.txt | 1033 +++++++++++++++++++++++++++++++++ 1 file changed, 1033 insertions(+) create mode 100644 bicep/files-to-load/slurm.txt diff --git a/bicep/files-to-load/slurm.txt b/bicep/files-to-load/slurm.txt new file mode 100644 index 00000000..b670c7cd --- /dev/null +++ b/bicep/files-to-load/slurm.txt @@ -0,0 +1,1033 @@ + +################################ +## Cluster Configuration File ## +################################ + + +[cluster Slurm] +FormLayout = selectionpanel +Category = Schedulers + +Autoscale = $Autoscale + + [[node defaults]] + UsePublicNetwork = $UsePublicNetwork + Credentials = $Credentials + SubnetId = $SubnetId + Region = $Region + KeyPairLocation = ~/.ssh/cyclecloud.pem + Azure.Identities = $ManagedIdentity + Tags = $NodeTags + + # Slurm autoscaling supports both Terminate and Deallocate shutdown policies + ShutdownPolicy = $configuration_slurm_shutdown_policy + + # Lustre mounts require termination notifications to unmount + EnableTerminateNotification = ${NFSType == "lustre" || NFSSchedType == "lustre" || AdditionalNFSType == "lustre" || EnableTerminateNotification} + TerminateNotificationTimeout = 10m + + [[[configuration]]] + + slurm.version = $configuration_slurm_version + slurm.user.uid = 11100 + slurm.user.gid = 11100 + munge.user.uid = 11101 + munge.user.gid = 11101 + slurm.accounting.enabled = $configuration_slurm_accounting_enabled + slurm.accounting.url = $configuration_slurm_accounting_url + slurm.accounting.user = $configuration_slurm_accounting_user + slurm.accounting.password = $configuration_slurm_accounting_password + slurm.accounting.certificate_url = $configuration_slurm_accounting_certificate_url + slurm.accounting.storageloc = $configuration_slurm_accounting_storageloc + slurm.additional.config = $additional_slurm_config + slurm.ha_enabled = $configuration_slurm_ha_enabled + slurm.launch_parameters = $configuration_slurm_launch_parameters + + # Disable ip-XXXXXXXX hostname generation + cyclecloud.hosts.standalone_dns.enabled = ${NodeNameIsHostname==false} + cyclecloud.hosts.simple_vpc_dns.enabled = ${NodeNameIsHostname==false} + + # For fast spin-up after Deallocate, force an immediate re-converge on boot + cyclecloud.converge_on_boot = true + + # Disable normal NFS exports and mounts + cyclecloud.mounts.sched.disabled = true + cyclecloud.mounts.shared.disabled = true + cyclecloud.exports.sched.disabled = true + cyclecloud.exports.shared.disabled = true + cyclecloud.exports.sched.samba.enabled = false + cyclecloud.exports.shared.samba.enabled = false + cyclecloud.exports.defaults.samba.enabled = false + cshared.server.legacy_links_disabled = true + + # May be used to identify the ID in cluster-init scripts + cluster.identities.default = $ManagedIdentity + + monitoring.ingestion_endpoint = $MonitoringIngestionEndpoint + monitoring.identity_client_id = $MonitoringIdentityClientId + monitoring.enabled = $MonitoringEnabled + cyclecloud.enable_chef = false + + [[[cluster-init slurm:default:4.0.0]]] + Optional = true + + [[[volume boot]]] + Size = ${ifThenElse(BootDiskSize > 0, BootDiskSize, undefined)} + SSD = True + + [[[configuration cyclecloud.mounts.nfs_shared]]] + type = $NFSType + mountpoint = /shared + export_path = ${ifThenElse(NFSType == "lustre", strcat("tcp:/lustrefs", NFSSharedExportPath), NFSSharedExportPath)} + address = $NFSAddress + options = $NFSSharedMountOptions + + [[[configuration cyclecloud.mounts.nfs_sched]]] + type = $NFSSchedType + mountpoint = /sched + export_path = ${ifThenElse(NFSSchedType == "lustre", strcat("tcp:/lustrefs", NFSSchedExportPath), NFSSchedExportPath)} + address = ${ifThenElse(UseBuiltinSched && !configuration_slurm_ha_enabled, undefined, NFSSchedAddress)} + options = $NFSSchedMountOptions + + [[[configuration cyclecloud.mounts.additional_nfs]]] + disabled = ${AdditionalNFS isnt true} + type = $AdditionalNFSType + address = $AdditionalNFSAddress + mountpoint = $AdditionalNFSMountPoint + export_path = ${ifThenElse(AdditionalNFSType == "lustre", strcat("tcp:/lustrefs", AdditionalNFSExportPath), AdditionalNFSExportPath)} + options = $AdditionalNFSMountOptions + + [[node scheduler]] + MachineType = $SchedulerMachineType + ImageName = $SchedulerImageName + IsReturnProxy = $ReturnProxy + AdditionalClusterInitSpecs = $SchedulerClusterInitSpecs + ComputerName = ${toLower(regexps("([^a-zA-Z0-9-])", ifThenElse(SchedulerHostName=="Cluster Prefix", StrJoin("-", ClusterName, "scheduler"), ifThenElse(Size(Trim(SchedulerHostName)) == 0 || SchedulerHostName == "Generated", undefined, SchedulerHostName)), "-"))} + # indented version, for clarity. + # ${toLower( + # regexps("([^a-zA-Z0-9-])", + # ifThenElse(SchedulerHostName=="Cluster Prefix", + # StrJoin("-", ClusterName, "scheduler"), + # ifThenElse(Size(Trim(SchedulerHostName)) == 0 || SchedulerHostName == "Generated", + # undefined, + # SchedulerHostName)), + # "-"))} + Zone = ${ifThenElse(configuration_slurm_ha_enabled, SchedulerZone, undefined)} + + [[[configuration]]] + slurm.role = scheduler + # Disable NFS mount of built-in /sched since it is a local volume mount: cyclecloud.mounts.builtinsched + cyclecloud.mounts.nfs_sched.disabled = ${UseBuiltinSched && !configuration_slurm_ha_enabled} + cyclecloud.mounts.nfs_shared.disabled = ${UseBuiltinShared && !configuration_slurm_ha_enabled} + slurm.secondary_scheduler_name = ${ifThenElse(configuration_slurm_ha_enabled, "scheduler-ha-1", undefined)} + + + [[[cluster-init slurm:scheduler:4.0.0]]] + + [[[network-interface eth0]]] + AssociatePublicIpAddress = $UsePublicNetwork + + [[[volume sched]]] + Size = $SchedFilesystemSize + SSD = True + Mount = builtinsched + Persistent = True + Disabled = ${!UseBuiltinSched || configuration_slurm_ha_enabled} + + [[[volume shared]]] + Size = $FilesystemSize + SSD = True + Mount = builtinshared + Persistent = True + Disabled = ${!UseBuiltinShared || configuration_slurm_ha_enabled} + + [[[configuration cyclecloud.mounts.builtinsched]]] + disabled = ${!UseBuiltinSched || configuration_slurm_ha_enabled} + mountpoint = /sched + fs_type = xfs + + [[[configuration cyclecloud.mounts.builtinshared]]] + disabled = ${!UseBuiltinShared || configuration_slurm_ha_enabled} + mountpoint = /shared + fs_type = xfs + + [[[configuration cyclecloud.exports.builtinsched]]] + disabled = ${!UseBuiltinSched || configuration_slurm_ha_enabled} + export_path = /sched + options = no_root_squash + samba.enabled = false + type = nfs + + [[[configuration cyclecloud.exports.builtinshared]]] + disabled = ${!UseBuiltinShared || configuration_slurm_ha_enabled} + export_path = /shared + samba.enabled = false + type = nfs + + [[nodearray scheduler-ha]] + Extends = scheduler + IsReturnProxy = false + InitialCount = $configuration_slurm_ha_enabled + Zone = $SchedulerHAZone + # Do not inherit property from node that is not used in nodearray + # The equivalent is ComputerNamePrefix for nodearray, however Cluster-init will handle renaming of all hosts in a VMSS + ComputerName := undefined + [[[configuration]]] + autoscale.enabled = false + slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} + slurm.use_nodename_as_hostname = $NodeNameIsHostname + slurm.is_primary_scheduler = false + + [[nodearray login]] + InitialCount = $NumberLoginNodes + MachineType = $loginMachineType + ImageName = $LoginImageName + AdditionalClusterInitSpecs = $LoginClusterInitSpecs + + [[[cluster-init slurm:login:4.0.0]]] + [[[configuration]]] + slurm.role = login + autoscale.enabled = false + slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} + slurm.use_nodename_as_hostname = $NodeNameIsHostname + + [[node nodearraybase]] + Abstract = true + [[[configuration]]] + slurm.role = execute + slurm.autoscale = true + + slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} + slurm.use_nodename_as_hostname = $NodeNameIsHostname + + [[[cluster-init slurm:execute:4.0.0]]] + + [[[network-interface eth0]]] + AssociatePublicIpAddress = $ExecuteNodesPublic + + [[nodearray hpc]] + CloudInit="""#!/bin/bash + echo DSHELL=/bin/bash >> /etc/adduser.conf + """ + Extends = nodearraybase + MachineType = $HPCMachineType + ImageName = $HPCImageName + MaxCount = $MaxHPCExecuteNodeCount + Azure.MaxScalesetSize = $HPCMaxScalesetSize + AdditionalClusterInitSpecs = $HPCClusterInitSpecs + EnableNodeHealthChecks = $EnableNodeHealthChecks + + + [[[configuration]]] + slurm.default_partition = true + slurm.hpc = true + slurm.partition = hpc + + [[nodearray htc]] + Extends = nodearraybase + MachineType = $HTCMachineType + ImageName = $HTCImageName + MaxCount = $MaxHTCExecuteNodeCount + + Interruptible = $HTCUseLowPrio + MaxPrice = $HTCSpotMaxPrice + AdditionalClusterInitSpecs = $HTCClusterInitSpecs + + [[[configuration]]] + slurm.hpc = false + slurm.partition = htc + # set pcpu = false for all hyperthreaded VMs + slurm.use_pcpu = false + + [[nodearray htc2]] + Extends = nodearraybase + MachineType = $HTC2MachineType + ImageName = $HTC2ImageName + MaxCount = $MaxHTC2ExecuteNodeCount + + Interruptible = $HTC2UseLowPrio + MaxPrice = $HTC2SpotMaxPrice + AdditionalClusterInitSpecs = $HTC2ClusterInitSpecs + + [[[configuration]]] + slurm.hpc = false + slurm.partition = htc2 + # set pcpu = false for all hyperthreaded VMs + slurm.use_pcpu = false + + [[nodearray gpu]] + Extends = nodearraybase + MachineType = $GPUMachineType + ImageName = $GPUImageName + MaxCount = $MaxGPUExecuteNodeCount + Azure.MaxScalesetSize = $HPCMaxScalesetSize + EnableNodeHealthChecks = $EnableNodeHealthChecks + + Interruptible = $GPUUseLowPrio + MaxPrice = $GPUSpotMaxPrice + AdditionalClusterInitSpecs = $GPUClusterInitSpecs + + [[[configuration]]] + slurm.default_partition = true + slurm.hpc = true + slurm.partition = gpu + #Parameter to enable or disable IMEX service on a per-job basis + #IMEX Support is enabled by default for GB200 but can be disabled by setting param to False + #slurm.imex.enabled=True + + [[nodearray dynamic]] + Extends = nodearraybase + MachineType = $DynamicMachineType + ImageName = $DynamicImageName + MaxCoreCount = $MaxDynamicExecuteCoreCount + + Interruptible = $DynamicUseLowPrio + MaxPrice = $DynamicSpotMaxPrice + AdditionalClusterInitSpecs = $DynamicClusterInitSpecs + [[[configuration]]] + slurm.hpc = false + # Slurm only allows a single feature to be defined in a Nodeset. If multiple features are defined here, only first value will be used for the nodeset. + slurm.dynamic_feature := "dyn" + # If this option is used, slurmd is started with this configuration for dynamic nodes. + #slurm.dynamic_config := "-Z --conf \"Feature=dyn\"" + # set pcpu = false for all hyperthreaded VMs + slurm.use_pcpu = false + slurm.autoscale = $EnableDynamicPartition + +[parameters About] +Order = 1 + + [[parameters About Slurm]] + + [[[parameter slurm]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template = '''
Slurm icon

Follow the instructions in theREADMEfor details on instructions on extending and configuring the Project for your environment.


Slurm is the most widely used workload manager in HPC, as the scheduler of choice for six of the top ten systems in the TOP500 and with market penetration of more than 70%. Slurm is an advanced, open-source scheduler designed to satisfy the demanding needs of high-performance computing (HPC), high-throughput computing (HTC), and artificial intelligence (AI).

Commercial Support provided by SchedMD

Get more from your HPC investment! SchedMD, the company behind Slurm development, can answer your Slurm questions and explain our options for consultation, training, support, and migration.

Contact SchedMD

View more details about Slurm?

Slurm at a glance

Slurm provides massive scalability and can easily manage performance requirements for small cluster, large cluster, and supercomputer needs. Slurm outperforms competitive schedulers with compute rates at:

  • 100K+ nodes/GPU
  • 17M+ jobs per day
  • 120M+ jobs per week

Slurm’s plug-in based architecture enables optimization and control in scheduling operations to meet organizational priorities. With first class resource management for GPUs, Slurm allows users to request GPU resources alongside CPUs. This flexibility ensures that jobs are executed quickly and efficiently, while maximizing resource utilization.


Other Slurm features include:

  • NVIDIA and AMD GPU support for AI, LLM, and ML environments
  • Advanced scheduling policies
  • Unique HPC, HTC, AI/ML workload expertise
  • Cloud bursting capabilities
  • Power saving capabilities, accounting, and reporting
  • Provided REST API daemon
  • Native support of containers
  • Tailored Slurm consulting and training available through SchedMD
''' + +[parameters Required Settings] +Order = 10 + + + [[parameters Virtual Machines]] + Description = "The cluster, in this case, has two roles: the scheduler node with shared filer and the execute hosts. Configure which VM types to use based on the requirements of your application." + Order = 20 + + [[[parameter Region]]] + Label = Region + Description = Deployment Location + ParameterType = Cloud.Region + + [[[parameter SchedulerMachineType]]] + Label = Scheduler VM Type + Description = The VM type for scheduler node + ParameterType = Cloud.MachineType + DefaultValue = Standard_D4ads_v5 + + [[[parameter loginMachineType]]] + Label = Login node VM Type + Description = The VM type for login nodes. + ParameterType = Cloud.MachineType + DefaultValue = Standard_D8as_v4 + + [[[parameter HPCMachineType]]] + Label = HPC VM Type + Description = The VM type for HPC execute nodes + ParameterType = Cloud.MachineType + DefaultValue = Standard_F2s_v2 + + [[[parameter HTCMachineType]]] + Label = HTC VM Type + Description = The VM type for HTC execute nodes + ParameterType = Cloud.MachineType + DefaultValue = Standard_F2s_v2 + + [[[parameter HTC2MachineType]]] + Label = HTC2 VM Type + Description = The VM type for HTC2 execute nodes + ParameterType = Cloud.MachineType + DefaultValue = Standard_F2s_v2 + + [[[parameter GPUMachineType]]] + Label = GPU VM Type + Description = The VM type for HPC execute nodes + ParameterType = Cloud.MachineType + DefaultValue = Standard_NC24rs_v3 + + [[[parameter DynamicMachineType]]] + Label = Dyn VM Type + Description = The VM type for Dynamic execute nodes + ParameterType = Cloud.MachineType + DefaultValue = Standard_F2s_v2 + Config.MultiSelect = true + + [[parameters Auto-Scaling]] + Description = "The cluster can autoscale to the workload, adding execute hosts as jobs are queued. To enable this check the box below and choose the initial and maximum core counts for the cluster." + Order = 30 + + [[[parameter Autoscale]]] + Label = Autoscale + DefaultValue = true + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Start and stop execute instances automatically + + [[[parameter MaxHTCExecuteNodeCount]]] + Label = Max HTC Nodes + Description = The total number of HTC execute nodes to start + DefaultValue = 100 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 0 + Config.IntegerOnly = true + + [[[parameter Max2HTCExecuteNodeCount]]] + Label = Max HTC2 Nodes + Description = The total number of HTC2 execute nodes to start + DefaultValue = 100 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 0 + Config.IntegerOnly = true + + [[[parameter MaxHPCExecuteNodeCount]]] + Label = Max HPC Nodes + Description = The total number of HPC execute nodes to start + DefaultValue = 16 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 0 + Config.IntegerOnly = true + + [[[parameter MaxGPUExecuteNodeCount]]] + Label = Max GPU Nodes + Description = The total number of GPU execute nodes to start + DefaultValue = 8 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 0 + Config.IntegerOnly = true + + [[[parameter MaxDynamicExecuteCoreCount]]] + Label = Max Dyn Cores + Description = The total number of Dynamic execute cores to start + DefaultValue = 100 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 1 + Config.IntegerOnly = true + + [[[parameter HPCMaxScalesetSize]]] + Label = Max VMs per VMSS + Description = The maximum number of VMs created per VM Scaleset e.g. switch in Slurm. + DefaultValue = 100 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 1 + Config.IntegerOnly = true + + + [[[parameter HTCUseLowPrio]]] + Label = HTC Spot + DefaultValue = false + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Use Spot VMs for HTC execute hosts + + [[[parameter HTC2UseLowPrio]]] + Label = HTC2 Spot + DefaultValue = false + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Use Spot VMs for HTC execute hosts + + [[[parameter HTCSpotMaxPrice]]] + Label = Max Price HTC + DefaultValue = -1 + Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) + Config.Plugin = pico.form.NumberTextBox + Conditions.Excluded := HTCUseLowPrio isnt true + Config.MinValue = -1 + + [[[parameter HTC2SpotMaxPrice]]] + Label = Max Price HTC2 + DefaultValue = -1 + Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) + Config.Plugin = pico.form.NumberTextBox + Conditions.Excluded := HTC2UseLowPrio isnt true + Config.MinValue = -1 + + + [[[parameter GPUUseLowPrio]]] + Label = GPU Spot + DefaultValue = false + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Use Spot VMs for GPU execute hosts + + [[[parameter GPUSpotMaxPrice]]] + Label = Max Price GPU + DefaultValue = -1 + Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) + Config.Plugin = pico.form.NumberTextBox + Conditions.Excluded := GPUUseLowPrio isnt true + Config.MinValue = -1 + + [[[parameter DynamicUseLowPrio]]] + Label = DynSpot + DefaultValue = false + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Use Spot VMs for Dynamic execute hosts + + [[[parameter DynamicSpotMaxPrice]]] + Label = Max Price Dyn + DefaultValue = -1 + Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) + Config.Plugin = pico.form.NumberTextBox + Conditions.Excluded := DynamicUseLowPrio isnt true + Config.MinValue = -1 + + [[[parameter NumberLoginNodes]]] + Label = Num Login Nodes + DefaultValue = 0 + Description = Number of optional login nodes to create. + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 0 + Config.MaxValue = 10000 + Config.IntegerOnly = true + + [[parameters Networking]] + Order = 40 + + [[[parameter SubnetId]]] + Label = Subnet ID + Description = Subnet Resource Path (ResourceGroup/VirtualNetwork/Subnet) + ParameterType = Azure.Subnet + Required = True + + [[parameters High Availability]] + Order = 50 + Description = "Slurm can be setup in HA mode - where slurmctld is running on two nodes with failover. Note that checking this box will require an external NFS, so any reference to the 'builtin' NFS will be hidden." + [[[parameter configuration_slurm_ha_enabled]]] + Label = Slurm HA Node + Description = Deploy with an additional HA node + DefaultValue = false + ParameterType = Boolean + + +[parameters Network Attached Storage] +Order = 15 + + [[parameters Shared Storage]] + Order = 10 + + [[[parameter About Shared Storage]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template = '''

The directories /sched and /shared are network attached mounts and exist on all nodes of the cluster.
+
+ Options for providing these mounts:
+ [Builtin]: The scheduler node is an NFS server that provides the mountpoint to the other nodes of the cluster (not supported for HA configurations).
+ [External NFS]: A network attached storage such as Azure Netapp Files, HPC Cache, or another VM running an NFS server provides the mountpoint.
+ [Azure Managed Lustre]: An Azure Managed Lustre deployment provides the mountpoint.
+

+

+ Note: the cluster must be terminated for changes to filesystem mounts to take effect. +

''' + Conditions.Hidden := false + + [[parameters Scheduler Mount]] + Order = 20 + Label = File-system Mount for /sched + + [[[parameter About sched]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template = '''

Slurm's configuration is linked in from the /sched directory. It is managed by the scheduler node.

''' + Order = 6 + + [[[parameter About sched part 2]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template = '''

Uncheck the box below to disable the built-in NFS export of the /sched directory and use an external file-system.

''' + Order = 7 + Conditions.Hidden := configuration_slurm_ha_enabled + + [[[parameter UseBuiltinSched]]] + Label = Use Builtin NFS + Description = Use the builtin NFS for /sched + DefaultValue = true + ParameterType = Boolean + Conditions.Hidden := configuration_slurm_ha_enabled + Disabled = configuration_slurm_ha_enabled + + [[[parameter NFSSchedDiskWarning]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template := "

Warning: switching an active cluster over to NFS or Lustre from Builtin will delete the shared disk.

" + Conditions.Hidden := UseBuiltinSched || configuration_slurm_ha_enabled + + [[[parameter NFSSchedType]]] + Label = FS Type + ParameterType = StringList + Config.Label = Type of shared filesystem to use for this cluster + Config.Plugin = pico.form.Dropdown + Config.Entries := {[Label="External NFS"; Value="nfs"], [Label="Azure Managed Lustre"; Value="lustre"]} + DefaultValue = nfs + Conditions.Hidden := UseBuiltinSched && !configuration_slurm_ha_enabled + + [[[parameter NFSSchedAddress]]] + Label = IP Address + Description = The IP address or hostname of the NFS server or Lustre FS. Also accepts a list comma-separated addresses, for example, to mount a frontend load-balanced Azure HPC Cache. + Config.ParameterType = String + Conditions.Hidden := UseBuiltinSched && !configuration_slurm_ha_enabled + + [[[parameter NFSSchedExportPath]]] + Label = Export Path + Description = The path exported by the file system + DefaultValue = /sched + Conditions.Hidden := UseBuiltinSched && !configuration_slurm_ha_enabled + + [[[parameter NFSSchedMountOptions]]] + Label = Mount Options + Description = NFS Client Mount Options + Conditions.Hidden := UseBuiltinSched && !configuration_slurm_ha_enabled + + + [[[parameter SchedFilesystemSize]]] + Label = Size (GB) + Description = The filesystem size (cannot be changed after initial start) + DefaultValue = 30 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 10 + Config.MaxValue = 10240 + Config.IntegerOnly = true + Conditions.Excluded := !UseBuiltinSched || configuration_slurm_ha_enabled + + + + [[parameters Default NFS Share]] + Order = 30 + Label = File-system Mount for /shared + + [[[parameter About shared]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template = '''

Users' home directories reside within the /shared mountpoint with the base homedir /shared/home.

''' + Order = 6 + + [[[parameter About shared part 2]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template = '''

Uncheck the box below to disable the built-in NFS export of the /shared directory and use an external file-system.

''' + Order = 7 + Conditions.Hidden := configuration_slurm_ha_enabled + + [[[parameter UseBuiltinShared]]] + Label = Use Builtin NFS + Description = Use the builtin NFS for /shared + DefaultValue = true + ParameterType = Boolean + Conditions.Hidden := configuration_slurm_ha_enabled + Disabled = configuration_slurm_ha_enabled + + [[[parameter NFSDiskWarning]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template := "

Warning: switching an active cluster over to NFS or Lustre from Builtin will delete the shared disk.

" + Conditions.Hidden := UseBuiltinShared || configuration_slurm_ha_enabled + + [[[parameter NFSType]]] + Label = FS Type + ParameterType = StringList + Config.Label = Type of shared filesystem to use for this cluster + Config.Plugin = pico.form.Dropdown + Config.Entries := {[Label="External NFS"; Value="nfs"], [Label="Azure Managed Lustre"; Value="lustre"]} + DefaultValue = nfs + Conditions.Hidden := UseBuiltinShared && !configuration_slurm_ha_enabled + + [[[parameter NFSAddress]]] + Label = IP Address + Description = The IP address or hostname of the NFS server or Lustre FS. Also accepts a list comma-separated addresses, for example, to mount a frontend load-balanced Azure HPC Cache. + Config.ParameterType = String + Conditions.Hidden := UseBuiltinShared && !configuration_slurm_ha_enabled + + [[[parameter NFSSharedExportPath]]] + Label = Export Path + Description = The path exported by the file system + DefaultValue = /shared + Conditions.Hidden := UseBuiltinShared && !configuration_slurm_ha_enabled + + [[[parameter NFSSharedMountOptions]]] + Label = Mount Options + Description = NFS Client Mount Options + Conditions.Hidden := UseBuiltinShared && !configuration_slurm_ha_enabled + + + [[[parameter FilesystemSize]]] + Label = Size (GB) + Description = The filesystem size (cannot be changed after initial start) + DefaultValue = 100 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 10 + Config.MaxValue = 10240 + Config.IntegerOnly = true + Conditions.Excluded := !UseBuiltinShared || configuration_slurm_ha_enabled + + [[parameters Additional NFS Mount]] + Order = 40 + Label = Additional Filesystem Mount + [[[parameter Additional Shared FS Mount Readme]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template := "

Mount another shared file-system endpoint on the cluster nodes.

" + Order = 20 + + [[[parameter AdditionalNFS]]] + HideLabel = true + DefaultValue = false + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Add Shared Filesystem mount + + [[[parameter AdditionalNFSType]]] + Label = FS Type + ParameterType = StringList + Config.Label = Shared filesystem type of the additional mount + Config.Plugin = pico.form.Dropdown + Config.Entries := {[Label="External NFS"; Value="nfs"], [Label="Azure Managed Lustre"; Value="lustre"]} + DefaultValue = nfs + Conditions.Excluded := AdditionalNFS isnt true + + [[[parameter AdditionalNFSAddress]]] + Label = IP Address + Description = The IP address or hostname of the additional mount. Also accepts a list comma-separated addresses, for example, to mount a frontend load-balanced Azure HPC Cache. + Config.ParameterType = String + Conditions.Excluded := AdditionalNFS isnt true + + [[[parameter AdditionalNFSMountPoint]]] + Label = Mount Point + Description = The path at which to mount the Filesystem + DefaultValue = /data + Conditions.Excluded := AdditionalNFS isnt true + + [[[parameter AdditionalNFSExportPath]]] + Label = Export Path + Description = The path exported by the file system + DefaultValue = /data + Conditions.Excluded := AdditionalNFS isnt true + + [[[parameter AdditionalNFSMountOptions]]] + Label = Mount Options + Description = Filesystem Client Mount Options + Conditions.Excluded := AdditionalNFS isnt true + + +[parameters Advanced Settings] +Order = 20 + + [[parameters Azure Settings]] + Order = 10 + + [[[parameter Credentials]]] + Description = The credentials for the cloud provider + ParameterType = Cloud.Credentials + + [[[parameter ManagedIdentity]]] + Label = Managed Id + Description = Optionally assign an Azure user assigned managed identity to all nodes to access Azure resources using assigned roles. + ParameterType = Azure.ManagedIdentity + DefaultValue = =undefined + + [[[parameter BootDiskSize]]] + Description = Optional: Size of the OS/boot disk in GB for all nodes in the cluster (leave at 0 to use Image size) + ParameterType = Integer + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 0 + Config.MaxValue = 32,000 + Config.IntegerOnly = true + Config.Increment = 64 + DefaultValue = 0 + + [[[parameter EnableDynamicPartition]]] + Label = Enable Dynamic Partition + DefaultValue = true + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Create a partition for the dynamic nodearray + + [[[parameter NodeTags]]] + Label = VM Tags + Description = Tags applied to all nodes + ParameterType = Record + DefaultValue := [] + Config.MultiSelect = false + + [[parameters Slurm Settings ]] + + Order = 5 + + [[[parameter slurm_version_warning]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Config.Template := "
Note: For SLES HPC, we can only install the version supported by SLES HPC's zypper repos. At the time of this release, that is 23.02.7
" + + + [[[parameter configuration_slurm_version]]] + Required = True + Label = Slurm Version + Description = Version of Slurm to install on the cluster + ParameterType = StringList + Config.Plugin = pico.form.Dropdown + Config.FreeForm = true + Config.Entries := {[Value="24.05.6-1"], [Value="24.11.3-1"]} + DefaultValue = 24.11.3-1 + + [[[parameter configuration_slurm_accounting_enabled]]] + Label = Job Accounting + DefaultValue = false + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Configure Slurm job accounting + + [[[parameter slurm_database_warning]]] + HideLabel = true + Config.Plugin = pico.widget.HtmlTemplateWidget + Conditions.Excluded := configuration_slurm_accounting_enabled isnt true + Config.Template := "
Note: Checking this box will create persistent databases and tables in SQL DB provided. Deleting this cluster will not automatically delete those databases. User is responsible for periodically purging/archiving their slurm databases to maintain costs.
" + + [[[parameter configuration_slurm_accounting_url]]] + Label = Slurm DBD URL + Description = URL of the database to use for Slurm job accounting + Conditions.Excluded := configuration_slurm_accounting_enabled isnt true + + [[[parameter configuration_slurm_accounting_storageloc]]] + Label = Database name + Description = Database name to store slurm accounting records + Conditions.Excluded := configuration_slurm_accounting_enabled isnt true + + [[[parameter configuration_slurm_accounting_user]]] + Label = Slurm DBD User + Description = User for Slurm DBD admin + Conditions.Excluded := configuration_slurm_accounting_enabled isnt true + + [[[parameter configuration_slurm_accounting_password]]] + Label = Slurm DBD Password + Description = Password for Slurm DBD admin + ParameterType = Password + Conditions.Excluded := configuration_slurm_accounting_enabled isnt true + + [[[parameter configuration_slurm_accounting_certificate_url]]] + Label = SSL Certificate URL + Description = URL to fetch SSL certificate for authentication to DB. Use AzureCA.pem (embedded) for use with deprecated MariaDB instances. + Conditions.Excluded := configuration_slurm_accounting_enabled isnt true + ParameterType = StringList + Config.Plugin = pico.form.Dropdown + Config.FreeForm = true + Config.Entries := {[Value=""], [Value="AzureCA.pem"]} + DefaultValue = "" + + [[[parameter configuration_slurm_shutdown_policy]]] + Label = Shutdown Policy + description = By default, autostop will Delete stopped VMS for lowest cost. Optionally, Stop/Deallocate the VMs for faster restart instead. + DefaultValue = Terminate + config.plugin = pico.control.AutoCompleteDropdown + [[[[list Config.Entries]]]] + Name = Terminate + Label = Terminate + [[[[list Config.Entries]]]] + Name = Deallocate + Label = Deallocate + + [[[parameter EnableTerminateNotification]]] + Label = Enable Termination notifications + DefaultValue = False + + [[[parameter additional_slurm_config]]] + Label = Slurm Configuration + Description = Any additional lines to add to slurm.conf + ParameterType = Text + + [[[parameter configuration_slurm_launch_parameters]]] + Label = Launch Parameters + Description = Deploy Slurm with Launch Parameters (comma delimited) + DefaultValue = '' + ParameterType = String + + + + [[parameters Software]] + Description = "Specify the scheduling software, and base OS installed on all nodes, and optionally the cluster-init and chef versions from your locker." + Order = 10 + + [[[parameter NodeNameIsHostname]]] + Label = Name As Hostname + Description = Should the hostname match the nodename for execute nodes? + ParameterType = Boolean + DefaultValue = true + + [[[parameter NodeNamePrefix]]] + Label = Node Prefix + Description = Prefix for generated node names, i.e. "prefix-" generates prefix-nodearray-1. Use 'Cluster Prefix' to get $ClusterName-nodearray-1 + ParameterType = StringList + Config.Plugin = pico.form.Dropdown + Config.FreeForm = true + DefaultValue = "Cluster Prefix" + Config.Entries := {[Value=""], [Value="Cluster Prefix"]} + Conditions.Hidden := NodeNameIsHostname != true + + [[[parameter SchedulerHostName]]] + Label = Scheduler Hostname + Description = Hostname of scheduler. 'Generated' uses the default generated hostname. 'Cluster Prefix' will generate $ClusterName-scheduler. + ParameterType = StringList + Config.Plugin = pico.form.Dropdown + Config.FreeForm = true + DefaultValue = "Cluster Prefix" + Config.Entries := {[Value="Generated"], [Value="Cluster Prefix"]} + Conditions.Hidden := NodeNameIsHostname != true + + [[[parameter SchedulerImageName]]] + Label = Scheduler OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = almalinux8 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + + [[[parameter LoginImageName]]] + Label = Login Node OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = almalinux8 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + + [[[parameter HPCImageName]]] + Label = HPC OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = almalinux8 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + + [[[parameter HTCImageName]]] + Label = HTC OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = almalinux8 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + + [[[parameter HTC2ImageName]]] + Label = HTC2 OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = almalinux8 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + + + [[[parameter GPUImageName]]] + Label = GPU OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = almalinux8 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + + [[[parameter DynamicImageName]]] + Label = Dynamic OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = almalinux8 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + + [[[parameter SchedulerClusterInitSpecs]]] + Label = Scheduler Cluster-Init + DefaultValue = =undefined + Description = Cluster init specs to apply to the scheduler node + ParameterType = Cloud.ClusterInitSpecs + + [[[parameter LoginClusterInitSpecs]]] + Label = Login Cluster-Init + DefaultValue = =undefined + Description = Cluster init specs to apply to Login nodes + ParameterType = Cloud.ClusterInitSpecs + + [[[parameter HTCClusterInitSpecs]]] + Label = HTC Cluster-Init + DefaultValue = =undefined + Description = Cluster init specs to apply to HTC execute nodes + ParameterType = Cloud.ClusterInitSpecs + + [[[parameter HTC2ClusterInitSpecs]]] + Label = HTC2 Cluster-Init + DefaultValue = =undefined + Description = Cluster init specs to apply to HTC2 execute nodes + ParameterType = Cloud.ClusterInitSpecs + + [[[parameter HPCClusterInitSpecs]]] + Label = HPC Cluster-Init + DefaultValue = =undefined + Description = Cluster init specs to apply to HPC execute nodes + ParameterType = Cloud.ClusterInitSpecs + + [[[parameter GPUClusterInitSpecs]]] + Label = GPU Cluster-Init + DefaultValue = =undefined + Description = Cluster init specs to apply to GPU execute nodes + ParameterType = Cloud.ClusterInitSpecs + + [[[parameter DynamicClusterInitSpecs]]] + Label = Dyn Cluster-Init + DefaultValue = =undefined + Description = Cluster init specs to apply to Dynamic execute nodes + ParameterType = Cloud.ClusterInitSpecs + + + + [[parameters Advanced Networking]] + + [[[parameter ReturnProxy]]] + Label = Return Proxy + DefaultValue = true + ParameterType = Boolean + Config.Label = Use SSH tunnel to connect to CycleCloud (required if direct access is blocked) + + [[[parameter UsePublicNetwork]]] + Label = Public Head Node + DefaultValue = true + ParameterType = Boolean + Config.Label = Access scheduler node from the Internet + + [[[parameter ExecuteNodesPublic]]] + Label = Public Execute + DefaultValue = false + ParameterType = Boolean + Config.Label = Access execute nodes from the Internet + Conditions.Excluded := UsePublicNetwork isnt true + + [[[parameter SchedulerZone]]] + Label = Scheduler Zone + Description = The availability zone in which to deploy the scheduler node. + DefaultValue = =undefined + Config.Plugin = pico.form.Dropdown + Config.Entries := {[Value=1], [Value=2], [Value=3], [Value=undefined; Label="Any"]} + + [[[parameter SchedulerHAZone]]] + Label = Scheduler HA Zone + Description = The availability zone in which to deploy the scheduler-ha node. + DefaultValue = =undefined + Config.Plugin = pico.form.Dropdown + Config.Entries := {[Value=1], [Value=2], [Value=3], [Value=undefined; Label="Any"]} + Conditions.Hidden := configuration_slurm_ha_enabled isnt true + + [[parameters Node Health Checks]] + Description = "Section for configuring Node Health Checks" + Order = 12 + + [[[parameter EnableNodeHealthChecks]]] + Label = Enable NHC tests + DefaultValue = false + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Run Node Health Checks on startup + +[parameters Monitoring] +Order = 100 + [[parameter MonitoringEnabled]] + Label = Enable Monitoring + DefaultValue = false + Widget.Plugin = pico.form.BooleanCheckBox + Widget.Label = Enable Grafana monitoring + + [[parameter MonitoringIngestionEndpoint]] + Label = Ingestion Endpoint + Description = URL of the ingestion endpoint for Grafana + Conditions.Excluded := MonitoringEnabled isnt true + + [[parameter MonitoringIdentityClientId]] + Label = Managed Id + Description = Optionally assign an Azure user assigned managed identity to all nodes to access Azure resources using assigned roles. + # ParameterType = Azure.ManagedIdentity + DefaultValue = =undefined + Conditions.Excluded := MonitoringEnabled isnt true \ No newline at end of file From 07ab3c761184232d9bba1135dc19872fd3693c2c Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Tue, 27 May 2025 17:03:55 -0400 Subject: [PATCH 11/50] ignore bicep/hub/build dir --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 71a254a6..437c5a4b 100644 --- a/.gitignore +++ b/.gitignore @@ -407,4 +407,5 @@ arm-ttk/ # delete_roles.sh files util/.role_assignment_cleanup* +bicep/hub/build/ bicep/hub/params/*.json From 50981452dc15ef71da0924e7ac5b7fa062e011ca Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Wed, 28 May 2025 10:07:44 -0400 Subject: [PATCH 12/50] make init.sh executable --- init.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 init.sh diff --git a/init.sh b/init.sh old mode 100644 new mode 100755 From 4196f3d81c202f3b691432eb729134ca0c071225 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Wed, 28 May 2025 10:08:38 -0400 Subject: [PATCH 13/50] Use params folder and add --what-if to create_hub.sh --- bicep/hub/create_hub.sh | 66 +++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 16 deletions(-) mode change 100644 => 100755 bicep/hub/create_hub.sh diff --git a/bicep/hub/create_hub.sh b/bicep/hub/create_hub.sh old mode 100644 new mode 100755 index f1a397c1..c7c7cded --- a/bicep/hub/create_hub.sh +++ b/bicep/hub/create_hub.sh @@ -1,8 +1,11 @@ #!/bin/bash +set -e # Initialize variables RESOURCE_GROUP="" LOCATION="" +WHATIF=false +FORCE=false # Parse arguments while [ "$#" -gt 0 ]; do @@ -15,9 +18,19 @@ while [ "$#" -gt 0 ]; do LOCATION="$2" shift 2 ;; + --what-if) + WHATIF=true + shift + ;; + --force) + FORCE=true + shift + ;; -h|--help) - echo "Usage: $0 --resource-group --location " - echo " or: $0 -rg -l " + echo "Usage: $0 --resource-group --location [--what-if] [--force]" + echo " or: $0 -rg -l [--what-if] [--force]" + echo " --what-if: Perform a what-if deployment without making changes." + echo " --force: Force the creation of resources even if they already exist." exit 0 ;; *) @@ -28,6 +41,11 @@ while [ "$#" -gt 0 ]; do esac done +WHATIF_FLAG="" +if [ "$WHATIF" = true ]; then + WHATIF_FLAG="--what-if" +fi + # Validate inputs if [ -z "$RESOURCE_GROUP" ] || [ -z "$LOCATION" ]; then echo "Error: Both --resource-group and --location are required." @@ -35,7 +53,7 @@ if [ -z "$RESOURCE_GROUP" ] || [ -z "$LOCATION" ]; then exit 1 fi -cd "$(dirname "$0")" +pushd "$(dirname "$0")" # Check if the resource group exists and create it if it doesn't echo Checking if resource group "${RESOURCE_GROUP}" exists... @@ -50,13 +68,15 @@ if [ "$RG_EXISTS" = "false" ]; then done fi +echo Deploying vnet... # Deploy vnet az deployment group create \ --resource-group "${RESOURCE_GROUP}" \ --template-file "$(pwd)/hub-vnet.bicep" \ - --parameters "$(pwd)/vnet_params.json" \ + --parameters "$(pwd)/params/vnet_params.json" \ --parameters location="$LOCATION" \ - --name "hub-vnet-${RESOURCE_GROUP}" + --name "hub-vnet-${RESOURCE_GROUP}" \ + $WHATIF_FLAG echo "Virtual network deployment is complete. Please enter the Azure Portal to create a VPN Gateway while the remainder of this script runs." @@ -65,20 +85,22 @@ bastion_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n AzureB az deployment group create \ --resource-group "${RESOURCE_GROUP}" \ --template-file "$(pwd)/../bastion.bicep" \ - --parameters "$(pwd)/bastion_params.json" \ + --parameters "$(pwd)/params/bastion_params.json" \ --parameters location="${LOCATION}" \ --parameters subnetId="${bastion_subnet_id}" \ - --name "hub-bastion-${RESOURCE_GROUP}" + --name "hub-bastion-${RESOURCE_GROUP}" \ + $WHATIF_FLAG # Deploy MySQL server db_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n database --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') az deployment group create \ --resource-group "${RESOURCE_GROUP}" \ --template-file "$(pwd)/../mysql.bicep" \ - --parameters "$(pwd)/db_params.json" \ + --parameters "$(pwd)/params/db_params.json" \ --parameters location="${LOCATION}" \ --parameters subnetId="${db_subnet_id}" \ - --name "hub-db-${RESOURCE_GROUP}" + --name "hub-db-${RESOURCE_GROUP}" \ + $WHATIF_FLAG # Deploy Azure NetApp Files netapp_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n netapp --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') @@ -86,21 +108,33 @@ az deployment group create \ --resource-group "${RESOURCE_GROUP}" \ --template-file "$(pwd)/../anf-account.bicep" \ --parameters location="${LOCATION}" \ - --name "hub-anf-account-${RESOURCE_GROUP}" + --name "hub-anf-account-${RESOURCE_GROUP}" \ + $WHATIF_FLAG + az deployment group create \ --resource-group "$RESOURCE_GROUP" \ --template-file "$(pwd)/../anf.bicep"\ - --parameters "$(pwd)/anf_params.json" \ + --parameters "$(pwd)/params/anf_params.json" \ --parameters subnetId="${netapp_subnet_id}" \ --parameters location="${LOCATION}" \ --parameters name="shared" \ - --name "hub-anf-resources-${RESOURCE_GROUP}" + --name "hub-anf-resources-${RESOURCE_GROUP}" \ + $WHATIF_FLAG # Deploy monitoring MONITORING_PROJECT_VERSION="1.0.0" -rm -rf cyclecloud-monitoring + +mkdir build/ +pushd build git clone --branch "${MONITORING_PROJECT_VERSION}" https://github.com/Azure/cyclecloud-monitoring.git -cd cyclecloud-monitoring/infra -sh $(pwd)/deploy.sh "$RESOURCE_GROUP" -cd ../.. +pushd cyclecloud-monitoring/infra +if [ $WHATIF = true ]; then + echo "monitoring does not support what-if mode, skipping deployment" +else + bash $(pwd)/deploy.sh "$RESOURCE_GROUP" +fi + +popd +popd +popd \ No newline at end of file From d5037d7b8f0245d1ccb6d64d0a5610d7b5c0415c Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Wed, 28 May 2025 10:09:58 -0400 Subject: [PATCH 14/50] add ENTER default values, until this is a template --- bicep/hub/params/template/db_params.json | 1 + bicep/hub/params/template/original_spoke_params.json | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bicep/hub/params/template/db_params.json b/bicep/hub/params/template/db_params.json index d13aeb5f..40c90b42 100644 --- a/bicep/hub/params/template/db_params.json +++ b/bicep/hub/params/template/db_params.json @@ -3,6 +3,7 @@ "value": "hpcadmin" }, "adminPassword": { + "value": "ENTER_YOUR_PASSWORD_HERE" }, "tags": { "value": {} diff --git a/bicep/hub/params/template/original_spoke_params.json b/bicep/hub/params/template/original_spoke_params.json index 13816b24..99a9c390 100644 --- a/bicep/hub/params/template/original_spoke_params.json +++ b/bicep/hub/params/template/original_spoke_params.json @@ -3,9 +3,10 @@ "value": "hpcadmin" }, "adminPassword": { - "value": "" + "value": "ENTER_YOUR_PASSWORD_HERE" }, "adminSshPublicKey": { + "value": "ENTER_YOUR_PUBLIC_SSH_KEY_HERE" }, "ccVMSize": { "value": "Standard_D4as_v5" @@ -85,6 +86,14 @@ "useSpot": false } }, + "htc2": { + "value": { + "sku": "Standard_F2s_v2", + "maxNodes": 100, + "osImage": "cycle.image.ubuntu24", + "useSpot": false + } + }, "hpc": { "value": { "sku": "Standard_D2plds_v6", From a782b040ac1c57aae07efac354a6ae803150ea9a Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Wed, 28 May 2025 10:13:35 -0400 Subject: [PATCH 15/50] add what-if, monitoring params, minor fixes --- bicep/hub/deploy_spoke.sh | 54 +++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 10 deletions(-) mode change 100644 => 100755 bicep/hub/deploy_spoke.sh diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh old mode 100644 new mode 100755 index 9cdfd523..b4f1b311 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -1,7 +1,13 @@ #!/bin/bash - +set -e cd "$(dirname "$0")" +# convert base64 files +pushd "$(dirname "$0")/../../" +./init.sh +popd + + # hub params HUB_RG_NAME="" @@ -9,6 +15,8 @@ HUB_RG_NAME="" LOCATION="" SPOKE_NUMBER="" +WHATIF=false + # Parse arguments while [ "$#" -gt 0 ]; do case "$1" in @@ -24,9 +32,14 @@ while [ "$#" -gt 0 ]; do LOCATION="$2" shift 2 ;; + --what-if) + WHATIF=true + shift + ;; -h|--help) - echo "Usage: $0 --hub-resource-group --location --spoke-number " - echo " or: $0 -rg -l -s " + echo "Usage: $0 --hub-resource-group --location --spoke-number [--what-if]" + echo " or: $0 -rg -l -s [--what-if]" + echo " --what-if: Perform a what-if deployment without making changes." exit 0 ;; *) @@ -38,12 +51,17 @@ while [ "$#" -gt 0 ]; do done # Validate inputs -if [ -z "$RESOURCE_GROUP" ] || [ -z "$LOCATION" ] || [ -z "$SPOKE_NUMBER" ]; then +if [ -z "$HUB_RG_NAME" ] || [ -z "$LOCATION" ] || [ -z "$SPOKE_NUMBER" ]; then echo "Error: --resource-group, --location, --spoke-number are required." echo "Use --help for usage information." exit 1 fi +WHATIF_FLAG="" +if [ "$WHATIF" = true ]; then + WHATIF_FLAG="--what-if" +fi + # hub SUFFIX="-${HUB_RG_NAME}" @@ -84,7 +102,7 @@ fetch_outputs() { echo "outputs/hub-monitoring-outputs.json already fetched. Skipping." else echo "Fetching outputs for hub monitoring..." - [ -f cyclecloud-monitoring/infra/outputs.json ] && cp cyclecloud-monitoring/infra/outputs.json outputs/hub-monitoring-outputs.json + [ -f build/cyclecloud-monitoring/infra/outputs.json ] && cp build/cyclecloud-monitoring/infra/outputs.json outputs/hub-monitoring-outputs.json # az deployment group show -g "$HUB_RG_NAME" -n ingestionEndpoint --query properties.outputs > outputs/hub-monitoring-outputs.json fi echo "Done fetching outputs." @@ -92,10 +110,10 @@ fetch_outputs() { fetch_outputs -cp original_spoke_params.json params/spoke_params.json +cp params/original_spoke_params.json spoke_params.json replace_fields() { - jq "$1" params/spoke_params.json > tmp_spoke_params.json && mv tmp_spoke_params.json params/spoke_params.json + jq "$1" spoke_params.json > tmp_spoke_params.json && mv tmp_spoke_params.json spoke_params.json } # shared FS @@ -111,7 +129,8 @@ replace_fields ".network.value.addressSpace=\"$ADDRESS_SPACE\"" # vnet to peer PEERED_VNET_ID=$(jq -r '.vnetId.value' outputs/hub-vnet-outputs.json) PEERED_VNET_NAME=$(echo "${PEERED_VNET_ID}" | cut -d '/' -f9) -PEERED_VNET_LOCATION=$(az network vnet show -g "$RG" -n "$PEERED_VNET_NAME" --query location -o tsv | tr -d '\r\n') + +PEERED_VNET_LOCATION=$(az network vnet show -g "$HUB_RG_NAME" -n "$PEERED_VNET_NAME" --query location -o tsv | tr -d '\r\n') replace_fields ".network.value.vnetToPeer={ name: \"$PEERED_VNET_NAME\", id: \"$PEERED_VNET_ID\", location: \"$PEERED_VNET_LOCATION\", subscriptionName: \"\"}" # database config @@ -122,8 +141,21 @@ replace_fields ".databaseConfig={ value: { type: \"privateIp\", databaseUser: \" replace_fields ".databaseAdminPassword={ value: \"$DB_PASSWORD\" }" # monitoring -MONITORING_INGESTION_ENDPOINT=$([ -f outputs/hub-monitoring-outputs.json ] && jq -r '.properties.outputs.ingestionEndpoint.value' outputs/hub-monitoring-outputs.json || echo "") +# TODO these are standin values - we need to implement the mi for the hub, +# in testing I have just used a manually create MI. +# MONITORING_INGESTION_ENDPOINT= +# MONITORING_CLIENT_ID= +# HUB_MI_NAME= +if [ -z "$MONITORING_INGESTION_ENDPOINT" ] || [ -z "$MONITORING_CLIENT_ID" ]; then + echo "Monitoring ingestion endpoint or client ID not set. Please edit the script to set them directly until hub MI automation is implemented." + exit 1 +fi + +replace_fields ".monitoringIngestionEndpoint.value=\"$MONITORING_INGESTION_ENDPOINT\"" +replace_fields ".monitoringIdentityClientId.value=\"$MONITORING_CLIENT_ID\"" +replace_fields ".hubMI.value=\"$HUB_MI_NAME\"" +echo "Deploying spoke #${SPOKE_NUMBER} in resource group ${SPOKE_RG_NAME} at location ${LOCATION}... ${WHATIF_FLAG}" az deployment sub create \ --location "$LOCATION" \ --template-file "$(pwd)/../mainTemplate.bicep" \ @@ -133,5 +165,7 @@ az deployment sub create \ --parameters ccVMName="ccw-${SPOKE_NUMBER}-cyclecloud-vm" \ --parameters clusterName="ccw-${SPOKE_NUMBER}" \ --parameters monitoringIngestionEndpoint="${MONITORING_INGESTION_ENDPOINT}" \ - --name "spoke-ccw-0${SPOKE_DEPLOYMENT_NAME}" + --name "spoke-ccw-0${SPOKE_DEPLOYMENT_NAME}" \ + $WHATIF_FLAG + \ No newline at end of file From 214c34e252bc18bdf35ae8cd5a66ebc96f37931c Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Wed, 28 May 2025 10:15:54 -0400 Subject: [PATCH 16/50] Add htc2 nodearray, custom template, monitoring params --- bicep/ccw.bicep | 23 +++++++++++++++++++---- bicep/files-to-load/create_cc_param.py | 17 ++++++++++++++--- bicep/install.sh | 18 ++++++++++++++++-- bicep/mainTemplate.bicep | 7 +++++++ bicep/types.bicep | 2 +- 5 files changed, 57 insertions(+), 10 deletions(-) diff --git a/bicep/ccw.bicep b/bicep/ccw.bicep index 5a2fa73b..2ccf590a 100644 --- a/bicep/ccw.bicep +++ b/bicep/ccw.bicep @@ -9,6 +9,8 @@ param branch string param projectVersion string param monitoringProjectVersion string param monitoringIngestionEndpoint string +param monitoringIdentityClientId string +param hubMI string param adminUsername string @secure() @@ -27,6 +29,7 @@ param slurmSettings types.slurmSettings_t param schedulerNode types.scheduler_t param loginNodes types.login_t param htc types.htc_t +param htc2 types.htc_t param hpc types.hpc_t param gpu types.hpc_t param tags types.resource_tags_t @@ -316,22 +319,20 @@ output filerInfoFinal types.filerInfo_t = { output cyclecloudPrincipalId string = infrastructureOnly ? '' : ccwVM.outputs.principalId output managedIdentityId string = infrastructureOnly ? '' : ccwManagedIdentity.outputs.managedIdentityId -output monitoringIngestionEndpoint string = monitoringIngestionEndpoint - // Automatically inject the ccw and monitoring cluster init specs var ccwClusterInitSpec = { type: 'gitHubReleaseURL' gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-slurm-workspace/releases/tag/', projectVersion) spec: 'default' - target: ['login', 'scheduler', 'htc', 'hpc', 'gpu', 'dynamic'] + target: ['login', 'scheduler', 'htc', 'htc2', 'hpc', 'gpu', 'dynamic'] } var monitoringClusterInitSpec = { type: 'gitHubReleaseURL' gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-monitoring/releases/tag/', monitoringProjectVersion) spec: 'default' - target: ['login', 'scheduler', 'htc', 'hpc', 'gpu', 'dynamic'] + target: ['login', 'scheduler', 'htc', 'htc2', 'hpc', 'gpu', 'dynamic'] } // Use of azslurm 4.0 does not require pyxis @@ -352,6 +353,13 @@ output partitions types.partitions_t = { osImage: htc.osImage useSpot: htc.?useSpot ?? false }, contains(htc,'availabilityZone') ? { availabilityZone: htc.?availabilityZone } : {}) + htc2: union({ + sku: htc2.sku + maxNodes: htc2.maxNodes + osImage: htc2.osImage + useSpot: htc2.?useSpot ?? false + // we aren't using availability zones for now, ignore this for htc2 TODO + }, contains(htc,'availabilityZone') ? { availabilityZone: htc.?availabilityZone } : {}) hpc: hpc gpu: gpu } @@ -404,9 +412,16 @@ output oodManualRegistration object = { fqdn: deployOOD ? oodNIC.outputs.privateIp : '' } +output monitoring object = { + ingestionEndpoint: monitoringIngestionEndpoint + identityClientId: monitoringIdentityClientId +} +output hubMI string = hubMI + output files object = { availability_zones_json: loadTextContent('./files-to-load/encoded/availability_zones.json.base64') create_cc_param_py: loadTextContent('./files-to-load/encoded/create_cc_param.py.base64') cyclecloud_install_py: loadTextContent('./files-to-load/encoded/cyclecloud_install.py.base64') initial_params_json: loadTextContent('./files-to-load/encoded/initial_params.json.base64') + slurm_txt: loadTextContent('./files-to-load/encoded/slurm.txt.base64') } diff --git a/bicep/files-to-load/create_cc_param.py b/bicep/files-to-load/create_cc_param.py index 439e9d1d..877cc865 100644 --- a/bicep/files-to-load/create_cc_param.py +++ b/bicep/files-to-load/create_cc_param.py @@ -39,6 +39,13 @@ def set_slurm_params(params, dbPassword, outputs): params['HTCUseLowPrio'] = outputs['partitions']['value']['htc']['useSpot'] params['HTCAvailabilityZone'] = outputs['partitions']['value']['htc']['availabilityZone'] if params['DefineNodesAvailabilityZone'] and 'availabilityZone' in outputs['partitions']['value']['htc'] else None + #HTC2 + params['HTC2MachineType'] = outputs['partitions']['value']['htc2']['sku'] + params['MaxHTC2ExecuteNodeCount'] = int(outputs['partitions']['value']['htc2']['maxNodes']) + params['HTC2ImageName'] = outputs['partitions']['value']['htc2']['osImage'] + params['HTC2UseLowPrio'] = outputs['partitions']['value']['htc2']['useSpot'] + params['HTC2AvailabilityZone'] = outputs['partitions']['value']['htc2']['availabilityZone'] if params['DefineNodesAvailabilityZone'] and 'availabilityZone' in outputs['partitions']['value']['htc'] else None + #HPC params['HPCMachineType'] = outputs['partitions']['value']['hpc']['sku'] params['MaxHPCExecuteNodeCount'] = int(outputs['partitions']['value']['hpc']['maxNodes']) @@ -97,9 +104,11 @@ def set_slurm_params(params, dbPassword, outputs): params['AdditionalNFSAddress'] = outputs['filerInfoFinal']['value']['additional']['ipAddress'] # Monitoring - params['MonitoringEnabled'] = outputs['monitoringIngestionEndpoint']['value'] != '' - params['MonitoringIngestionEndpoint'] = outputs['monitoringIngestionEndpoint']['value'] - params['MonitoringIdentityClientId'] = outputs['managedIdentityId']['value'] + params['MonitoringEnabled'] = outputs['monitoring']["value"]['ingestionEndpoint'] != '' + params['MonitoringIngestionEndpoint'] = outputs['monitoring']['value']['ingestionEndpoint'] + params['MonitoringIdentityClientId'] = outputs['monitoring']['value']['identityClientId'] + + params['ManagedIdentity'] = outputs['hubMI']['value'] def set_ood_params(params, outputs): @@ -124,6 +133,7 @@ def set_ood_params(params, outputs): params['ood_entra_tenant_id'] = outputs['ood']['value'].get('tenantId') params['ood_nic'] = outputs['ood']['value'].get('nic') + class ClusterInitSpec: def __init__(self, project: str, version: str, spec: str, targets: typing.List[str]): self.project = project @@ -203,6 +213,7 @@ def main(): "gpu": "GPUClusterInitSpecs", "hpc": "HPCClusterInitSpecs", "htc": "HTCClusterInitSpecs", + "htc2": "HTC2ClusterInitSpecs", "scheduler": "SchedulerClusterInitSpecs", "dynamic": "DynamicClusterInitSpecs", "ood": "ClusterInitSpecs" diff --git a/bicep/install.sh b/bicep/install.sh index 4f09504f..f97884f2 100644 --- a/bicep/install.sh +++ b/bicep/install.sh @@ -165,7 +165,12 @@ for key in $keys; do # Print the file name echo "Processing $filename.$extension" # Create the file with the value decoded from base 64 - echo $filecontent | base64 --decode > "$filename.$extension" + + if [ ! -e "$filename.$extension" ]; then + echo $filecontent | base64 --decode > "$filename.$extension".tmp + mv "$filename.$extension".tmp "$filename.$extension" + fi + done while [ ! -f "$SECRETS_FILE_PATH" ]; do echo "Waiting for VM to create secrets file..." @@ -298,7 +303,16 @@ fi # copying template parameters file to admin user's home directory cp slurm_params.json "${ADMIN_USER_HOME_DIR}/${SLURM_CLUSTER_NAME}/slurm_params.json" -SLURM_PROJ_VERSION=$(cycle_server execute --format json 'SELECT Version FROM Cloud.Project WHERE Name=="Slurm"' | jq -r '.[0].Version') +# custom slurm template +if [ -f "slurm.txt" ]; then + echo "Found slurm.txt, using it as the template" + SLURM_PROJ_VERSION="CUSTOM" + cyclecloud import_template -c Slurm -f slurm.txt slurm_template_${SLURM_PROJ_VERSION} --force +else + echo "No slurm.txt found, using default template" + SLURM_PROJ_VERSION=$(cycle_server execute --format json 'SELECT Version FROM Cloud.Project WHERE Name=="Slurm"' | jq -r '.[0].Version') +fi + cyclecloud create_cluster slurm_template_${SLURM_PROJ_VERSION} $SLURM_CLUSTER_NAME -p slurm_params.json echo "CC create_cluster successful" diff --git a/bicep/mainTemplate.bicep b/bicep/mainTemplate.bicep index 9a239488..61e46cae 100644 --- a/bicep/mainTemplate.bicep +++ b/bicep/mainTemplate.bicep @@ -21,6 +21,7 @@ param slurmSettings types.slurmSettings_t = { startCluster: true, version: '23.1 param schedulerNode types.scheduler_t param loginNodes types.login_t param htc types.htc_t +param htc2 types.htc_t param hpc types.hpc_t param gpu types.hpc_t param tags types.resource_tags_t @@ -32,6 +33,8 @@ param databaseConfig types.databaseConfig_t = { type: 'disabled' } param clusterName string = 'ccw' param acceptMarketplaceTerms bool = false param ood types.oodConfig_t = { type: 'disabled' } +// this should have rw access to the Hub blob storage and metrics collection to grafana +param hubMI string param infrastructureOnly bool = false param insidersBuild bool = false @@ -45,6 +48,7 @@ param monitoringProjectVersion string = '1.0.0' param manualInstall bool = false param monitoringIngestionEndpoint string = '' +param monitoringIdentityClientId string = '' resource ccwResourceGroup 'Microsoft.Resources/resourceGroups@2024-03-01' = { name: resourceGroup @@ -74,6 +78,7 @@ module makeCCWresources 'ccw.bicep' = { schedulerNode: schedulerNode loginNodes: loginNodes htc: htc + htc2: htc2 hpc: hpc gpu: gpu storedKey: storedKey @@ -87,7 +92,9 @@ module makeCCWresources 'ccw.bicep' = { branch: branch projectVersion: projectVersion monitoringProjectVersion: monitoringProjectVersion + hubMI: hubMI monitoringIngestionEndpoint: monitoringIngestionEndpoint + monitoringIdentityClientId: monitoringIdentityClientId manualInstall: manualInstall acceptMarketplaceTerms: acceptMarketplaceTerms ood: ood diff --git a/bicep/types.bicep b/bicep/types.bicep index 2bb36037..bdb4609a 100644 --- a/bicep/types.bicep +++ b/bicep/types.bicep @@ -292,7 +292,7 @@ type ood_enabled_t = { @export() @discriminator('type') type oodConfig_t = ood_none_t | ood_enabled_t -type cluster_init_target_t = 'login' | 'scheduler' | 'htc' | 'hpc' | 'gpu' | 'dynamic' | 'ood' +type cluster_init_target_t = 'login' | 'scheduler' | 'htc' | 'htc2' | 'hpc' | 'gpu' | 'dynamic' | 'ood' type github_cluster_init_t = { From 28d48dc61e2e909f3c4cb7711293af73246d4218 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Wed, 28 May 2025 10:16:20 -0400 Subject: [PATCH 17/50] Remove metrics role from jetpack MI --- bicep/mi.bicep | 1 - 1 file changed, 1 deletion(-) diff --git a/bicep/mi.bicep b/bicep/mi.bicep index a5a5e5b9..766691f2 100644 --- a/bicep/mi.bicep +++ b/bicep/mi.bicep @@ -19,7 +19,6 @@ module ccwMIRoleAssignments './miRoleAssignments.bicep' = { principalId: managedIdentity.properties.principalId roles: [ 'Storage Blob Data Reader' - 'Monitoring Metrics Publisher' ] storageAccountName: storageAccountName } From 81c06963d68a1a3a8aff4f2d3dbc8c9148327c49 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Wed, 28 May 2025 10:52:13 -0400 Subject: [PATCH 18/50] Add hub-mi to hub and spoke --- bicep/hub/create_hub.sh | 15 ++++++++-- bicep/hub/create_hub_mi.sh | 0 bicep/hub/deploy_spoke.sh | 30 ++++++++++++------- bicep/hub/hub-mi.bicep | 15 +++++----- ...oke_params.json => base_spoke_params.json} | 9 ++++-- 5 files changed, 46 insertions(+), 23 deletions(-) mode change 100644 => 100755 bicep/hub/create_hub_mi.sh rename bicep/hub/params/template/{original_spoke_params.json => base_spoke_params.json} (91%) diff --git a/bicep/hub/create_hub.sh b/bicep/hub/create_hub.sh index c7c7cded..42da070a 100755 --- a/bicep/hub/create_hub.sh +++ b/bicep/hub/create_hub.sh @@ -80,6 +80,10 @@ az deployment group create \ echo "Virtual network deployment is complete. Please enter the Azure Portal to create a VPN Gateway while the remainder of this script runs." +echo "Deploying hub managed identity..." +./create_hub_mi.sh "${RESOURCE_GROUP}" "${LOCATION}" + +echo "Deploying Bastion" # Deploy Bastion bastion_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n AzureBastionSubnet --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') az deployment group create \ @@ -92,6 +96,7 @@ az deployment group create \ $WHATIF_FLAG # Deploy MySQL server +echo "Deploying MySQL server" db_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n database --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') az deployment group create \ --resource-group "${RESOURCE_GROUP}" \ @@ -103,6 +108,7 @@ az deployment group create \ $WHATIF_FLAG # Deploy Azure NetApp Files +echo "Deploying Azure NetApp Files" netapp_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n netapp --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') az deployment group create \ --resource-group "${RESOURCE_GROUP}" \ @@ -110,7 +116,8 @@ az deployment group create \ --parameters location="${LOCATION}" \ --name "hub-anf-account-${RESOURCE_GROUP}" \ $WHATIF_FLAG - + +echo "Deploying Azure NetApp Files volumes" az deployment group create \ --resource-group "$RESOURCE_GROUP" \ --template-file "$(pwd)/../anf.bicep"\ @@ -123,7 +130,7 @@ az deployment group create \ # Deploy monitoring MONITORING_PROJECT_VERSION="1.0.0" - +echo "Deploying monitoring" mkdir build/ pushd build git clone --branch "${MONITORING_PROJECT_VERSION}" https://github.com/Azure/cyclecloud-monitoring.git @@ -137,4 +144,6 @@ fi popd popd -popd \ No newline at end of file +popd + +echo "Done!" \ No newline at end of file diff --git a/bicep/hub/create_hub_mi.sh b/bicep/hub/create_hub_mi.sh old mode 100644 new mode 100755 diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh index b4f1b311..ccb76385 100755 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -83,20 +83,31 @@ fetch_outputs() { echo "outputs/hub-vnet-outputs.json already fetched. Skipping." else echo "Fetching outputs for hub vnet..." - az deployment group show -g "$HUB_RG_NAME" -n "hub-vnet${SUFFIX}" --query properties.outputs > outputs/hub-vnet-outputs.json + az deployment group show -g "$HUB_RG_NAME" -n "hub-vnet${SUFFIX}" --query properties.outputs > outputs/hub-vnet-outputs.json.tmp + mv outputs/hub-vnet-outputs.json.tmp outputs/hub-vnet-outputs.json + fi + + if [ -f outputs/hub-mi-outputs.json ]; then + echo "outputs/hub-mi-outputs.json already fetched. Skipping." + else + echo "Fetching outputs for hub managed identity..." + az deployment group show -g "$HUB_RG_NAME" -n "${HUB_RG_NAME}-hub-mi" --query properties.outputs > outputs/hub-mi-outputs.json.tmp + mv outputs/hub-mi-outputs.json.tmp outputs/hub-mi-outputs.json fi if [ -f outputs/hub-anf-outputs.json ]; then echo "outputs/hub-anf-outputs.json already fetched. Skipping." else echo "Fetching outputs for hub ANF..." - az deployment group show -g "$HUB_RG_NAME" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > outputs/hub-anf-outputs.json + az deployment group show -g "$HUB_RG_NAME" -n "hub-anf-resources${SUFFIX}" --query properties.outputs > outputs/hub-anf-outputs.json.tmp + mv outputs/hub-anf-outputs.json.tmp outputs/hub-anf-outputs.json fi if [ -f outputs/hub-db-outputs.json ]; then echo "outputs/hub-db-outputs.json already fetched. Skipping." else echo "Fetching outputs for hub MySQL database..." - az deployment group show -g "$HUB_RG_NAME" -n "hub-db${SUFFIX}" --query properties.outputs > outputs/hub-db-outputs.json + az deployment group show -g "$HUB_RG_NAME" -n "hub-db${SUFFIX}" --query properties.outputs > outputs/hub-db-outputs.json.tmp + mv outputs/hub-db-outputs.json.tmp outputs/hub-db-outputs.json fi if [ -f outputs/hub-monitoring-outputs.json ]; then echo "outputs/hub-monitoring-outputs.json already fetched. Skipping." @@ -110,7 +121,8 @@ fetch_outputs() { fetch_outputs -cp params/original_spoke_params.json spoke_params.json +# copy original spoke params, as a working copy +cp params/base_spoke_params.json spoke_params.json replace_fields() { jq "$1" spoke_params.json > tmp_spoke_params.json && mv tmp_spoke_params.json spoke_params.json @@ -141,11 +153,9 @@ replace_fields ".databaseConfig={ value: { type: \"privateIp\", databaseUser: \" replace_fields ".databaseAdminPassword={ value: \"$DB_PASSWORD\" }" # monitoring -# TODO these are standin values - we need to implement the mi for the hub, -# in testing I have just used a manually create MI. -# MONITORING_INGESTION_ENDPOINT= -# MONITORING_CLIENT_ID= -# HUB_MI_NAME= +MONITORING_INGESTION_ENDPOINT=$(jq -r '.ingestionEndpoint.value' outputs/hub-monitoring-outputs.json) +MONITORING_CLIENT_ID=$(jq -r '.hubMIClientId.value' outputs/hub-mi-outputs.json) +HUB_MI=$(jq -r '.hubMI.value' outputs/hub-mi-outputs.json) if [ -z "$MONITORING_INGESTION_ENDPOINT" ] || [ -z "$MONITORING_CLIENT_ID" ]; then echo "Monitoring ingestion endpoint or client ID not set. Please edit the script to set them directly until hub MI automation is implemented." exit 1 @@ -153,7 +163,7 @@ fi replace_fields ".monitoringIngestionEndpoint.value=\"$MONITORING_INGESTION_ENDPOINT\"" replace_fields ".monitoringIdentityClientId.value=\"$MONITORING_CLIENT_ID\"" -replace_fields ".hubMI.value=\"$HUB_MI_NAME\"" +replace_fields ".hubMI.value=\"$HUB_MI\"" echo "Deploying spoke #${SPOKE_NUMBER} in resource group ${SPOKE_RG_NAME} at location ${LOCATION}... ${WHATIF_FLAG}" az deployment sub create \ diff --git a/bicep/hub/hub-mi.bicep b/bicep/hub/hub-mi.bicep index 58871217..fbfb5d4c 100644 --- a/bicep/hub/hub-mi.bicep +++ b/bicep/hub/hub-mi.bicep @@ -1,8 +1,8 @@ targetScope = 'resourceGroup' import {tags_t} from '.././types.bicep' -import * as exports from './exports.bicep' +import * as exports from '.././exports.bicep' -param name string = '{resourceGroup().name}-mi' +param name string = '${resourceGroup().name}-mi' param location string = resourceGroup().location param tags tags_t = {} @@ -15,18 +15,19 @@ resource managedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023- var roles = [ 'Storage Blob Data Reader' - 'Storage Blob Data Constributor' + 'Storage Blob Data Contributor' 'Monitoring Metrics Publisher' ] resource roleAssignments 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ for role in roles: { - name: guid(subscription().id, principalId, exports.role_lookup[role]) - scope: storageAccount + name: guid(subscription().id, managedIdentity.id, exports.role_lookup[role]) + scope: resourceGroup() properties: { roleDefinitionId: exports.role_lookup[role] - - principalType: 'ResourceGroup' + principalId: managedIdentity.properties.principalId + principalType: 'ServicePrincipal' } }] output hubMI string = managedIdentity.id +output hubMIClientId string = managedIdentity.properties.clientId diff --git a/bicep/hub/params/template/original_spoke_params.json b/bicep/hub/params/template/base_spoke_params.json similarity index 91% rename from bicep/hub/params/template/original_spoke_params.json rename to bicep/hub/params/template/base_spoke_params.json index 99a9c390..3ad70ff4 100644 --- a/bicep/hub/params/template/original_spoke_params.json +++ b/bicep/hub/params/template/base_spoke_params.json @@ -1,4 +1,7 @@ { + "manualInstall": { + "value": true + }, "adminUsername": { "value": "hpcadmin" }, @@ -31,9 +34,9 @@ "bastion": false, "createNatGateway": true, "vnetToPeer": { - "name": "t-abatallas-vnet", - "id": "/subscriptions/1dc1b726-3fdf-40a5-b356-3b8bd6227e52/resourceGroups/t-abatallas-rg/providers/Microsoft.Network/virtualNetworks/t-abatallas-vnet", - "location": "eastus", + "name": "AUTOMATED", + "id": "AUTOMATED", + "location": "AUTOMATED", "subscriptionName": "" }, "peeringAllowGatewayTransit": true From 505c8b151c08dcf426bbb9a40843db4f9c4f4cd3 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Wed, 28 May 2025 10:59:37 -0400 Subject: [PATCH 19/50] add basic README.md --- bicep/hub/README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 bicep/hub/README.md diff --git a/bicep/hub/README.md b/bicep/hub/README.md new file mode 100644 index 00000000..2ad9d737 --- /dev/null +++ b/bicep/hub/README.md @@ -0,0 +1,16 @@ + +1. cp params/template/*.json params/ +2. edit parameter files in params/ + * Note that we can convert these templates to .j2 files, I left that out for now. + * all json files are ignored in params/ by .gitignore +3. Before deploying a hub: + * bicep/hub/params/db_params.json adminPassword - the password for the mysql DB. + * Optional: anf_params.json - we have a default of 4 TB right now. + * Only base_spoke_params.json is outside the scope of this deployment. + * `create_hub.sh --resource-group HUB_RG_NAME --location HUB_LOCATION` + * See `create_hub.sh --help for more` +4. Before deploying a spoke: + * bicep/hub/params/base_spoke_params.json adminPassword - CycleCloud hpcadmin password + * bicep/hub/params/base_spoke_params.json adminSshPublicKey - hpcadmin public ssh key + * `deploy_spoke.sh --hub-resource-group HUB_RG_NAME --spoke-number 1` + * see `deploy_spoke.sh --help for more` From 625a6fc0b57c4a8dff02c09789cacecdf5f78d2a Mon Sep 17 00:00:00 2001 From: Aditi Gaur Date: Wed, 28 May 2025 10:27:42 -0700 Subject: [PATCH 20/50] Fix the slurm template to load projects from cyclecloud Add healthagent cluster init. --- bicep/files-to-load/slurm.txt | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/bicep/files-to-load/slurm.txt b/bicep/files-to-load/slurm.txt index b670c7cd..535a12bd 100644 --- a/bicep/files-to-load/slurm.txt +++ b/bicep/files-to-load/slurm.txt @@ -68,7 +68,8 @@ Autoscale = $Autoscale monitoring.enabled = $MonitoringEnabled cyclecloud.enable_chef = false - [[[cluster-init slurm:default:4.0.0]]] + [[[cluster-init cyclecloud/healthagent:default]]] + [[[cluster-init cyclecloud/slurm:default:4.0.0]]] Optional = true [[[volume boot]]] @@ -121,8 +122,8 @@ Autoscale = $Autoscale cyclecloud.mounts.nfs_shared.disabled = ${UseBuiltinShared && !configuration_slurm_ha_enabled} slurm.secondary_scheduler_name = ${ifThenElse(configuration_slurm_ha_enabled, "scheduler-ha-1", undefined)} - - [[[cluster-init slurm:scheduler:4.0.0]]] + [[[cluster-init cyclecloud/healthagent:default]]] + [[[cluster-init cyclecloud/slurm:scheduler:4.0.0]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $UsePublicNetwork @@ -184,7 +185,8 @@ Autoscale = $Autoscale ImageName = $LoginImageName AdditionalClusterInitSpecs = $LoginClusterInitSpecs - [[[cluster-init slurm:login:4.0.0]]] + [[[cluster-init cyclecloud/healthagent:default]]] + [[[cluster-init cyclecloud/slurm:login:4.0.0]]] [[[configuration]]] slurm.role = login autoscale.enabled = false @@ -200,7 +202,8 @@ Autoscale = $Autoscale slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} slurm.use_nodename_as_hostname = $NodeNameIsHostname - [[[cluster-init slurm:execute:4.0.0]]] + [[[cluster-init cyclecloud/healthagent:default]]] + [[[cluster-init cyclecloud/slurm:execute:4.0.0]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $ExecuteNodesPublic From f433d7001ca345abf770f44bd846d74484f7832b Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Fri, 30 May 2025 13:58:34 -0400 Subject: [PATCH 21/50] Add htc2 to partitions output --- bicep/types.bicep | 1 + 1 file changed, 1 insertion(+) diff --git a/bicep/types.bicep b/bicep/types.bicep index bdb4609a..3971bd67 100644 --- a/bicep/types.bicep +++ b/bicep/types.bicep @@ -226,6 +226,7 @@ type hpc_t = { @export() type partitions_t = { htc: htc_output_t + htc2: htc_output_t hpc: hpc_t //if any property becomes optional, create a *_output_t type gpu: hpc_t //if any property becomes optional, create a *_output_t type } From 9780987767a07d69ad637c2ee788dbb42465b51c Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Fri, 30 May 2025 14:06:13 -0400 Subject: [PATCH 22/50] add pyxis cluster-init --- bicep/files-to-load/slurm.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bicep/files-to-load/slurm.txt b/bicep/files-to-load/slurm.txt index 535a12bd..64092f4d 100644 --- a/bicep/files-to-load/slurm.txt +++ b/bicep/files-to-load/slurm.txt @@ -16,6 +16,7 @@ Autoscale = $Autoscale SubnetId = $SubnetId Region = $Region KeyPairLocation = ~/.ssh/cyclecloud.pem + # hubMI is assigned here Azure.Identities = $ManagedIdentity Tags = $NodeTags @@ -71,6 +72,7 @@ Autoscale = $Autoscale [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/slurm:default:4.0.0]]] Optional = true + [[[cluster-init cyclecloud/slurm:pyxis:4.0.0]]] [[[volume boot]]] Size = ${ifThenElse(BootDiskSize > 0, BootDiskSize, undefined)} @@ -124,6 +126,7 @@ Autoscale = $Autoscale [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/slurm:scheduler:4.0.0]]] + [[[cluster-init cyclecloud/slurm:pyxis:4.0.0]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $UsePublicNetwork @@ -184,9 +187,11 @@ Autoscale = $Autoscale MachineType = $loginMachineType ImageName = $LoginImageName AdditionalClusterInitSpecs = $LoginClusterInitSpecs - + [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/slurm:login:4.0.0]]] + [[[cluster-init cyclecloud/slurm:pyxis:4.0.0]]] + [[[configuration]]] slurm.role = login autoscale.enabled = false @@ -204,6 +209,7 @@ Autoscale = $Autoscale [[[cluster-init cyclecloud/healthagent:default]]] [[[cluster-init cyclecloud/slurm:execute:4.0.0]]] + [[[cluster-init cyclecloud/slurm:pyxis:4.0.0]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $ExecuteNodesPublic From 2f9e826ebd95ef818523c7a7a1279c660fbc0583 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Fri, 30 May 2025 14:27:22 -0400 Subject: [PATCH 23/50] Copy custom slurm.txt instead of prod slurm template to user home dir --- bicep/install.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bicep/install.sh b/bicep/install.sh index f97884f2..d30b0d34 100644 --- a/bicep/install.sh +++ b/bicep/install.sh @@ -188,6 +188,10 @@ SLURM_CLUSTER_NAME=$(jq -r .clusterName.value ccwOutputs.json) ADMIN_USER_HOME_DIR="/home/${CYCLECLOUD_USERNAME}" SLURM_TEMPLATE_PATH=$(find /opt/cycle_server/system/work/.plugins_expanded/.expanded/cloud*/plugins/cloud/initial_data/templates/slurm/slurm_template_*.txt) mkdir -p "${ADMIN_USER_HOME_DIR}/${SLURM_CLUSTER_NAME}" +if [ -e /opt/ccw/slurm.txt ]; then + echo "Found custom slurm.txt, using that as the template in the user's home dir." + SLURM_TEMPLATE_PATH="slurm.txt" +fi cp "${SLURM_TEMPLATE_PATH}" "${ADMIN_USER_HOME_DIR}/${SLURM_CLUSTER_NAME}/slurm_template.txt" cp ccwOutputs.json "${ADMIN_USER_HOME_DIR}/${SLURM_CLUSTER_NAME}/deployment.json" From 63defb35ddd2aeb59431bdccce544cf4a2e29668 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Fri, 30 May 2025 15:14:18 -0400 Subject: [PATCH 24/50] Fix deploy_spoke.sh support for grafana inputs --- bicep/hub/deploy_spoke.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh index ccb76385..401d90ef 100755 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -113,8 +113,9 @@ fetch_outputs() { echo "outputs/hub-monitoring-outputs.json already fetched. Skipping." else echo "Fetching outputs for hub monitoring..." - [ -f build/cyclecloud-monitoring/infra/outputs.json ] && cp build/cyclecloud-monitoring/infra/outputs.json outputs/hub-monitoring-outputs.json - # az deployment group show -g "$HUB_RG_NAME" -n ingestionEndpoint --query properties.outputs > outputs/hub-monitoring-outputs.json + #[ -f build/cyclecloud-monitoring/infra/outputs.json ] && cp build/cyclecloud-monitoring/infra/outputs.json outputs/hub-monitoring-outputs.json + az deployment group show -g "$HUB_RG_NAME" -n ingestionEndpoint --query properties.outputs > outputs/hub-monitoring-outputs.json.tmp + mv outputs/hub-monitoring-outputs.json.tmp outputs/hub-monitoring-outputs.json fi echo "Done fetching outputs." } @@ -153,7 +154,7 @@ replace_fields ".databaseConfig={ value: { type: \"privateIp\", databaseUser: \" replace_fields ".databaseAdminPassword={ value: \"$DB_PASSWORD\" }" # monitoring -MONITORING_INGESTION_ENDPOINT=$(jq -r '.ingestionEndpoint.value' outputs/hub-monitoring-outputs.json) +MONITORING_INGESTION_ENDPOINT=$(jq -r '.metricsIngestionEndpoint.value' outputs/hub-monitoring-outputs.json) MONITORING_CLIENT_ID=$(jq -r '.hubMIClientId.value' outputs/hub-mi-outputs.json) HUB_MI=$(jq -r '.hubMI.value' outputs/hub-mi-outputs.json) if [ -z "$MONITORING_INGESTION_ENDPOINT" ] || [ -z "$MONITORING_CLIENT_ID" ]; then From 17330ff576bc1bcacd8ad0b7b06f0ceb5321ec63 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Fri, 30 May 2025 17:06:49 -0400 Subject: [PATCH 25/50] update slurm.txt --- bicep/files-to-load/slurm.txt | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/bicep/files-to-load/slurm.txt b/bicep/files-to-load/slurm.txt index 64092f4d..79aff3fa 100644 --- a/bicep/files-to-load/slurm.txt +++ b/bicep/files-to-load/slurm.txt @@ -215,9 +215,6 @@ Autoscale = $Autoscale AssociatePublicIpAddress = $ExecuteNodesPublic [[nodearray hpc]] - CloudInit="""#!/bin/bash - echo DSHELL=/bin/bash >> /etc/adduser.conf - """ Extends = nodearraybase MachineType = $HPCMachineType ImageName = $HPCImageName @@ -265,6 +262,12 @@ Autoscale = $Autoscale slurm.use_pcpu = false [[nodearray gpu]] + CloudInit="""#!/bin/bash +sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd + """ + Azure.Overprovision = false + Azure.MaxScaleSetSize = 1000 + Extends = nodearraybase MachineType = $GPUMachineType ImageName = $GPUImageName @@ -278,7 +281,7 @@ Autoscale = $Autoscale [[[configuration]]] slurm.default_partition = true - slurm.hpc = true + slurm.hpc = false slurm.partition = gpu #Parameter to enable or disable IMEX service on a per-job basis #IMEX Support is enabled by default for GB200 but can be disabled by setting param to False @@ -846,7 +849,7 @@ Order = 20 [[[parameter configuration_slurm_launch_parameters]]] Label = Launch Parameters Description = Deploy Slurm with Launch Parameters (comma delimited) - DefaultValue = '' + DefaultValue = 'use_interactive_step' ParameterType = String From 51a1b840025bc34174541423646a9b5d654a6527 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Fri, 30 May 2025 17:07:10 -0400 Subject: [PATCH 26/50] WIP README updates --- bicep/hub/README.md | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/bicep/hub/README.md b/bicep/hub/README.md index 2ad9d737..ffe05a52 100644 --- a/bicep/hub/README.md +++ b/bicep/hub/README.md @@ -1,16 +1,24 @@ -1. cp params/template/*.json params/ -2. edit parameter files in params/ - * Note that we can convert these templates to .j2 files, I left that out for now. - * all json files are ignored in params/ by .gitignore -3. Before deploying a hub: - * bicep/hub/params/db_params.json adminPassword - the password for the mysql DB. - * Optional: anf_params.json - we have a default of 4 TB right now. - * Only base_spoke_params.json is outside the scope of this deployment. +0. Login to azure via the az cli, as well as enabling the graph api. + * az login + * Make sure the correct subscription is selected. + * az login --scope https://graph.microsoft.com//.default +0. Create the cyclecloud-slurm-workspace directory for deploying the hub and spoke + ```bash + git clone -b abatallas/gb200_hub_spoke https://github.com/Azure/cyclecloud-slurm-workspace.git + cd cyclecloud-slurm-workspace/bicep/hub + cp params/template/*.json params/ + ``` +2. Edit hub parameter json files found within cyclecloud-slurm-workspace/bicep/hub/params/ + * In `params/db_params.json` update `adminPassword` - the password for the mysql DB. + * Optional: `params/anf_params.json` - we have a default `sizeTiB` of 4 TB right now. + * The rest of the parameter files likely do not need to be changed. + * **Note***: `base_spoke_params.json` is only used when deploying a spoke, it goes unused by create_hub.sh +3. Create the hub deployments + * Pick a resource group name and location, then run the following: **Note** we will create the resource group if it does not exist. * `create_hub.sh --resource-group HUB_RG_NAME --location HUB_LOCATION` - * See `create_hub.sh --help for more` -4. Before deploying a spoke: - * bicep/hub/params/base_spoke_params.json adminPassword - CycleCloud hpcadmin password - * bicep/hub/params/base_spoke_params.json adminSshPublicKey - hpcadmin public ssh key - * `deploy_spoke.sh --hub-resource-group HUB_RG_NAME --spoke-number 1` - * see `deploy_spoke.sh --help for more` +4. Add a VPN Gateway to the hub resource group using the Azure Portal. +5. Create a spoke: i.e. a CycleCloud + Slurm cluster deployment: + * `bicep/hub/params/base_spoke_params.json` Update `adminPassword` - CycleCloud hpcadmin password + * `bicep/hub/params/base_spoke_params.json` Update `adminSshPublicKey` - hpcadmin public ssh key + * `deploy_spoke.sh --hub-resource-group HUB_RG_NAME --spoke-number 1` \ No newline at end of file From c1821aa95093c869ab93891adfcb37f7347eecdd Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Mon, 2 Jun 2025 15:45:21 -0400 Subject: [PATCH 27/50] Add 3 new nodearrays and synchronize to latest standard spoke settings --- bicep/ccw.bicep | 40 +++-- bicep/files-to-load/create_cc_param.py | 26 ++- bicep/files-to-load/slurm.txt | 151 ++++++++++-------- bicep/hub/deploy_spoke.sh | 7 +- .../params/template/base_spoke_params.json | 50 +++--- bicep/mainTemplate.bicep | 10 +- bicep/types.bicep | 7 +- 7 files changed, 153 insertions(+), 138 deletions(-) diff --git a/bicep/ccw.bicep b/bicep/ccw.bicep index 2ccf590a..ec737a53 100644 --- a/bicep/ccw.bicep +++ b/bicep/ccw.bicep @@ -28,8 +28,9 @@ param clusterInitSpecs types.cluster_init_param_t param slurmSettings types.slurmSettings_t param schedulerNode types.scheduler_t param loginNodes types.login_t -param htc types.htc_t -param htc2 types.htc_t +param d64d types.htc_t +param d16d types.htc_t +param m64 types.htc_t param hpc types.hpc_t param gpu types.hpc_t param tags types.resource_tags_t @@ -325,14 +326,14 @@ var ccwClusterInitSpec = { type: 'gitHubReleaseURL' gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-slurm-workspace/releases/tag/', projectVersion) spec: 'default' - target: ['login', 'scheduler', 'htc', 'htc2', 'hpc', 'gpu', 'dynamic'] + target: ['login', 'scheduler', 'd64d', 'd16d', 'm64', 'hpc', 'gpu', 'dynamic'] } var monitoringClusterInitSpec = { type: 'gitHubReleaseURL' gitHubReleaseURL: uri('https://github.com/Azure/cyclecloud-monitoring/releases/tag/', monitoringProjectVersion) spec: 'default' - target: ['login', 'scheduler', 'htc', 'htc2', 'hpc', 'gpu', 'dynamic'] + target: ['login', 'scheduler', 'd64d', 'd16d', 'm64', 'hpc', 'gpu', 'dynamic'] } // Use of azslurm 4.0 does not require pyxis @@ -347,19 +348,24 @@ output schedulerNode types.scheduler_t = schedulerNode output loginNodes types.login_t = loginNodes output partitions types.partitions_t = { - htc: union({ - sku: htc.sku - maxNodes: htc.maxNodes - osImage: htc.osImage - useSpot: htc.?useSpot ?? false - }, contains(htc,'availabilityZone') ? { availabilityZone: htc.?availabilityZone } : {}) - htc2: union({ - sku: htc2.sku - maxNodes: htc2.maxNodes - osImage: htc2.osImage - useSpot: htc2.?useSpot ?? false - // we aren't using availability zones for now, ignore this for htc2 TODO - }, contains(htc,'availabilityZone') ? { availabilityZone: htc.?availabilityZone } : {}) + d64d: union({ + sku: d64d.sku + maxNodes: d64d.maxNodes + osImage: d64d.osImage + useSpot: d64d.?useSpot ?? false + }, contains(d64d,'availabilityZone') ? { availabilityZone: d64d.?availabilityZone } : {}) + d16d: union({ + sku: d16d.sku + maxNodes: d16d.maxNodes + osImage: d16d.osImage + useSpot: d16d.?useSpot ?? false + }, contains(d16d,'availabilityZone') ? { availabilityZone: d16d.?availabilityZone } : {}) + m64: union({ + sku: m64.sku + maxNodes: m64.maxNodes + osImage: m64.osImage + useSpot: m64.?useSpot ?? false + }, contains(m64,'availabilityZone') ? { availabilityZone: m64.?availabilityZone } : {}) hpc: hpc gpu: gpu } diff --git a/bicep/files-to-load/create_cc_param.py b/bicep/files-to-load/create_cc_param.py index 877cc865..f07f8957 100644 --- a/bicep/files-to-load/create_cc_param.py +++ b/bicep/files-to-load/create_cc_param.py @@ -30,22 +30,13 @@ def set_slurm_params(params, dbPassword, outputs): params['SubnetId'] = '/'.join([outputs['vnet']['value']['rg'], outputs['vnet']['value']['name'], outputs['vnet']['value']['computeSubnetName']]) # Define Availability Zone - params['DefineNodesAvailabilityZone'] = any('availabilityZone' in zoneList for zoneList in [outputs['partitions']['value']['htc'], outputs['partitions']['value']['hpc'], outputs['partitions']['value']['gpu']]) - - #HTC - params['HTCMachineType'] = outputs['partitions']['value']['htc']['sku'] - params['MaxHTCExecuteNodeCount'] = int(outputs['partitions']['value']['htc']['maxNodes']) - params['HTCImageName'] = outputs['partitions']['value']['htc']['osImage'] - params['HTCUseLowPrio'] = outputs['partitions']['value']['htc']['useSpot'] - params['HTCAvailabilityZone'] = outputs['partitions']['value']['htc']['availabilityZone'] if params['DefineNodesAvailabilityZone'] and 'availabilityZone' in outputs['partitions']['value']['htc'] else None - - #HTC2 - params['HTC2MachineType'] = outputs['partitions']['value']['htc2']['sku'] - params['MaxHTC2ExecuteNodeCount'] = int(outputs['partitions']['value']['htc2']['maxNodes']) - params['HTC2ImageName'] = outputs['partitions']['value']['htc2']['osImage'] - params['HTC2UseLowPrio'] = outputs['partitions']['value']['htc2']['useSpot'] - params['HTC2AvailabilityZone'] = outputs['partitions']['value']['htc2']['availabilityZone'] if params['DefineNodesAvailabilityZone'] and 'availabilityZone' in outputs['partitions']['value']['htc'] else None + params['DefineNodesAvailabilityZone'] = any('availabilityZone' in zoneList for zoneList in [outputs['partitions']['value']['hpc'], outputs['partitions']['value']['gpu']]) + for na in ['D64D', 'D16D', 'M64']: + params[f'{na}MachineType'] = outputs['partitions']['value'][na.lower()]['sku'] + params[f'Max{na}NodeCount'] = int(outputs['partitions']['value'][na.lower()]['maxNodes']) + params[f'{na}ImageName'] = outputs['partitions']['value'][na.lower()]['osImage'] + #HPC params['HPCMachineType'] = outputs['partitions']['value']['hpc']['sku'] params['MaxHPCExecuteNodeCount'] = int(outputs['partitions']['value']['hpc']['maxNodes']) @@ -212,8 +203,9 @@ def main(): "login": "LoginClusterInitSpecs", "gpu": "GPUClusterInitSpecs", "hpc": "HPCClusterInitSpecs", - "htc": "HTCClusterInitSpecs", - "htc2": "HTC2ClusterInitSpecs", + "d64d": "D64DClusterInitSpecs", + "d16d": "D16DClusterInitSpecs", + "m64": "M64ClusterInitSpecs", "scheduler": "SchedulerClusterInitSpecs", "dynamic": "DynamicClusterInitSpecs", "ood": "ClusterInitSpecs" diff --git a/bicep/files-to-load/slurm.txt b/bicep/files-to-load/slurm.txt index 79aff3fa..138d20ae 100644 --- a/bicep/files-to-load/slurm.txt +++ b/bicep/files-to-load/slurm.txt @@ -229,35 +229,45 @@ Autoscale = $Autoscale slurm.hpc = true slurm.partition = hpc - [[nodearray htc]] + [[nodearray d64d]] Extends = nodearraybase - MachineType = $HTCMachineType - ImageName = $HTCImageName - MaxCount = $MaxHTCExecuteNodeCount + MachineType = $D64DMachineType + ImageName = $D64DImageName + MaxCount = $MaxD64DNodeCount - Interruptible = $HTCUseLowPrio - MaxPrice = $HTCSpotMaxPrice - AdditionalClusterInitSpecs = $HTCClusterInitSpecs + AdditionalClusterInitSpecs = $D64DClusterInitSpecs [[[configuration]]] slurm.hpc = false - slurm.partition = htc + slurm.partition = d64d # set pcpu = false for all hyperthreaded VMs slurm.use_pcpu = false - [[nodearray htc2]] + [[nodearray d16d]] Extends = nodearraybase - MachineType = $HTC2MachineType - ImageName = $HTC2ImageName - MaxCount = $MaxHTC2ExecuteNodeCount + MachineType = $D16DMachineType + ImageName = $D16DImageName + MaxCount = $MaxD16DNodeCount - Interruptible = $HTC2UseLowPrio - MaxPrice = $HTC2SpotMaxPrice - AdditionalClusterInitSpecs = $HTC2ClusterInitSpecs + AdditionalClusterInitSpecs = $D16DClusterInitSpecs [[[configuration]]] slurm.hpc = false - slurm.partition = htc2 + slurm.partition = d16d + # set pcpu = false for all hyperthreaded VMs + slurm.use_pcpu = false + + [[nodearray m64]] + Extends = nodearraybase + MachineType = $M64MachineType + ImageName = $M64ImageName + MaxCount = $MaxM64NodeCount + + AdditionalClusterInitSpecs = $M64ClusterInitSpecs + + [[[configuration]]] + slurm.hpc = false + slurm.partition = m64 # set pcpu = false for all hyperthreaded VMs slurm.use_pcpu = false @@ -265,8 +275,12 @@ Autoscale = $Autoscale CloudInit="""#!/bin/bash sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd """ + + # GB200: Peregrine disallows SinglePlacementGroup, but pkey requires SingleScaleset Azure.Overprovision = false Azure.MaxScaleSetSize = 1000 + Azure.SingleScaleset = true + Azure.SinglePlacementGroup = false Extends = nodearraybase MachineType = $GPUMachineType @@ -347,17 +361,23 @@ Order = 10 ParameterType = Cloud.MachineType DefaultValue = Standard_F2s_v2 - [[[parameter HTCMachineType]]] - Label = HTC VM Type - Description = The VM type for HTC execute nodes + [[[parameter D64DMachineType]]] + Label = D64D VM Type + Description = The VM type for D64D execute nodes ParameterType = Cloud.MachineType - DefaultValue = Standard_F2s_v2 + DefaultValue = Standard_D64ds_v5 - [[[parameter HTC2MachineType]]] - Label = HTC2 VM Type - Description = The VM type for HTC2 execute nodes + [[[parameter D16DMachineType]]] + Label = D16D VM Type + Description = The VM type for D16D execute nodes ParameterType = Cloud.MachineType - DefaultValue = Standard_F2s_v2 + DefaultValue = Standard_D16ds_v5 + + [[[parameter M64MachineType]]] + Label = M64 VM Type + Description = The VM type for M64 execute nodes + ParameterType = Cloud.MachineType + DefaultValue = Standard_M64s_v2 [[[parameter GPUMachineType]]] Label = GPU VM Type @@ -382,17 +402,25 @@ Order = 10 Widget.Plugin = pico.form.BooleanCheckBox Widget.Label = Start and stop execute instances automatically - [[[parameter MaxHTCExecuteNodeCount]]] - Label = Max HTC Nodes - Description = The total number of HTC execute nodes to start + [[[parameter MaxD64DNodeCount]]] + Label = Max D64D Nodes + Description = The total number of D64D execute nodes to start DefaultValue = 100 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 0 Config.IntegerOnly = true - [[[parameter Max2HTCExecuteNodeCount]]] - Label = Max HTC2 Nodes - Description = The total number of HTC2 execute nodes to start + [[[parameter MaxD16DNodeCount]]] + Label = Max D16D Nodes + Description = The total number of D16D execute nodes to start + DefaultValue = 100 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 0 + Config.IntegerOnly = true + + [[[parameter MaxM64NodeCount]]] + Label = Max M64 Nodes + Description = The total number of M64 execute nodes to start DefaultValue = 100 Config.Plugin = pico.form.NumberTextBox Config.MinValue = 0 @@ -430,36 +458,6 @@ Order = 10 Config.MinValue = 1 Config.IntegerOnly = true - - [[[parameter HTCUseLowPrio]]] - Label = HTC Spot - DefaultValue = false - Widget.Plugin = pico.form.BooleanCheckBox - Widget.Label = Use Spot VMs for HTC execute hosts - - [[[parameter HTC2UseLowPrio]]] - Label = HTC2 Spot - DefaultValue = false - Widget.Plugin = pico.form.BooleanCheckBox - Widget.Label = Use Spot VMs for HTC execute hosts - - [[[parameter HTCSpotMaxPrice]]] - Label = Max Price HTC - DefaultValue = -1 - Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) - Config.Plugin = pico.form.NumberTextBox - Conditions.Excluded := HTCUseLowPrio isnt true - Config.MinValue = -1 - - [[[parameter HTC2SpotMaxPrice]]] - Label = Max Price HTC2 - DefaultValue = -1 - Description = Max price for Spot VMs in USD (value of -1 will not evict based on price) - Config.Plugin = pico.form.NumberTextBox - Conditions.Excluded := HTC2UseLowPrio isnt true - Config.MinValue = -1 - - [[[parameter GPUUseLowPrio]]] Label = GPU Spot DefaultValue = false @@ -845,6 +843,7 @@ Order = 20 Label = Slurm Configuration Description = Any additional lines to add to slurm.conf ParameterType = Text + DefaultValue = "SuspendExcParts=gpu" [[[parameter configuration_slurm_launch_parameters]]] Label = Launch Parameters @@ -905,20 +904,26 @@ Order = 20 DefaultValue = almalinux8 Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} - [[[parameter HTCImageName]]] - Label = HTC OS + [[[parameter D64DImageName]]] + Label = D64D OS ParameterType = Cloud.Image Config.OS = linux DefaultValue = almalinux8 Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} - [[[parameter HTC2ImageName]]] - Label = HTC2 OS + [[[parameter D16DImageName]]] + Label = D16D OS ParameterType = Cloud.Image Config.OS = linux DefaultValue = almalinux8 Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + [[[parameter M64ImageName]]] + Label = M64 OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = almalinux8 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} [[[parameter GPUImageName]]] Label = GPU OS @@ -946,16 +951,22 @@ Order = 20 Description = Cluster init specs to apply to Login nodes ParameterType = Cloud.ClusterInitSpecs - [[[parameter HTCClusterInitSpecs]]] - Label = HTC Cluster-Init + [[[parameter D64DClusterInitSpecs]]] + Label = D64D Cluster-Init + DefaultValue = =undefined + Description = Cluster init specs to apply to D64D execute nodes + ParameterType = Cloud.ClusterInitSpecs + + [[[parameter D16DClusterInitSpecs]]] + Label = D16D Cluster-Init DefaultValue = =undefined - Description = Cluster init specs to apply to HTC execute nodes + Description = Cluster init specs to apply to D16D execute nodes ParameterType = Cloud.ClusterInitSpecs - [[[parameter HTC2ClusterInitSpecs]]] - Label = HTC2 Cluster-Init + [[[parameter M64ClusterInitSpecs]]] + Label = M64 Cluster-Init DefaultValue = =undefined - Description = Cluster init specs to apply to HTC2 execute nodes + Description = Cluster init specs to apply to M64 execute nodes ParameterType = Cloud.ClusterInitSpecs [[[parameter HPCClusterInitSpecs]]] diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh index 401d90ef..e149e567 100755 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -136,7 +136,7 @@ MOUNT_OPTIONS=$(jq -r '.mountOptions.value' outputs/hub-anf-outputs.json) replace_fields ".sharedFilesystem={ value: { type: \"nfs-existing\", ipAddress: \"$IP_ADDRESS\", exportPath: \"$EXPORT_PATH\", mountOptions: \"$MOUNT_OPTIONS\" } }" # new vnet -ADDRESS_SPACE="10.${SPOKE_NUMBER}.0.0/24" +ADDRESS_SPACE="10.${SPOKE_NUMBER}.0.0/20" replace_fields ".network.value.addressSpace=\"$ADDRESS_SPACE\"" # vnet to peer @@ -173,10 +173,9 @@ az deployment sub create \ --parameters "$(pwd)/spoke_params.json" \ --parameters location="$LOCATION" \ --parameters resourceGroup="${SPOKE_RG_NAME}" \ - --parameters ccVMName="ccw-${SPOKE_NUMBER}-cyclecloud-vm" \ - --parameters clusterName="ccw-${SPOKE_NUMBER}" \ + --parameters ccVMName="ccw${SPOKE_NUMBER}-cyclecloud-vm" \ + --parameters clusterName="ccw${SPOKE_NUMBER}" \ --parameters monitoringIngestionEndpoint="${MONITORING_INGESTION_ENDPOINT}" \ --name "spoke-ccw-0${SPOKE_DEPLOYMENT_NAME}" \ $WHATIF_FLAG - \ No newline at end of file diff --git a/bicep/hub/params/template/base_spoke_params.json b/bicep/hub/params/template/base_spoke_params.json index 3ad70ff4..a50f1f10 100644 --- a/bicep/hub/params/template/base_spoke_params.json +++ b/bicep/hub/params/template/base_spoke_params.json @@ -12,7 +12,7 @@ "value": "ENTER_YOUR_PUBLIC_SSH_KEY_HERE" }, "ccVMSize": { - "value": "Standard_D4as_v5" + "value": "Standard_D4ds_v5" }, "sharedFilesystem": { "value": { @@ -30,7 +30,7 @@ "network": { "value": { "type": "new", - "addressSpace": "10.1.0.0/24", + "addressSpace": "AUTOMATED", "bastion": false, "createNatGateway": true, "vnetToPeer": { @@ -69,59 +69,63 @@ }, "schedulerNode": { "value": { - "sku": "Standard_D4as_v5", + "sku": "Standard_D4ds_v5", "osImage": "cycle.image.ubuntu22" } }, "loginNodes": { "value": { "sku": "Standard_F4s_v2", - "osImage": "cycle.image.ubuntu24", + "osImage": "cycle.image.ubuntu22", "initialNodes": 1, "maxNodes": 1 } }, - "htc": { + "d64d": { "value": { - "sku": "Standard_F2s_v2", - "maxNodes": 100, - "osImage": "cycle.image.ubuntu24", + "sku": "Standard_D64ds_v5", + "maxNodes": 32, + "osImage": "cycle.image.ubuntu22", "useSpot": false } }, - "htc2": { + "d16d": { "value": { - "sku": "Standard_F2s_v2", - "maxNodes": 100, - "osImage": "cycle.image.ubuntu24", + "sku": "Standard_D16ds_v5", + "maxNodes": 192, + "osImage": "cycle.image.ubuntu22", + "useSpot": false + } + }, + "m64": { + "value": { + "sku": "Standard_M64s_v2", + "maxNodes": 10, + "osImage": "cycle.image.ubuntu22", "useSpot": false } }, "hpc": { "value": { - "sku": "Standard_D2plds_v6", - "maxNodes": 16, - "osImage": "cycle.image.ubuntu24" + "sku": "Standard_F2s_v2", + "maxNodes": 200, + "osImage": "cycle.image.ubuntu22" } }, "gpu": { "value": { "sku": "Standard_NC24ads_A100_v4", - "maxNodes": 8, - "osImage": "cycle.image.ubuntu24" + "maxNodes": 500, + "osImage": "/subscriptions/e2275dba-53d4-41b0-8945-c89aa11d72a5/resourceGroups/azhpcai-images-rg/providers/Microsoft.Compute/galleries/AzHpcAiVmImageReleaseCandidates/images/Ubuntu-24.04-gen2-ARM64/versions/2025.0423.0" } }, "ood": { "value": { - "type": "enabled", - "startCluster": false, - "sku": "Standard_D4as_v5", - "osImage": "cycle.image.ubuntu24", - "userDomain": "microsoft.com", - "registerEntraIDApp": true + "type": "disabled" } }, "tags": { "value": {} } } + diff --git a/bicep/mainTemplate.bicep b/bicep/mainTemplate.bicep index 61e46cae..2a98dfc5 100644 --- a/bicep/mainTemplate.bicep +++ b/bicep/mainTemplate.bicep @@ -20,8 +20,9 @@ param clusterInitSpecs types.cluster_init_param_t = [] param slurmSettings types.slurmSettings_t = { startCluster: true, version: '23.11.7-1', healthCheckEnabled: false } param schedulerNode types.scheduler_t param loginNodes types.login_t -param htc types.htc_t -param htc2 types.htc_t +param d64d types.htc_t +param d16d types.htc_t +param m64 types.htc_t param hpc types.hpc_t param gpu types.hpc_t param tags types.resource_tags_t @@ -77,8 +78,9 @@ module makeCCWresources 'ccw.bicep' = { slurmSettings: slurmSettings schedulerNode: schedulerNode loginNodes: loginNodes - htc: htc - htc2: htc2 + d64d: d64d + d16d: d16d + m64: m64 hpc: hpc gpu: gpu storedKey: storedKey diff --git a/bicep/types.bicep b/bicep/types.bicep index 3971bd67..0df8f863 100644 --- a/bicep/types.bicep +++ b/bicep/types.bicep @@ -225,8 +225,9 @@ type hpc_t = { @export() type partitions_t = { - htc: htc_output_t - htc2: htc_output_t + d16d: htc_output_t + d64d: htc_output_t + m64: htc_output_t hpc: hpc_t //if any property becomes optional, create a *_output_t type gpu: hpc_t //if any property becomes optional, create a *_output_t type } @@ -293,7 +294,7 @@ type ood_enabled_t = { @export() @discriminator('type') type oodConfig_t = ood_none_t | ood_enabled_t -type cluster_init_target_t = 'login' | 'scheduler' | 'htc' | 'htc2' | 'hpc' | 'gpu' | 'dynamic' | 'ood' +type cluster_init_target_t = 'login' | 'scheduler' | 'd64d' | 'd16d' | 'm64' | 'hpc' | 'gpu' | 'dynamic' | 'ood' type github_cluster_init_t = { From c9e351adf803bc588bc9025a2da799cc35396f5e Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Mon, 2 Jun 2025 16:24:01 -0400 Subject: [PATCH 28/50] README: Add steps for installing cyclecloud8 via /opt/ccw/install.sh --- bicep/hub/README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/bicep/hub/README.md b/bicep/hub/README.md index ffe05a52..0c196b6c 100644 --- a/bicep/hub/README.md +++ b/bicep/hub/README.md @@ -21,4 +21,12 @@ 5. Create a spoke: i.e. a CycleCloud + Slurm cluster deployment: * `bicep/hub/params/base_spoke_params.json` Update `adminPassword` - CycleCloud hpcadmin password * `bicep/hub/params/base_spoke_params.json` Update `adminSshPublicKey` - hpcadmin public ssh key - * `deploy_spoke.sh --hub-resource-group HUB_RG_NAME --spoke-number 1` \ No newline at end of file + * `deploy_spoke.sh --hub-resource-group HUB_RG_NAME --spoke-number 1` +6. Once the spoke finishes, perform the following to install the latest version of CycleCloud8. **Assuming the CC vm is at 10.1.0.4** + ```bash + scp cyclecloud8.rpm hpcadmin@10.1.0.4:~/ + ssh hpcadmin@10.1.0.4 + sudo -i + cd /opt/ccw + bash install.sh --local-package ~hpcadmin/cyclecloud8.rpm + ``` \ No newline at end of file From 6285122ea924691daaa8cab64fc0e8b1ff67fcc8 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Mon, 2 Jun 2025 17:48:09 -0400 Subject: [PATCH 29/50] Update hub readme with private endpoint instructions and create a default subnet in hub vnet --- bicep/hub/README.md | 23 +++++++++++++++++++++-- bicep/hub/hub-vnet.bicep | 7 +++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/bicep/hub/README.md b/bicep/hub/README.md index 0c196b6c..d1fdc5af 100644 --- a/bicep/hub/README.md +++ b/bicep/hub/README.md @@ -3,7 +3,7 @@ * az login * Make sure the correct subscription is selected. * az login --scope https://graph.microsoft.com//.default -0. Create the cyclecloud-slurm-workspace directory for deploying the hub and spoke +1. Create the cyclecloud-slurm-workspace directory for deploying the hub and spoke ```bash git clone -b abatallas/gb200_hub_spoke https://github.com/Azure/cyclecloud-slurm-workspace.git cd cyclecloud-slurm-workspace/bicep/hub @@ -29,4 +29,23 @@ sudo -i cd /opt/ccw bash install.sh --local-package ~hpcadmin/cyclecloud8.rpm - ``` \ No newline at end of file + ``` + +## How to create a private endpoint for storage account resources +1. Create a new private endpoint resource in the hub resource group via the Azure Portal. Complete this once for each storage account. Set the following under the named menu tab: + * Resource + * Connection Method: Connect to an Azure resource in my directory + * Resource type: `Microsoft.Storage/storageAccounts` + * Resource: *Name of storage account* + * Target sub-resource: `blob` + * Virtual Network: + * Virtual network: *Name of hub virtual network* + * Subnet: `default` + * DNS: + * Integrate with private DNS zone: `Yes` + * Subscription: *Subscription in which the hub is deployed* + * Resource group: *Resource group in which the hub is deployed* +2. Navigate to the private DNS zone resource named `privatelink.blob.core.windows.net` in the hub resource group. + * Expand the **DNS Management** sub-menu in the left-hand side menu and select *Virtual Network Links* + * Click **Add** + * Choose an arbitary name for the link, select the hub vnet in the relevant dropdown menu, and click *Create*. No modifications to the *Configuration* section are required. \ No newline at end of file diff --git a/bicep/hub/hub-vnet.bicep b/bicep/hub/hub-vnet.bicep index 6edb9b5d..13114988 100644 --- a/bicep/hub/hub-vnet.bicep +++ b/bicep/hub/hub-vnet.bicep @@ -13,6 +13,13 @@ var vnet = { name: 'hub-vnet-${resourceGroup().name}' cidr: address subnets: { + default: { + name: 'default' + cidr: '10.0.0.0/29' + nat_gateway : false + service_endpoints: [] + delegations: [] + } netapp: { name: 'netapp' cidr: subnet_cidr.netapp From 3bbf825067116956c391db5d8696803d3470775a Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Tue, 3 Jun 2025 08:19:21 -0400 Subject: [PATCH 30/50] Larger scheduler, ANF, and use existing dns for storage --- bicep/hub/README.md | 6 ++++-- bicep/hub/params/template/anf_params.json | 2 +- bicep/hub/params/template/base_spoke_params.json | 6 ++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bicep/hub/README.md b/bicep/hub/README.md index d1fdc5af..36c71bf1 100644 --- a/bicep/hub/README.md +++ b/bicep/hub/README.md @@ -18,11 +18,13 @@ * Pick a resource group name and location, then run the following: **Note** we will create the resource group if it does not exist. * `create_hub.sh --resource-group HUB_RG_NAME --location HUB_LOCATION` 4. Add a VPN Gateway to the hub resource group using the Azure Portal. -5. Create a spoke: i.e. a CycleCloud + Slurm cluster deployment: +5. Follow steps below for "How to create a private endpoint for storage account resources" +6. Create a spoke: i.e. a CycleCloud + Slurm cluster deployment: * `bicep/hub/params/base_spoke_params.json` Update `adminPassword` - CycleCloud hpcadmin password * `bicep/hub/params/base_spoke_params.json` Update `adminSshPublicKey` - hpcadmin public ssh key + * `bicep/hub/params/base_spoke_params.json` Update storagePrivateDnsZone.id with the private link id we created in step 5. * `deploy_spoke.sh --hub-resource-group HUB_RG_NAME --spoke-number 1` -6. Once the spoke finishes, perform the following to install the latest version of CycleCloud8. **Assuming the CC vm is at 10.1.0.4** +7. Once the spoke finishes, perform the following to install the latest version of CycleCloud8. **Assuming the CC vm is at 10.1.0.4** ```bash scp cyclecloud8.rpm hpcadmin@10.1.0.4:~/ ssh hpcadmin@10.1.0.4 diff --git a/bicep/hub/params/template/anf_params.json b/bicep/hub/params/template/anf_params.json index d2c2d805..055fb712 100644 --- a/bicep/hub/params/template/anf_params.json +++ b/bicep/hub/params/template/anf_params.json @@ -3,7 +3,7 @@ "value": "Premium" }, "sizeTiB": { - "value": 4 + "value": 25 }, "tags": { "value": {} diff --git a/bicep/hub/params/template/base_spoke_params.json b/bicep/hub/params/template/base_spoke_params.json index a50f1f10..8a196254 100644 --- a/bicep/hub/params/template/base_spoke_params.json +++ b/bicep/hub/params/template/base_spoke_params.json @@ -44,7 +44,9 @@ }, "storagePrivateDnsZone": { "value": { - "type": "new" + "type": "existing", + "id": "ENTER_PRIVATE_DNS_ZONE_ID_HERE", + "vnetLink": false } }, "databaseAdminPassword": { @@ -69,7 +71,7 @@ }, "schedulerNode": { "value": { - "sku": "Standard_D4ds_v5", + "sku": "Standard_D16ds_v5", "osImage": "cycle.image.ubuntu22" } }, From cd1bf2e0d7bfa664a0249ce19ecd74c59d226e6b Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Tue, 3 Jun 2025 10:27:47 -0400 Subject: [PATCH 31/50] Blob: add build 3408 --- bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm new file mode 100644 index 00000000..652bdca2 --- /dev/null +++ b/bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c0819a4c5551897d521588535786f30483cbd366a474d8439062d9d380b02b0 +size 455425758 From 1160b6a3290b7a86c350e2ff26bf4a1fd7d81e53 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Tue, 3 Jun 2025 10:50:03 -0400 Subject: [PATCH 32/50] Use two gpu partitions --- bicep/files-to-load/slurm.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/bicep/files-to-load/slurm.txt b/bicep/files-to-load/slurm.txt index 138d20ae..561bd9e3 100644 --- a/bicep/files-to-load/slurm.txt +++ b/bicep/files-to-load/slurm.txt @@ -301,6 +301,17 @@ sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd #IMEX Support is enabled by default for GB200 but can be disabled by setting param to False #slurm.imex.enabled=True + [[nodearray gpu2]] + extends = gpu + MaxCount = $MaxGPU2ExecuteNodeCount + [[[configuration]]] + slurm.default_partition = false + slurm.hpc = false + slurm.partition = gpu2 + #Parameter to enable or disable IMEX service on a per-job basis + #IMEX Support is enabled by default for GB200 but can be disabled by setting param to False + #slurm.imex.enabled=True + [[nodearray dynamic]] Extends = nodearraybase MachineType = $DynamicMachineType @@ -442,6 +453,14 @@ Order = 10 Config.MinValue = 0 Config.IntegerOnly = true + [[[parameter MaxGPU2ExecuteNodeCount]]] + Label = Max GPU2 Nodes + Description = The total number of GPU2 execute nodes to start + DefaultValue = 120 + Config.Plugin = pico.form.NumberTextBox + Config.MinValue = 0 + Config.IntegerOnly = true + [[[parameter MaxDynamicExecuteCoreCount]]] Label = Max Dyn Cores Description = The total number of Dynamic execute cores to start From 94766e9660f84c9539c51ed51a054f7a5e4f0659 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Tue, 3 Jun 2025 12:02:25 -0400 Subject: [PATCH 33/50] Enable SPG on hpc/hpc2 --- bicep/files-to-load/slurm.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bicep/files-to-load/slurm.txt b/bicep/files-to-load/slurm.txt index 561bd9e3..c2651573 100644 --- a/bicep/files-to-load/slurm.txt +++ b/bicep/files-to-load/slurm.txt @@ -280,7 +280,7 @@ sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd Azure.Overprovision = false Azure.MaxScaleSetSize = 1000 Azure.SingleScaleset = true - Azure.SinglePlacementGroup = false + Azure.SinglePlacementGroup = true Extends = nodearraybase MachineType = $GPUMachineType @@ -295,7 +295,7 @@ sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd [[[configuration]]] slurm.default_partition = true - slurm.hpc = false + slurm.hpc = true slurm.partition = gpu #Parameter to enable or disable IMEX service on a per-job basis #IMEX Support is enabled by default for GB200 but can be disabled by setting param to False @@ -306,7 +306,7 @@ sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd MaxCount = $MaxGPU2ExecuteNodeCount [[[configuration]]] slurm.default_partition = false - slurm.hpc = false + slurm.hpc = true slurm.partition = gpu2 #Parameter to enable or disable IMEX service on a per-job basis #IMEX Support is enabled by default for GB200 but can be disabled by setting param to False From 1087e6a399c111d76fbd2bc1fe37428e10a22572 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Tue, 3 Jun 2025 12:51:35 -0400 Subject: [PATCH 34/50] Fix error in ccwBastion module scope that blocks deployment --- bicep/ccw.bicep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bicep/ccw.bicep b/bicep/ccw.bicep index ec737a53..de0bcad2 100644 --- a/bicep/ccw.bicep +++ b/bicep/ccw.bicep @@ -108,7 +108,7 @@ output vnet types.networkOutput_t = union( var deploy_bastion = network.?bastion ?? false module ccwBastion './bastion.bicep' = if (deploy_bastion) { name: 'ccwBastion' - scope: create_new_vnet ? az.resourceGroup() : az.resourceGroup(split(network.?existing_vnet_id, '/')[4]) + scope: az.resourceGroup() params: { location: location tags: getTags('Microsoft.Network/bastionHosts', tags) From a27aedbc187772d3fa3b1252a6ac2bafe91ef0c5 Mon Sep 17 00:00:00 2001 From: abatallas <167922471+abatallas@users.noreply.github.com> Date: Tue, 3 Jun 2025 16:44:36 -0400 Subject: [PATCH 35/50] Delete NVMe-related scripts for gb200 clusters (#279) * Update version number from 2025.04.24 to 2025.06.03 * Fix error in ccwBastion module scope that blocks deployment * Remove NVMe-related scripts --- bicep/mainTemplate.bicep | 4 +- build.sh | 2 +- project.ini | 2 +- .../files/nvme_persistent_mount.sh | 178 ------------------ specs/default/cluster-init/scripts/08-nvme.sh | 8 - uidefinitions/createUiDefinition.json | 2 +- 6 files changed, 5 insertions(+), 191 deletions(-) delete mode 100644 specs/default/cluster-init/files/nvme_persistent_mount.sh delete mode 100755 specs/default/cluster-init/scripts/08-nvme.sh diff --git a/bicep/mainTemplate.bicep b/bicep/mainTemplate.bicep index 2a98dfc5..bb08ae01 100644 --- a/bicep/mainTemplate.bicep +++ b/bicep/mainTemplate.bicep @@ -43,8 +43,8 @@ param insidersBuild bool = false // build.sh will override this, but for development please set this yourself as a parameter param branch string = 'main' // This needs to be updated on each release. Our Cloud.Project records require a release tag -param projectVersion string = '2025.04.24' -param monitoringProjectVersion string = '1.0.0' +param projectVersion string = '2025.06.03' +param pyxisProjectVersion string = '1.0.0' //Internal developer use only: set true use custom CycleCloud release build param manualInstall bool = false diff --git a/build.sh b/build.sh index 40f5e567..bb7ed6f3 100755 --- a/build.sh +++ b/build.sh @@ -2,7 +2,7 @@ set -e # This script builds the ARM template and UI definition for the marketplace solution cd $(dirname $0)/ -VERSION="2025.04.24" +VERSION="2025.06.03" THIS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" diff --git a/project.ini b/project.ini index 3dc8d1ed..14a8384b 100644 --- a/project.ini +++ b/project.ini @@ -1,5 +1,5 @@ [project] name = ccw label = Workspace for Slurm -version = 2025.04.24 +version = 2025.06.03 type = scheduler diff --git a/specs/default/cluster-init/files/nvme_persistent_mount.sh b/specs/default/cluster-init/files/nvme_persistent_mount.sh deleted file mode 100644 index 4b068ad5..00000000 --- a/specs/default/cluster-init/files/nvme_persistent_mount.sh +++ /dev/null @@ -1,178 +0,0 @@ -#!/bin/bash - -# Script requirements: -# nvme-cli -# mdadm -# gdisk -set -x -readonly USAGE="Usage: $(basename "$0") " - -# Label used to identify the NVMe array file system and associated disks -# Can't exceed 16 characters -readonly RAID0_FILESYSTEM_LABEL="azure_temp" -# Device path used for the RAID 0 NVMe array -# Choose any unoccupied device path of format /dev/mdX (X = 0 to 99) -readonly RAID0_DEVICE_PATH="/dev/md0" -# Formatted RAID 0 partition is mounted here -readonly DEFAULT_MOUNT_POINT="/mnt/${RAID0_FILESYSTEM_LABEL}" - -filesystem="$1" -if [ ! "$filesystem" ]; then - printf "No filesystem specified. Usage: $USAGE\n" - exit 1 -fi -if ! [ -x "$(command -v mkfs.$filesystem)" ]; then - printf "Filesystem \"$filesystem\" not supported by mkfs\n$USAGE\n" - exit 1 -fi - -mount_point="$2" -if [ ! "$mount_point" ]; then - printf "No mount point specified. Using default: $DEFAULT_MOUNT_POINT\n" - mount_point=$DEFAULT_MOUNT_POINT -fi - -# Make sure mdadm.conf is present -mdadm_conf_path="" -if [ -e "/etc/mdadm/mdadm.conf" ]; then - mdadm_conf_path="/etc/mdadm/mdadm.conf" -elif [ -e "/etc/mdadm.conf" ]; then - mdadm_conf_path="/etc/mdadm.conf" -else - print "Couldn't find mdadm.conf file" - exit 1 -fi - -# Enumerate unmounted NVMe direct disks -devices=$(lsblk -p -o NAME,TYPE,MOUNTPOINT | grep "nvme" | awk '$2 == "disk" && $3 == "" {print $1}') -nvme_direct_disks=() -for device in $devices -do - if nvme id-ctrl "$device" | grep -q "Microsoft NVMe Direct Disk"; then - nvme_direct_disks+=("$device") - fi -done -nvme_direct_disk_count=${#nvme_direct_disks[@]} -printf "Found $nvme_direct_disk_count NVMe Direct Disks\n" - -# CCW MODIFICATION: Early exit if there are no nvme devices -# TODO this is added since we run this on every node. Ideally we would only -# run this on compute nodes that actually have nvme -if [ "$nvme_direct_disk_count" -eq 0 ]; then - printf "No NVMe Direct Disks found\n" - exit 0 -fi - -# Check if there's already an NVMe Direct Disk RAID 0 disk (or remnant data) -if grep "$RAID0_FILESYSTEM_LABEL" /etc/fstab > /dev/null; then - fstab_entry_present=true -fi -if grep "$RAID0_FILESYSTEM_LABEL" $mdadm_conf_path > /dev/null; then - mdadm_conf_entry_present=true -fi -if [ -e $RAID0_DEVICE_PATH ]; then - nvme_raid0_present=true -fi -if [ "$fstab_entry_present" = true ] || [ "$mdadm_conf_entry_present" = true ] || [ "$nvme_raid0_present" = true ]; then - # Check if the RAID 0 volume and associated configurations are still intact or need to be reinitialized - # - # If reinitialization is needed, clear the old RAID 0 information and associated files - - reinit_raid0=false - if [ "$fstab_entry_present" = true ] && [ "$mdadm_conf_entry_present" = true ] && [ "$nvme_raid0_present" = true ]; then - # Check RAID 0 device status - if ! mdadm --detail --test $RAID0_DEVICE_PATH &> /dev/null; then - reinit_raid0=true - # Test the NVMe direct disks for valid mdadm superblocks - else - for device in "${nvme_direct_disks[@]}" - do - if ! mdadm --examine $device &> /dev/null; then - reinit_raid0=true - break - fi - done - fi - else - reinit_raid0=true - fi - - if [ "$reinit_raid0" = true ]; then - echo "Errors found in NVMe RAID 0 temp array device or configuration. Reinitializing." - - # Remove the file system and partition table, and stop the RAID 0 array - if [ "$nvme_raid0_present" = true ]; then - if [ -e ${RAID0_DEVICE_PATH}p1 ]; then - umount ${RAID0_DEVICE_PATH}p1 - wipefs -a -f ${RAID0_DEVICE_PATH}p1 - fi - sgdisk -o $RAID0_DEVICE_PATH &> /dev/null - mdadm --stop $RAID0_DEVICE_PATH - fi - - # Remove any mdadm metadata from all NVMe Direct Disks - for device in "${nvme_direct_disks[@]}" - do - printf "Clearing mdadm superblock from $device\n" - mdadm --zero-superblock $device &> /dev/null - done - - # Remove any associated entries in fstab and mdadm.conf - sed -i.bak "/$RAID0_FILESYSTEM_LABEL/d" /etc/fstab - sed -i.bak "/$RAID0_FILESYSTEM_LABEL/d" $mdadm_conf_path - else - printf "Valid NVMe RAID 0 array present and no additional Direct Disks found. Skipping\n" - exit 0 - fi -fi - -if [ "$nvme_direct_disk_count" -eq 0 ]; then - printf "No NVMe Direct Disks found\n" - exit 1 -elif [ "$nvme_direct_disk_count" -eq 1 ]; then - additional_mdadm_params="--force" -fi - -# Initialize enumerated disks as RAID 0 -printf "Creating RAID 0 array from:\n" -printf "${nvme_direct_disks[*]}\n\n" -if ! mdadm --create $RAID0_DEVICE_PATH --verbose $additional_mdadm_params --name=$RAID0_FILESYSTEM_LABEL --level=0 --raid-devices=$nvme_direct_disk_count ${nvme_direct_disks[*]}; then - printf "Failed to create RAID 0 array\n" - exit 1 -fi - -# Create a GPT partition entry -readonly GPT_PARTITION_TYPE_GUID="0FC63DAF-8483-4772-8E79-3D69D8477DE4" -printf "\nCreating GPT on $RAID0_DEVICE_PATH..\n" -sgdisk -o $RAID0_DEVICE_PATH &> /dev/null -if ! sgdisk --new 1::0 --typecode 1:$GPT_PARTITION_TYPE_GUID $RAID0_DEVICE_PATH &> /dev/null; then - printf "Failed to create partition on $RAID0_DEVICE_PATH\n" - exit 1 -fi - -# Format the partition -partition_path="${RAID0_DEVICE_PATH}p1" -printf "\nCreating $filesystem filesystem..\n" -if ! mkfs.$filesystem -q -L $RAID0_FILESYSTEM_LABEL $partition_path; then - printf "Failed to create $filesystem filesystem\n" - exit 1 -fi -printf "The operation has completed successfully.\n" - -# Add the partition to /etc/fstab -echo "LABEL=$RAID0_FILESYSTEM_LABEL $mount_point $filesystem defaults,nofail 0 0" >> /etc/fstab - -# Add RAID 0 array to mdadm.conf -mdadm --detail --scan >> $mdadm_conf_path -update-initramfs -u - -# Mount the partition -printf "\nMounting filesystem to $mount_point..\n" -mkdir $mount_point &> /dev/null -if ! mount -a; then - printf "Failed to automount partition\n" - exit 1 -fi -printf "The operation has completed successfully.\n" - -exit 0 \ No newline at end of file diff --git a/specs/default/cluster-init/scripts/08-nvme.sh b/specs/default/cluster-init/scripts/08-nvme.sh deleted file mode 100755 index 8f41d6cc..00000000 --- a/specs/default/cluster-init/scripts/08-nvme.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -source "$script_dir/../files/common.sh" - -if is_compute; then -chmod +x $script_dir/../files/nvme_persistent_mount.sh - $script_dir/../files/nvme_persistent_mount.sh xfs /mnt/nvme -fi diff --git a/uidefinitions/createUiDefinition.json b/uidefinitions/createUiDefinition.json index e99a6127..71c66fae 100644 --- a/uidefinitions/createUiDefinition.json +++ b/uidefinitions/createUiDefinition.json @@ -5,7 +5,7 @@ "parameters": { "config": { "basics": { - "description": "Version **2025.04.24**: [Release Notes](https://learn.microsoft.com/azure/cyclecloud/release-notes/ccws/release-notes)", + "description": "Version **2025.06.03**: [Release Notes](https://learn.microsoft.com/azure/cyclecloud/release-notes/ccws/release-notes)", "resourceGroup": { "visible": false }, From 7f953755571b75ca798a0501974fc17dda264754 Mon Sep 17 00:00:00 2001 From: Ryan Hamel Date: Tue, 3 Jun 2025 17:18:14 -0400 Subject: [PATCH 36/50] Blobs: use build 3433 --- bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm | 3 --- bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm create mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm deleted file mode 100644 index 652bdca2..00000000 --- a/bicep/hub/blobs/cyclecloud8-8.8.0-3408.x86_64.rpm +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c0819a4c5551897d521588535786f30483cbd366a474d8439062d9d380b02b0 -size 455425758 diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm new file mode 100644 index 00000000..bf5c4c5a --- /dev/null +++ b/bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70c34fddf6873be3d61796f0082ec33bfd44020531d0206c05cd1b8c7deca5df +size 455369336 From 481835d5615673c97b934f0e919bef79e17c2ca7 Mon Sep 17 00:00:00 2001 From: Ben Watrous Date: Thu, 19 Jun 2025 12:34:47 -0700 Subject: [PATCH 37/50] Blobs: use build 3438 --- bicep/hub/blobs/.gitattributes | 1 + bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm | 3 --- bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm | 3 +++ 3 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 bicep/hub/blobs/.gitattributes delete mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm create mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm diff --git a/bicep/hub/blobs/.gitattributes b/bicep/hub/blobs/.gitattributes new file mode 100644 index 00000000..adc2bfdf --- /dev/null +++ b/bicep/hub/blobs/.gitattributes @@ -0,0 +1 @@ +*.rpm filter=lfs diff=lfs merge=lfs -text diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm deleted file mode 100644 index bf5c4c5a..00000000 --- a/bicep/hub/blobs/cyclecloud8-8.8.0-3433.x86_64.rpm +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70c34fddf6873be3d61796f0082ec33bfd44020531d0206c05cd1b8c7deca5df -size 455369336 diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm new file mode 100644 index 00000000..e5a83d5e --- /dev/null +++ b/bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:617912459fa119baf3716ca6e054f1e0170acc30f569dd71d723761aa81c1eab +size 455462875 From 5f1ade7ab15a14d625dabd634775b420916f64e5 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Thu, 10 Jul 2025 14:54:51 -0400 Subject: [PATCH 38/50] Create vnet link to hub private DNS zone and correct README step 5 instructions --- bicep/hub/README.md | 2 +- bicep/hub/params/template/base_spoke_params.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bicep/hub/README.md b/bicep/hub/README.md index 36c71bf1..b0acc6f4 100644 --- a/bicep/hub/README.md +++ b/bicep/hub/README.md @@ -22,7 +22,7 @@ 6. Create a spoke: i.e. a CycleCloud + Slurm cluster deployment: * `bicep/hub/params/base_spoke_params.json` Update `adminPassword` - CycleCloud hpcadmin password * `bicep/hub/params/base_spoke_params.json` Update `adminSshPublicKey` - hpcadmin public ssh key - * `bicep/hub/params/base_spoke_params.json` Update storagePrivateDnsZone.id with the private link id we created in step 5. + * `bicep/hub/params/base_spoke_params.json` Update storagePrivateDnsZone.id with the resource ID of the private DNS zone created in step 5. * `deploy_spoke.sh --hub-resource-group HUB_RG_NAME --spoke-number 1` 7. Once the spoke finishes, perform the following to install the latest version of CycleCloud8. **Assuming the CC vm is at 10.1.0.4** ```bash diff --git a/bicep/hub/params/template/base_spoke_params.json b/bicep/hub/params/template/base_spoke_params.json index 8a196254..a1de3595 100644 --- a/bicep/hub/params/template/base_spoke_params.json +++ b/bicep/hub/params/template/base_spoke_params.json @@ -45,8 +45,8 @@ "storagePrivateDnsZone": { "value": { "type": "existing", - "id": "ENTER_PRIVATE_DNS_ZONE_ID_HERE", - "vnetLink": false + "id": "ENTER_PRIVATE_DNS_ZONE_RESOURCE_ID_HERE", + "vnetLink": true } }, "databaseAdminPassword": { From 2a9ebba396ec53311d09da0d95df4e2f6d1376ac Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Thu, 10 Jul 2025 14:55:22 -0400 Subject: [PATCH 39/50] Add monitoring project version as parameter in mainTemplate.bicep --- bicep/mainTemplate.bicep | 1 + 1 file changed, 1 insertion(+) diff --git a/bicep/mainTemplate.bicep b/bicep/mainTemplate.bicep index bb08ae01..02124d23 100644 --- a/bicep/mainTemplate.bicep +++ b/bicep/mainTemplate.bicep @@ -50,6 +50,7 @@ param manualInstall bool = false param monitoringIngestionEndpoint string = '' param monitoringIdentityClientId string = '' +param monitoringProjectVersion string = '1.0.0' resource ccwResourceGroup 'Microsoft.Resources/resourceGroups@2024-03-01' = { name: resourceGroup From dcc8874696dbfc0dbf54040021ad512654ff0d97 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Thu, 10 Jul 2025 14:56:17 -0400 Subject: [PATCH 40/50] Fix file path in create_hub_mi.sh --- bicep/hub/create_hub_mi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bicep/hub/create_hub_mi.sh b/bicep/hub/create_hub_mi.sh index 6b0fb10e..3ebaa724 100755 --- a/bicep/hub/create_hub_mi.sh +++ b/bicep/hub/create_hub_mi.sh @@ -6,4 +6,4 @@ LOCATION=$2 az deployment group create \ --name "$RG-hub-mi" \ --resource-group "$RG" \ - --template-file ./hub-mi.bicep \ \ No newline at end of file + --template-file $(pwd)/hub-mi.bicep \ \ No newline at end of file From 97913b49f8cc8c90b279c936d285ff320d2f2ad2 Mon Sep 17 00:00:00 2001 From: abatallas <167922471+abatallas@users.noreply.github.com> Date: Tue, 15 Jul 2025 13:01:41 -0400 Subject: [PATCH 41/50] Remove accelerated networking auto-enable patch from install.sh (#295) --- bicep/install.sh | 9 --------- 1 file changed, 9 deletions(-) diff --git a/bicep/install.sh b/bicep/install.sh index d30b0d34..08ad7db7 100644 --- a/bicep/install.sh +++ b/bicep/install.sh @@ -362,15 +362,6 @@ while [ $(/opt/cycle_server/./cycle_server execute --format json " done echo All Azure.MachineType records are loaded. -# Enable accel networking on any nodearray that has a VM Size that supports it. -/opt/cycle_server/./cycle_server execute \ -"SELECT AdType, ClusterName, Name, M.AcceleratedNetworkingEnabled AS EnableAcceleratedNetworking - FROM Cloud.Node - INNER JOIN Azure.MachineType M - ON M.Name===MachineType && M.Location===Region - WHERE ClusterName==\"$SLURM_CLUSTER_NAME\"" > /tmp/accel_network.txt - mv /tmp/accel_network.txt /opt/cycle_server/config/data - # it usually takes less than 2 seconds, so before starting the longer timeouts, optimistically sleep. sleep 2 echo Waiting for accelerated network records to be imported From 113adb706f90dd51a0c78458c95add2d30484587 Mon Sep 17 00:00:00 2001 From: Ben Watrous Date: Tue, 15 Jul 2025 18:15:28 -0700 Subject: [PATCH 42/50] updated preview CycleCloud 8.8.0 release to build 3455 --- bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm | 3 --- bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm create mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm deleted file mode 100644 index e5a83d5e..00000000 --- a/bicep/hub/blobs/cyclecloud8-8.8.0-3438.x86_64.rpm +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:617912459fa119baf3716ca6e054f1e0170acc30f569dd71d723761aa81c1eab -size 455462875 diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm new file mode 100644 index 00000000..5284247f --- /dev/null +++ b/bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3498a5c77469092f13eb56aaed4e2f3e3012eb45ca51421553557555b0d1e9dd +size 454815619 From 40ebba152f59d2fd433cbf162ea867551c107d8d Mon Sep 17 00:00:00 2001 From: Ben Watrous Date: Tue, 15 Jul 2025 22:40:54 -0700 Subject: [PATCH 43/50] updated preview CycleCloud 8.8.0 release to build 3455 --- bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm | 3 --- bicep/hub/blobs/cyclecloud8-8.8.0-3456.x86_64.rpm | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm create mode 100644 bicep/hub/blobs/cyclecloud8-8.8.0-3456.x86_64.rpm diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm deleted file mode 100644 index 5284247f..00000000 --- a/bicep/hub/blobs/cyclecloud8-8.8.0-3455.x86_64.rpm +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3498a5c77469092f13eb56aaed4e2f3e3012eb45ca51421553557555b0d1e9dd -size 454815619 diff --git a/bicep/hub/blobs/cyclecloud8-8.8.0-3456.x86_64.rpm b/bicep/hub/blobs/cyclecloud8-8.8.0-3456.x86_64.rpm new file mode 100644 index 00000000..9d9621fc --- /dev/null +++ b/bicep/hub/blobs/cyclecloud8-8.8.0-3456.x86_64.rpm @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4757e90db172f82ad32832ec40db8aa2da3f53f2987bdb066dbca9b02211e5dc +size 455067063 From c1ad11fc765e65a91674fb4efecefa9a49cef75b Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Thu, 17 Jul 2025 12:12:47 -0400 Subject: [PATCH 44/50] Update spoke deployment name for uniqueness across regions --- bicep/hub/deploy_spoke.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bicep/hub/deploy_spoke.sh b/bicep/hub/deploy_spoke.sh index e149e567..97a9c9ab 100755 --- a/bicep/hub/deploy_spoke.sh +++ b/bicep/hub/deploy_spoke.sh @@ -176,6 +176,6 @@ az deployment sub create \ --parameters ccVMName="ccw${SPOKE_NUMBER}-cyclecloud-vm" \ --parameters clusterName="ccw${SPOKE_NUMBER}" \ --parameters monitoringIngestionEndpoint="${MONITORING_INGESTION_ENDPOINT}" \ - --name "spoke-ccw-0${SPOKE_DEPLOYMENT_NAME}" \ + --name "spoke-ccw-0${SPOKE_DEPLOYMENT_NAME}-${LOCATION}" \ $WHATIF_FLAG \ No newline at end of file From fe80e0c40214d469759b085cbb7542ee878e3f91 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Thu, 17 Jul 2025 12:21:53 -0400 Subject: [PATCH 45/50] Update custom Slurm template --- bicep/files-to-load/slurm.txt | 80 +++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/bicep/files-to-load/slurm.txt b/bicep/files-to-load/slurm.txt index c2651573..bb92be06 100644 --- a/bicep/files-to-load/slurm.txt +++ b/bicep/files-to-load/slurm.txt @@ -26,6 +26,15 @@ Autoscale = $Autoscale # Lustre mounts require termination notifications to unmount EnableTerminateNotification = ${NFSType == "lustre" || NFSSchedType == "lustre" || AdditionalNFSType == "lustre" || EnableTerminateNotification} TerminateNotificationTimeout = 10m + CloudInit="""#!/bin/bash +sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd + +CLUSTER_NAME=$( jetpack config cyclecloud.cluster.name ) +mkdir -p /etc/slurm/ +ln -s /sched/${CLUSTER_NAME}/topology.conf /etc/slurm/ + + """ + [[[configuration]]] @@ -34,6 +43,7 @@ Autoscale = $Autoscale slurm.user.gid = 11100 munge.user.uid = 11101 munge.user.gid = 11101 + slurm.enable_healthchecks = true slurm.accounting.enabled = $configuration_slurm_accounting_enabled slurm.accounting.url = $configuration_slurm_accounting_url slurm.accounting.user = $configuration_slurm_accounting_user @@ -69,10 +79,10 @@ Autoscale = $Autoscale monitoring.enabled = $MonitoringEnabled cyclecloud.enable_chef = false - [[[cluster-init cyclecloud/healthagent:default]]] - [[[cluster-init cyclecloud/slurm:default:4.0.0]]] + [[[cluster-init cyclecloud/healthagent:default:1.0.2]]] + [[[cluster-init cyclecloud/slurm:default:4.0.1]]] Optional = true - [[[cluster-init cyclecloud/slurm:pyxis:4.0.0]]] + [[[cluster-init cyclecloud/slurm:pyxis:4.0.1]]] [[[volume boot]]] Size = ${ifThenElse(BootDiskSize > 0, BootDiskSize, undefined)} @@ -124,9 +134,9 @@ Autoscale = $Autoscale cyclecloud.mounts.nfs_shared.disabled = ${UseBuiltinShared && !configuration_slurm_ha_enabled} slurm.secondary_scheduler_name = ${ifThenElse(configuration_slurm_ha_enabled, "scheduler-ha-1", undefined)} - [[[cluster-init cyclecloud/healthagent:default]]] - [[[cluster-init cyclecloud/slurm:scheduler:4.0.0]]] - [[[cluster-init cyclecloud/slurm:pyxis:4.0.0]]] + [[[cluster-init cyclecloud/healthagent:default:1.0.2]]] + [[[cluster-init cyclecloud/slurm:scheduler:4.0.1]]] + [[[cluster-init cyclecloud/slurm:pyxis:4.0.1]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $UsePublicNetwork @@ -188,9 +198,9 @@ Autoscale = $Autoscale ImageName = $LoginImageName AdditionalClusterInitSpecs = $LoginClusterInitSpecs - [[[cluster-init cyclecloud/healthagent:default]]] - [[[cluster-init cyclecloud/slurm:login:4.0.0]]] - [[[cluster-init cyclecloud/slurm:pyxis:4.0.0]]] + [[[cluster-init cyclecloud/healthagent:default:1.0.2]]] + [[[cluster-init cyclecloud/slurm:login:4.0.1]]] + [[[cluster-init cyclecloud/slurm:pyxis:4.0.1]]] [[[configuration]]] slurm.role = login @@ -198,6 +208,15 @@ Autoscale = $Autoscale slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} slurm.use_nodename_as_hostname = $NodeNameIsHostname + + [[nodearray login-arm64]] + Extends = login + MachineType = $loginArm64MachineType + ImageName = $LoginArm64ImageName + + + + [[node nodearraybase]] Abstract = true [[[configuration]]] @@ -207,9 +226,9 @@ Autoscale = $Autoscale slurm.node_prefix = ${ifThenElse(NodeNamePrefix=="Cluster Prefix", StrJoin("-", ClusterName, ""), NodeNamePrefix)} slurm.use_nodename_as_hostname = $NodeNameIsHostname - [[[cluster-init cyclecloud/healthagent:default]]] - [[[cluster-init cyclecloud/slurm:execute:4.0.0]]] - [[[cluster-init cyclecloud/slurm:pyxis:4.0.0]]] + [[[cluster-init cyclecloud/healthagent:default:1.0.2]]] + [[[cluster-init cyclecloud/slurm:execute:4.0.1]]] + [[[cluster-init cyclecloud/slurm:pyxis:4.0.1]]] [[[network-interface eth0]]] AssociatePublicIpAddress = $ExecuteNodesPublic @@ -272,21 +291,20 @@ Autoscale = $Autoscale slurm.use_pcpu = false [[nodearray gpu]] - CloudInit="""#!/bin/bash -sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd - """ + + EnableAcceleratedNetworking=true + # GB200: Peregrine disallows SinglePlacementGroup, but pkey requires SingleScaleset Azure.Overprovision = false Azure.MaxScaleSetSize = 1000 Azure.SingleScaleset = true - Azure.SinglePlacementGroup = true + Azure.SinglePlacementGroup = false Extends = nodearraybase MachineType = $GPUMachineType ImageName = $GPUImageName MaxCount = $MaxGPUExecuteNodeCount - Azure.MaxScalesetSize = $HPCMaxScalesetSize EnableNodeHealthChecks = $EnableNodeHealthChecks Interruptible = $GPUUseLowPrio @@ -295,22 +313,24 @@ sed -i 's/^SHELL.*/SHELL=\/bin\/bash/g' /etc/default/useradd [[[configuration]]] slurm.default_partition = true - slurm.hpc = true + slurm.hpc = false slurm.partition = gpu #Parameter to enable or disable IMEX service on a per-job basis #IMEX Support is enabled by default for GB200 but can be disabled by setting param to False - #slurm.imex.enabled=True + slurm.imex.enabled=True [[nodearray gpu2]] extends = gpu MaxCount = $MaxGPU2ExecuteNodeCount + Azure.SinglePlacementGroup = false + Azure.MaxScalesetSize = $HPCMaxScalesetSize [[[configuration]]] slurm.default_partition = false - slurm.hpc = true + slurm.hpc = false slurm.partition = gpu2 #Parameter to enable or disable IMEX service on a per-job basis #IMEX Support is enabled by default for GB200 but can be disabled by setting param to False - #slurm.imex.enabled=True + slurm.imex.enabled=True [[nodearray dynamic]] Extends = nodearraybase @@ -366,6 +386,12 @@ Order = 10 ParameterType = Cloud.MachineType DefaultValue = Standard_D8as_v4 + [[[parameter loginArm64MachineType]]] + Label = ARM64 Login node VM Type + Description = The VM type for ARM64 login nodes. + ParameterType = Cloud.MachineType + DefaultValue = Standard_D8plds_v5 + [[[parameter HPCMachineType]]] Label = HPC VM Type Description = The VM type for HPC execute nodes @@ -394,7 +420,7 @@ Order = 10 Label = GPU VM Type Description = The VM type for HPC execute nodes ParameterType = Cloud.MachineType - DefaultValue = Standard_NC24rs_v3 + DefaultValue = Standard_ND128isr_NDR_GB200_v6 [[[parameter DynamicMachineType]]] Label = Dyn VM Type @@ -916,6 +942,14 @@ Order = 20 DefaultValue = almalinux8 Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + [[[parameter LoginArm64ImageName]]] + Label = ARM64 Login Node OS + ParameterType = Cloud.Image + Config.OS = linux + DefaultValue = cycle.image.ubuntu24 + Config.Filter := Package in {"cycle.image.ubuntu20", "cycle.image.ubuntu22", "cycle.image.sles15-hpc", "almalinux8"} + + [[[parameter HPCImageName]]] Label = HPC OS ParameterType = Cloud.Image @@ -1072,4 +1106,4 @@ Order = 100 Description = Optionally assign an Azure user assigned managed identity to all nodes to access Azure resources using assigned roles. # ParameterType = Azure.ManagedIdentity DefaultValue = =undefined - Conditions.Excluded := MonitoringEnabled isnt true \ No newline at end of file + Conditions.Excluded := MonitoringEnabled isnt true From d21ddffd8da5a0d918b6651c3b5dfc3bbde255fc Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Thu, 17 Jul 2025 12:23:29 -0400 Subject: [PATCH 46/50] Update b64-encoded create_cc_param.py --- bicep/files-to-load/encoded/create_cc_param.py.base64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bicep/files-to-load/encoded/create_cc_param.py.base64 b/bicep/files-to-load/encoded/create_cc_param.py.base64 index 8b89731e..d4dbafe7 100644 --- a/bicep/files-to-load/encoded/create_cc_param.py.base64 +++ b/bicep/files-to-load/encoded/create_cc_param.py.base64 @@ -1 +1 @@ -IyEvdXNyL2Jpbi9lbnYgcHl0aG9uDQoNCmltcG9ydCBhcmdwYXJzZQ0KaW1wb3J0IGhhc2hsaWINCmltcG9ydCBqc29uDQppbXBvcnQgb3MNCmltcG9ydCBzaHV0aWwNCmZyb20gc3VicHJvY2VzcyBpbXBvcnQgY2hlY2tfb3V0cHV0DQppbXBvcnQgc3lzDQppbXBvcnQgdHlwaW5nDQoNCg0KZGVmIGdldF9qc29uX2RpY3QoZmlsZV9uYW1lKToNCiAgICBhYnNfcGF0aCA9IG9zLnBhdGguYWJzcGF0aChmaWxlX25hbWUpDQogICAgd2l0aCBvcGVuKGFic19wYXRoKSBhcyBmcjoNCiAgICAgICAgcmV0dXJuIGpzb24ubG9hZChmcikNCg0KDQpkZWYgc2V0X3NsdXJtX3BhcmFtcyhwYXJhbXMsIGRiUGFzc3dvcmQsIG91dHB1dHMpOg0KICAgIHBhcmFtc1snUmVnaW9uJ10gPSBvdXRwdXRzWydsb2NhdGlvbiddWyd2YWx1ZSddDQogICAgaWYgb3V0cHV0c1sndm5ldCddWyd2YWx1ZSddWyd0eXBlJ10gPT0gJ25ldyc6DQogICAgICAgIHN1Ym5ldElEID0gb3V0cHV0c1sndm5ldCddWyd2YWx1ZSddWydjb21wdXRlU3VibmV0SWQnXQ0KICAgICAgICBzdWJuZXRfdG9rcyA9IHN1Ym5ldElELnNwbGl0KCIvIikNCiAgICAgICAgaWYgbGVuKHN1Ym5ldF90b2tzKSA+PSAxMToNCiAgICAgICAgICAgIHBhcmFtc1snU3VibmV0SWQnXSA9ICIvIi5qb2luKFtzdWJuZXRfdG9rc1s0XSwgc3VibmV0X3Rva3NbOF0sIHN1Ym5ldF90b2tzWzEwXV0pDQogICAgICAgIGVsc2U6DQogICAgICAgICAgICBwcmludChmIlVuZXhwZWN0ZWQgc3VibmV0IGlkIHtzdWJuZXRJRH0gLSBwYXNzaW5nIGFzIFN1Ym5ldElkIGRpcmVjdGx5IGluc3RlYWQgb2YgcmVzb3VyY2VfZ3JvdXAvdm5ldF9uYW1lL3N1Ym5ldF9uYW1lIiwgZmlsZT1zeXMuc3RkZXJyKQ0KICAgICAgICAgICAgcGFyYW1zWydTdWJuZXRJZCddID0gc3VibmV0SUQNCiAgICBlbHNlOg0KICAgICAgICBwYXJhbXNbJ1N1Ym5ldElkJ10gPSAnLycuam9pbihbb3V0cHV0c1sndm5ldCddWyd2YWx1ZSddWydyZyddLCBvdXRwdXRzWyd2bmV0J11bJ3ZhbHVlJ11bJ25hbWUnXSwgb3V0cHV0c1sndm5ldCddWyd2YWx1ZSddWydjb21wdXRlU3VibmV0TmFtZSddXSkNCiAgICAgICAgDQogICAgIyBEZWZpbmUgQXZhaWxhYmlsaXR5IFpvbmUNCiAgICBwYXJhbXNbJ0RlZmluZU5vZGVzQXZhaWxhYmlsaXR5Wm9uZSddID0gYW55KCdhdmFpbGFiaWxpdHlab25lJyBpbiB6b25lTGlzdCBmb3Igem9uZUxpc3QgaW4gW291dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHRjJ10sIG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHBjJ10sIG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnZ3B1J11dKQ0KICAgIA0KICAgICNIVEMNCiAgICBwYXJhbXNbJ0hUQ01hY2hpbmVUeXBlJ10gPSBvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2h0YyddWydza3UnXQ0KICAgIHBhcmFtc1snTWF4SFRDRXhlY3V0ZU5vZGVDb3VudCddID0gaW50KG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHRjJ11bJ21heE5vZGVzJ10pDQogICAgcGFyYW1zWydIVENJbWFnZU5hbWUnXSA9IG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHRjJ11bJ29zSW1hZ2UnXQ0KICAgIHBhcmFtc1snSFRDVXNlTG93UHJpbyddID0gb3V0cHV0c1sncGFydGl0aW9ucyddWyd2YWx1ZSddWydodGMnXVsndXNlU3BvdCddDQogICAgcGFyYW1zWydIVENBdmFpbGFiaWxpdHlab25lJ10gPSBvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2h0YyddWydhdmFpbGFiaWxpdHlab25lJ10gaWYgcGFyYW1zWydEZWZpbmVOb2Rlc0F2YWlsYWJpbGl0eVpvbmUnXSBhbmQgJ2F2YWlsYWJpbGl0eVpvbmUnIGluIG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHRjJ10gZWxzZSBOb25lDQogICAgDQogICAgI0hQQw0KICAgIHBhcmFtc1snSFBDTWFjaGluZVR5cGUnXSA9IG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHBjJ11bJ3NrdSddDQogICAgcGFyYW1zWydNYXhIUENFeGVjdXRlTm9kZUNvdW50J10gPSBpbnQob3V0cHV0c1sncGFydGl0aW9ucyddWyd2YWx1ZSddWydocGMnXVsnbWF4Tm9kZXMnXSkNCiAgICBwYXJhbXNbJ0hQQ0ltYWdlTmFtZSddID0gb3V0cHV0c1sncGFydGl0aW9ucyddWyd2YWx1ZSddWydocGMnXVsnb3NJbWFnZSddDQogICAgcGFyYW1zWydIUENBdmFpbGFiaWxpdHlab25lJ10gPSBvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2hwYyddWydhdmFpbGFiaWxpdHlab25lJ10gaWYgcGFyYW1zWydEZWZpbmVOb2Rlc0F2YWlsYWJpbGl0eVpvbmUnXSBhbmQgJ2F2YWlsYWJpbGl0eVpvbmUnIGluIG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHBjJ10gZWxzZSBOb25lDQoNCiAgICAjR1BVDQogICAgcGFyYW1zWydHUFVNYWNoaW5lVHlwZSddID0gb3V0cHV0c1sncGFydGl0aW9ucyddWyd2YWx1ZSddWydncHUnXVsnc2t1J10NCiAgICBwYXJhbXNbJ01heEdQVUV4ZWN1dGVOb2RlQ291bnQnXSA9IGludChvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2dwdSddWydtYXhOb2RlcyddKQ0KICAgIHBhcmFtc1snR1BVSW1hZ2VOYW1lJ10gPSBvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2dwdSddWydvc0ltYWdlJ10NCiAgICBwYXJhbXNbJ0dQVUF2YWlsYWJpbGl0eVpvbmUnXSA9IG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnZ3B1J11bJ2F2YWlsYWJpbGl0eVpvbmUnXSBpZiBwYXJhbXNbJ0RlZmluZU5vZGVzQXZhaWxhYmlsaXR5Wm9uZSddIGFuZCAnYXZhaWxhYmlsaXR5Wm9uZScgaW4gb3V0cHV0c1sncGFydGl0aW9ucyddWyd2YWx1ZSddWydncHUnXSBlbHNlIE5vbmUNCg0KICAgICNzY2hlZHVsZXIgbm9kZQ0KICAgICNwYXJhbXNbJ3NsdXJtJ10gI2lzIHRoaXMgdGhlIHNsdXJtIHZlcnNpb24/Pz8gbm8sIHNvIHdoYXQgaXMgaXQ/DQogICAgcGFyYW1zWydTY2hlZHVsZXJNYWNoaW5lVHlwZSddID0gb3V0cHV0c1snc2NoZWR1bGVyTm9kZSddWyd2YWx1ZSddWydza3UnXQ0KICAgIHBhcmFtc1snU2NoZWR1bGVySW1hZ2VOYW1lJ10gPSBvdXRwdXRzWydzY2hlZHVsZXJOb2RlJ11bJ3ZhbHVlJ11bJ29zSW1hZ2UnXQ0KICAgIHBhcmFtc1snY29uZmlndXJhdGlvbl9zbHVybV92ZXJzaW9uJ10gPSBvdXRwdXRzWydzbHVybVNldHRpbmdzJ11bJ3ZhbHVlJ11bJ3ZlcnNpb24nXQ0KICAgICMgaWYgb3V0cHV0c1snc2x1cm1TZXR0aW5ncyddWyd2YWx1ZSddWydjYW5Vc2VTbHVybUhBJ106DQogICAgIyAgICAgcGFyYW1zWydjb25maWd1cmF0aW9uX3NsdXJtX2hhX2VuYWJsZWQnXSA9IG91dHB1dHNbJ3NsdXJtU2V0dGluZ3MnXVsndmFsdWUnXVsnc2x1cm1IQSddDQogICAgcGFyYW1zWydjb25maWd1cmF0aW9uX3NsdXJtX2FjY291bnRpbmdfZW5hYmxlZCddID0gYm9vbChvdXRwdXRzWydkYXRhYmFzZUluZm8nXVsndmFsdWUnXSkNCiAgICBpZiBwYXJhbXNbJ2NvbmZpZ3VyYXRpb25fc2x1cm1fYWNjb3VudGluZ19lbmFibGVkJ106DQogICAgICAgIHBhcmFtc1snY29uZmlndXJhdGlvbl9zbHVybV9hY2NvdW50aW5nX3VzZXInXSA9IG91dHB1dHNbJ2RhdGFiYXNlSW5mbyddWyd2YWx1ZSddWydkYXRhYmFzZVVzZXInXQ0KICAgIGlmIHBhcmFtc1snY29uZmlndXJhdGlvbl9zbHVybV9hY2NvdW50aW5nX2VuYWJsZWQnXToNCiAgICAgICAgcGFyYW1zWydjb25maWd1cmF0aW9uX3NsdXJtX2FjY291bnRpbmdfcGFzc3dvcmQnXSA9IGRiUGFzc3dvcmQNCiAgICBpZiBwYXJhbXNbJ2NvbmZpZ3VyYXRpb25fc2x1cm1fYWNjb3VudGluZ19lbmFibGVkJ106DQogICAgICAgIHBhcmFtc1snY29uZmlndXJhdGlvbl9zbHVybV9hY2NvdW50aW5nX3VybCddID0gb3V0cHV0c1snZGF0YWJhc2VJbmZvJ11bJ3ZhbHVlJ11bJ3VybCddDQogICAgI3BhcmFtc1snY29uZmlndXJhdGlvbl9zbHVybV9hY2NvdW50aW5nX2NlcnRpZmljYXRlX3VybCddDQoNCiAgICAjbG9naW4gbm9kZShzKQ0KICAgIHBhcmFtc1snbG9naW5NYWNoaW5lVHlwZSddID0gKG91dHB1dHNbJ2xvZ2luTm9kZXMnXVsndmFsdWUnXVsnc2t1J10pLnN0cmlwKCkNCiAgICBwYXJhbXNbJ051bWJlckxvZ2luTm9kZXMnXSA9IGludChvdXRwdXRzWydsb2dpbk5vZGVzJ11bJ3ZhbHVlJ11bJ2luaXRpYWxOb2RlcyddKQ0KICAgIHBhcmFtc1snTG9naW5JbWFnZU5hbWUnXSA9IG91dHB1dHNbJ2xvZ2luTm9kZXMnXVsndmFsdWUnXVsnb3NJbWFnZSddDQogICAgcGFyYW1zWydFbmFibGVOb2RlSGVhbHRoQ2hlY2tzJ10gPSBvdXRwdXRzWydzbHVybVNldHRpbmdzJ11bJ3ZhbHVlJ11bJ2hlYWx0aENoZWNrRW5hYmxlZCddDQoNCiAgICAjRXhlY3V0ZSBub2RlIHRhZ3MNCiAgICBwYXJhbXNbJ05vZGVUYWdzJ10gPSBvdXRwdXRzWydub2RlQXJyYXlUYWdzJ11bJ3ZhbHVlJ10NCg0KICAgICNOZXR3b3JrIEF0dGFjaGVkIFN0b3JhZ2UNCiAgICBwYXJhbXNbJ1VzZUJ1aWx0aW5TaGFyZWQnXSA9IG91dHB1dHNbJ2ZpbGVySW5mb0ZpbmFsJ11bJ3ZhbHVlJ11bJ2hvbWUnXVsndHlwZSddID09ICduZnMtbmV3JyANCiAgICBpZiBwYXJhbXNbJ1VzZUJ1aWx0aW5TaGFyZWQnXToNCiAgICAgICAgcGFyYW1zWydGaWxlc3lzdGVtU2l6ZSddID0gb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnaG9tZSddWyduZnNDYXBhY2l0eUluR2InXQ0KICAgIGVsc2U6DQogICAgICAgIHBhcmFtc1snTkZTVHlwZSddID0gJ25mcycgaWYgb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnaG9tZSddWyd0eXBlJ10gaW4gWyduZnMtZXhpc3RpbmcnLCdhbmYtbmV3J10gZWxzZSAnbHVzdHJlJw0KICAgICAgICAjIFdlIG5vIGxvbmdlciBuZWVkIHRvIGhhbmRsZSB0aGVzZSBkaWZmZXJlbnRseSBiYXNlZCBvbiB0aGUgZnMgdHlwZSwgYXMgZWFjaA0KICAgICAgICAjIGZzIG1vZHVsZSdzIGNvbW1vbiBvdXRwdXRzIG1hcCB0byB0aGVzZS4NCiAgICAgICAgcGFyYW1zWydORlNTaGFyZWRFeHBvcnRQYXRoJ10gPSBvdXRwdXRzWydmaWxlckluZm9GaW5hbCddWyd2YWx1ZSddWydob21lJ11bJ2V4cG9ydFBhdGgnXQ0KICAgICAgICBwYXJhbXNbJ05GU1NoYXJlZE1vdW50T3B0aW9ucyddID0gb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnaG9tZSddWydtb3VudE9wdGlvbnMnXQ0KICAgICAgICBwYXJhbXNbJ05GU0FkZHJlc3MnXSA9IG91dHB1dHNbJ2ZpbGVySW5mb0ZpbmFsJ11bJ3ZhbHVlJ11bJ2hvbWUnXVsnaXBBZGRyZXNzJ10NCg0KICAgIHBhcmFtc1snQWRkaXRpb25hbE5GUyddID0gb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnYWRkaXRpb25hbCddWyd0eXBlJ10gIT0gJ2Rpc2FibGVkJw0KICAgIGlmIHBhcmFtc1snQWRkaXRpb25hbE5GUyddOg0KICAgICAgICBwYXJhbXNbJ0FkZGl0aW9uYWxORlNUeXBlJ10gPSAnbmZzJyBpZiBvdXRwdXRzWydmaWxlckluZm9GaW5hbCddWyd2YWx1ZSddWydhZGRpdGlvbmFsJ11bJ3R5cGUnXSBpbiBbJ25mcy1leGlzdGluZycsJ2FuZi1uZXcnXSBlbHNlICdsdXN0cmUnDQogICAgICAgIHBhcmFtc1snQWRkaXRpb25hbE5GU01vdW50UG9pbnQnXSA9IG91dHB1dHNbJ2ZpbGVySW5mb0ZpbmFsJ11bJ3ZhbHVlJ11bJ2FkZGl0aW9uYWwnXVsnbW91bnRQYXRoJ10NCiAgICAgICAgcGFyYW1zWydBZGRpdGlvbmFsTkZTRXhwb3J0UGF0aCddID0gb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnYWRkaXRpb25hbCddWydleHBvcnRQYXRoJ10NCiAgICAgICAgcGFyYW1zWydBZGRpdGlvbmFsTkZTTW91bnRPcHRpb25zJ10gPSBvdXRwdXRzWydmaWxlckluZm9GaW5hbCddWyd2YWx1ZSddWydhZGRpdGlvbmFsJ11bJ21vdW50T3B0aW9ucyddDQogICAgICAgIHBhcmFtc1snQWRkaXRpb25hbE5GU0FkZHJlc3MnXSA9IG91dHB1dHNbJ2ZpbGVySW5mb0ZpbmFsJ11bJ3ZhbHVlJ11bJ2FkZGl0aW9uYWwnXVsnaXBBZGRyZXNzJ10NCg0KDQpkZWYgc2V0X29vZF9wYXJhbXMocGFyYW1zLCBvdXRwdXRzKToNCiAgICBzbHVybV9wYXJhbXMgPSBnZXRfanNvbl9kaWN0KCdpbml0aWFsX3BhcmFtcy5qc29uJykNCiAgICAjIFdlIHdhbnQgdG8gZXNzZW50aWFsbHkgaW5oZXJpdCBjZXJ0YWluIHNldHRpbmdzIGZyb20gdGhlIHNsdXJtIGNsdXN0ZXIuDQogICAgc2V0X3NsdXJtX3BhcmFtcyhzbHVybV9wYXJhbXMsICIiLCBvdXRwdXRzKQ0KICAgIHBhcmFtc1snTkZTQWRkcmVzcyddID0gc2x1cm1fcGFyYW1zLmdldCgnTkZTQWRkcmVzcycpIG9yICdjY3ctc2NoZWR1bGVyJw0KICAgIHBhcmFtc1snTkZTU2hhcmVkRXhwb3J0UGF0aCddID0gc2x1cm1fcGFyYW1zLmdldCgnTkZTU2hhcmVkRXhwb3J0UGF0aCcpIG9yICcvc2hhcmVkJw0KICAgIHBhcmFtc1snTkZTU2hhcmVkTW91bnRPcHRpb25zJ10gPSBzbHVybV9wYXJhbXMuZ2V0KCdORlNTaGFyZWRNb3VudE9wdGlvbnMnKQ0KICAgIHBhcmFtc1snU3VibmV0SWQnXSA9IHNsdXJtX3BhcmFtc1siU3VibmV0SWQiXQ0KICAgIHBhcmFtc1snUmVnaW9uJ10gPSBzbHVybV9wYXJhbXNbJ1JlZ2lvbiddDQogICAgcGFyYW1zWydDcmVkZW50aWFscyddID0gc2x1cm1fcGFyYW1zWydDcmVkZW50aWFscyddDQoNCiAgICBwYXJhbXNbJ01hY2hpbmVUeXBlJ10gPSBvdXRwdXRzWydvb2QnXVsndmFsdWUnXS5nZXQoJ3NrdScpDQogICAgcGFyYW1zWydNYW5hZ2VkSWRlbnRpdHknXSA9IG91dHB1dHNbJ29vZCddWyd2YWx1ZSddLmdldCgnbWFuYWdlZElkZW50aXR5JykNCiAgICBwYXJhbXNbJ0Jvb3REaXNrU2l6ZSddID0gb3V0cHV0c1snb29kJ11bJ3ZhbHVlJ10uZ2V0KCdCb290RGlza1NpemUnKQ0KICAgIHBhcmFtc1snSW1hZ2VOYW1lJ10gPSBvdXRwdXRzWydvb2QnXVsndmFsdWUnXS5nZXQoJ29zSW1hZ2UnKQ0KDQogICAgcGFyYW1zWydvb2Rfc2VydmVyX25hbWUnXSA9IG91dHB1dHNbJ29vZCddWyd2YWx1ZSddLmdldCgnZnFkbicsJycpDQogICAgcGFyYW1zWydvb2RfZW50cmFfdXNlcl9tYXBfbWF0Y2gnXSA9IG91dHB1dHNbJ29vZCddWyd2YWx1ZSddLmdldCgndXNlckRvbWFpbicpDQogICAgcGFyYW1zWydvb2RfZW50cmFfY2xpZW50X2lkJ10gPSBvdXRwdXRzWydvb2QnXVsndmFsdWUnXS5nZXQoJ2NsaWVudElkJykNCiAgICBwYXJhbXNbJ29vZF9lbnRyYV90ZW5hbnRfaWQnXSA9IG91dHB1dHNbJ29vZCddWyd2YWx1ZSddLmdldCgndGVuYW50SWQnKQ0KICAgIHBhcmFtc1snb29kX25pYyddID0gb3V0cHV0c1snb29kJ11bJ3ZhbHVlJ10uZ2V0KCduaWMnKQ0KDQpjbGFzcyBDbHVzdGVySW5pdFNwZWM6DQogICAgZGVmIF9faW5pdF9fKHNlbGYsIHByb2plY3Q6IHN0ciwgdmVyc2lvbjogc3RyLCBzcGVjOiBzdHIsIHRhcmdldHM6IHR5cGluZy5MaXN0W3N0cl0pOg0KICAgICAgICBzZWxmLnByb2plY3QgPSBwcm9qZWN0DQogICAgICAgIHNlbGYudmVyc2lvbiA9IHZlcnNpb24NCiAgICAgICAgc2VsZi5zcGVjID0gc3BlYw0KICAgICAgICBzZWxmLnRhcmdldHMgPSB0YXJnZXRzDQogICAgICAgIHNlbGYuY2x1c3Rlcl9pbml0X2tleSA9IGYie3NlbGYucHJvamVjdH06e3NlbGYuc3BlY306e3NlbGYudmVyc2lvbn0iDQoNCg0KZGVmIGRvd25sb2FkX2NsdXN0ZXJfaW5pdChvdXRwdXRzLCByb290X2ZvbGRlciwgbG9ja2VyKSAtPiB0eXBpbmcuTGlzdFtDbHVzdGVySW5pdFNwZWNdOg0KICAgIHJldCA9IFtdDQogICAgZm9yIHJlY29yZCBpbiAob3V0cHV0c1snY2x1c3RlckluaXRTcGVjcyddLmdldCgidmFsdWUiKSBvciBbXSk6DQogICAgICAgIHVybCA9IF9zdHJpcF90YWdzX2Zyb21fZ2l0aHViX3VybChyZWNvcmQpDQogICAgICAgIHVybF9oYXNoID0gaGFzaGxpYi5zaGEyNTYodXJsLmVuY29kZSgpKQ0KICAgICAgICANCiAgICAgICAgZm9sZGVyID0gb3MucGF0aC5qb2luKHJvb3RfZm9sZGVyLCB1cmxfaGFzaC5oZXhkaWdlc3QoKSkNCiAgICAgICAgaWYgbm90IG9zLnBhdGguZXhpc3RzKGZvbGRlcik6DQogICAgICAgICAgICAjIGRvd25sb2FkIGFuZCBtb3ZlIHRvIGF2b2lkIHJlcGVhdGVkIGZhaWx1cmVzIHdpdGggcGFydGlhbCBkb3dubG9hZHMvdXBsb2Fkcw0KICAgICAgICAgICAgY2hlY2tfb3V0cHV0KFsiL3Vzci9sb2NhbC9iaW4vY3ljbGVjbG91ZCIsICJwcm9qZWN0IiwgImZldGNoIiwgdXJsLCBmb2xkZXIgKyAiLnRtcCJdKQ0KICAgICAgICAgICAgY2hlY2tfb3V0cHV0KFsiL3Vzci9sb2NhbC9iaW4vY3ljbGVjbG91ZCIsICJwcm9qZWN0IiwgInVwbG9hZCIsIGxvY2tlcl0sIGN3ZD1mb2xkZXIgKyAiLnRtcCIpDQogICAgICAgICAgICBzaHV0aWwubW92ZShmb2xkZXIgKyAiLnRtcCIsIGZvbGRlcikNCiAgICAgICAgICAgIHdpdGggb3Blbihvcy5wYXRoLmpvaW4oZm9sZGVyLCAiZG93bmxvYWQtdXJsIiksICJ3IikgYXMgZnc6DQogICAgICAgICAgICAgICAgZncud3JpdGUodXJsKQ0KICAgICAgICBwcm9qX2luZm9fcmF3ID0gY2hlY2tfb3V0cHV0KFsiL3Vzci9sb2NhbC9iaW4vY3ljbGVjbG91ZCIsICJwcm9qZWN0IiwgImluZm8iXSwgY3dkPWZvbGRlcikuZGVjb2RlKCkNCiAgICAgICAgcHJval9pbmZvID0ge30NCiAgICAgICAgZm9yIGxpbmUgaW4gcHJval9pbmZvX3Jhdy5zcGxpdGxpbmVzKCk6DQogICAgICAgICAgICBrZXksIHJlc3QgPSBsaW5lLnNwbGl0KCI6IiwgMSkNCiAgICAgICAgICAgIHByb2pfaW5mb1trZXkubG93ZXIoKV0gPSByZXN0LnN0cmlwKCkNCiAgICAgICAgcmV0LmFwcGVuZChDbHVzdGVySW5pdFNwZWMocHJval9pbmZvWyJuYW1lIl0sDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHByb2pfaW5mb1sidmVyc2lvbiJdLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICByZWNvcmQuZ2V0KCJzcGVjIikgb3IgImRlZmF1bHQiLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICByZWNvcmRbInRhcmdldCJdKSkNCiAgICByZXR1cm4gcmV0DQoNCg0KZGVmIF9zdHJpcF90YWdzX2Zyb21fZ2l0aHViX3VybChyZWNvcmQpOg0KICAgIHVybCA9IHJlY29yZFsiZ2l0SHViUmVsZWFzZVVSTCJdDQogICAgaWYgIi90YWcvIiBpbiB1cmw6DQogICAgICAgIHJldHVybiB1cmwucmVwbGFjZSgiL3RhZyIsICIiKQ0KICAgIHJldHVybiB1cmwNCg0KDQpkZWYgX3ZlcnNpb25fZnJvbV91cmwocmVjb3JkKToNCiAgICBpZiByZWNvcmQuZ2V0KCJ2ZXJzaW9uIik6DQogICAgICAgIHJldHVybiByZWNvcmRbInZlcnNpb24iXQ0KICAgIHJldHVybiByZWNvcmRbImdpdEh1YlJlbGVhc2VVUkwiXS5zcGxpdCgiLyIpWy0xXQ0KDQoNCmRlZiBzZXRfY2x1c3Rlcl9pbml0X3BhcmFtcyhwYXJhbXM6IGRpY3QsIHNwZWNzOiB0eXBpbmcuTGlzdFtDbHVzdGVySW5pdFNwZWNdLCBjbHVzdGVyX25hbWU6IHN0ciwgdGFyZ2V0X3BhcmFtczogZGljdCkgLT4gTm9uZToNCiAgICBvcmRlciA9IDEwMDAwDQogICAgZm9yIHNwZWMgaW4gc3BlY3M6DQogICAgICAgIGZvciB0YXJnZXQgaW4gc3BlYy50YXJnZXRzOg0KICAgICAgICAgICAgdGFyZ2V0X2tleSA9IGYie3RhcmdldF9wYXJhbXNbdGFyZ2V0Lmxvd2VyKCldfSINCiAgICAgICAgICAgIGlmIG5vdCBwYXJhbXMuZ2V0KHRhcmdldF9rZXkpOg0KICAgICAgICAgICAgICAgIHBhcmFtc1t0YXJnZXRfa2V5XSA9IHt9DQoNCiAgICAgICAgICAgIHBhcmFtc1t0YXJnZXRfa2V5XVtzcGVjLmNsdXN0ZXJfaW5pdF9rZXldID0gew0KICAgICAgICAgICAgICAgICJPcmRlciI6IG9yZGVyLA0KICAgICAgICAgICAgICAgICJTcGVjIjogc3BlYy5zcGVjLA0KICAgICAgICAgICAgICAgICJOYW1lIjogc3BlYy5jbHVzdGVyX2luaXRfa2V5LA0KICAgICAgICAgICAgICAgICJQcm9qZWN0Ijogc3BlYy5wcm9qZWN0LA0KICAgICAgICAgICAgICAgICJMb2NrZXIiOiAiYXp1cmUtc3RvcmFnZSIsDQogICAgICAgICAgICAgICAgIlZlcnNpb24iOiBzcGVjLnZlcnNpb24NCiAgICAgICAgICAgIH0NCiAgICAgICAgICAgIG9yZGVyICs9IDEwMA0KDQoNCmRlZiBtYWluKCk6DQogICAgcGFyc2VyID0gYXJncGFyc2UuQXJndW1lbnRQYXJzZXIoZGVzY3JpcHRpb249IlRPRE8gUkRIIikNCiAgICBwYXJzZXIuYWRkX2FyZ3VtZW50KCItLWxvY2tlciIsIGRlZmF1bHQ9ImF6dXJlLXN0b3JhZ2UiKQ0KICAgIHBhcnNlci5hZGRfYXJndW1lbnQoIi0tY2x1c3Rlci1pbml0LXdvcmtpbmctZGlyIiwgZGVmYXVsdD0iY2x1c3Rlci1pbml0IikNCiAgICBzdWJwYXJzZXJzID0gcGFyc2VyLmFkZF9zdWJwYXJzZXJzKCkNCiAgICBjY3dfcGFyc2VyID0gc3VicGFyc2Vycy5hZGRfcGFyc2VyKCJzbHVybSIpDQogICAgIyBUT0RPIHRoaXMgbmVlZHMgdG8gYmUgYnkgY2x1c3RlciB0eXBlDQogICAgdGFyZ2V0X3BhcmFtcyA9IHsNCiAgICAgICAgImxvZ2luIjogIkxvZ2luQ2x1c3RlckluaXRTcGVjcyIsDQogICAgICAgICJncHUiOiAiR1BVQ2x1c3RlckluaXRTcGVjcyIsDQogICAgICAgICJocGMiOiAiSFBDQ2x1c3RlckluaXRTcGVjcyIsDQogICAgICAgICJodGMiOiAiSFRDQ2x1c3RlckluaXRTcGVjcyIsDQogICAgICAgICJzY2hlZHVsZXIiOiAiU2NoZWR1bGVyQ2x1c3RlckluaXRTcGVjcyIsDQogICAgICAgICJkeW5hbWljIjogIkR5bmFtaWNDbHVzdGVySW5pdFNwZWNzIiwNCiAgICAgICAgIm9vZCI6ICJDbHVzdGVySW5pdFNwZWNzIg0KICAgIH0NCiAgICBjY3dfcGFyc2VyLnNldF9kZWZhdWx0cyhjbHVzdGVyX3R5cGU9InNsdXJtIiwgdGFyZ2V0X3BhcmFtcz10YXJnZXRfcGFyYW1zKQ0KICAgIGNjd19wYXJzZXIuYWRkX2FyZ3VtZW50KCItLWRiUGFzc3dvcmQiLCBkZXN0PSJkYlBhc3N3b3JkIiwgZGVmYXVsdD0iIiwgaGVscD0iTXlTUUwgZGF0YWJhc2UgcGFzc3dvcmQiKQ0KICAgIA0KICAgIG9vZF9wYXJzZXIgPSBzdWJwYXJzZXJzLmFkZF9wYXJzZXIoIm9vZCIpDQogICAgb29kX3BhcnNlci5zZXRfZGVmYXVsdHMoY2x1c3Rlcl90eXBlPSJvb2QiLCB0YXJnZXRfcGFyYW1zPXRhcmdldF9wYXJhbXMpDQogICAgDQogICAgYXJncyA9IHBhcnNlci5wYXJzZV9hcmdzKCkNCg0KICAgIGlmIGFyZ3MuY2x1c3Rlcl90eXBlID09ICJzbHVybSI6DQogICAgICAgIG91dHB1dF9wYXJhbXMgPSBnZXRfanNvbl9kaWN0KCdpbml0aWFsX3BhcmFtcy5qc29uJykNCiAgICBlbHNlOg0KICAgICAgICBvdXRwdXRfcGFyYW1zID0ge30NCiAgICBjY3dfb3V0cHV0cyA9IGdldF9qc29uX2RpY3QoJ2Njd091dHB1dHMuanNvbicpDQoNCiAgICBzcGVjcyA9IGRvd25sb2FkX2NsdXN0ZXJfaW5pdChjY3dfb3V0cHV0cywgb3MucGF0aC5qb2luKG9zLmdldGN3ZCgpLCBhcmdzLmNsdXN0ZXJfaW5pdF93b3JraW5nX2RpciksIGFyZ3MubG9ja2VyKQ0KICAgIHNldF9jbHVzdGVyX2luaXRfcGFyYW1zKG91dHB1dF9wYXJhbXMsIHNwZWNzLCBhcmdzLmNsdXN0ZXJfdHlwZSwgYXJncy50YXJnZXRfcGFyYW1zKQ0KICAgIGlmIGFyZ3MuY2x1c3Rlcl90eXBlID09ICJzbHVybSI6DQogICAgICAgIHNldF9zbHVybV9wYXJhbXMob3V0cHV0X3BhcmFtcywgYXJncy5kYlBhc3N3b3JkLCBjY3dfb3V0cHV0cykNCiAgICBlbHNlOg0KICAgICAgICBzZXRfb29kX3BhcmFtcyhvdXRwdXRfcGFyYW1zLCBjY3dfb3V0cHV0cykNCiAgICBwcmludChqc29uLmR1bXBzKG91dHB1dF9wYXJhbXMsIGluZGVudD00KSkNCg0KDQppZiBfX25hbWVfXyA9PSAnX19tYWluX18nOg0KICAgIG1haW4oKQ== \ No newline at end of file +IyEvdXNyL2Jpbi9lbnYgcHl0aG9uDQoNCmltcG9ydCBhcmdwYXJzZQ0KaW1wb3J0IGhhc2hsaWINCmltcG9ydCBqc29uDQppbXBvcnQgb3MNCmltcG9ydCBzaHV0aWwNCmZyb20gc3VicHJvY2VzcyBpbXBvcnQgY2hlY2tfb3V0cHV0DQppbXBvcnQgc3lzDQppbXBvcnQgdHlwaW5nDQoNCg0KZGVmIGdldF9qc29uX2RpY3QoZmlsZV9uYW1lKToNCiAgICBhYnNfcGF0aCA9IG9zLnBhdGguYWJzcGF0aChmaWxlX25hbWUpDQogICAgd2l0aCBvcGVuKGFic19wYXRoKSBhcyBmcjoNCiAgICAgICAgcmV0dXJuIGpzb24ubG9hZChmcikNCg0KDQpkZWYgc2V0X3NsdXJtX3BhcmFtcyhwYXJhbXMsIGRiUGFzc3dvcmQsIG91dHB1dHMpOg0KICAgIHBhcmFtc1snUmVnaW9uJ10gPSBvdXRwdXRzWydsb2NhdGlvbiddWyd2YWx1ZSddDQogICAgaWYgb3V0cHV0c1sndm5ldCddWyd2YWx1ZSddWyd0eXBlJ10gPT0gJ25ldyc6DQogICAgICAgIHN1Ym5ldElEID0gb3V0cHV0c1sndm5ldCddWyd2YWx1ZSddWydjb21wdXRlU3VibmV0SWQnXQ0KICAgICAgICBzdWJuZXRfdG9rcyA9IHN1Ym5ldElELnNwbGl0KCIvIikNCiAgICAgICAgaWYgbGVuKHN1Ym5ldF90b2tzKSA+PSAxMToNCiAgICAgICAgICAgIHBhcmFtc1snU3VibmV0SWQnXSA9ICIvIi5qb2luKFtzdWJuZXRfdG9rc1s0XSwgc3VibmV0X3Rva3NbOF0sIHN1Ym5ldF90b2tzWzEwXV0pDQogICAgICAgIGVsc2U6DQogICAgICAgICAgICBwcmludChmIlVuZXhwZWN0ZWQgc3VibmV0IGlkIHtzdWJuZXRJRH0gLSBwYXNzaW5nIGFzIFN1Ym5ldElkIGRpcmVjdGx5IGluc3RlYWQgb2YgcmVzb3VyY2VfZ3JvdXAvdm5ldF9uYW1lL3N1Ym5ldF9uYW1lIiwgZmlsZT1zeXMuc3RkZXJyKQ0KICAgICAgICAgICAgcGFyYW1zWydTdWJuZXRJZCddID0gc3VibmV0SUQNCiAgICBlbHNlOg0KICAgICAgICBwYXJhbXNbJ1N1Ym5ldElkJ10gPSAnLycuam9pbihbb3V0cHV0c1sndm5ldCddWyd2YWx1ZSddWydyZyddLCBvdXRwdXRzWyd2bmV0J11bJ3ZhbHVlJ11bJ25hbWUnXSwgb3V0cHV0c1sndm5ldCddWyd2YWx1ZSddWydjb21wdXRlU3VibmV0TmFtZSddXSkNCiAgICAgICAgDQogICAgIyBEZWZpbmUgQXZhaWxhYmlsaXR5IFpvbmUNCiAgICBwYXJhbXNbJ0RlZmluZU5vZGVzQXZhaWxhYmlsaXR5Wm9uZSddID0gYW55KCdhdmFpbGFiaWxpdHlab25lJyBpbiB6b25lTGlzdCBmb3Igem9uZUxpc3QgaW4gW291dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHBjJ10sIG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnZ3B1J11dKQ0KICAgIA0KICAgIGZvciBuYSBpbiBbJ0Q2NEQnLCAnRDE2RCcsICdNNjQnXToNCiAgICAgICAgcGFyYW1zW2Yne25hfU1hY2hpbmVUeXBlJ10gPSBvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bbmEubG93ZXIoKV1bJ3NrdSddDQogICAgICAgIHBhcmFtc1tmJ01heHtuYX1Ob2RlQ291bnQnXSA9IGludChvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bbmEubG93ZXIoKV1bJ21heE5vZGVzJ10pDQogICAgICAgIHBhcmFtc1tmJ3tuYX1JbWFnZU5hbWUnXSA9IG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVtuYS5sb3dlcigpXVsnb3NJbWFnZSddDQoNCiAgICAjSFBDDQogICAgcGFyYW1zWydIUENNYWNoaW5lVHlwZSddID0gb3V0cHV0c1sncGFydGl0aW9ucyddWyd2YWx1ZSddWydocGMnXVsnc2t1J10NCiAgICBwYXJhbXNbJ01heEhQQ0V4ZWN1dGVOb2RlQ291bnQnXSA9IGludChvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2hwYyddWydtYXhOb2RlcyddKQ0KICAgIHBhcmFtc1snSFBDSW1hZ2VOYW1lJ10gPSBvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2hwYyddWydvc0ltYWdlJ10NCiAgICBwYXJhbXNbJ0hQQ0F2YWlsYWJpbGl0eVpvbmUnXSA9IG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnaHBjJ11bJ2F2YWlsYWJpbGl0eVpvbmUnXSBpZiBwYXJhbXNbJ0RlZmluZU5vZGVzQXZhaWxhYmlsaXR5Wm9uZSddIGFuZCAnYXZhaWxhYmlsaXR5Wm9uZScgaW4gb3V0cHV0c1sncGFydGl0aW9ucyddWyd2YWx1ZSddWydocGMnXSBlbHNlIE5vbmUNCg0KICAgICNHUFUNCiAgICBwYXJhbXNbJ0dQVU1hY2hpbmVUeXBlJ10gPSBvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2dwdSddWydza3UnXQ0KICAgIHBhcmFtc1snTWF4R1BVRXhlY3V0ZU5vZGVDb3VudCddID0gaW50KG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnZ3B1J11bJ21heE5vZGVzJ10pDQogICAgcGFyYW1zWydHUFVJbWFnZU5hbWUnXSA9IG91dHB1dHNbJ3BhcnRpdGlvbnMnXVsndmFsdWUnXVsnZ3B1J11bJ29zSW1hZ2UnXQ0KICAgIHBhcmFtc1snR1BVQXZhaWxhYmlsaXR5Wm9uZSddID0gb3V0cHV0c1sncGFydGl0aW9ucyddWyd2YWx1ZSddWydncHUnXVsnYXZhaWxhYmlsaXR5Wm9uZSddIGlmIHBhcmFtc1snRGVmaW5lTm9kZXNBdmFpbGFiaWxpdHlab25lJ10gYW5kICdhdmFpbGFiaWxpdHlab25lJyBpbiBvdXRwdXRzWydwYXJ0aXRpb25zJ11bJ3ZhbHVlJ11bJ2dwdSddIGVsc2UgTm9uZQ0KDQogICAgI3NjaGVkdWxlciBub2RlDQogICAgI3BhcmFtc1snc2x1cm0nXSAjaXMgdGhpcyB0aGUgc2x1cm0gdmVyc2lvbj8/PyBubywgc28gd2hhdCBpcyBpdD8NCiAgICBwYXJhbXNbJ1NjaGVkdWxlck1hY2hpbmVUeXBlJ10gPSBvdXRwdXRzWydzY2hlZHVsZXJOb2RlJ11bJ3ZhbHVlJ11bJ3NrdSddDQogICAgcGFyYW1zWydTY2hlZHVsZXJJbWFnZU5hbWUnXSA9IG91dHB1dHNbJ3NjaGVkdWxlck5vZGUnXVsndmFsdWUnXVsnb3NJbWFnZSddDQogICAgcGFyYW1zWydjb25maWd1cmF0aW9uX3NsdXJtX3ZlcnNpb24nXSA9IG91dHB1dHNbJ3NsdXJtU2V0dGluZ3MnXVsndmFsdWUnXVsndmVyc2lvbiddDQogICAgIyBpZiBvdXRwdXRzWydzbHVybVNldHRpbmdzJ11bJ3ZhbHVlJ11bJ2NhblVzZVNsdXJtSEEnXToNCiAgICAjICAgICBwYXJhbXNbJ2NvbmZpZ3VyYXRpb25fc2x1cm1faGFfZW5hYmxlZCddID0gb3V0cHV0c1snc2x1cm1TZXR0aW5ncyddWyd2YWx1ZSddWydzbHVybUhBJ10NCiAgICBwYXJhbXNbJ2NvbmZpZ3VyYXRpb25fc2x1cm1fYWNjb3VudGluZ19lbmFibGVkJ10gPSBib29sKG91dHB1dHNbJ2RhdGFiYXNlSW5mbyddWyd2YWx1ZSddKQ0KICAgIGlmIHBhcmFtc1snY29uZmlndXJhdGlvbl9zbHVybV9hY2NvdW50aW5nX2VuYWJsZWQnXToNCiAgICAgICAgcGFyYW1zWydjb25maWd1cmF0aW9uX3NsdXJtX2FjY291bnRpbmdfdXNlciddID0gb3V0cHV0c1snZGF0YWJhc2VJbmZvJ11bJ3ZhbHVlJ11bJ2RhdGFiYXNlVXNlciddDQogICAgaWYgcGFyYW1zWydjb25maWd1cmF0aW9uX3NsdXJtX2FjY291bnRpbmdfZW5hYmxlZCddOg0KICAgICAgICBwYXJhbXNbJ2NvbmZpZ3VyYXRpb25fc2x1cm1fYWNjb3VudGluZ19wYXNzd29yZCddID0gZGJQYXNzd29yZA0KICAgIGlmIHBhcmFtc1snY29uZmlndXJhdGlvbl9zbHVybV9hY2NvdW50aW5nX2VuYWJsZWQnXToNCiAgICAgICAgcGFyYW1zWydjb25maWd1cmF0aW9uX3NsdXJtX2FjY291bnRpbmdfdXJsJ10gPSBvdXRwdXRzWydkYXRhYmFzZUluZm8nXVsndmFsdWUnXVsndXJsJ10NCiAgICAjcGFyYW1zWydjb25maWd1cmF0aW9uX3NsdXJtX2FjY291bnRpbmdfY2VydGlmaWNhdGVfdXJsJ10NCg0KICAgICNsb2dpbiBub2RlKHMpDQogICAgcGFyYW1zWydsb2dpbk1hY2hpbmVUeXBlJ10gPSAob3V0cHV0c1snbG9naW5Ob2RlcyddWyd2YWx1ZSddWydza3UnXSkuc3RyaXAoKQ0KICAgIHBhcmFtc1snTnVtYmVyTG9naW5Ob2RlcyddID0gaW50KG91dHB1dHNbJ2xvZ2luTm9kZXMnXVsndmFsdWUnXVsnaW5pdGlhbE5vZGVzJ10pDQogICAgcGFyYW1zWydMb2dpbkltYWdlTmFtZSddID0gb3V0cHV0c1snbG9naW5Ob2RlcyddWyd2YWx1ZSddWydvc0ltYWdlJ10NCiAgICBwYXJhbXNbJ0VuYWJsZU5vZGVIZWFsdGhDaGVja3MnXSA9IG91dHB1dHNbJ3NsdXJtU2V0dGluZ3MnXVsndmFsdWUnXVsnaGVhbHRoQ2hlY2tFbmFibGVkJ10NCg0KICAgICNFeGVjdXRlIG5vZGUgdGFncw0KICAgIHBhcmFtc1snTm9kZVRhZ3MnXSA9IG91dHB1dHNbJ25vZGVBcnJheVRhZ3MnXVsndmFsdWUnXQ0KDQogICAgI05ldHdvcmsgQXR0YWNoZWQgU3RvcmFnZQ0KICAgIHBhcmFtc1snVXNlQnVpbHRpblNoYXJlZCddID0gb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnaG9tZSddWyd0eXBlJ10gPT0gJ25mcy1uZXcnIA0KICAgIGlmIHBhcmFtc1snVXNlQnVpbHRpblNoYXJlZCddOg0KICAgICAgICBwYXJhbXNbJ0ZpbGVzeXN0ZW1TaXplJ10gPSBvdXRwdXRzWydmaWxlckluZm9GaW5hbCddWyd2YWx1ZSddWydob21lJ11bJ25mc0NhcGFjaXR5SW5HYiddDQogICAgZWxzZToNCiAgICAgICAgcGFyYW1zWydORlNUeXBlJ10gPSAnbmZzJyBpZiBvdXRwdXRzWydmaWxlckluZm9GaW5hbCddWyd2YWx1ZSddWydob21lJ11bJ3R5cGUnXSBpbiBbJ25mcy1leGlzdGluZycsJ2FuZi1uZXcnXSBlbHNlICdsdXN0cmUnDQogICAgICAgICMgV2Ugbm8gbG9uZ2VyIG5lZWQgdG8gaGFuZGxlIHRoZXNlIGRpZmZlcmVudGx5IGJhc2VkIG9uIHRoZSBmcyB0eXBlLCBhcyBlYWNoDQogICAgICAgICMgZnMgbW9kdWxlJ3MgY29tbW9uIG91dHB1dHMgbWFwIHRvIHRoZXNlLg0KICAgICAgICBwYXJhbXNbJ05GU1NoYXJlZEV4cG9ydFBhdGgnXSA9IG91dHB1dHNbJ2ZpbGVySW5mb0ZpbmFsJ11bJ3ZhbHVlJ11bJ2hvbWUnXVsnZXhwb3J0UGF0aCddDQogICAgICAgIHBhcmFtc1snTkZTU2hhcmVkTW91bnRPcHRpb25zJ10gPSBvdXRwdXRzWydmaWxlckluZm9GaW5hbCddWyd2YWx1ZSddWydob21lJ11bJ21vdW50T3B0aW9ucyddDQogICAgICAgIHBhcmFtc1snTkZTQWRkcmVzcyddID0gb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnaG9tZSddWydpcEFkZHJlc3MnXQ0KDQogICAgcGFyYW1zWydBZGRpdGlvbmFsTkZTJ10gPSBvdXRwdXRzWydmaWxlckluZm9GaW5hbCddWyd2YWx1ZSddWydhZGRpdGlvbmFsJ11bJ3R5cGUnXSAhPSAnZGlzYWJsZWQnDQogICAgaWYgcGFyYW1zWydBZGRpdGlvbmFsTkZTJ106DQogICAgICAgIHBhcmFtc1snQWRkaXRpb25hbE5GU1R5cGUnXSA9ICduZnMnIGlmIG91dHB1dHNbJ2ZpbGVySW5mb0ZpbmFsJ11bJ3ZhbHVlJ11bJ2FkZGl0aW9uYWwnXVsndHlwZSddIGluIFsnbmZzLWV4aXN0aW5nJywnYW5mLW5ldyddIGVsc2UgJ2x1c3RyZScNCiAgICAgICAgcGFyYW1zWydBZGRpdGlvbmFsTkZTTW91bnRQb2ludCddID0gb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnYWRkaXRpb25hbCddWydtb3VudFBhdGgnXQ0KICAgICAgICBwYXJhbXNbJ0FkZGl0aW9uYWxORlNFeHBvcnRQYXRoJ10gPSBvdXRwdXRzWydmaWxlckluZm9GaW5hbCddWyd2YWx1ZSddWydhZGRpdGlvbmFsJ11bJ2V4cG9ydFBhdGgnXQ0KICAgICAgICBwYXJhbXNbJ0FkZGl0aW9uYWxORlNNb3VudE9wdGlvbnMnXSA9IG91dHB1dHNbJ2ZpbGVySW5mb0ZpbmFsJ11bJ3ZhbHVlJ11bJ2FkZGl0aW9uYWwnXVsnbW91bnRPcHRpb25zJ10NCiAgICAgICAgcGFyYW1zWydBZGRpdGlvbmFsTkZTQWRkcmVzcyddID0gb3V0cHV0c1snZmlsZXJJbmZvRmluYWwnXVsndmFsdWUnXVsnYWRkaXRpb25hbCddWydpcEFkZHJlc3MnXQ0KDQogICAgIyBNb25pdG9yaW5nDQogICAgcGFyYW1zWydNb25pdG9yaW5nRW5hYmxlZCddID0gb3V0cHV0c1snbW9uaXRvcmluZyddWyJ2YWx1ZSJdWydpbmdlc3Rpb25FbmRwb2ludCddICE9ICcnDQogICAgcGFyYW1zWydNb25pdG9yaW5nSW5nZXN0aW9uRW5kcG9pbnQnXSA9IG91dHB1dHNbJ21vbml0b3JpbmcnXVsndmFsdWUnXVsnaW5nZXN0aW9uRW5kcG9pbnQnXQ0KICAgIHBhcmFtc1snTW9uaXRvcmluZ0lkZW50aXR5Q2xpZW50SWQnXSA9IG91dHB1dHNbJ21vbml0b3JpbmcnXVsndmFsdWUnXVsnaWRlbnRpdHlDbGllbnRJZCddDQoNCiAgICBwYXJhbXNbJ01hbmFnZWRJZGVudGl0eSddID0gb3V0cHV0c1snaHViTUknXVsndmFsdWUnXQ0KDQoNCmRlZiBzZXRfb29kX3BhcmFtcyhwYXJhbXMsIG91dHB1dHMpOg0KICAgIHNsdXJtX3BhcmFtcyA9IGdldF9qc29uX2RpY3QoJ2luaXRpYWxfcGFyYW1zLmpzb24nKQ0KICAgICMgV2Ugd2FudCB0byBlc3NlbnRpYWxseSBpbmhlcml0IGNlcnRhaW4gc2V0dGluZ3MgZnJvbSB0aGUgc2x1cm0gY2x1c3Rlci4NCiAgICBzZXRfc2x1cm1fcGFyYW1zKHNsdXJtX3BhcmFtcywgIiIsIG91dHB1dHMpDQogICAgcGFyYW1zWydORlNBZGRyZXNzJ10gPSBzbHVybV9wYXJhbXMuZ2V0KCdORlNBZGRyZXNzJykgb3IgJ2Njdy1zY2hlZHVsZXInDQogICAgcGFyYW1zWydORlNTaGFyZWRFeHBvcnRQYXRoJ10gPSBzbHVybV9wYXJhbXMuZ2V0KCdORlNTaGFyZWRFeHBvcnRQYXRoJykgb3IgJy9zaGFyZWQnDQogICAgcGFyYW1zWydORlNTaGFyZWRNb3VudE9wdGlvbnMnXSA9IHNsdXJtX3BhcmFtcy5nZXQoJ05GU1NoYXJlZE1vdW50T3B0aW9ucycpDQogICAgcGFyYW1zWydTdWJuZXRJZCddID0gc2x1cm1fcGFyYW1zWyJTdWJuZXRJZCJdDQogICAgcGFyYW1zWydSZWdpb24nXSA9IHNsdXJtX3BhcmFtc1snUmVnaW9uJ10NCiAgICBwYXJhbXNbJ0NyZWRlbnRpYWxzJ10gPSBzbHVybV9wYXJhbXNbJ0NyZWRlbnRpYWxzJ10NCg0KICAgIHBhcmFtc1snTWFjaGluZVR5cGUnXSA9IG91dHB1dHNbJ29vZCddWyd2YWx1ZSddLmdldCgnc2t1JykNCiAgICBwYXJhbXNbJ01hbmFnZWRJZGVudGl0eSddID0gb3V0cHV0c1snb29kJ11bJ3ZhbHVlJ10uZ2V0KCdtYW5hZ2VkSWRlbnRpdHknKQ0KICAgIHBhcmFtc1snQm9vdERpc2tTaXplJ10gPSBvdXRwdXRzWydvb2QnXVsndmFsdWUnXS5nZXQoJ0Jvb3REaXNrU2l6ZScpDQogICAgcGFyYW1zWydJbWFnZU5hbWUnXSA9IG91dHB1dHNbJ29vZCddWyd2YWx1ZSddLmdldCgnb3NJbWFnZScpDQoNCiAgICBwYXJhbXNbJ29vZF9zZXJ2ZXJfbmFtZSddID0gb3V0cHV0c1snb29kJ11bJ3ZhbHVlJ10uZ2V0KCdmcWRuJywnJykNCiAgICBwYXJhbXNbJ29vZF9lbnRyYV91c2VyX21hcF9tYXRjaCddID0gb3V0cHV0c1snb29kJ11bJ3ZhbHVlJ10uZ2V0KCd1c2VyRG9tYWluJykNCiAgICBwYXJhbXNbJ29vZF9lbnRyYV9jbGllbnRfaWQnXSA9IG91dHB1dHNbJ29vZCddWyd2YWx1ZSddLmdldCgnY2xpZW50SWQnKQ0KICAgIHBhcmFtc1snb29kX2VudHJhX3RlbmFudF9pZCddID0gb3V0cHV0c1snb29kJ11bJ3ZhbHVlJ10uZ2V0KCd0ZW5hbnRJZCcpDQogICAgcGFyYW1zWydvb2RfbmljJ10gPSBvdXRwdXRzWydvb2QnXVsndmFsdWUnXS5nZXQoJ25pYycpDQoNCg0KY2xhc3MgQ2x1c3RlckluaXRTcGVjOg0KICAgIGRlZiBfX2luaXRfXyhzZWxmLCBwcm9qZWN0OiBzdHIsIHZlcnNpb246IHN0ciwgc3BlYzogc3RyLCB0YXJnZXRzOiB0eXBpbmcuTGlzdFtzdHJdKToNCiAgICAgICAgc2VsZi5wcm9qZWN0ID0gcHJvamVjdA0KICAgICAgICBzZWxmLnZlcnNpb24gPSB2ZXJzaW9uDQogICAgICAgIHNlbGYuc3BlYyA9IHNwZWMNCiAgICAgICAgc2VsZi50YXJnZXRzID0gdGFyZ2V0cw0KICAgICAgICBzZWxmLmNsdXN0ZXJfaW5pdF9rZXkgPSBmIntzZWxmLnByb2plY3R9OntzZWxmLnNwZWN9OntzZWxmLnZlcnNpb259Ig0KDQoNCmRlZiBkb3dubG9hZF9jbHVzdGVyX2luaXQob3V0cHV0cywgcm9vdF9mb2xkZXIsIGxvY2tlcikgLT4gdHlwaW5nLkxpc3RbQ2x1c3RlckluaXRTcGVjXToNCiAgICByZXQgPSBbXQ0KICAgIGZvciByZWNvcmQgaW4gKG91dHB1dHNbJ2NsdXN0ZXJJbml0U3BlY3MnXS5nZXQoInZhbHVlIikgb3IgW10pOg0KICAgICAgICB1cmwgPSBfc3RyaXBfdGFnc19mcm9tX2dpdGh1Yl91cmwocmVjb3JkKQ0KICAgICAgICB1cmxfaGFzaCA9IGhhc2hsaWIuc2hhMjU2KHVybC5lbmNvZGUoKSkNCiAgICAgICAgDQogICAgICAgIGZvbGRlciA9IG9zLnBhdGguam9pbihyb290X2ZvbGRlciwgdXJsX2hhc2guaGV4ZGlnZXN0KCkpDQogICAgICAgIGlmIG5vdCBvcy5wYXRoLmV4aXN0cyhmb2xkZXIpOg0KICAgICAgICAgICAgIyBkb3dubG9hZCBhbmQgbW92ZSB0byBhdm9pZCByZXBlYXRlZCBmYWlsdXJlcyB3aXRoIHBhcnRpYWwgZG93bmxvYWRzL3VwbG9hZHMNCiAgICAgICAgICAgIGNoZWNrX291dHB1dChbIi91c3IvbG9jYWwvYmluL2N5Y2xlY2xvdWQiLCAicHJvamVjdCIsICJmZXRjaCIsIHVybCwgZm9sZGVyICsgIi50bXAiXSkNCiAgICAgICAgICAgIGNoZWNrX291dHB1dChbIi91c3IvbG9jYWwvYmluL2N5Y2xlY2xvdWQiLCAicHJvamVjdCIsICJ1cGxvYWQiLCBsb2NrZXJdLCBjd2Q9Zm9sZGVyICsgIi50bXAiKQ0KICAgICAgICAgICAgc2h1dGlsLm1vdmUoZm9sZGVyICsgIi50bXAiLCBmb2xkZXIpDQogICAgICAgICAgICB3aXRoIG9wZW4ob3MucGF0aC5qb2luKGZvbGRlciwgImRvd25sb2FkLXVybCIpLCAidyIpIGFzIGZ3Og0KICAgICAgICAgICAgICAgIGZ3LndyaXRlKHVybCkNCiAgICAgICAgcHJval9pbmZvX3JhdyA9IGNoZWNrX291dHB1dChbIi91c3IvbG9jYWwvYmluL2N5Y2xlY2xvdWQiLCAicHJvamVjdCIsICJpbmZvIl0sIGN3ZD1mb2xkZXIpLmRlY29kZSgpDQogICAgICAgIHByb2pfaW5mbyA9IHt9DQogICAgICAgIGZvciBsaW5lIGluIHByb2pfaW5mb19yYXcuc3BsaXRsaW5lcygpOg0KICAgICAgICAgICAga2V5LCByZXN0ID0gbGluZS5zcGxpdCgiOiIsIDEpDQogICAgICAgICAgICBwcm9qX2luZm9ba2V5Lmxvd2VyKCldID0gcmVzdC5zdHJpcCgpDQogICAgICAgIHJldC5hcHBlbmQoQ2x1c3RlckluaXRTcGVjKHByb2pfaW5mb1sibmFtZSJdLA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBwcm9qX2luZm9bInZlcnNpb24iXSwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcmVjb3JkLmdldCgic3BlYyIpIG9yICJkZWZhdWx0IiwNCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcmVjb3JkWyJ0YXJnZXQiXSkpDQogICAgcmV0dXJuIHJldA0KDQoNCmRlZiBfc3RyaXBfdGFnc19mcm9tX2dpdGh1Yl91cmwocmVjb3JkKToNCiAgICB1cmwgPSByZWNvcmRbImdpdEh1YlJlbGVhc2VVUkwiXQ0KICAgIGlmICIvdGFnLyIgaW4gdXJsOg0KICAgICAgICByZXR1cm4gdXJsLnJlcGxhY2UoIi90YWciLCAiIikNCiAgICByZXR1cm4gdXJsDQoNCg0KZGVmIF92ZXJzaW9uX2Zyb21fdXJsKHJlY29yZCk6DQogICAgaWYgcmVjb3JkLmdldCgidmVyc2lvbiIpOg0KICAgICAgICByZXR1cm4gcmVjb3JkWyJ2ZXJzaW9uIl0NCiAgICByZXR1cm4gcmVjb3JkWyJnaXRIdWJSZWxlYXNlVVJMIl0uc3BsaXQoIi8iKVstMV0NCg0KDQpkZWYgc2V0X2NsdXN0ZXJfaW5pdF9wYXJhbXMocGFyYW1zOiBkaWN0LCBzcGVjczogdHlwaW5nLkxpc3RbQ2x1c3RlckluaXRTcGVjXSwgY2x1c3Rlcl9uYW1lOiBzdHIsIHRhcmdldF9wYXJhbXM6IGRpY3QpIC0+IE5vbmU6DQogICAgb3JkZXIgPSAxMDAwMA0KICAgIGZvciBzcGVjIGluIHNwZWNzOg0KICAgICAgICBmb3IgdGFyZ2V0IGluIHNwZWMudGFyZ2V0czoNCiAgICAgICAgICAgIHRhcmdldF9rZXkgPSBmInt0YXJnZXRfcGFyYW1zW3RhcmdldC5sb3dlcigpXX0iDQogICAgICAgICAgICBpZiBub3QgcGFyYW1zLmdldCh0YXJnZXRfa2V5KToNCiAgICAgICAgICAgICAgICBwYXJhbXNbdGFyZ2V0X2tleV0gPSB7fQ0KDQogICAgICAgICAgICBwYXJhbXNbdGFyZ2V0X2tleV1bc3BlYy5jbHVzdGVyX2luaXRfa2V5XSA9IHsNCiAgICAgICAgICAgICAgICAiT3JkZXIiOiBvcmRlciwNCiAgICAgICAgICAgICAgICAiU3BlYyI6IHNwZWMuc3BlYywNCiAgICAgICAgICAgICAgICAiTmFtZSI6IHNwZWMuY2x1c3Rlcl9pbml0X2tleSwNCiAgICAgICAgICAgICAgICAiUHJvamVjdCI6IHNwZWMucHJvamVjdCwNCiAgICAgICAgICAgICAgICAiTG9ja2VyIjogImF6dXJlLXN0b3JhZ2UiLA0KICAgICAgICAgICAgICAgICJWZXJzaW9uIjogc3BlYy52ZXJzaW9uDQogICAgICAgICAgICB9DQogICAgICAgICAgICBvcmRlciArPSAxMDANCg0KDQpkZWYgbWFpbigpOg0KICAgIHBhcnNlciA9IGFyZ3BhcnNlLkFyZ3VtZW50UGFyc2VyKGRlc2NyaXB0aW9uPSJUT0RPIFJESCIpDQogICAgcGFyc2VyLmFkZF9hcmd1bWVudCgiLS1sb2NrZXIiLCBkZWZhdWx0PSJhenVyZS1zdG9yYWdlIikNCiAgICBwYXJzZXIuYWRkX2FyZ3VtZW50KCItLWNsdXN0ZXItaW5pdC13b3JraW5nLWRpciIsIGRlZmF1bHQ9ImNsdXN0ZXItaW5pdCIpDQogICAgc3VicGFyc2VycyA9IHBhcnNlci5hZGRfc3VicGFyc2VycygpDQogICAgY2N3X3BhcnNlciA9IHN1YnBhcnNlcnMuYWRkX3BhcnNlcigic2x1cm0iKQ0KICAgICMgVE9ETyB0aGlzIG5lZWRzIHRvIGJlIGJ5IGNsdXN0ZXIgdHlwZQ0KICAgIHRhcmdldF9wYXJhbXMgPSB7DQogICAgICAgICJsb2dpbiI6ICJMb2dpbkNsdXN0ZXJJbml0U3BlY3MiLA0KICAgICAgICAiZ3B1IjogIkdQVUNsdXN0ZXJJbml0U3BlY3MiLA0KICAgICAgICAiaHBjIjogIkhQQ0NsdXN0ZXJJbml0U3BlY3MiLA0KICAgICAgICAiZDY0ZCI6ICJENjREQ2x1c3RlckluaXRTcGVjcyIsDQogICAgICAgICJkMTZkIjogIkQxNkRDbHVzdGVySW5pdFNwZWNzIiwNCiAgICAgICAgIm02NCI6ICJNNjRDbHVzdGVySW5pdFNwZWNzIiwNCiAgICAgICAgInNjaGVkdWxlciI6ICJTY2hlZHVsZXJDbHVzdGVySW5pdFNwZWNzIiwNCiAgICAgICAgImR5bmFtaWMiOiAiRHluYW1pY0NsdXN0ZXJJbml0U3BlY3MiLA0KICAgICAgICAib29kIjogIkNsdXN0ZXJJbml0U3BlY3MiDQogICAgfQ0KICAgIGNjd19wYXJzZXIuc2V0X2RlZmF1bHRzKGNsdXN0ZXJfdHlwZT0ic2x1cm0iLCB0YXJnZXRfcGFyYW1zPXRhcmdldF9wYXJhbXMpDQogICAgY2N3X3BhcnNlci5hZGRfYXJndW1lbnQoIi0tZGJQYXNzd29yZCIsIGRlc3Q9ImRiUGFzc3dvcmQiLCBkZWZhdWx0PSIiLCBoZWxwPSJNeVNRTCBkYXRhYmFzZSBwYXNzd29yZCIpDQogICAgDQogICAgb29kX3BhcnNlciA9IHN1YnBhcnNlcnMuYWRkX3BhcnNlcigib29kIikNCiAgICBvb2RfcGFyc2VyLnNldF9kZWZhdWx0cyhjbHVzdGVyX3R5cGU9Im9vZCIsIHRhcmdldF9wYXJhbXM9dGFyZ2V0X3BhcmFtcykNCiAgICANCiAgICBhcmdzID0gcGFyc2VyLnBhcnNlX2FyZ3MoKQ0KDQogICAgaWYgYXJncy5jbHVzdGVyX3R5cGUgPT0gInNsdXJtIjoNCiAgICAgICAgb3V0cHV0X3BhcmFtcyA9IGdldF9qc29uX2RpY3QoJ2luaXRpYWxfcGFyYW1zLmpzb24nKQ0KICAgIGVsc2U6DQogICAgICAgIG91dHB1dF9wYXJhbXMgPSB7fQ0KICAgIGNjd19vdXRwdXRzID0gZ2V0X2pzb25fZGljdCgnY2N3T3V0cHV0cy5qc29uJykNCg0KICAgIHNwZWNzID0gZG93bmxvYWRfY2x1c3Rlcl9pbml0KGNjd19vdXRwdXRzLCBvcy5wYXRoLmpvaW4ob3MuZ2V0Y3dkKCksIGFyZ3MuY2x1c3Rlcl9pbml0X3dvcmtpbmdfZGlyKSwgYXJncy5sb2NrZXIpDQogICAgc2V0X2NsdXN0ZXJfaW5pdF9wYXJhbXMob3V0cHV0X3BhcmFtcywgc3BlY3MsIGFyZ3MuY2x1c3Rlcl90eXBlLCBhcmdzLnRhcmdldF9wYXJhbXMpDQogICAgaWYgYXJncy5jbHVzdGVyX3R5cGUgPT0gInNsdXJtIjoNCiAgICAgICAgc2V0X3NsdXJtX3BhcmFtcyhvdXRwdXRfcGFyYW1zLCBhcmdzLmRiUGFzc3dvcmQsIGNjd19vdXRwdXRzKQ0KICAgIGVsc2U6DQogICAgICAgIHNldF9vb2RfcGFyYW1zKG91dHB1dF9wYXJhbXMsIGNjd19vdXRwdXRzKQ0KICAgIHByaW50KGpzb24uZHVtcHMob3V0cHV0X3BhcmFtcywgaW5kZW50PTQpKQ0KDQoNCmlmIF9fbmFtZV9fID09ICdfX21haW5fXyc6DQogICAgbWFpbigp \ No newline at end of file From 0d78761ed459a0dc06a8f771994df261ae28f156 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Thu, 17 Jul 2025 12:25:57 -0400 Subject: [PATCH 47/50] Raise monitoring project version to 1.0.1 --- bicep/mainTemplate.bicep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bicep/mainTemplate.bicep b/bicep/mainTemplate.bicep index 02124d23..a153e461 100644 --- a/bicep/mainTemplate.bicep +++ b/bicep/mainTemplate.bicep @@ -50,7 +50,7 @@ param manualInstall bool = false param monitoringIngestionEndpoint string = '' param monitoringIdentityClientId string = '' -param monitoringProjectVersion string = '1.0.0' +param monitoringProjectVersion string = '1.0.1' resource ccwResourceGroup 'Microsoft.Resources/resourceGroups@2024-03-01' = { name: resourceGroup From e1763e995d4d1f1bfa26fc4c60ae314076f36ea0 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Thu, 17 Jul 2025 15:00:52 -0400 Subject: [PATCH 48/50] Update monitoring project version to 1.0.1 in hub creation script --- bicep/hub/create_hub.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bicep/hub/create_hub.sh b/bicep/hub/create_hub.sh index 42da070a..868c7a59 100755 --- a/bicep/hub/create_hub.sh +++ b/bicep/hub/create_hub.sh @@ -129,7 +129,7 @@ az deployment group create \ $WHATIF_FLAG # Deploy monitoring -MONITORING_PROJECT_VERSION="1.0.0" +MONITORING_PROJECT_VERSION="1.0.1" echo "Deploying monitoring" mkdir build/ pushd build From 59d97ad1ef9166514a5fd76a7a26f0e5c8e3ddb5 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Fri, 18 Jul 2025 10:36:14 -0400 Subject: [PATCH 49/50] Assign Monitoring Metrics Publisher role to hub MI for DCR RG rather than hub RG --- bicep/hub/create_hub.sh | 11 +++++++---- bicep/hub/create_hub_mi.sh | 4 +++- bicep/hub/hub-mi-dcr.bicep | 15 +++++++++++++++ bicep/hub/hub-mi.bicep | 14 +++++++++++++- 4 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 bicep/hub/hub-mi-dcr.bicep diff --git a/bicep/hub/create_hub.sh b/bicep/hub/create_hub.sh index 868c7a59..abd12ce9 100755 --- a/bicep/hub/create_hub.sh +++ b/bicep/hub/create_hub.sh @@ -80,9 +80,6 @@ az deployment group create \ echo "Virtual network deployment is complete. Please enter the Azure Portal to create a VPN Gateway while the remainder of this script runs." -echo "Deploying hub managed identity..." -./create_hub_mi.sh "${RESOURCE_GROUP}" "${LOCATION}" - echo "Deploying Bastion" # Deploy Bastion bastion_subnet_id=$(az network vnet subnet show -g "${RESOURCE_GROUP}" -n AzureBastionSubnet --vnet-name "hub-vnet-${RESOURCE_GROUP}" | jq '.id' | tr -d '"') @@ -131,7 +128,7 @@ az deployment group create \ # Deploy monitoring MONITORING_PROJECT_VERSION="1.0.1" echo "Deploying monitoring" -mkdir build/ +rm -rf build && mkdir build/ pushd build git clone --branch "${MONITORING_PROJECT_VERSION}" https://github.com/Azure/cyclecloud-monitoring.git @@ -144,6 +141,12 @@ fi popd popd + +DCR_RESOURCE_GROUP=$(az deployment group show -g "${RESOURCE_GROUP}" -n managedMonitoring --query properties.outputs.dcrResourceId.value | tr -d '"' | cut -d '/' -f5) + +echo "Deploying hub managed identity..." +./create_hub_mi.sh "${RESOURCE_GROUP}" "${LOCATION}" "${DCR_RESOURCE_GROUP}" + popd echo "Done!" \ No newline at end of file diff --git a/bicep/hub/create_hub_mi.sh b/bicep/hub/create_hub_mi.sh index 3ebaa724..1011e24b 100755 --- a/bicep/hub/create_hub_mi.sh +++ b/bicep/hub/create_hub_mi.sh @@ -2,8 +2,10 @@ set -e RG=$1 LOCATION=$2 +DCR_RG=$3 az deployment group create \ --name "$RG-hub-mi" \ --resource-group "$RG" \ - --template-file $(pwd)/hub-mi.bicep \ \ No newline at end of file + --template-file $(pwd)/hub-mi.bicep \ + --parameters dcrResourceGroup="${DCR_RG}" \ \ No newline at end of file diff --git a/bicep/hub/hub-mi-dcr.bicep b/bicep/hub/hub-mi-dcr.bicep new file mode 100644 index 00000000..c3159593 --- /dev/null +++ b/bicep/hub/hub-mi-dcr.bicep @@ -0,0 +1,15 @@ +targetScope = 'resourceGroup' +import * as exports from '.././exports.bicep' + +param miPrincipalId string +param miId string + +var role = 'Monitoring Metrics Publisher' +resource roleAssignments_dcr 'Microsoft.Authorization/roleAssignments@2022-04-01' = { + name: guid(subscription().id, miId, exports.role_lookup[role]) + properties: { + roleDefinitionId: exports.role_lookup[role] + principalId: miPrincipalId + principalType: 'ServicePrincipal' + } +} diff --git a/bicep/hub/hub-mi.bicep b/bicep/hub/hub-mi.bicep index fbfb5d4c..dda22ef2 100644 --- a/bicep/hub/hub-mi.bicep +++ b/bicep/hub/hub-mi.bicep @@ -4,6 +4,7 @@ import * as exports from '.././exports.bicep' param name string = '${resourceGroup().name}-mi' param location string = resourceGroup().location +param dcrResourceGroup string param tags tags_t = {} //create managed identity for VMSSs @@ -16,7 +17,6 @@ resource managedIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023- var roles = [ 'Storage Blob Data Reader' 'Storage Blob Data Contributor' - 'Monitoring Metrics Publisher' ] resource roleAssignments 'Microsoft.Authorization/roleAssignments@2022-04-01' = [ for role in roles: { @@ -29,5 +29,17 @@ resource roleAssignments 'Microsoft.Authorization/roleAssignments@2022-04-01' = } }] +module dcrMIRoleAssignments './hub-mi-dcr.bicep' = { + name: 'roleForMonitoringDCR' + scope: resourceGroup(dcrResourceGroup) + params: { + miPrincipalId: managedIdentity.properties.principalId + miId: managedIdentity.id + } + dependsOn: [ + roleAssignments + ] +} + output hubMI string = managedIdentity.id output hubMIClientId string = managedIdentity.properties.clientId From 55cd9201ae8adc62864e215c2da1a8b45e25b9e8 Mon Sep 17 00:00:00 2001 From: Andrew Batallas Date: Fri, 8 Aug 2025 14:53:36 -0400 Subject: [PATCH 50/50] Script for in-place update of CC to latest insiders fast build with CC-Slurm 4.0.0 --- bicep/hub/inplace_update.sh | 56 +++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 bicep/hub/inplace_update.sh diff --git a/bicep/hub/inplace_update.sh b/bicep/hub/inplace_update.sh new file mode 100644 index 00000000..e781122d --- /dev/null +++ b/bicep/hub/inplace_update.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -e +cd "$(dirname "$0")" + +# Identify cyclecloud8 build version +INITIAL_CC_BUILD_VERSION=$(cat /opt/cycle_server/system/version) + +# Update cyclecloud yum repo to insiders-fast +sed -i 's|^\(baseurl=.*\)/cyclecloud$|\1/cyclecloud-insiders-fast|' /etc/yum.repos.d/cyclecloud.repo + +# Clean and rebuild yum cache +yum clean all +yum makecache + +# Update cyclecloud8 to latest version +yum -y update cyclecloud8 + +# Check if the update was successful +UPDATED_CC_BUILD_VERSION=$(cat /opt/cycle_server/system/version) +if [ "$UPDATED_CC_BUILD_VERSION" != "$INITIAL_CC_BUILD_VERSION" ]; then + echo "CycleCloud updated successfully from version $INITIAL_CC_BUILD_VERSION to $UPDATED_CC_BUILD_VERSION." +else + echo "CycleCloud update failed or no new version available." + exit 1 +fi + +# Update cyclecloud-monitoring project to latest release +/usr/local/bin/cyclecloud project fetch https://github.com/Azure/cyclecloud-monitoring/releases/1.0.2 /tmp/cyclecloud-monitoring +pushd /tmp/cyclecloud-monitoring +/usr/local/bin/cyclecloud project upload azure-storage +popd +rm -rf /tmp/cyclecloud-monitoring + +# Insert CC-Slurm 4.0.0 project record +cat >/opt/cycle_server/config/data/slurm400.txt<