diff --git a/README.md b/README.md index bdb9fd9..2365980 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,6 @@ Clone the repository to your local machine. Then make sure you have all the prer 1. An Azure subscription. If you don't have an Azure subscription, you can create a free account [here](https://azure.microsoft.com/free/). 1. The Azure CLI installed on your local machine. You can install the Azure CLI by following the instructions [here](https://docs.microsoft.com/cli/azure/install-azure-cli). -1. The [Azure Kubernetes Service Preview extension](https://learn.microsoft.com/azure/aks/draft#install-the-aks-preview-azure-cli-extension) must be installed. 1. [Helm](https://helm.sh/docs/intro/install/) must be installed. 1. [Terraform client tools](https://developer.hashicorp.com/terraform/install) or [OpenTofu](https://opentofu.org/) must be installed. This guide makes use of Terrafrom, however the modules used should be compatible with OpenTofu. diff --git a/aks-automatic/deploy.sh b/aks-automatic/deploy.sh index eb266ae..95eb156 100755 --- a/aks-automatic/deploy.sh +++ b/aks-automatic/deploy.sh @@ -1,5 +1,8 @@ #!/bin/bash +# Ray version to use (can be overridden via environment variable) +RAY_VERSION=${RAY_VERSION:-"2.54.1"} + # Check if the user is logged into Azure CLI if ! az account show > /dev/null 2>&1; then echo "Please login to Azure CLI using 'az login' before running this script." @@ -86,7 +89,7 @@ helm upgrade \ --timeout 10m0s \ --namespace "$kuberay_namespace" \ --create-namespace kuberay-operator kuberay/kuberay-operator \ ---version 1.1.1 +--version 1.6.0 # Output the pods in the kuberay namespace kubectl get pods -n $kuberay_namespace @@ -163,6 +166,10 @@ EOF # Download the PyTorch MNIST job YAML file curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml +# Update the Ray version in the downloaded job YAML +sed -i "s/rayVersion: '.*'/rayVersion: '${RAY_VERSION}'/g" ray-job.pytorch-mnist.yaml +sed -i "s|image: rayproject/ray:.*|image: rayproject/ray:${RAY_VERSION}|g" ray-job.pytorch-mnist.yaml + # Train a PyTorch Model on Fashion MNIST kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml diff --git a/aks-automatic/main.tf b/aks-automatic/main.tf index d7ad774..f44fc39 100644 --- a/aks-automatic/main.tf +++ b/aks-automatic/main.tf @@ -19,7 +19,7 @@ resource "tls_private_key" "ssh_key" { } resource "azapi_resource" "aks_auto" { - type = "Microsoft.ContainerService/managedClusters@2024-06-02-preview" + type = "Microsoft.ContainerService/managedClusters@2026-01-01" name = "aks-${var.project_prefix}-${random_string.suffix.result}" parent_id = azurerm_resource_group.rg.id location = var.location @@ -28,7 +28,7 @@ resource "azapi_resource" "aks_auto" { body = jsonencode({ properties = { - kubernetesVersion = "1.31" + kubernetesVersion = "1.33" nodeResourceGroup = "MC-aks-${var.project_prefix}-${random_string.suffix.result}" agentPoolProfiles = [ { @@ -97,7 +97,7 @@ resource "null_resource" "wait_for_aks" { } resource "azapi_update_resource" "k8s-default-node-pool-systempool-taint" { - type = "Microsoft.ContainerService/managedClusters@2024-06-02-preview" + type = "Microsoft.ContainerService/managedClusters@2026-01-01" resource_id = azapi_resource.aks_auto.id body = jsonencode({ properties = { diff --git a/aks-automatic/ray-job.pytorch-mnist-persist-logs.yaml b/aks-automatic/ray-job.pytorch-mnist-persist-logs.yaml index 2a73835..a465aa3 100644 --- a/aks-automatic/ray-job.pytorch-mnist-persist-logs.yaml +++ b/aks-automatic/ray-job.pytorch-mnist-persist-logs.yaml @@ -38,7 +38,7 @@ spec: CPUS_PER_WORKER: "2" # rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller. rayClusterSpec: - rayVersion: '2.41.0' + rayVersion: '2.54.1' headGroupSpec: rayStartParams: {} # Pod template @@ -46,7 +46,7 @@ spec: spec: containers: - name: ray-head - image: rayproject/ray:2.41.0 + image: rayproject/ray:2.54.1 ports: - containerPort: 6379 name: gcs-server @@ -104,7 +104,7 @@ spec: spec: containers: - name: ray-worker - image: rayproject/ray:2.41.0 + image: rayproject/ray:2.54.1 resources: limits: cpu: "3" diff --git a/aks-classic/deploy.sh b/aks-classic/deploy.sh index 01113bd..bba5065 100755 --- a/aks-classic/deploy.sh +++ b/aks-classic/deploy.sh @@ -1,5 +1,8 @@ #!/bin/bash +# Ray version to use (can be overridden via environment variable) +RAY_VERSION=${RAY_VERSION:-"2.54.1"} + # Check if the user is logged into Azure CLI if ! az account show > /dev/null 2>&1; then echo "Please login to Azure CLI using 'az login' before running this script." @@ -53,7 +56,7 @@ helm upgrade \ --timeout 10m0s \ --namespace "$kuberay_namespace" \ --create-namespace kuberay-operator kuberay/kuberay-operator \ ---version 1.1.1 +--version 1.6.0 # Output the pods in the kuberay namespace kubectl get pods -n $kuberay_namespace @@ -61,6 +64,10 @@ kubectl get pods -n $kuberay_namespace # Download the PyTorch MNIST job YAML file curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml +# Update the Ray version in the downloaded job YAML +sed -i "s/rayVersion: '.*'/rayVersion: '${RAY_VERSION}'/g" ray-job.pytorch-mnist.yaml +sed -i "s|image: rayproject/ray:.*|image: rayproject/ray:${RAY_VERSION}|g" ray-job.pytorch-mnist.yaml + # Train a PyTorch Model on Fashion MNIST kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml diff --git a/aks-classic/main.tf b/aks-classic/main.tf index e7e6052..c8042fe 100644 --- a/aks-classic/main.tf +++ b/aks-classic/main.tf @@ -27,10 +27,11 @@ resource "azurerm_kubernetes_cluster" "k8s" { } default_node_pool { - name = "systempool" - vm_size = var.system_node_pool_vm_size - node_count = var.system_node_pool_node_count - tags = { owner = var.resource_group_owner } + name = "systempool" + vm_size = var.system_node_pool_vm_size + node_count = var.system_node_pool_node_count + only_critical_addons_enabled = true + tags = { owner = var.resource_group_owner } } linux_profile { @@ -71,7 +72,7 @@ resource "null_resource" "wait_for_aks" { } resource "azapi_update_resource" "k8s-default-node-pool-systempool-taint" { - type = "Microsoft.ContainerService/managedClusters@2024-09-02-preview" + type = "Microsoft.ContainerService/managedClusters@2026-01-01" resource_id = azurerm_kubernetes_cluster.k8s.id body = jsonencode({ properties = { diff --git a/docs/deploy-ray-aks.md b/docs/deploy-ray-aks.md index 4185e57..163a62b 100644 --- a/docs/deploy-ray-aks.md +++ b/docs/deploy-ray-aks.md @@ -12,7 +12,6 @@ This article provides two methods to deploy the Ray cluster on AKS: * Review the [Ray cluster on AKS overview](./ray-on-aks.md) to understand the components and deployment process. * An Azure subscription. If you don't have an Azure subscription, you can create a free account [here](https://azure.microsoft.com/free/). * The Azure CLI installed on your local machine. You can install it using the instructions in [How to install the Azure CLI](https://docs.microsoft.com/cli/azure/install-azure-cli). -* The [Azure Kubernetes Service Preview extension](https://learn.microsoft.com/azure/aks/draft#install-the-aks-preview-azure-cli-extension) installed. * [Helm](https://helm.sh/docs/intro/install/) installed. * [Terraform client tools](https://developer.hashicorp.com/terraform/install) or [OpenTofu](https://opentofu.org/) installed. This article uses Terrafrom, but the modules used should be compatible with OpenTofu. diff --git a/sample-tuning-setup/direct-blob-access/deploy.sh b/sample-tuning-setup/direct-blob-access/deploy.sh index 74a8045..78dc849 100755 --- a/sample-tuning-setup/direct-blob-access/deploy.sh +++ b/sample-tuning-setup/direct-blob-access/deploy.sh @@ -255,7 +255,7 @@ fi echo "Deploying KubeRay Operator in ${KUBERAY_NAMESPACE} namespace" helm repo add kuberay https://ray-project.github.io/kuberay-helm/ -helm install kuberay-operator kuberay/kuberay-operator --version 1.4.2 --namespace ${KUBERAY_NAMESPACE} --create-namespace +helm install kuberay-operator kuberay/kuberay-operator --version 1.6.0 --namespace ${KUBERAY_NAMESPACE} --create-namespace # Output the pods in the kuberay namespace kubectl get pods -n ${KUBERAY_NAMESPACE} diff --git a/sample-tuning-setup/direct-blob-access/rayjob_direct_access.tpl b/sample-tuning-setup/direct-blob-access/rayjob_direct_access.tpl index 05188bf..be3237c 100644 --- a/sample-tuning-setup/direct-blob-access/rayjob_direct_access.tpl +++ b/sample-tuning-setup/direct-blob-access/rayjob_direct_access.tpl @@ -13,7 +13,7 @@ spec: # Uncomment the next line to experiment with autoscaling. # enableInTreeAutoscaling: true # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '2.43.0' + rayVersion: '2.54.1' headGroupSpec: # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' serviceType: ClusterIP diff --git a/sample-tuning-setup/direct-blob-access/variables.tf b/sample-tuning-setup/direct-blob-access/variables.tf index 58d2d77..2352669 100644 --- a/sample-tuning-setup/direct-blob-access/variables.tf +++ b/sample-tuning-setup/direct-blob-access/variables.tf @@ -22,7 +22,7 @@ variable "project_prefix" { variable "azure_kubernetes_version" { description = "Version of the azure kubernetes" - default = "1.33.0" + default = "1.33" type = string } @@ -96,7 +96,7 @@ variable "azure_storage_profile" { variable "kuberay_version" { description = "Kuberay version that needs to be installed" type = string - default = "1.4.2" + default = "1.6.0" } variable "kuberay_scrape_config_path" { diff --git a/sample-tuning-setup/rayjob.yaml b/sample-tuning-setup/rayjob.yaml index 14f67fa..07746bf 100644 --- a/sample-tuning-setup/rayjob.yaml +++ b/sample-tuning-setup/rayjob.yaml @@ -12,7 +12,7 @@ spec: # Uncomment the next line to experiment with autoscaling. # enableInTreeAutoscaling: true # The version of Ray you are using. Make sure all Ray containers are running this version of Ray. - rayVersion: '2.43.0' + rayVersion: '2.54.1' headGroupSpec: # Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer' serviceType: ClusterIP @@ -24,7 +24,7 @@ spec: containers: # The Ray head container - name: ray-head - image: rayproject/ray-ml:2.43.0.84f276-py310-cpu + image: rayproject/ray:2.54.1 imagePullPolicy: Always # Optimal resource allocation will depend on your Kubernetes infrastructure and might # require some experimentation. @@ -67,7 +67,7 @@ spec: spec: containers: - name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc') - image: rayproject/ray-ml:2.43.0.84f276-py310-cpu + image: rayproject/ray:2.54.1 # Optimal resource allocation will depend on your Kubernetes infrastructure and might # require some experimentation. # Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal diff --git a/sample-tuning-setup/terraform/variables.tf b/sample-tuning-setup/terraform/variables.tf index 50f3701..79b3f8b 100644 --- a/sample-tuning-setup/terraform/variables.tf +++ b/sample-tuning-setup/terraform/variables.tf @@ -22,7 +22,7 @@ variable "project_prefix" { variable "azure_kubernetes_version" { description = "Version of the azure kubernetes" - default = "1.32" + default = "1.33" type = string } @@ -96,7 +96,7 @@ variable "azure_storage_profile" { variable "kuberay_version" { description = "Kuberay version that needs to be installed" type = string - default = "1.4.2" + default = "1.6.0" } variable "kuberay_persistent_volume_claim_name" {