Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ Clone the repository to your local machine. Then make sure you have all the prer

1. An Azure subscription. If you don't have an Azure subscription, you can create a free account [here](https://azure.microsoft.com/free/).
1. The Azure CLI installed on your local machine. You can install the Azure CLI by following the instructions [here](https://docs.microsoft.com/cli/azure/install-azure-cli).
1. The [Azure Kubernetes Service Preview extension](https://learn.microsoft.com/azure/aks/draft#install-the-aks-preview-azure-cli-extension) must be installed.
1. [Helm](https://helm.sh/docs/intro/install/) must be installed.
1. [Terraform client tools](https://developer.hashicorp.com/terraform/install) or [OpenTofu](https://opentofu.org/) must be installed. This guide makes use of Terrafrom, however the modules used should be compatible with OpenTofu.

Expand Down
9 changes: 8 additions & 1 deletion aks-automatic/deploy.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

# Ray version to use (can be overridden via environment variable)
RAY_VERSION=${RAY_VERSION:-"2.54.1"}

# Check if the user is logged into Azure CLI
if ! az account show > /dev/null 2>&1; then
echo "Please login to Azure CLI using 'az login' before running this script."
Expand Down Expand Up @@ -86,7 +89,7 @@ helm upgrade \
--timeout 10m0s \
--namespace "$kuberay_namespace" \
--create-namespace kuberay-operator kuberay/kuberay-operator \
--version 1.1.1
--version 1.6.0

# Output the pods in the kuberay namespace
kubectl get pods -n $kuberay_namespace
Expand Down Expand Up @@ -163,6 +166,10 @@ EOF
# Download the PyTorch MNIST job YAML file
curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml

# Update the Ray version in the downloaded job YAML
sed -i "s/rayVersion: '.*'/rayVersion: '${RAY_VERSION}'/g" ray-job.pytorch-mnist.yaml
sed -i "s|image: rayproject/ray:.*|image: rayproject/ray:${RAY_VERSION}|g" ray-job.pytorch-mnist.yaml

# Train a PyTorch Model on Fashion MNIST
kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml

Expand Down
6 changes: 3 additions & 3 deletions aks-automatic/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ resource "tls_private_key" "ssh_key" {
}

resource "azapi_resource" "aks_auto" {
type = "Microsoft.ContainerService/managedClusters@2024-06-02-preview"
type = "Microsoft.ContainerService/managedClusters@2026-01-01"
name = "aks-${var.project_prefix}-${random_string.suffix.result}"
parent_id = azurerm_resource_group.rg.id
location = var.location
Expand All @@ -28,7 +28,7 @@ resource "azapi_resource" "aks_auto" {
body = jsonencode({

properties = {
kubernetesVersion = "1.31"
kubernetesVersion = "1.33"
nodeResourceGroup = "MC-aks-${var.project_prefix}-${random_string.suffix.result}"
agentPoolProfiles = [
{
Expand Down Expand Up @@ -97,7 +97,7 @@ resource "null_resource" "wait_for_aks" {
}

resource "azapi_update_resource" "k8s-default-node-pool-systempool-taint" {
type = "Microsoft.ContainerService/managedClusters@2024-06-02-preview"
type = "Microsoft.ContainerService/managedClusters@2026-01-01"
resource_id = azapi_resource.aks_auto.id
body = jsonencode({
properties = {
Expand Down
6 changes: 3 additions & 3 deletions aks-automatic/ray-job.pytorch-mnist-persist-logs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,15 @@ spec:
CPUS_PER_WORKER: "2"
# rayClusterSpec specifies the RayCluster instance to be created by the RayJob controller.
rayClusterSpec:
rayVersion: '2.41.0'
rayVersion: '2.54.1'
headGroupSpec:
rayStartParams: {}
# Pod template
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.41.0
image: rayproject/ray:2.54.1
ports:
- containerPort: 6379
name: gcs-server
Expand Down Expand Up @@ -104,7 +104,7 @@ spec:
spec:
containers:
- name: ray-worker
image: rayproject/ray:2.41.0
image: rayproject/ray:2.54.1
resources:
limits:
cpu: "3"
Expand Down
9 changes: 8 additions & 1 deletion aks-classic/deploy.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
#!/bin/bash

# Ray version to use (can be overridden via environment variable)
RAY_VERSION=${RAY_VERSION:-"2.54.1"}

# Check if the user is logged into Azure CLI
if ! az account show > /dev/null 2>&1; then
echo "Please login to Azure CLI using 'az login' before running this script."
Expand Down Expand Up @@ -53,14 +56,18 @@ helm upgrade \
--timeout 10m0s \
--namespace "$kuberay_namespace" \
--create-namespace kuberay-operator kuberay/kuberay-operator \
--version 1.1.1
--version 1.6.0

# Output the pods in the kuberay namespace
kubectl get pods -n $kuberay_namespace

# Download the PyTorch MNIST job YAML file
curl -LO https://raw.githubusercontent.com/ray-project/kuberay/master/ray-operator/config/samples/pytorch-mnist/ray-job.pytorch-mnist.yaml

# Update the Ray version in the downloaded job YAML
sed -i "s/rayVersion: '.*'/rayVersion: '${RAY_VERSION}'/g" ray-job.pytorch-mnist.yaml
sed -i "s|image: rayproject/ray:.*|image: rayproject/ray:${RAY_VERSION}|g" ray-job.pytorch-mnist.yaml

# Train a PyTorch Model on Fashion MNIST
kubectl apply -n $kuberay_namespace -f ray-job.pytorch-mnist.yaml

Expand Down
11 changes: 6 additions & 5 deletions aks-classic/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@ resource "azurerm_kubernetes_cluster" "k8s" {
}

default_node_pool {
name = "systempool"
vm_size = var.system_node_pool_vm_size
node_count = var.system_node_pool_node_count
tags = { owner = var.resource_group_owner }
name = "systempool"
vm_size = var.system_node_pool_vm_size
node_count = var.system_node_pool_node_count
only_critical_addons_enabled = true
tags = { owner = var.resource_group_owner }
}

linux_profile {
Expand Down Expand Up @@ -71,7 +72,7 @@ resource "null_resource" "wait_for_aks" {
}

resource "azapi_update_resource" "k8s-default-node-pool-systempool-taint" {
type = "Microsoft.ContainerService/managedClusters@2024-09-02-preview"
type = "Microsoft.ContainerService/managedClusters@2026-01-01"
resource_id = azurerm_kubernetes_cluster.k8s.id
body = jsonencode({
properties = {
Expand Down
1 change: 0 additions & 1 deletion docs/deploy-ray-aks.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ This article provides two methods to deploy the Ray cluster on AKS:
* Review the [Ray cluster on AKS overview](./ray-on-aks.md) to understand the components and deployment process.
* An Azure subscription. If you don't have an Azure subscription, you can create a free account [here](https://azure.microsoft.com/free/).
* The Azure CLI installed on your local machine. You can install it using the instructions in [How to install the Azure CLI](https://docs.microsoft.com/cli/azure/install-azure-cli).
* The [Azure Kubernetes Service Preview extension](https://learn.microsoft.com/azure/aks/draft#install-the-aks-preview-azure-cli-extension) installed.
* [Helm](https://helm.sh/docs/intro/install/) installed.
* [Terraform client tools](https://developer.hashicorp.com/terraform/install) or [OpenTofu](https://opentofu.org/) installed. This article uses Terrafrom, but the modules used should be compatible with OpenTofu.

Expand Down
2 changes: 1 addition & 1 deletion sample-tuning-setup/direct-blob-access/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ fi

echo "Deploying KubeRay Operator in ${KUBERAY_NAMESPACE} namespace"
helm repo add kuberay https://ray-project.github.io/kuberay-helm/
helm install kuberay-operator kuberay/kuberay-operator --version 1.4.2 --namespace ${KUBERAY_NAMESPACE} --create-namespace
helm install kuberay-operator kuberay/kuberay-operator --version 1.6.0 --namespace ${KUBERAY_NAMESPACE} --create-namespace

# Output the pods in the kuberay namespace
kubectl get pods -n ${KUBERAY_NAMESPACE}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
# Uncomment the next line to experiment with autoscaling.
# enableInTreeAutoscaling: true
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
rayVersion: '2.43.0'
rayVersion: '2.54.1'
headGroupSpec:
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
Expand Down
4 changes: 2 additions & 2 deletions sample-tuning-setup/direct-blob-access/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ variable "project_prefix" {

variable "azure_kubernetes_version" {
description = "Version of the azure kubernetes"
default = "1.33.0"
default = "1.33"
type = string
}

Expand Down Expand Up @@ -96,7 +96,7 @@ variable "azure_storage_profile" {
variable "kuberay_version" {
description = "Kuberay version that needs to be installed"
type = string
default = "1.4.2"
default = "1.6.0"
}

variable "kuberay_scrape_config_path" {
Expand Down
6 changes: 3 additions & 3 deletions sample-tuning-setup/rayjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
# Uncomment the next line to experiment with autoscaling.
# enableInTreeAutoscaling: true
# The version of Ray you are using. Make sure all Ray containers are running this version of Ray.
rayVersion: '2.43.0'
rayVersion: '2.54.1'
headGroupSpec:
# Kubernetes Service Type, valid values are 'ClusterIP', 'NodePort' and 'LoadBalancer'
serviceType: ClusterIP
Expand All @@ -24,7 +24,7 @@ spec:
containers:
# The Ray head container
- name: ray-head
image: rayproject/ray-ml:2.43.0.84f276-py310-cpu
image: rayproject/ray:2.54.1
imagePullPolicy: Always
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
# require some experimentation.
Expand Down Expand Up @@ -67,7 +67,7 @@ spec:
spec:
containers:
- name: machine-learning # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name', or '123-abc')
image: rayproject/ray-ml:2.43.0.84f276-py310-cpu
image: rayproject/ray:2.54.1
# Optimal resource allocation will depend on your Kubernetes infrastructure and might
# require some experimentation.
# Setting requests=limits is recommended with Ray. K8s limits are used for Ray-internal
Expand Down
4 changes: 2 additions & 2 deletions sample-tuning-setup/terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ variable "project_prefix" {

variable "azure_kubernetes_version" {
description = "Version of the azure kubernetes"
default = "1.32"
default = "1.33"
type = string
}

Expand Down Expand Up @@ -96,7 +96,7 @@ variable "azure_storage_profile" {
variable "kuberay_version" {
description = "Kuberay version that needs to be installed"
type = string
default = "1.4.2"
default = "1.6.0"
}

variable "kuberay_persistent_volume_claim_name" {
Expand Down