diff --git a/README.md b/README.md index 08b5cce52..1ccc3b7cc 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,44 @@ kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/maste * Extends more granular health detection per GPU using the exporter health service over grpc socket service mounted on /var/lib/amd-metrics-exporter/ -## Notes +# GPU Time-Slicing (Virtual Devices) + +GPU time-slicing allows a single physical AMD GPU to be advertised as multiple virtual devices to Kubernetes, enabling multiple pods to share a GPU via OS-level scheduling. This is a Kubernetes-level overcommit — all virtual slices of the same physical GPU share the same `/dev/kfd` and `/dev/dri/renderD*` devices, so pods compete for VRAM and compute at runtime. + +| Flag / Field | Type | Default | Valid Range | Description | +|--------------|------|---------|-------------|-------------| +| `--replicas` | int | `1` | `≥ 1` | Number of virtual device slices per physical GPU | + +Setting `--replicas=1` (or omitting it) produces behavior identical to the upstream plugin. + +## Quick Start + +Add the `--replicas` flag to the DaemonSet container args: + +```yaml +containers: +- image: rocm/k8s-device-plugin + name: amdgpu-dp-cntr + args: + - "./k8s-device-plugin" + - "--replicas=4" +``` + +That's it. A node with 2 physical GPUs will now report `8` under `amd.com/gpu`. + +## Verification + +```bash +kubectl get nodes -o custom-columns=NAME:.metadata.name,GPU:"status.capacity.amd\.com/gpu" +``` + +Two pods each requesting `amd.com/gpu: 1` can be scheduled on a node with a single physical GPU when `replicas >= 2`. + +## Caveats + +- **No hardware isolation**: All virtual slices share the same physical GPU. Pods compete for VRAM and compute resources at the OS scheduler level. +- **No MIG equivalent**: Unlike NVIDIA MIG, there is no hardware-level partitioning. Time-slicing provides Kubernetes scheduling flexibility but no performance guarantees. + * This plugin uses [`go modules`][gm] for dependencies management * Please consult the `Dockerfile` on how to build and use this plugin independent of a docker image diff --git a/cmd/k8s-device-plugin/main.go b/cmd/k8s-device-plugin/main.go index 879772a83..8a9df59d0 100644 --- a/cmd/k8s-device-plugin/main.go +++ b/cmd/k8s-device-plugin/main.go @@ -105,10 +105,11 @@ func main() { flag.PrintDefaults() } var pulse int + var replicas int var resourceNamingStrategy string flag.IntVar(&pulse, "pulse", 0, "time between health check polling in seconds. Set to 0 to disable.") flag.StringVar(&resourceNamingStrategy, "resource_naming_strategy", "single", "Resource strategy to be used: single or mixed") - // this is also needed to enable glog usage in dpm + flag.IntVar(&replicas, "replicas", 1, "number of virtual GPU devices per physical GPU for time-slicing. Must be >= 1.") flag.Parse() strategy, err := ParseStrategy(resourceNamingStrategy) if err != nil { @@ -116,6 +117,13 @@ func main() { os.Exit(1) } + + + if replicas < 1 { + glog.Fatalf("invalid --replicas value %d: must be >= 1", replicas) + } + glog.Infof("GPU time-slicing replicas: %d", replicas) + for _, v := range versions { glog.Infof("%s", v) } @@ -123,6 +131,7 @@ func main() { l := plugin.AMDGPULister{ ResUpdateChan: make(chan dpm.PluginNameList), Heartbeat: make(chan bool), + Replicas: replicas, } manager := dpm.NewManager(&l) diff --git a/internal/pkg/exporter/health.go b/internal/pkg/exporter/health.go index 7cdd9ec79..f8e2eec70 100644 --- a/internal/pkg/exporter/health.go +++ b/internal/pkg/exporter/health.go @@ -82,8 +82,12 @@ func getGPUHealth() (hMap map[string]string, err error) { } // PopulatePerGPUDHealth populate the per gpu health status if available, -// else return simple health status -func PopulatePerGPUDHealth(devs []*pluginapi.Device, defaultHealth string) { +// else return simple health status. +// An optional resolveID function can be provided to map virtual device IDs +// (e.g. "0000:03:00.0-slice-2") back to their physical device IDs for +// health map lookups. This supports GPU time-slicing where multiple virtual +// devices share a single physical GPU. +func PopulatePerGPUDHealth(devs []*pluginapi.Device, defaultHealth string, resolveID ...func(string) string) { var hasHealthSvc = false hMap, err := getGPUHealth() if err == nil { @@ -97,6 +101,15 @@ func PopulatePerGPUDHealth(devs []*pluginapi.Device, defaultHealth string) { // only use if we have the device id entry if gpuHealth, ok := hMap[devs[i].ID]; ok { devs[i].Health = gpuHealth + } else if len(resolveID) > 0 && resolveID[0] != nil { + // Try resolving virtual ID to physical ID for health lookup + physicalID := resolveID[0](devs[i].ID) + if gpuHealth, ok := hMap[physicalID]; ok { + devs[i].Health = gpuHealth + } else { + // revert to simpleHealthCheck if not found + devs[i].Health = defaultHealth + } } else { // revert to simpleHealthCheck if not found devs[i].Health = defaultHealth @@ -104,3 +117,4 @@ func PopulatePerGPUDHealth(devs []*pluginapi.Device, defaultHealth string) { } } } + diff --git a/internal/pkg/plugin/plugin.go b/internal/pkg/plugin/plugin.go index 307377ab1..0b0382dbf 100644 --- a/internal/pkg/plugin/plugin.go +++ b/internal/pkg/plugin/plugin.go @@ -43,6 +43,7 @@ type AMDGPUPlugin struct { Heartbeat chan bool signal chan os.Signal Resource string + Replicas int devAllocator allocator.Policy allocatorInitError bool } @@ -74,6 +75,12 @@ func WithResource(res string) AMDGPUPluginOption { } } +func WithReplicas(r int) AMDGPUPluginOption { + return func(p *AMDGPUPlugin) { + p.Replicas = r + } +} + // Start is an optional interface that could be implemented by plugin. // If case Start is implemented, it will be executed by Manager after // plugin instantiation and before its registration to kubelet. This @@ -230,28 +237,39 @@ func (p *AMDGPUPlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin p.AMDGPUs = amdgpu.GetAMDGPUs() - glog.Infof("Found %d AMDGPUs", len(p.AMDGPUs)) + replicas := p.Replicas + if replicas < 1 { + replicas = 1 + } + + glog.Infof("Found %d AMDGPUs, replicas=%d", len(p.AMDGPUs), replicas) - devs := make([]*pluginapi.Device, len(p.AMDGPUs)) var isHomogeneous bool isHomogeneous = amdgpu.IsHomogeneous() // Initialize a map to store partitionType based device list resourceTypeDevs := make(map[string][]*pluginapi.Device) + // Collect physical IDs and build virtual devices + physicalIDs := make([]string, 0, len(p.AMDGPUs)) + for id := range p.AMDGPUs { + physicalIDs = append(physicalIDs, id) + } + + // Build virtual devices from physical IDs + devs := buildVirtualDevices(physicalIDs, replicas) + + // Build a lookup from virtual device ID to its *pluginapi.Device for topology assignment + devLookup := make(map[string]*pluginapi.Device, len(devs)) + for _, dev := range devs { + devLookup[dev.ID] = dev + } + if isHomogeneous { // limit scope for hwloc func() { - i := 0 for id, device := range p.AMDGPUs { - dev := &pluginapi.Device{ - ID: id, - Health: pluginapi.Healthy, - } - devs[i] = dev - i++ - numas := []int64{int64(device["numaNode"].(int))} - glog.Infof("Watching GPU with bus ID: %s NUMA Node: %+v", id, numas) + glog.Infof("Watching GPU with bus ID: %s NUMA Node: %+v (replicas=%d)", id, numas, replicas) numaNodes := make([]*pluginapi.NUMANode, len(numas)) for j, v := range numas { @@ -260,25 +278,24 @@ func (p *AMDGPUPlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin } } - dev.Topology = &pluginapi.TopologyInfo{ - Nodes: numaNodes, + // Assign topology to all virtual devices belonging to this physical GPU + for i := 0; i < replicas; i++ { + virtualID := fmt.Sprintf("%s%s%d", id, sliceSeparator, i) + if vdev, ok := devLookup[virtualID]; ok { + vdev.Topology = &pluginapi.TopologyInfo{ + Nodes: numaNodes, + } + } } } }() + glog.Infof("Sending %d virtual devices to kubelet", len(devs)) s.Send(&pluginapi.ListAndWatchResponse{Devices: devs}) } else { func() { for id, device := range p.AMDGPUs { - dev := &pluginapi.Device{ - ID: id, - Health: pluginapi.Healthy, - } - // Append a device belonging to a certain partition type to its respective list - partitionType := device["computePartitionType"].(string) + "_" + device["memoryPartitionType"].(string) - resourceTypeDevs[partitionType] = append(resourceTypeDevs[partitionType], dev) - numas := []int64{int64(device["numaNode"].(int))} - glog.Infof("Watching GPU with bus ID: %s NUMA Node: %+v", id, numas) + glog.Infof("Watching GPU with bus ID: %s NUMA Node: %+v (replicas=%d)", id, numas, replicas) numaNodes := make([]*pluginapi.NUMANode, len(numas)) for j, v := range numas { @@ -287,13 +304,22 @@ func (p *AMDGPUPlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin } } - dev.Topology = &pluginapi.TopologyInfo{ - Nodes: numaNodes, + // Append virtual devices belonging to a certain partition type to its respective list + partitionType := device["computePartitionType"].(string) + "_" + device["memoryPartitionType"].(string) + for i := 0; i < replicas; i++ { + virtualID := fmt.Sprintf("%s%s%d", id, sliceSeparator, i) + if vdev, ok := devLookup[virtualID]; ok { + vdev.Topology = &pluginapi.TopologyInfo{ + Nodes: numaNodes, + } + resourceTypeDevs[partitionType] = append(resourceTypeDevs[partitionType], vdev) + } } } }() // Send the appropriate list of devices based on the partitionType if devList, exists := resourceTypeDevs[p.Resource]; exists { + glog.Infof("Sending %d virtual devices for resource %s to kubelet", len(devList), p.Resource) s.Send(&pluginapi.ListAndWatchResponse{Devices: devList}) } } @@ -310,11 +336,11 @@ loop: // update with per device GPU health status if isHomogeneous { - exporter.PopulatePerGPUDHealth(devs, health) + exporter.PopulatePerGPUDHealth(devs, health, resolvePhysicalID) s.Send(&pluginapi.ListAndWatchResponse{Devices: devs}) } else { if devList, exists := resourceTypeDevs[p.Resource]; exists { - exporter.PopulatePerGPUDHealth(devList, health) + exporter.PopulatePerGPUDHealth(devList, health, resolvePhysicalID) s.Send(&pluginapi.ListAndWatchResponse{Devices: devList}) } } @@ -374,9 +400,11 @@ func (p *AMDGPUPlugin) Allocate(ctx context.Context, r *pluginapi.AllocateReques car.Devices = append(car.Devices, dev) for _, id := range req.DevicesIDs { - glog.Infof("Allocating device ID: %s", id) + // Resolve virtual device ID back to physical GPU ID + physicalID := resolvePhysicalID(id) + glog.Infof("Allocating device ID: %s (physical: %s)", id, physicalID) - for k, v := range p.AMDGPUs[id] { + for k, v := range p.AMDGPUs[physicalID] { // Map struct previously only had 'card' and 'renderD' and only those are paths to be appended as before if k != "card" && k != "renderD" { continue @@ -403,6 +431,7 @@ type AMDGPULister struct { ResUpdateChan chan dpm.PluginNameList Heartbeat chan bool Signal chan os.Signal + Replicas int } // GetResourceNamespace must return namespace (vendor ID) of implemented Lister. e.g. for @@ -437,6 +466,7 @@ func (l *AMDGPULister) NewPlugin(resourceLastName string) dpm.PluginInterface { WithHeartbeat(l.Heartbeat), WithResource(resourceLastName), WithAllocator(allocator.NewBestEffortPolicy()), + WithReplicas(l.Replicas), } return NewAMDGPUPlugin(options...) } diff --git a/internal/pkg/plugin/timeslice.go b/internal/pkg/plugin/timeslice.go new file mode 100644 index 000000000..0df541e73 --- /dev/null +++ b/internal/pkg/plugin/timeslice.go @@ -0,0 +1,56 @@ +/** + * Copyright 2018 Advanced Micro Devices, Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +package plugin + +import ( + "fmt" + "strings" + + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +const sliceSeparator = "-slice-" + +// buildVirtualDevices expands a list of physical device IDs into +// replicas virtual device IDs per physical device. +// Each virtual device ID follows the format "-slice-". +// All virtual devices are initially marked as Healthy. +func buildVirtualDevices(physicalIDs []string, replicas int) []*pluginapi.Device { + devs := make([]*pluginapi.Device, 0, len(physicalIDs)*replicas) + for _, pid := range physicalIDs { + for i := 0; i < replicas; i++ { + dev := &pluginapi.Device{ + ID: fmt.Sprintf("%s%s%d", pid, sliceSeparator, i), + Health: pluginapi.Healthy, + } + devs = append(devs, dev) + } + } + return devs +} + +// resolvePhysicalID extracts the physical device ID from a virtual one. +// "0000:03:00.0-slice-2" → "0000:03:00.0" +// Returns the input unchanged if it contains no "-slice-" suffix, +// so the function is safe to call on physical IDs directly. +func resolvePhysicalID(virtualID string) string { + idx := strings.LastIndex(virtualID, sliceSeparator) + if idx < 0 { + return virtualID + } + return virtualID[:idx] +} diff --git a/internal/pkg/plugin/timeslice_test.go b/internal/pkg/plugin/timeslice_test.go new file mode 100644 index 000000000..ae3d27776 --- /dev/null +++ b/internal/pkg/plugin/timeslice_test.go @@ -0,0 +1,158 @@ +/** + * Copyright 2018 Advanced Micro Devices, Inc. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +**/ + +package plugin + +import ( + "testing" + + pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" +) + +func TestBuildVirtualDevices_Replicas1(t *testing.T) { + physicalIDs := []string{"0000:03:00.0", "0000:04:00.0"} + devs := buildVirtualDevices(physicalIDs, 1) + + if len(devs) != 2 { + t.Fatalf("expected 2 devices, got %d", len(devs)) + } + + expected := []string{"0000:03:00.0-slice-0", "0000:04:00.0-slice-0"} + for i, dev := range devs { + if dev.ID != expected[i] { + t.Errorf("device %d: expected ID %q, got %q", i, expected[i], dev.ID) + } + if dev.Health != pluginapi.Healthy { + t.Errorf("device %d: expected health %q, got %q", i, pluginapi.Healthy, dev.Health) + } + } +} + +func TestBuildVirtualDevices_Replicas4(t *testing.T) { + physicalIDs := []string{"0000:03:00.0", "0000:04:00.0"} + devs := buildVirtualDevices(physicalIDs, 4) + + if len(devs) != 8 { + t.Fatalf("expected 8 devices, got %d", len(devs)) + } + + // Verify all IDs are unique + seen := make(map[string]bool) + for _, dev := range devs { + if seen[dev.ID] { + t.Errorf("duplicate device ID: %s", dev.ID) + } + seen[dev.ID] = true + } + + // Verify expected IDs + expectedIDs := map[string]bool{ + "0000:03:00.0-slice-0": true, + "0000:03:00.0-slice-1": true, + "0000:03:00.0-slice-2": true, + "0000:03:00.0-slice-3": true, + "0000:04:00.0-slice-0": true, + "0000:04:00.0-slice-1": true, + "0000:04:00.0-slice-2": true, + "0000:04:00.0-slice-3": true, + } + for _, dev := range devs { + if !expectedIDs[dev.ID] { + t.Errorf("unexpected device ID: %s", dev.ID) + } + } +} + +func TestBuildVirtualDevices_AllUnique(t *testing.T) { + physicalIDs := []string{"gpu-a", "gpu-b", "gpu-c"} + devs := buildVirtualDevices(physicalIDs, 3) + + if len(devs) != 9 { + t.Fatalf("expected 9 devices, got %d", len(devs)) + } + + seen := make(map[string]bool) + for _, dev := range devs { + if seen[dev.ID] { + t.Errorf("duplicate device ID: %s", dev.ID) + } + seen[dev.ID] = true + } +} + +func TestBuildVirtualDevices_Empty(t *testing.T) { + devs := buildVirtualDevices([]string{}, 4) + if len(devs) != 0 { + t.Fatalf("expected 0 devices for empty input, got %d", len(devs)) + } +} + +func TestResolvePhysicalID_WithSliceSuffix(t *testing.T) { + tests := []struct { + input string + expected string + }{ + {"0000:03:00.0-slice-0", "0000:03:00.0"}, + {"0000:03:00.0-slice-2", "0000:03:00.0"}, + {"0000:04:00.0-slice-99", "0000:04:00.0"}, + {"amdgpu_xcp_30-slice-1", "amdgpu_xcp_30"}, + } + + for _, tc := range tests { + result := resolvePhysicalID(tc.input) + if result != tc.expected { + t.Errorf("resolvePhysicalID(%q) = %q, want %q", tc.input, result, tc.expected) + } + } +} + +func TestResolvePhysicalID_WithoutSliceSuffix(t *testing.T) { + tests := []struct { + input string + }{ + {"0000:03:00.0"}, + {"amdgpu_xcp_30"}, + {"some-device-id"}, + {""}, + } + + for _, tc := range tests { + result := resolvePhysicalID(tc.input) + if result != tc.input { + t.Errorf("resolvePhysicalID(%q) = %q, want %q (no-op)", tc.input, result, tc.input) + } + } +} + +func TestResolvePhysicalID_RoundTrip(t *testing.T) { + // Verify that resolvePhysicalID correctly undoes buildVirtualDevices naming + physicalIDs := []string{"0000:03:00.0", "0000:04:00.0"} + devs := buildVirtualDevices(physicalIDs, 3) + + for _, dev := range devs { + resolved := resolvePhysicalID(dev.ID) + found := false + for _, pid := range physicalIDs { + if resolved == pid { + found = true + break + } + } + if !found { + t.Errorf("resolvePhysicalID(%q) = %q, not in original physical IDs", dev.ID, resolved) + } + } +}