Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 38 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,44 @@ kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/maste
* Extends more granular health detection per GPU using the exporter health
service over grpc socket service mounted on /var/lib/amd-metrics-exporter/

## Notes
# GPU Time-Slicing (Virtual Devices)

GPU time-slicing allows a single physical AMD GPU to be advertised as multiple virtual devices to Kubernetes, enabling multiple pods to share a GPU via OS-level scheduling. This is a Kubernetes-level overcommit — all virtual slices of the same physical GPU share the same `/dev/kfd` and `/dev/dri/renderD*` devices, so pods compete for VRAM and compute at runtime.

| Flag / Field | Type | Default | Valid Range | Description |
|--------------|------|---------|-------------|-------------|
| `--replicas` | int | `1` | `≥ 1` | Number of virtual device slices per physical GPU |

Setting `--replicas=1` (or omitting it) produces behavior identical to the upstream plugin.

## Quick Start

Add the `--replicas` flag to the DaemonSet container args:

```yaml
containers:
- image: rocm/k8s-device-plugin
name: amdgpu-dp-cntr
args:
- "./k8s-device-plugin"
- "--replicas=4"
```

That's it. A node with 2 physical GPUs will now report `8` under `amd.com/gpu`.

## Verification

```bash
kubectl get nodes -o custom-columns=NAME:.metadata.name,GPU:"status.capacity.amd\.com/gpu"
```

Two pods each requesting `amd.com/gpu: 1` can be scheduled on a node with a single physical GPU when `replicas >= 2`.

## Caveats

- **No hardware isolation**: All virtual slices share the same physical GPU. Pods compete for VRAM and compute resources at the OS scheduler level.
- **No MIG equivalent**: Unlike NVIDIA MIG, there is no hardware-level partitioning. Time-slicing provides Kubernetes scheduling flexibility but no performance guarantees.


* This plugin uses [`go modules`][gm] for dependencies management
* Please consult the `Dockerfile` on how to build and use this plugin independent of a docker image
Expand Down
11 changes: 10 additions & 1 deletion cmd/k8s-device-plugin/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,24 +105,33 @@ func main() {
flag.PrintDefaults()
}
var pulse int
var replicas int
var resourceNamingStrategy string
flag.IntVar(&pulse, "pulse", 0, "time between health check polling in seconds. Set to 0 to disable.")
flag.StringVar(&resourceNamingStrategy, "resource_naming_strategy", "single", "Resource strategy to be used: single or mixed")
// this is also needed to enable glog usage in dpm
flag.IntVar(&replicas, "replicas", 1, "number of virtual GPU devices per physical GPU for time-slicing. Must be >= 1.")
flag.Parse()
strategy, err := ParseStrategy(resourceNamingStrategy)
if err != nil {
glog.Errorf("%v", err)
os.Exit(1)
}



if replicas < 1 {
glog.Fatalf("invalid --replicas value %d: must be >= 1", replicas)
}
glog.Infof("GPU time-slicing replicas: %d", replicas)

for _, v := range versions {
glog.Infof("%s", v)
}

l := plugin.AMDGPULister{
ResUpdateChan: make(chan dpm.PluginNameList),
Heartbeat: make(chan bool),
Replicas: replicas,
}
manager := dpm.NewManager(&l)

Expand Down
18 changes: 16 additions & 2 deletions internal/pkg/exporter/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,12 @@ func getGPUHealth() (hMap map[string]string, err error) {
}

// PopulatePerGPUDHealth populate the per gpu health status if available,
// else return simple health status
func PopulatePerGPUDHealth(devs []*pluginapi.Device, defaultHealth string) {
// else return simple health status.
// An optional resolveID function can be provided to map virtual device IDs
// (e.g. "0000:03:00.0-slice-2") back to their physical device IDs for
// health map lookups. This supports GPU time-slicing where multiple virtual
// devices share a single physical GPU.
func PopulatePerGPUDHealth(devs []*pluginapi.Device, defaultHealth string, resolveID ...func(string) string) {
var hasHealthSvc = false
hMap, err := getGPUHealth()
if err == nil {
Expand All @@ -97,10 +101,20 @@ func PopulatePerGPUDHealth(devs []*pluginapi.Device, defaultHealth string) {
// only use if we have the device id entry
if gpuHealth, ok := hMap[devs[i].ID]; ok {
devs[i].Health = gpuHealth
} else if len(resolveID) > 0 && resolveID[0] != nil {
// Try resolving virtual ID to physical ID for health lookup
physicalID := resolveID[0](devs[i].ID)
if gpuHealth, ok := hMap[physicalID]; ok {
devs[i].Health = gpuHealth
} else {
// revert to simpleHealthCheck if not found
devs[i].Health = defaultHealth
}
} else {
// revert to simpleHealthCheck if not found
devs[i].Health = defaultHealth
}
}
}
}

86 changes: 58 additions & 28 deletions internal/pkg/plugin/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type AMDGPUPlugin struct {
Heartbeat chan bool
signal chan os.Signal
Resource string
Replicas int
devAllocator allocator.Policy
allocatorInitError bool
}
Expand Down Expand Up @@ -74,6 +75,12 @@ func WithResource(res string) AMDGPUPluginOption {
}
}

func WithReplicas(r int) AMDGPUPluginOption {
return func(p *AMDGPUPlugin) {
p.Replicas = r
}
}

// Start is an optional interface that could be implemented by plugin.
// If case Start is implemented, it will be executed by Manager after
// plugin instantiation and before its registration to kubelet. This
Expand Down Expand Up @@ -230,28 +237,39 @@ func (p *AMDGPUPlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin

p.AMDGPUs = amdgpu.GetAMDGPUs()

glog.Infof("Found %d AMDGPUs", len(p.AMDGPUs))
replicas := p.Replicas
if replicas < 1 {
replicas = 1
}

glog.Infof("Found %d AMDGPUs, replicas=%d", len(p.AMDGPUs), replicas)

devs := make([]*pluginapi.Device, len(p.AMDGPUs))
var isHomogeneous bool
isHomogeneous = amdgpu.IsHomogeneous()
// Initialize a map to store partitionType based device list
resourceTypeDevs := make(map[string][]*pluginapi.Device)

// Collect physical IDs and build virtual devices
physicalIDs := make([]string, 0, len(p.AMDGPUs))
for id := range p.AMDGPUs {
physicalIDs = append(physicalIDs, id)
}

// Build virtual devices from physical IDs
devs := buildVirtualDevices(physicalIDs, replicas)

// Build a lookup from virtual device ID to its *pluginapi.Device for topology assignment
devLookup := make(map[string]*pluginapi.Device, len(devs))
for _, dev := range devs {
devLookup[dev.ID] = dev
}

if isHomogeneous {
// limit scope for hwloc
func() {
i := 0
for id, device := range p.AMDGPUs {
dev := &pluginapi.Device{
ID: id,
Health: pluginapi.Healthy,
}
devs[i] = dev
i++

numas := []int64{int64(device["numaNode"].(int))}
glog.Infof("Watching GPU with bus ID: %s NUMA Node: %+v", id, numas)
glog.Infof("Watching GPU with bus ID: %s NUMA Node: %+v (replicas=%d)", id, numas, replicas)

numaNodes := make([]*pluginapi.NUMANode, len(numas))
for j, v := range numas {
Expand All @@ -260,25 +278,24 @@ func (p *AMDGPUPlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin
}
}

dev.Topology = &pluginapi.TopologyInfo{
Nodes: numaNodes,
// Assign topology to all virtual devices belonging to this physical GPU
for i := 0; i < replicas; i++ {
virtualID := fmt.Sprintf("%s%s%d", id, sliceSeparator, i)
if vdev, ok := devLookup[virtualID]; ok {
vdev.Topology = &pluginapi.TopologyInfo{
Nodes: numaNodes,
}
}
}
}
}()
glog.Infof("Sending %d virtual devices to kubelet", len(devs))
s.Send(&pluginapi.ListAndWatchResponse{Devices: devs})
} else {
func() {
for id, device := range p.AMDGPUs {
dev := &pluginapi.Device{
ID: id,
Health: pluginapi.Healthy,
}
// Append a device belonging to a certain partition type to its respective list
partitionType := device["computePartitionType"].(string) + "_" + device["memoryPartitionType"].(string)
resourceTypeDevs[partitionType] = append(resourceTypeDevs[partitionType], dev)

numas := []int64{int64(device["numaNode"].(int))}
glog.Infof("Watching GPU with bus ID: %s NUMA Node: %+v", id, numas)
glog.Infof("Watching GPU with bus ID: %s NUMA Node: %+v (replicas=%d)", id, numas, replicas)

numaNodes := make([]*pluginapi.NUMANode, len(numas))
for j, v := range numas {
Expand All @@ -287,13 +304,22 @@ func (p *AMDGPUPlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin
}
}

dev.Topology = &pluginapi.TopologyInfo{
Nodes: numaNodes,
// Append virtual devices belonging to a certain partition type to its respective list
partitionType := device["computePartitionType"].(string) + "_" + device["memoryPartitionType"].(string)
for i := 0; i < replicas; i++ {
virtualID := fmt.Sprintf("%s%s%d", id, sliceSeparator, i)
if vdev, ok := devLookup[virtualID]; ok {
vdev.Topology = &pluginapi.TopologyInfo{
Nodes: numaNodes,
}
resourceTypeDevs[partitionType] = append(resourceTypeDevs[partitionType], vdev)
}
}
}
}()
// Send the appropriate list of devices based on the partitionType
if devList, exists := resourceTypeDevs[p.Resource]; exists {
glog.Infof("Sending %d virtual devices for resource %s to kubelet", len(devList), p.Resource)
s.Send(&pluginapi.ListAndWatchResponse{Devices: devList})
}
}
Expand All @@ -310,11 +336,11 @@ loop:

// update with per device GPU health status
if isHomogeneous {
exporter.PopulatePerGPUDHealth(devs, health)
exporter.PopulatePerGPUDHealth(devs, health, resolvePhysicalID)
s.Send(&pluginapi.ListAndWatchResponse{Devices: devs})
} else {
if devList, exists := resourceTypeDevs[p.Resource]; exists {
exporter.PopulatePerGPUDHealth(devList, health)
exporter.PopulatePerGPUDHealth(devList, health, resolvePhysicalID)
s.Send(&pluginapi.ListAndWatchResponse{Devices: devList})
}
}
Expand Down Expand Up @@ -374,9 +400,11 @@ func (p *AMDGPUPlugin) Allocate(ctx context.Context, r *pluginapi.AllocateReques
car.Devices = append(car.Devices, dev)

for _, id := range req.DevicesIDs {
glog.Infof("Allocating device ID: %s", id)
// Resolve virtual device ID back to physical GPU ID
physicalID := resolvePhysicalID(id)
glog.Infof("Allocating device ID: %s (physical: %s)", id, physicalID)

for k, v := range p.AMDGPUs[id] {
for k, v := range p.AMDGPUs[physicalID] {
// Map struct previously only had 'card' and 'renderD' and only those are paths to be appended as before
if k != "card" && k != "renderD" {
continue
Expand All @@ -403,6 +431,7 @@ type AMDGPULister struct {
ResUpdateChan chan dpm.PluginNameList
Heartbeat chan bool
Signal chan os.Signal
Replicas int
}

// GetResourceNamespace must return namespace (vendor ID) of implemented Lister. e.g. for
Expand Down Expand Up @@ -437,6 +466,7 @@ func (l *AMDGPULister) NewPlugin(resourceLastName string) dpm.PluginInterface {
WithHeartbeat(l.Heartbeat),
WithResource(resourceLastName),
WithAllocator(allocator.NewBestEffortPolicy()),
WithReplicas(l.Replicas),
}
return NewAMDGPUPlugin(options...)
}
56 changes: 56 additions & 0 deletions internal/pkg/plugin/timeslice.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/**
* Copyright 2018 Advanced Micro Devices, Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/

package plugin

import (
"fmt"
"strings"

pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

const sliceSeparator = "-slice-"

// buildVirtualDevices expands a list of physical device IDs into
// replicas virtual device IDs per physical device.
// Each virtual device ID follows the format "<physical-id>-slice-<n>".
// All virtual devices are initially marked as Healthy.
func buildVirtualDevices(physicalIDs []string, replicas int) []*pluginapi.Device {
devs := make([]*pluginapi.Device, 0, len(physicalIDs)*replicas)
for _, pid := range physicalIDs {
for i := 0; i < replicas; i++ {
dev := &pluginapi.Device{
ID: fmt.Sprintf("%s%s%d", pid, sliceSeparator, i),
Health: pluginapi.Healthy,
}
devs = append(devs, dev)
}
}
return devs
}

// resolvePhysicalID extracts the physical device ID from a virtual one.
// "0000:03:00.0-slice-2" → "0000:03:00.0"
// Returns the input unchanged if it contains no "-slice-" suffix,
// so the function is safe to call on physical IDs directly.
func resolvePhysicalID(virtualID string) string {
idx := strings.LastIndex(virtualID, sliceSeparator)
if idx < 0 {
return virtualID
}
return virtualID[:idx]
}
Loading