Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 60 additions & 10 deletions internal/amdgpu/amdgpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {

renderDevIds := GetDevIdsFromTopology(fs)

// Map to store devices by unique_id to maintain grouping
uniqueIdDevices := make(map[string][]DeviceInfo)
// Map to store devices by parent dev ID to maintain grouping
devIdToDevices := make(map[string][]DeviceInfo)
var uniqueIds []string // To maintain order

Comment thread
shiv-tyagi marked this conversation as resolved.
// Process PCI devices
Expand Down Expand Up @@ -156,17 +156,17 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {

if len(drmDevs) > 0 && renderMinor > 0 {
if devID, exists := renderDevIds[renderMinor]; exists {
if _, exists := uniqueIdDevices[devID]; !exists {
if _, exists := devIdToDevices[devID]; !exists {
uniqueIds = append(uniqueIds, devID)
}
uniqueIdDevices[devID] = append(uniqueIdDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType})
devIdToDevices[devID] = append(devIdToDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType})
}
}
}

// Sort devices within each unique_id group by render minor number
// Sort devices within each parent dev ID group by render minor number
for _, devID := range uniqueIds {
sort.Slice(uniqueIdDevices[devID], func(i, j int) bool {
sort.Slice(devIdToDevices[devID], func(i, j int) bool {
getRenderID := func(devInfo DeviceInfo) int {
devs := devInfo.DrmDevices
for _, dev := range devs {
Expand All @@ -178,14 +178,14 @@ func GetAMDGPUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
}
return 0
}
return getRenderID(uniqueIdDevices[devID][i]) < getRenderID(uniqueIdDevices[devID][j])
return getRenderID(devIdToDevices[devID][i]) < getRenderID(devIdToDevices[devID][j])
})
}

// Combine all devices maintaining the unique_id order
var devs []DeviceInfo
for _, devID := range uniqueIds {
devs = append(devs, uniqueIdDevices[devID]...)
devs = append(devs, devIdToDevices[devID]...)
}

return devs, nil
Expand Down Expand Up @@ -242,8 +242,10 @@ func GetAMDGPUWithFS(fs FileSystem, dev string) (AMDGPU, error) {

var topoUniqueIdRe = regexp.MustCompile(`unique_id\s(\d+)`)
var renderMinorRe = regexp.MustCompile(`drm_render_minor\s(\d+)`)
var locationIdRe = regexp.MustCompile(`location_id\s(\d+)`)
var domainRe = regexp.MustCompile(`domain\s(\d+)`)

// GetDevIdsFromTopology returns a map of render minor numbers to unique_ids
// GetDevIdsFromTopology returns a map of render minor numbers to parent devID
func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string {
topoRoot := "/sys/class/kfd/kfd"
if len(topoRootParam) == 1 {
Expand All @@ -257,6 +259,54 @@ func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]strin
return renderDevIds
}

for _, nodeFile := range nodeFiles {
slog.Debug("Parsing topology node file", "file", nodeFile)
renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe)
if err != nil {
slog.Debug("Error parsing render minor", "file", nodeFile, "error", err)
continue
}

if renderMinor <= 0 || renderMinor > math.MaxInt32 {
continue
}

locationId, e := ParseTopologyProperties(fs, nodeFile, locationIdRe)
if e != nil {
slog.Debug("Error parsing location_id", "file", nodeFile, "error", e)
continue
}

domain, e := ParseTopologyProperties(fs, nodeFile, domainRe)
if e != nil {
slog.Debug("Error parsing domain", "file", nodeFile, "error", e)
continue
}

dev := (locationId >> 3) & 0x1f
bus := (locationId >> 8) & 0xff
devID := fmt.Sprintf("%04x:%02x:%02x:0", domain, bus, dev)
Comment thread
shiv-tyagi marked this conversation as resolved.

renderDevIds[int(renderMinor)] = devID
}

return renderDevIds
}

// GetUniqueIdsFromTopology returns a map of render minor numbers to unique_ids
func GetUniqueIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string {
topoRoot := "/sys/class/kfd/kfd"
if len(topoRootParam) == 1 {
topoRoot = topoRootParam[0]
}

renderDevIds := make(map[int]string)
nodeFiles, err := fs.Glob(topoRoot + "/topology/nodes/*/properties")
if err != nil {
slog.Warn("Failed to glob topology nodes", "error", err)
return renderDevIds
}

for _, nodeFile := range nodeFiles {
slog.Debug("Parsing topology node file", "file", nodeFile)
renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe)
Expand Down Expand Up @@ -331,7 +381,7 @@ func GetUniqueIdToDeviceIndexMapWithFS(fs FileSystem) (map[string][]int, error)
return nil, fmt.Errorf("getting AMD GPUs: %w", err)
}

renderDevIds := GetDevIdsFromTopology(fs)
renderDevIds := GetUniqueIdsFromTopology(fs)
uniqueIdToIndex := make(map[string][]int)

// Process each device group and assign index
Expand Down
36 changes: 20 additions & 16 deletions internal/amdgpu/amdgpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -527,32 +527,32 @@ func TestGetDevIdsFromTopology(t *testing.T) {
name: "single GPU topology",
testCase: "single_gpu",
expectedResult: map[int]string{
128: "1",
128: "0000:05:00:0",
},
},
{
name: "GPU with partition topology",
testCase: "gpu_with_partition",
expectedResult: map[int]string{
128: "1",
129: "1",
128: "0000:05:00:0",
129: "0000:05:00:0",
},
},
{
name: "multiple GPUs topology",
testCase: "multiple_gpus",
expectedResult: map[int]string{
128: "1",
130: "2",
128: "0000:05:00:0",
130: "0000:48:00:0",
},
},
{
name: "unordered partitions topology",
testCase: "unordered_partitions",
expectedResult: map[int]string{
128: "1",
129: "1",
130: "2",
128: "0000:05:00:0",
129: "0000:05:00:0",
130: "0000:48:00:0",
},
},
}
Expand Down Expand Up @@ -589,8 +589,10 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) {
name: "GPU with partition UUID mapping",
testCase: "gpu_with_partition",
expectedResult: map[string][]int{
"0x1": {0, 1},
"1": {0, 1},
"0x1": {0},
"0x2": {1},
"1": {0},
"2": {1},
},
expectedError: nil,
},
Expand All @@ -599,20 +601,22 @@ func TestGetUniqueIdToDeviceIndexMapWithFS(t *testing.T) {
testCase: "multiple_gpus",
expectedResult: map[string][]int{
"0x1": {0},
"0x3": {1},
"1": {0},
"0x2": {1},
"2": {1},
"3": {1},
},
expectedError: nil,
},
{
name: "unordered partitions UUID mapping",
testCase: "unordered_partitions",
expectedResult: map[string][]int{
"0x1": {0, 1},
"1": {0, 1},
"0x2": {2},
"2": {2},
"0x1": {0},
"0x2": {1},
"0x3": {2},
"1": {0},
"2": {1},
"3": {2},
},
expectedError: nil,
},
Expand Down
4 changes: 3 additions & 1 deletion tests/amdgpu/topology/nodes/0/properties
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ mem_banks_count 1
caches_count 0
io_links_count 1
cpu_core_id_base 0
simd_id_base 0
simd_id_base 0
location_id 1280
Comment thread
biluriuday marked this conversation as resolved.
domain 0
6 changes: 4 additions & 2 deletions tests/amdgpu/topology/nodes/1/properties
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
drm_render_minor 129
unique_id 1
unique_id 2
gfx_target_version 90402
cpu_cores_count 20
simd_count 0
mem_banks_count 1
caches_count 0
io_links_count 1
cpu_core_id_base 0
simd_id_base 0
simd_id_base 0
location_id 1281
domain 0
6 changes: 4 additions & 2 deletions tests/amdgpu/topology/nodes/2/properties
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
drm_render_minor 130
unique_id 2
unique_id 3
gfx_target_version 90402
cpu_cores_count 20
simd_count 0
mem_banks_count 1
caches_count 0
io_links_count 1
cpu_core_id_base 0
simd_id_base 0
simd_id_base 0
location_id 18432
domain 0
Loading