From 5adfecc80c8b5b850b65abf8ab40a21256da6c0d Mon Sep 17 00:00:00 2001 From: Nitish Bhat Date: Wed, 17 Jun 2026 21:49:34 +0000 Subject: [PATCH 1/6] Add CI workflow to run unit tests on PRs --- .github/workflows/unit-tests.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/unit-tests.yml diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 000000000..50b9488fd --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,22 @@ +name: Unit Tests + +on: + pull_request: + branches: + - master + - release-* + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Run unit tests + run: go test ./... From 3126cf6401a551478ca7632efc491ba8d99e7a9e Mon Sep 17 00:00:00 2001 From: Nitish Bhat Date: Wed, 17 Jun 2026 22:11:13 +0000 Subject: [PATCH 2/6] Fix unit test CI: install CGo deps and fix fmt.Errorf with non-constant string --- .github/workflows/unit-tests.yml | 3 +++ internal/pkg/allocator/device.go | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 50b9488fd..e73d8585e 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -13,6 +13,9 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Install system dependencies + run: sudo apt-get update && sudo apt-get install -y libhwloc-dev libdrm-dev + - name: Set up Go uses: actions/setup-go@v5 with: diff --git a/internal/pkg/allocator/device.go b/internal/pkg/allocator/device.go index 0358b099a..4cdf6b7c9 100644 --- a/internal/pkg/allocator/device.go +++ b/internal/pkg/allocator/device.go @@ -18,6 +18,7 @@ package allocator import ( "bufio" + "errors" "fmt" "os" "path/filepath" @@ -219,9 +220,9 @@ func scanAndPopulatePeerWeights(fromPath string, devices []*Device, lookupNodes func fetchAllPairWeights(devices []*Device, p2pWeights map[int]map[int]int, folderPath string) error { if len(devices) == 0 { - errMsg := fmt.Sprintf("Devices list is empty. Unable to calculate pair wise weights") + errMsg := "Devices list is empty. Unable to calculate pair wise weights" glog.Info(errMsg) - return fmt.Errorf(errMsg) + return errors.New(errMsg) } if folderPath == "" { folderPath = topoRootPath From e286d6e9e19a9b9abb395d1333ef2197f466bb8b Mon Sep 17 00:00:00 2001 From: Nitish Bhat Date: Wed, 17 Jun 2026 22:14:52 +0000 Subject: [PATCH 3/6] Fix GetAMDGPUs to return empty map instead of Fatal when driver unavailable --- internal/pkg/amdgpu/amdgpu.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/pkg/amdgpu/amdgpu.go b/internal/pkg/amdgpu/amdgpu.go index 4947191d6..f697edb76 100644 --- a/internal/pkg/amdgpu/amdgpu.go +++ b/internal/pkg/amdgpu/amdgpu.go @@ -150,7 +150,8 @@ func GetDevIdsFromTopology(topoRootParam ...string) map[int]string { // GetAMDGPUs return a map of AMD GPU on a node identified by the part of the pci address func GetAMDGPUs() map[string]map[string]interface{} { if _, err := os.Stat("/sys/module/amdgpu/drivers/"); err != nil { - glog.Fatalf("amdgpu driver unavailable. exiting with exit code 2. error: %s", err) + glog.Warningf("amdgpu driver unavailable: %s", err) + return map[string]map[string]interface{}{} } //ex: /sys/module/amdgpu/drivers/pci:amdgpu/0000:19:00.0 From 3e4f4bec7df948f0c8b441229f565f6d2f2edc29 Mon Sep 17 00:00:00 2001 From: Nitish Bhat Date: Fri, 26 Jun 2026 18:53:43 +0000 Subject: [PATCH 4/6] Restore Fatal on missing amdgpu driver, use flag to disable in tests The previous commit changed GetAMDGPUs to Warning+return instead of Fatal when the driver is absent. This broke production behavior where we want the process to exit immediately on hardware nodes without the driver. Instead, add a FatalOnDriverUnavailable package variable (defaults to true) that tests disable via TestMain, keeping production Fatal behavior intact while letting CI run tests without GPUs. --- internal/pkg/amdgpu/amdgpu.go | 8 ++++++++ internal/pkg/amdgpu/amdgpu_test.go | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/internal/pkg/amdgpu/amdgpu.go b/internal/pkg/amdgpu/amdgpu.go index f697edb76..874969fea 100644 --- a/internal/pkg/amdgpu/amdgpu.go +++ b/internal/pkg/amdgpu/amdgpu.go @@ -147,9 +147,17 @@ func GetDevIdsFromTopology(topoRootParam ...string) map[int]string { return renderDevIds } +// FatalOnDriverUnavailable controls whether GetAMDGPUs calls glog.Fatalf +// when the amdgpu driver is not present. Tests set this to false so the +// test process isn't killed on machines without AMD GPUs. +var FatalOnDriverUnavailable = true + // GetAMDGPUs return a map of AMD GPU on a node identified by the part of the pci address func GetAMDGPUs() map[string]map[string]interface{} { if _, err := os.Stat("/sys/module/amdgpu/drivers/"); err != nil { + if FatalOnDriverUnavailable { + glog.Fatalf("amdgpu driver unavailable. exiting with exit code 2. error: %s", err) + } glog.Warningf("amdgpu driver unavailable: %s", err) return map[string]map[string]interface{}{} } diff --git a/internal/pkg/amdgpu/amdgpu_test.go b/internal/pkg/amdgpu/amdgpu_test.go index 85f3df96c..3769fea27 100644 --- a/internal/pkg/amdgpu/amdgpu_test.go +++ b/internal/pkg/amdgpu/amdgpu_test.go @@ -20,6 +20,7 @@ import ( "encoding/json" "fmt" "io/ioutil" + "os" "path/filepath" "reflect" "regexp" @@ -27,6 +28,11 @@ import ( "testing" ) +func TestMain(m *testing.M) { + FatalOnDriverUnavailable = false + os.Exit(m.Run()) +} + func hasAMDGPU(t *testing.T) bool { devices := GetAMDGPUs() From 9548344e92e6178b69c1a3aaaafe91f90ed9ef9a Mon Sep 17 00:00:00 2001 From: Nitish Bhat Date: Fri, 26 Jun 2026 18:55:52 +0000 Subject: [PATCH 5/6] ci: add concurrency group and workflow_dispatch to unit tests workflow Add workflow_dispatch trigger to allow manual runs, and a concurrency group with cancel-in-progress to cancel stale runs when new commits are pushed to the same PR. Mirrors ROCm/device-config-manager#32. --- .github/workflows/unit-tests.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index e73d8585e..38b706a86 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -5,10 +5,16 @@ on: branches: - master - release-* + workflow_dispatch: + +concurrency: + group: unit-tests-${{ github.ref }} + cancel-in-progress: true jobs: test: runs-on: ubuntu-latest + timeout-minutes: 15 steps: - name: Checkout uses: actions/checkout@v4 From 659e8572b8dc9a45669ce88e70e3059034ae8313 Mon Sep 17 00:00:00 2001 From: Nitish Bhat Date: Fri, 26 Jun 2026 19:07:09 +0000 Subject: [PATCH 6/6] Fix TestRenderDevIdsFromTopology expected values after domain/location_id change Commit e8120e33 changed GetDevIdsFromTopology from using unique_id (hashed numeric strings) to using domain/location_id (PCI addresses), but the test expectations were not updated to match. --- internal/pkg/amdgpu/amdgpu_test.go | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/internal/pkg/amdgpu/amdgpu_test.go b/internal/pkg/amdgpu/amdgpu_test.go index 3769fea27..28a33bd8c 100644 --- a/internal/pkg/amdgpu/amdgpu_test.go +++ b/internal/pkg/amdgpu/amdgpu_test.go @@ -227,38 +227,38 @@ func TestRenderDevIdsFromTopology(t *testing.T) { renderDevIds := GetDevIdsFromTopology("../../../testdata/topology-parsing-mi308") expDevIds := map[int]string{ - 128: "598046273873802902", - 129: "598046273873802902", - 130: "598046273873802902", - 131: "598046273873802902", - 136: "11803749423592941193", - 137: "11803749423592941193", - 138: "11803749423592941193", - 139: "11803749423592941193", - 144: "10187445671099294242", - 145: "10187445671099294242", - 146: "10187445671099294242", - 147: "10187445671099294242", - 152: "9604994527082705072", - 153: "9604994527082705072", - 154: "9604994527082705072", - 155: "9604994527082705072", - 160: "17466021589395472075", - 161: "17466021589395472075", - 162: "17466021589395472075", - 163: "17466021589395472075", - 168: "1044926823201815193", - 169: "1044926823201815193", - 170: "1044926823201815193", - 171: "1044926823201815193", - 176: "13372828617950681944", - 177: "13372828617950681944", - 178: "13372828617950681944", - 179: "13372828617950681944", - 184: "6576958293045616595", - 185: "6576958293045616595", - 186: "6576958293045616595", - 187: "6576958293045616595"} + 128: "0000:0a:00:0", + 129: "0000:0a:00:0", + 130: "0000:0a:00:0", + 131: "0000:0a:00:0", + 136: "0000:80:00:0", + 137: "0000:80:00:0", + 138: "0000:80:00:0", + 139: "0000:80:00:0", + 144: "0000:a4:00:0", + 145: "0000:a4:00:0", + 146: "0000:a4:00:0", + 147: "0000:a4:00:0", + 152: "0000:c8:00:0", + 153: "0000:c8:00:0", + 154: "0000:c8:00:0", + 155: "0000:c8:00:0", + 160: "0001:0b:00:0", + 161: "0001:0b:00:0", + 162: "0001:0b:00:0", + 163: "0001:0b:00:0", + 168: "0001:81:00:0", + 169: "0001:81:00:0", + 170: "0001:81:00:0", + 171: "0001:81:00:0", + 176: "0001:a5:00:0", + 177: "0001:a5:00:0", + 178: "0001:a5:00:0", + 179: "0001:a5:00:0", + 184: "0001:c9:00:0", + 185: "0001:c9:00:0", + 186: "0001:c9:00:0", + 187: "0001:c9:00:0"} if !reflect.DeepEqual(renderDevIds, expDevIds) { val, _ := json.MarshalIndent(renderDevIds, "", " ") exp, _ := json.MarshalIndent(expDevIds, "", " ")