From fb4ed57adf8826238b6c8598a843c525999d8fbd Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 26 Aug 2025 16:15:28 -0700 Subject: [PATCH 01/26] [GPUHEALTH-383] agent ci/cd pipeline --- .github/workflows/build.yml | 8 +- .github/workflows/goreleaser.yml | 33 ++- .goreleaser_darwin_amd64.yaml | 66 ------ .goreleaser_darwin_arm64.yaml | 66 ------ .goreleaser_gpuhealth_deb_amd64.yaml | 10 +- .goreleaser_gpuhealth_deb_arm64.yaml | 10 +- .goreleaser_gpuhealth_rpm_aarch64.yaml | 10 +- .goreleaser_gpuhealth_rpm_x86_64.yaml | 10 +- .goreleaser_linux_amd64.yaml | 66 ------ .goreleaser_linux_amd64_amzn2.yaml | 66 ------ .goreleaser_linux_amd64_amzn2023.yaml | 66 ------ .goreleaser_linux_amd64_ubuntu22.04.yaml | 66 ------ .goreleaser_linux_amd64_ubuntu24.04.yaml | 66 ------ .goreleaser_linux_arm64_amzn2.yaml | 67 ------- .goreleaser_linux_arm64_amzn2023.yaml | 68 ------- .goreleaser_linux_arm64_ubuntu22.04.yaml | 67 ------- .goreleaser_linux_arm64_ubuntu24.04.yaml | 67 ------- Makefile | 12 +- README.md | 244 ++++++++++++++++------- version/version.go | 2 +- 20 files changed, 217 insertions(+), 853 deletions(-) delete mode 100644 .goreleaser_darwin_amd64.yaml delete mode 100644 .goreleaser_darwin_arm64.yaml delete mode 100644 .goreleaser_linux_amd64.yaml delete mode 100644 .goreleaser_linux_amd64_amzn2.yaml delete mode 100644 .goreleaser_linux_amd64_amzn2023.yaml delete mode 100644 .goreleaser_linux_amd64_ubuntu22.04.yaml delete mode 100644 .goreleaser_linux_amd64_ubuntu24.04.yaml delete mode 100644 .goreleaser_linux_arm64_amzn2.yaml delete mode 100644 .goreleaser_linux_arm64_amzn2023.yaml delete mode 100644 .goreleaser_linux_arm64_ubuntu22.04.yaml delete mode 100644 .goreleaser_linux_arm64_ubuntu24.04.yaml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5fa31df6..b276995b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,4 +1,4 @@ -name: Build and Store gpud binary +name: Build and Store gpuhealth binary on: push: @@ -22,10 +22,10 @@ jobs: - name: Build project run: | - make + make gpuhealth - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: gpud - path: bin/gpud \ No newline at end of file + name: gpuhealth + path: bin/gpuhealth \ No newline at end of file diff --git a/.github/workflows/goreleaser.yml b/.github/workflows/goreleaser.yml index 45b99e35..687abc59 100644 --- a/.github/workflows/goreleaser.yml +++ b/.github/workflows/goreleaser.yml @@ -17,31 +17,23 @@ jobs: job: - os: ubuntu-22.04 platform: linux - target: linux_amd64 - goreleaser_suffix: _ubuntu22.04 + target: gpuhealth_deb_amd64 + goreleaser_suffix: "" - os: ubuntu-22.04 platform: linux - target: linux_arm64 - goreleaser_suffix: _ubuntu22.04 + target: gpuhealth_deb_arm64 + goreleaser_suffix: "" - - os: ubuntu-24.04 + - os: ubuntu-22.04 platform: linux - target: linux_amd64 - goreleaser_suffix: _ubuntu24.04 + target: gpuhealth_rpm_x86_64 + goreleaser_suffix: "" - - os: ubuntu-24.04 + - os: ubuntu-22.04 platform: linux - target: linux_arm64 - goreleaser_suffix: _ubuntu24.04 - - - os: macos-latest - platform: darwin - target: darwin_amd64 - - - os: macos-latest - platform: darwin - target: darwin_arm64 + target: gpuhealth_rpm_aarch64 + goreleaser_suffix: "" name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) runs-on: ${{ matrix.job.os }} @@ -74,7 +66,7 @@ jobs: with: distribution: goreleaser version: latest - args: release --config .goreleaser_${{ matrix.job.target }}${{ matrix.job.goreleaser_suffix }}.yaml + args: release --config .goreleaser_${{ matrix.job.target }}.yaml workdir: . env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -94,4 +86,5 @@ jobs: prerelease: false body: Latest builds from the last commit files: | - ./dist/gpud_v${{steps.get_tag_version.outputs.TAG_VERSION}}_${{ matrix.job.target }}${{ matrix.job.goreleaser_suffix }}.tgz + ./dist/gpuhealth_*.deb + ./dist/gpuhealth_*.rpm diff --git a/.goreleaser_darwin_amd64.yaml b/.goreleaser_darwin_amd64.yaml deleted file mode 100644 index 562f96d6..00000000 --- a/.goreleaser_darwin_amd64.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - darwin - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }} - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_darwin_arm64.yaml b/.goreleaser_darwin_arm64.yaml deleted file mode 100644 index f017436d..00000000 --- a/.goreleaser_darwin_arm64.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - darwin - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }} - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_gpuhealth_deb_amd64.yaml b/.goreleaser_gpuhealth_deb_amd64.yaml index 1e8b0339..f0601a40 100644 --- a/.goreleaser_gpuhealth_deb_amd64.yaml +++ b/.goreleaser_gpuhealth_deb_amd64.yaml @@ -8,9 +8,9 @@ builds: flags: [-v] ldflags: - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} + - -X github.com/NVIDIA/gpuhealth/version.BuildTimestamp={{.CommitTimestamp}} + - -X github.com/NVIDIA/gpuhealth/version.Revision={{.Commit}} + - -X github.com/NVIDIA/gpuhealth/version.Version=v{{.Version}} goos: [linux] goarch: [amd64] @@ -22,7 +22,7 @@ nfpms: bindir: /usr/bin maintainer: "GPU Health Team " license: Apache-2.0 - homepage: https://github.com/leptonai/gpud + homepage: https://github.com/NVIDIA/gpuhealth description: GPU Health monitoring agent for datacenter environments dependencies: [systemd, curl] recommends: [nvidia-driver] @@ -52,7 +52,7 @@ release: replace_existing_draft: true make_latest: true mode: replace - github: { owner: leptonai, name: gpud } + github: { owner: NVIDIA, name: gpuhealth } name_template: "gpuhealth-v{{.Version}}" checksum: { disable: true } diff --git a/.goreleaser_gpuhealth_deb_arm64.yaml b/.goreleaser_gpuhealth_deb_arm64.yaml index c74d3ae2..6988e931 100644 --- a/.goreleaser_gpuhealth_deb_arm64.yaml +++ b/.goreleaser_gpuhealth_deb_arm64.yaml @@ -8,9 +8,9 @@ builds: flags: [-v] ldflags: - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} + - -X github.com/NVIDIA/gpuhealth/version.BuildTimestamp={{.CommitTimestamp}} + - -X github.com/NVIDIA/gpuhealth/version.Revision={{.Commit}} + - -X github.com/NVIDIA/gpuhealth/version.Version=v{{.Version}} goos: [linux] goarch: [arm64] @@ -22,7 +22,7 @@ nfpms: bindir: /usr/bin maintainer: "GPU Health Team " license: Apache-2.0 - homepage: https://github.com/leptonai/gpud + homepage: https://github.com/NVIDIA/gpuhealth description: GPU Health monitoring agent for datacenter environments dependencies: [systemd, curl] recommends: [nvidia-driver] @@ -52,7 +52,7 @@ release: replace_existing_draft: true make_latest: true mode: replace - github: { owner: leptonai, name: gpud } + github: { owner: NVIDIA, name: gpuhealth } name_template: "gpuhealth-v{{.Version}}" checksum: { disable: true } diff --git a/.goreleaser_gpuhealth_rpm_aarch64.yaml b/.goreleaser_gpuhealth_rpm_aarch64.yaml index 8ba3e225..0827b621 100644 --- a/.goreleaser_gpuhealth_rpm_aarch64.yaml +++ b/.goreleaser_gpuhealth_rpm_aarch64.yaml @@ -8,9 +8,9 @@ builds: flags: [-v] ldflags: - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} + - -X github.com/NVIDIA/gpuhealth/version.BuildTimestamp={{.CommitTimestamp}} + - -X github.com/NVIDIA/gpuhealth/version.Revision={{.Commit}} + - -X github.com/NVIDIA/gpuhealth/version.Version=v{{.Version}} goos: [linux] goarch: [arm64] @@ -24,7 +24,7 @@ nfpms: file_name_template: '{{ .PackageName }}-{{ .Version }}-{{ .Release }}.{{ if eq .Arch "amd64" }}x86_64{{ else if eq .Arch "arm64" }}aarch64{{ else }}{{ .Arch }}{{ end }}' maintainer: "GPU Health Team " license: Apache-2.0 - homepage: https://github.com/leptonai/gpud + homepage: https://github.com/NVIDIA/gpuhealth description: GPU Health monitoring agent for datacenter environments dependencies: [systemd, curl] recommends: [nvidia-driver] @@ -56,7 +56,7 @@ release: replace_existing_draft: true make_latest: true mode: replace - github: { owner: leptonai, name: gpud } + github: { owner: NVIDIA, name: gpuhealth } name_template: "gpuhealth-v{{.Version}}" checksum: { disable: true } diff --git a/.goreleaser_gpuhealth_rpm_x86_64.yaml b/.goreleaser_gpuhealth_rpm_x86_64.yaml index 68fa06a0..66efd974 100644 --- a/.goreleaser_gpuhealth_rpm_x86_64.yaml +++ b/.goreleaser_gpuhealth_rpm_x86_64.yaml @@ -8,9 +8,9 @@ builds: flags: [-v] ldflags: - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} + - -X github.com/NVIDIA/gpuhealth/version.BuildTimestamp={{.CommitTimestamp}} + - -X github.com/NVIDIA/gpuhealth/version.Revision={{.Commit}} + - -X github.com/NVIDIA/gpuhealth/version.Version=v{{.Version}} goos: [linux] goarch: [amd64] @@ -24,7 +24,7 @@ nfpms: file_name_template: '{{ .PackageName }}-{{ .Version }}-{{ .Release }}.{{ if eq .Arch "amd64" }}x86_64{{ else if eq .Arch "arm64" }}aarch64{{ else }}{{ .Arch }}{{ end }}' maintainer: "GPU Health Team " license: Apache-2.0 - homepage: https://github.com/leptonai/gpud + homepage: https://github.com/NVIDIA/gpuhealth description: GPU Health monitoring agent for datacenter environments dependencies: [systemd, curl] recommends: [nvidia-driver] @@ -56,7 +56,7 @@ release: replace_existing_draft: true make_latest: true mode: replace - github: { owner: leptonai, name: gpud } + github: { owner: NVIDIA, name: gpuhealth } name_template: "gpuhealth-v{{.Version}}" checksum: { disable: true } diff --git a/.goreleaser_linux_amd64.yaml b/.goreleaser_linux_amd64.yaml deleted file mode 100644 index 6f6a444e..00000000 --- a/.goreleaser_linux_amd64.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }} - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_amd64_amzn2.yaml b/.goreleaser_linux_amd64_amzn2.yaml deleted file mode 100644 index 051febe5..00000000 --- a/.goreleaser_linux_amd64_amzn2.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_amzn2 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_amd64_amzn2023.yaml b/.goreleaser_linux_amd64_amzn2023.yaml deleted file mode 100644 index 8e492fef..00000000 --- a/.goreleaser_linux_amd64_amzn2023.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_amzn2023 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_amd64_ubuntu22.04.yaml b/.goreleaser_linux_amd64_ubuntu22.04.yaml deleted file mode 100644 index f3e56989..00000000 --- a/.goreleaser_linux_amd64_ubuntu22.04.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_ubuntu22.04 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_amd64_ubuntu24.04.yaml b/.goreleaser_linux_amd64_ubuntu24.04.yaml deleted file mode 100644 index 308c9b0b..00000000 --- a/.goreleaser_linux_amd64_ubuntu24.04.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_ubuntu24.04 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_arm64_amzn2.yaml b/.goreleaser_linux_arm64_amzn2.yaml deleted file mode 100644 index bc436014..00000000 --- a/.goreleaser_linux_arm64_amzn2.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - - CC=gcc - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_amzn2 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_arm64_amzn2023.yaml b/.goreleaser_linux_arm64_amzn2023.yaml deleted file mode 100644 index 6ef85c25..00000000 --- a/.goreleaser_linux_arm64_amzn2023.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - # ref. https://docs.aws.amazon.com/linux/al2023/ug/compare-with-al2.html#compiler-triplet - - CC=aarch64-amazon-linux-gcc - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_amzn2023 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_arm64_ubuntu22.04.yaml b/.goreleaser_linux_arm64_ubuntu22.04.yaml deleted file mode 100644 index d8b33859..00000000 --- a/.goreleaser_linux_arm64_ubuntu22.04.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - - CC=aarch64-linux-gnu-gcc - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_ubuntu22.04 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_arm64_ubuntu24.04.yaml b/.goreleaser_linux_arm64_ubuntu24.04.yaml deleted file mode 100644 index 9e0055c8..00000000 --- a/.goreleaser_linux_arm64_ubuntu24.04.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - - CC=aarch64-linux-gnu-gcc - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_ubuntu24.04 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/Makefile b/Makefile index 15fb6f60..196d18ae 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ ROOTDIR=$(dir $(abspath $(lastword $(MAKEFILE_LIST)))) BUILD_TIMESTAMP ?= $(shell date -u +"%Y-%m-%dT%H:%M:%SZ") VERSION ?= $(shell git describe --match 'v[0-9]*' --dirty='.m' --always) REVISION=$(shell git rev-parse HEAD)$(shell if ! git diff --no-ext-diff --quiet --exit-code; then echo .m; fi) -PACKAGE=github.com/leptonai/gpud +PACKAGE=github.com/NVIDIA/gpuhealth ifneq "$(strip $(shell command -v $(GO) 2>/dev/null))" "" GOOS ?= $(shell $(GO) env GOOS) @@ -49,9 +49,9 @@ endif WHALE = "🇩" ONI = "👹" -RELEASE=gpud-$(VERSION:v%=%)-${GOOS}-${GOARCH} +RELEASE=gpuhealth-$(VERSION:v%=%)-${GOOS}-${GOARCH} -COMMANDS=gpud swagger +COMMANDS=gpuhealth GO_BUILD_FLAGS=-ldflags '-s -X $(PACKAGE)/version.BuildTimestamp=$(BUILD_TIMESTAMP) -X $(PACKAGE)/version.Version=$(VERSION) -X $(PACKAGE)/version.Revision=$(REVISION) -X $(PACKAGE)/version.Package=$(PACKAGE)' @@ -71,7 +71,7 @@ GOPATHS=$(shell echo ${GOPATH} | tr ":" "\n" | tr ";" "\n") BINARIES=$(addprefix bin/,$(COMMANDS)) -.PHONY: clean all binaries +.PHONY: clean all binaries gpuhealth .DEFAULT: default all: binaries @@ -90,6 +90,10 @@ bin/%: cmd/% FORCE binaries: $(BINARIES) ## build binaries @echo "$(WHALE) $@" +# Specific target for gpuhealth (your main binary) +gpuhealth: bin/gpuhealth ## build gpuhealth binary + @echo "$(WHALE) gpuhealth built successfully" + clean: ## clean up binaries @echo "$(WHALE) $@" @rm -f $(BINARIES) diff --git a/README.md b/README.md index f1d2e6ec..9785f668 100644 --- a/README.md +++ b/README.md @@ -1,143 +1,241 @@ -GPUd logo +# GPUHealth -[![Go Report Card](https://goreportcard.com/badge/github.com/leptonai/gpud)](https://goreportcard.com/report/github.com/leptonai/gpud) -![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/leptonai/gpud?sort=semver) -[![Go Reference](https://pkg.go.dev/badge/github.com/leptonai/gpud.svg)](https://pkg.go.dev/github.com/leptonai/gpud) -[![codecov](https://codecov.io/gh/leptonai/gpud/graph/badge.svg?token=G8MGRK9X4A)](https://codecov.io/gh/leptonai/gpud) +[![Go Report Card](https://goreportcard.com/badge/github.com/NVIDIA/gpuhealth)](https://goreportcard.com/report/github.com/NVIDIA/gpuhealth) +![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/NVIDIA/gpuhealth?sort=semver) +[![Go Reference](https://pkg.go.dev/badge/github.com/NVIDIA/gpuhealth.svg)](https://pkg.go.dev/github.com/NVIDIA/gpuhealth) ## Overview -[GPUd](https://www.gpud.ai) is designed to ensure GPU efficiency and reliability by actively monitoring GPUs and effectively managing AI/ML workloads. +**GPUHealth** is a streamlined GPU health monitoring and reporting tool designed to ensure GPU reliability by actively monitoring GPU status and exporting health metrics for analysis. -## Why GPUd +## About GPUHealth -GPUd is built on years of experience operating large-scale GPU clusters at Meta, Alibaba Cloud, Uber, and Lepton AI. It is carefully designed to be self-contained and to integrate seamlessly with other systems such as Docker, containerd, Kubernetes, and Nvidia ecosystems. +GPUHealth is based on the upstream [leptonai/gpud](https://github.com/leptonai/gpud) project but focuses specifically on GPU health monitoring without management overhead. It is built on years of experience operating large-scale GPU clusters and is carefully designed to be self-contained with seamless integration into existing monitoring infrastructure. -- **First-class GPU support**: GPUd is GPU-centric, providing a unified view of critical GPU metrics and issues. -- **Easy to run at scale**: GPUd is a self-contained binary that runs on any machine with a low footprint. -- **Production grade**: GPUd is used in [Lepton AI](https://lepton.ai/)'s production infrastructure. +### Key Characteristics -Most importantly, GPUd operates with minimal CPU and memory overhead in a non-critical path and requires only read-only operations. See [*architecture*](./docs/ARCHITECTURE.md) for more details. +- **Health-Focused**: Concentrates purely on GPU health monitoring and metrics export +- **Lightweight**: Self-contained binary with minimal CPU and memory footprint +- **Non-Intrusive**: Operates with read-only operations in a non-critical path +- **Integration-Ready**: Easy to integrate with existing monitoring and alerting systems +- **Production-Ready**: Built for reliability in datacenter environments + +### Architecture + +GPUHealth operates as a standalone monitoring agent that: +- Collects GPU health metrics and status information +- Detects hardware issues and performance anomalies +- Exports data in standard formats (JSON, CSV) +- Supports both online (HTTP endpoint) and offline (file-based) modes ## Get Started - -gpud-2025-06-01-01-install-and-scan - +### Quick Start + +To quickly check your GPU health status: -See [Tutorials](./docs/TUTORIALS.md) for more. +```bash +# Download and run a quick scan +gpuhealth scan +``` ### Installation -To install from the official release on Linux and amd64 (x86_64) machine: +#### From GitHub Releases + +Download the latest release for your platform from [GitHub Releases](https://github.com/NVIDIA/gpuhealth/releases): ```bash -curl -fsSL https://pkg.gpud.dev/install.sh | sh +# Example for Linux x86_64 +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_linux_amd64.tar.gz +tar -xzf gpuhealth_linux_amd64.tar.gz +sudo mv gpuhealth /usr/local/bin/ ``` -To specify a version +#### Package Installation +**Debian/Ubuntu:** ```bash -curl -fsSL https://pkg.gpud.dev/install.sh | sh -s v0.6.0 +# Download .deb package +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_amd64.deb +sudo dpkg -i gpuhealth_amd64.deb ``` -Note that the install script doesn't support other architectures (arm64) and OSes (macos), yet. +**RHEL/CentOS:** +```bash +# Download .rpm package +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth-x86_64.rpm +sudo rpm -i gpuhealth-x86_64.rpm +``` -### Run GPUd with Lepton Platform +#### Build from Source -Sign up at [lepton.ai](https://www.lepton.ai/) and get the workspace token from the ["Settings" and "Tokens" page](https://dashboard.lepton.ai/workspace-redirect/settings/api-tokens): +```bash +git clone https://github.com/NVIDIA/gpuhealth.git +cd gpuhealth +make gpuhealth +sudo mv bin/gpuhealth /usr/local/bin/ +``` -GPUd lepton.ai machines settings +### Usage -Copy the token and pass it to the `gpud up --token` flag: +#### Health Monitoring Server + +Start the health monitoring server: ```bash -sudo gpud up --token -``` +# Start server (runs on port 15133 by default) +gpuhealth run -You can go to the [dashboard](https://dashboard.lepton.ai/workspace-redirect/machines/self-managed-nodes) to check the self-managed machine status. +# Start with custom configuration +gpuhealth run --listen-address=0.0.0.0:8080 --log-level=debug +``` -### Run GPUd standalone +#### One-time Health Check -For linux, run the following command to start the service: +Perform a quick health scan: ```bash -sudo gpud up +gpuhealth scan ``` -You can also start with the standalone mode and later switch to the managed option: +#### Offline Data Collection + +Collect health data to files: ```bash -# when the token is ready, run the following command -sudo gpud login --token +# Collect data for 1 hour to /tmp/gpu-health/ +gpuhealth run --offline-mode --path=/tmp/gpu-health --duration=1h +``` -# to logout -sudo gpud logout +#### Check Service Status -# to logout and reset the state file -sudo gpud logout --reset-state +```bash +gpuhealth status ``` -#### Run GPUd with Kubernetes +### API Access -The recommended way to deploy GPUd on Kubernetes is with our official [Helm chart](./deployments/helm/gpud/README.md). +Once running, access health data via HTTP API: -#### If your system doesn't have systemd +```bash +# Health endpoint +curl http://localhost:15133/healthz -To run on Mac (without systemd): +# Machine information +curl http://localhost:15133/machine-info -```bash -gpud run +# Health states +curl http://localhost:15133/v1/states ``` -Or +## Key Features + +### GPU Health Monitoring +- **Hardware Metrics**: Power consumption, temperature, clock speeds, utilization +- **Error Detection**: NVML Xid events, hardware slowdown, row remapping failures +- **Fabric Health**: GPU fabric status and interconnect monitoring +- **Performance Tracking**: GPU performance counters and throughput metrics + +### System Health Monitoring +- **Basic System Metrics**: CPU, memory, and disk usage +- **Driver Status**: NVIDIA driver version and compatibility checks +- **Process Monitoring**: GPU process information and resource allocation + +### Data Export & Integration +- **Multiple Formats**: JSON and CSV output formats +- **HTTP API**: RESTful endpoints for real-time data access +- **Offline Mode**: File-based data collection for batch processing +- **Configurable Intervals**: Customizable health check frequencies + +### Production Features +- **Low Overhead**: Minimal CPU and memory footprint +- **Read-Only**: Non-intrusive monitoring with no system modifications +- **Reliability**: Built for 24/7 operation in datacenter environments +- **Scalability**: Deploy across large GPU clusters with consistent performance + +Check out [*components*](./docs/COMPONENTS.md) for a detailed list of monitoring components and their capabilities. + +## FAQs + +### Does GPUHealth send data externally? + +**No.** GPUHealth operates in a fully self-contained mode and does not send any data to external services by default. All health monitoring data is: + +- Stored locally on your system +- Accessed only through the local HTTP API (if enabled) +- Exported to local files in offline mode +- **Never transmitted** to external services without explicit configuration +GPUHealth is designed for environments where data privacy and security are paramount. + +### How do I integrate GPUHealth with my monitoring system? + +GPUHealth provides multiple integration options: + +**HTTP API Integration:** ```bash -nohup sudo /usr/local/bin run &>> & -``` +# Prometheus-style metrics +curl http://localhost:15133/metrics -### Stop and uninstall +# JSON health data +curl http://localhost:15133/v1/states +``` +**File-based Integration:** ```bash -sudo gpud down -sudo rm /usr/local/bin -sudo rm /etc/systemd/system/gpud.service +# Export to files for processing +gpuhealth run --offline-mode --path=/monitoring/data --duration=24h ``` -## Key Features +**Custom Endpoints:** +Configure your monitoring system to scrape the GPUHealth API endpoints at your desired interval. -- Monitor critical GPU and GPU fabric metrics (power, temperature). -- Reports GPU and GPU fabric status (nvidia-smi parser, error checking). -- Detects critical GPU and GPU fabric errors (kmsg, hardware slowdown, NVML Xid event, DCGM). -- Monitor overall system metrics (CPU, memory, disk). +### How do I update GPUHealth? -Check out [*components*](./docs/COMPONENTS.md) for a detailed list of components and their features. +1. **Download latest release** from [GitHub Releases](https://github.com/NVIDIA/gpuhealth/releases) +2. **Stop running instance**: `gpuhealth status` to check, then stop if needed +3. **Replace binary**: Update `/usr/local/bin/gpuhealth` or your installation path +4. **Restart**: Launch gpuhealth with your previous configuration -## Integration +For package installations (deb/rpm), use your system's package manager to update. -For users looking to set up a platform to collect and process data from gpud, please refer to [INTEGRATION](./docs/INTEGRATION.md). +### What are the system requirements? -## FAQs +- **OS**: Linux (primary support), basic support for other Unix-like systems +- **Architecture**: x86_64 (amd64), ARM64 (aarch64) +- **NVIDIA Driver**: Version 535+ recommended (not required for basic system monitoring) +- **Memory**: ~10-50MB RAM usage +- **CPU**: Minimal overhead, typically <1% CPU usage +- **Storage**: ~100MB for binary and logs -### Does GPUd send data to lepton.ai? +### Can I run GPUHealth without NVIDIA drivers? -GPUd collects a small anonymous usage signal by default to help the engineering team better understand usage frequencies. The data is strictly anonymized and **does not contain any sensitive data**. You can disable this behavior by setting `GPUD_NO_USAGE_STATS=true`. If GPUd is run with systemd (default option for the `gpud up` command), you can add the line `GPUD_NO_USAGE_STATS=true` to the `/etc/default/gpud` environment file and restart the service. +Yes! GPUHealth will operate in a reduced functionality mode: +- ✅ **System monitoring**: CPU, memory, disk metrics still available +- ✅ **Basic GPU detection**: PCI device enumeration +- ❌ **NVIDIA-specific monitoring**: Requires NVIDIA drivers for full GPU health data -If you opt-in to log in to the Lepton AI platform, to assist you with more helpful GPU health states, GPUd periodically sends system runtime related information about the host to the platform. All these info are system workload and health info, and contain no user data. The data are sent via secure channels. +## Documentation -### How to update GPUd? +- [Components Guide](./docs/COMPONENTS.md) - Detailed component information and configuration +- [Architecture Overview](./docs/ARCHITECTURE.md) - System design and technical details +- [Installation Guide](./docs/INSTALL.md) - Comprehensive installation instructions +- [Integration Guide](./docs/INTEGRATION.md) - How to integrate with monitoring systems -GPUd is still in active development, regularly releasing new versions for critical bug fixes and new features. We strongly recommend always being on the latest version of GPUd. +## Related Projects -When GPUd is registered with the Lepton platform, the platform will automatically update GPUd to the latest version. To disable such auto-updates, if GPUd is run with systemd (default option for the `gpud up` command), you may add the flag `FLAGS="--enable-auto-update=false"` to the `/etc/default/gpud` environment file and restart the service. +- **Upstream Project**: [leptonai/gpud](https://github.com/leptonai/gpud) - Full-featured GPU management daemon +- **NVIDIA Tools**: Compatible with NVIDIA's GPU monitoring ecosystem -## Learn more +## Contributing -- [Why GPUd](./docs/WHY.md) -- [Install GPUd](./docs/INSTALL.md) -- [GPUd components](./docs/COMPONENTS.md) -- [GPUd architecture](./docs/ARCHITECTURE.md) +We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for: +- Development setup and build instructions +- Code style and contribution guidelines +- How to report issues and submit pull requests +- Upstream sync procedures for maintainers -## Contributing +## License -Please see the [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on how to contribute to this project. +This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. diff --git a/version/version.go b/version/version.go index bce07a4d..fa3bd9af 100644 --- a/version/version.go +++ b/version/version.go @@ -5,7 +5,7 @@ import "runtime" var ( // Package is filled at linking time - Package = "github.com/leptonai/gpud" + Package = "github.com/NVIDIA/gpuhealth" // Version holds the complete version number. Filled in at linking time. Version = "0.0.1+unknown" From 1279a78f1f8ee559982bbebfe3cb70e0d745990c Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 26 Aug 2025 16:30:50 -0700 Subject: [PATCH 02/26] change readme --- README.md | 95 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index 9785f668..495e28e1 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,8 @@ -# GPUHealth - -[![Go Report Card](https://goreportcard.com/badge/github.com/NVIDIA/gpuhealth)](https://goreportcard.com/report/github.com/NVIDIA/gpuhealth) -![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/NVIDIA/gpuhealth?sort=semver) -[![Go Reference](https://pkg.go.dev/badge/github.com/NVIDIA/gpuhealth.svg)](https://pkg.go.dev/github.com/NVIDIA/gpuhealth) +# NVIDIA GPU Health Monitoring and Reporting Agent ## Overview -**GPUHealth** is a streamlined GPU health monitoring and reporting tool designed to ensure GPU reliability by actively monitoring GPU status and exporting health metrics for analysis. - -## About GPUHealth - -GPUHealth is based on the upstream [leptonai/gpud](https://github.com/leptonai/gpud) project but focuses specifically on GPU health monitoring without management overhead. It is built on years of experience operating large-scale GPU clusters and is carefully designed to be self-contained with seamless integration into existing monitoring infrastructure. +**GPUHealth** is a streamlined GPU health monitoring and reporting tool designed to ensure GPU reliability by actively monitoring GPU status and exporting health metrics for analysis. **GPUHealth** is based on the upstream [leptonai/gpud](https://github.com/leptonai/gpud) project but focuses specifically on GPU health monitoring without management overhead. ### Key Characteristics @@ -26,57 +18,69 @@ GPUHealth operates as a standalone monitoring agent that: - Collects GPU health metrics and status information - Detects hardware issues and performance anomalies - Exports data in standard formats (JSON, CSV) -- Supports both online (HTTP endpoint) and offline (file-based) modes +- Supports multiple deployment modes: + - **Local API**: HTTP endpoints for on-demand access + - **Offline Collection**: File-based batch data export + - **Centralized Reporting**: Optional push-mode to control planes (configurable) ## Get Started -### Quick Start - -To quickly check your GPU health status: - -```bash -# Download and run a quick scan -gpuhealth scan -``` - ### Installation -#### From GitHub Releases +Choose between **package installation** (recommended for production) or **building from source** (for development/customization): -Download the latest release for your platform from [GitHub Releases](https://github.com/NVIDIA/gpuhealth/releases): - -```bash -# Example for Linux x86_64 -wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_linux_amd64.tar.gz -tar -xzf gpuhealth_linux_amd64.tar.gz -sudo mv gpuhealth /usr/local/bin/ -``` +#### Package Installation (Recommended) -#### Package Installation +**Includes systemd integration and auto-start capability** **Debian/Ubuntu:** ```bash -# Download .deb package +# Download and install .deb package wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_amd64.deb sudo dpkg -i gpuhealth_amd64.deb + +# Check the gpuhealthd service status +systemctl status gpuhealthd ``` **RHEL/CentOS:** ```bash -# Download .rpm package +# Download and install .rpm package wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth-x86_64.rpm sudo rpm -i gpuhealth-x86_64.rpm + +# Check the gpuhealthd service status +systemctl status gpuhealthd ``` +**Package installation provides:** +- ✅ **Systemd integration**: Service management with `systemctl` +- ✅ **Auto-start**: Automatically starts on system boot +- ✅ **Service configuration**: Pre-configured service files and environment +- ✅ **Standard paths**: Binary, logs, and data stored in standard system locations + #### Build from Source +**For development, customization, or manual deployment** + ```bash git clone https://github.com/NVIDIA/gpuhealth.git cd gpuhealth make gpuhealth sudo mv bin/gpuhealth /usr/local/bin/ + +# Manual setup required: +# - No systemd integration (run manually or create your own service) +# - No auto-start capability +# - Manual configuration of paths and permissions ``` +**Source installation provides:** +- ✅ **Latest code**: Access to newest features and bug fixes +- ✅ **Customization**: Modify source code as needed +- ✅ **Minimal installation**: Just the binary, no additional system integration +- ❌ **Manual setup**: You handle service management, auto-start, and configuration + ### Usage #### Health Monitoring Server @@ -146,7 +150,9 @@ curl http://localhost:15133/v1/states - **Multiple Formats**: JSON and CSV output formats - **HTTP API**: RESTful endpoints for real-time data access - **Offline Mode**: File-based data collection for batch processing -- **Configurable Intervals**: Customizable health check frequencies +- **Centralized Reporting**: Optional push-mode data export to control planes +- **Configurable Intervals**: Customizable health check and export frequencies +- **Flexible Endpoints**: Support for custom monitoring infrastructure integration ### Production Features - **Low Overhead**: Minimal CPU and memory footprint @@ -160,14 +166,21 @@ Check out [*components*](./docs/COMPONENTS.md) for a detailed list of monitoring ### Does GPUHealth send data externally? -**No.** GPUHealth operates in a fully self-contained mode and does not send any data to external services by default. All health monitoring data is: +**By default, no.** GPUHealth operates in a fully self-contained mode and does not send any data to external services by default. However, it **can be configured** to send health data to a centralized control plane for further analysis if desired. +**Default behavior:** - Stored locally on your system - Accessed only through the local HTTP API (if enabled) - Exported to local files in offline mode -- **Never transmitted** to external services without explicit configuration +- **No external data transmission** without explicit configuration + +**Optional centralized reporting:** +- Can be configured to send health data to a centralized monitoring platform +- Configurable endpoints, intervals, and data filtering +- All data transmission is **opt-in** and under your control +- Supports secure channels for data transmission -GPUHealth is designed for environments where data privacy and security are paramount. +GPUHealth is designed for environments where data privacy and security are paramount, giving you full control over where and how your GPU health data is used. ### How do I integrate GPUHealth with my monitoring system? @@ -188,8 +201,16 @@ curl http://localhost:15133/v1/states gpuhealth run --offline-mode --path=/monitoring/data --duration=24h ``` +**Centralized Control Plane Integration:** +```bash +# Configure centralized reporting (optional) +gpuhealth run --health-exporter-endpoint=https://monitoring.company.com/gpu-health \ + --health-exporter-interval=5m \ + --include-metrics=true +``` + **Custom Endpoints:** -Configure your monitoring system to scrape the GPUHealth API endpoints at your desired interval. +Configure your monitoring system to scrape the GPUHealth API endpoints at your desired interval, or set up centralized reporting to push data to your monitoring infrastructure. ### How do I update GPUHealth? From 13ff7912e4e5a9b97b2d4ffc1ef6fc39f80263d7 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 26 Aug 2025 16:32:00 -0700 Subject: [PATCH 03/26] change the readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 495e28e1..f9b506b1 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## Overview -**GPUHealth** is a streamlined GPU health monitoring and reporting tool designed to ensure GPU reliability by actively monitoring GPU status and exporting health metrics for analysis. **GPUHealth** is based on the upstream [leptonai/gpud](https://github.com/leptonai/gpud) project but focuses specifically on GPU health monitoring without management overhead. +`gpuhealth` is a streamlined GPU health monitoring and reporting agent designed to ensure GPU reliability by actively monitoring GPU status and exporting health metrics for analysis. `gpuhealth` is based on the upstream [leptonai/gpud](https://github.com/leptonai/gpud) project but focuses specifically on GPU health monitoring without management overhead. ### Key Characteristics From ac2d47a142e49bd8a048d587ed6448251f3f39a8 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 26 Aug 2025 16:40:56 -0700 Subject: [PATCH 04/26] update readme --- README.md | 58 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index f9b506b1..73052089 100644 --- a/README.md +++ b/README.md @@ -33,23 +33,37 @@ Choose between **package installation** (recommended for production) or **buildi **Includes systemd integration and auto-start capability** -**Debian/Ubuntu:** +**Ubuntu (22.04, 24.04):** ```bash -# Download and install .deb package -wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_amd64.deb -sudo dpkg -i gpuhealth_amd64.deb +# Download the appropriate package from releases page: +# https://github.com/NVIDIA/gpuhealth/releases -# Check the gpuhealthd service status +# For x86_64/amd64 +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_*_amd64.deb +sudo dpkg -i gpuhealth_*_amd64.deb + +# For ARM64/aarch64 +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_*_arm64.deb +sudo dpkg -i gpuhealth_*_arm64.deb + +# Verify installation and service status systemctl status gpuhealthd ``` -**RHEL/CentOS:** +**RHEL, Rocky Linux, AlmaLinux (8, 9, 10) & Amazon Linux 2023:** ```bash -# Download and install .rpm package -wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth-x86_64.rpm -sudo rpm -i gpuhealth-x86_64.rpm +# Download the appropriate package from releases page: +# https://github.com/NVIDIA/gpuhealth/releases + +# For x86_64 +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth-*-1.x86_64.rpm +sudo rpm -i gpuhealth-*-1.x86_64.rpm -# Check the gpuhealthd service status +# For ARM64/aarch64 +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth-*-1.aarch64.rpm +sudo rpm -i gpuhealth-*-1.aarch64.rpm + +# Verify installation and service status systemctl status gpuhealthd ``` @@ -223,12 +237,30 @@ For package installations (deb/rpm), use your system's package manager to update ### What are the system requirements? -- **OS**: Linux (primary support), basic support for other Unix-like systems -- **Architecture**: x86_64 (amd64), ARM64 (aarch64) +#### Supported Operating Systems + +| OS Distribution | Version | x86_64 (amd64) | ARM64 (aarch64) | +|-----------------|---------|:--------------:|:---------------:| +| **Ubuntu** | 22.04 | ✅ | ✅ | +| **Ubuntu** | 24.04 | ✅ | ✅ | +| **RHEL/Rocky Linux/AlmaLinux** | 8 | ✅ | ✅ | +| **RHEL/Rocky Linux/AlmaLinux** | 9 | ✅ | ✅ | +| **RHEL/Rocky Linux/AlmaLinux** | 10 | ✅ | ✅ | +| **Amazon Linux** | 2023 | ✅ | ✅ | + +#### System Resources + - **NVIDIA Driver**: Version 535+ recommended (not required for basic system monitoring) -- **Memory**: ~10-50MB RAM usage +- **Memory**: <100MB RAM usage - **CPU**: Minimal overhead, typically <1% CPU usage - **Storage**: ~100MB for binary and logs +- **Network**: HTTP/HTTPS access for centralized reporting (optional) + +#### Additional Requirements + +- **systemd**: Version 230+ (for package installations) +- **curl**: Required for installation scripts and HTTP exports +- **Root privileges**: Required for full system monitoring capabilities ### Can I run GPUHealth without NVIDIA drivers? From aed5a16a20adc71210ab4995ae00285b721c3874 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Thu, 28 Aug 2025 13:55:46 -0700 Subject: [PATCH 05/26] fix ci job --- .github/workflows/goreleaser-dev.yml | 148 ++++++++--------- .github/workflows/goreleaser.yml | 152 +++++++++--------- .goreleaser_gpuhealth_rpm_aarch64.yaml | 4 +- .goreleaser_gpuhealth_rpm_x86_64.yaml | 4 +- deployments/packages/gpuhealth/doc/README.md | 5 +- .../gpuhealth/systemd/gpuhealthd.service | 3 +- 6 files changed, 159 insertions(+), 157 deletions(-) diff --git a/.github/workflows/goreleaser-dev.yml b/.github/workflows/goreleaser-dev.yml index 3b8be418..d46f9fc2 100644 --- a/.github/workflows/goreleaser-dev.yml +++ b/.github/workflows/goreleaser-dev.yml @@ -1,88 +1,88 @@ -name: goreleaser-dev +# name: goreleaser-dev -# ref. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions -on: - push: - branches: ["main"] - pull_request: - paths: - - .github/workflows/golangci-lint.yml - - .github/workflows/goreleaser-dev.yml - - "**.go" - - go.mod - - go.sum - - .goreleaser* - branches: ["**"] +# # ref. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions +# on: +# push: +# branches: ["main"] +# pull_request: +# paths: +# - .github/workflows/golangci-lint.yml +# - .github/workflows/goreleaser-dev.yml +# - "**.go" +# - go.mod +# - go.sum +# - .goreleaser* +# branches: ["**"] -permissions: - contents: write +# permissions: +# contents: write -jobs: - release: - # https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix - strategy: - matrix: - job: - - os: ubuntu-22.04 - platform: linux - target: linux_amd64 - goreleaser_suffix: _ubuntu22.04 +# jobs: +# release: +# # https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix +# strategy: +# matrix: +# job: +# - os: ubuntu-22.04 +# platform: linux +# target: linux_amd64 +# goreleaser_suffix: _ubuntu22.04 - - os: ubuntu-22.04 - platform: linux - target: linux_arm64 - goreleaser_suffix: _ubuntu22.04 +# - os: ubuntu-22.04 +# platform: linux +# target: linux_arm64 +# goreleaser_suffix: _ubuntu22.04 - - os: ubuntu-24.04 - platform: linux - target: linux_amd64 - goreleaser_suffix: _ubuntu24.04 +# - os: ubuntu-24.04 +# platform: linux +# target: linux_amd64 +# goreleaser_suffix: _ubuntu24.04 - - os: ubuntu-24.04 - platform: linux - target: linux_arm64 - goreleaser_suffix: _ubuntu24.04 +# - os: ubuntu-24.04 +# platform: linux +# target: linux_arm64 +# goreleaser_suffix: _ubuntu24.04 - - os: macos-latest - platform: darwin - target: darwin_amd64 +# - os: macos-latest +# platform: darwin +# target: darwin_amd64 - - os: macos-latest - platform: darwin - target: darwin_arm64 +# - os: macos-latest +# platform: darwin +# target: darwin_arm64 - name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) - runs-on: ${{ matrix.job.os }} +# name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) +# runs-on: ${{ matrix.job.os }} - steps: - - name: Checkout code - uses: actions/checkout@v3 +# steps: +# - name: Checkout code +# uses: actions/checkout@v3 - - name: Install OS dependencies - shell: bash - run: | - case ${{ matrix.job.target }} in - linux_arm64) sudo apt-get -y update ; sudo apt-get -y install gcc-aarch64-linux-gnu ;; - esac +# - name: Install OS dependencies +# shell: bash +# run: | +# case ${{ matrix.job.target }} in +# linux_arm64) sudo apt-get -y update ; sudo apt-get -y install gcc-aarch64-linux-gnu ;; +# esac - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod +# - name: Set up Go +# uses: actions/setup-go@v5 +# with: +# go-version-file: go.mod - - name: Show version information - shell: bash - run: | - gcc --version || true - go version +# - name: Show version information +# shell: bash +# run: | +# gcc --version || true +# go version - # https://github.com/goreleaser/goreleaser-action - - name: Run GoReleaser in snapshot mode - uses: goreleaser/goreleaser-action@v6 - with: - distribution: goreleaser - version: latest - args: release --snapshot --config .goreleaser_${{ matrix.job.target }}${{ matrix.job.goreleaser_suffix }}.yaml - workdir: . - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +# # https://github.com/goreleaser/goreleaser-action +# - name: Run GoReleaser in snapshot mode +# uses: goreleaser/goreleaser-action@v6 +# with: +# distribution: goreleaser +# version: latest +# args: release --snapshot --config .goreleaser_${{ matrix.job.target }}${{ matrix.job.goreleaser_suffix }}.yaml +# workdir: . +# env: +# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/goreleaser.yml b/.github/workflows/goreleaser.yml index 687abc59..53fcc25a 100644 --- a/.github/workflows/goreleaser.yml +++ b/.github/workflows/goreleaser.yml @@ -1,90 +1,90 @@ -name: goreleaser +# name: goreleaser -# ref. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions -on: - push: - tags: - - "*" +# # ref. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions +# on: +# push: +# tags: +# - "*" -permissions: - contents: write +# permissions: +# contents: write -jobs: - release: - # https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix - strategy: - matrix: - job: - - os: ubuntu-22.04 - platform: linux - target: gpuhealth_deb_amd64 - goreleaser_suffix: "" +# jobs: +# release: +# # https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix +# strategy: +# matrix: +# job: +# - os: ubuntu-22.04 +# platform: linux +# target: gpuhealth_deb_amd64 +# goreleaser_suffix: "" - - os: ubuntu-22.04 - platform: linux - target: gpuhealth_deb_arm64 - goreleaser_suffix: "" +# - os: ubuntu-22.04 +# platform: linux +# target: gpuhealth_deb_arm64 +# goreleaser_suffix: "" - - os: ubuntu-22.04 - platform: linux - target: gpuhealth_rpm_x86_64 - goreleaser_suffix: "" +# - os: ubuntu-22.04 +# platform: linux +# target: gpuhealth_rpm_x86_64 +# goreleaser_suffix: "" - - os: ubuntu-22.04 - platform: linux - target: gpuhealth_rpm_aarch64 - goreleaser_suffix: "" +# - os: ubuntu-22.04 +# platform: linux +# target: gpuhealth_rpm_aarch64 +# goreleaser_suffix: "" - name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) - runs-on: ${{ matrix.job.os }} +# name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) +# runs-on: ${{ matrix.job.os }} - steps: - - name: Checkout code - uses: actions/checkout@v3 +# steps: +# - name: Checkout code +# uses: actions/checkout@v3 - - name: Install OS dependencies - shell: bash - run: | - case ${{ matrix.job.target }} in - linux_arm64) sudo apt-get -y update ; sudo apt-get -y install gcc-aarch64-linux-gnu ;; - esac +# - name: Install OS dependencies +# shell: bash +# run: | +# case ${{ matrix.job.target }} in +# linux_arm64) sudo apt-get -y update ; sudo apt-get -y install gcc-aarch64-linux-gnu ;; +# esac - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod +# - name: Set up Go +# uses: actions/setup-go@v5 +# with: +# go-version-file: go.mod - - name: Show version information - shell: bash - run: | - gcc --version || true - go version +# - name: Show version information +# shell: bash +# run: | +# gcc --version || true +# go version - # https://github.com/goreleaser/goreleaser-action - - name: Run GoReleaser - uses: goreleaser/goreleaser-action@v6 - with: - distribution: goreleaser - version: latest - args: release --config .goreleaser_${{ matrix.job.target }}.yaml - workdir: . - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +# # https://github.com/goreleaser/goreleaser-action +# - name: Run GoReleaser +# uses: goreleaser/goreleaser-action@v6 +# with: +# distribution: goreleaser +# version: latest +# args: release --config .goreleaser_${{ matrix.job.target }}.yaml +# workdir: . +# env: +# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Get tag version - if: startsWith(github.ref, 'refs/tags/') - id: get_tag_version - run: echo TAG_VERSION=${GITHUB_REF/refs\/tags\//} >> $GITHUB_OUTPUT +# - name: Get tag version +# if: startsWith(github.ref, 'refs/tags/') +# id: get_tag_version +# run: echo TAG_VERSION=${GITHUB_REF/refs\/tags\//} >> $GITHUB_OUTPUT - - name: Release latest - uses: softprops/action-gh-release@v1 - if: ${{ github.ref == 'refs/heads/main' }} - with: - name: Latest release - tag_name: latest - draft: false - prerelease: false - body: Latest builds from the last commit - files: | - ./dist/gpuhealth_*.deb - ./dist/gpuhealth_*.rpm +# - name: Release latest +# uses: softprops/action-gh-release@v1 +# if: ${{ github.ref == 'refs/heads/main' }} +# with: +# name: Latest release +# tag_name: latest +# draft: false +# prerelease: false +# body: Latest builds from the last commit +# files: | +# ./dist/gpuhealth_*.deb +# ./dist/gpuhealth_*.rpm diff --git a/.goreleaser_gpuhealth_rpm_aarch64.yaml b/.goreleaser_gpuhealth_rpm_aarch64.yaml index 0827b621..352bc01d 100644 --- a/.goreleaser_gpuhealth_rpm_aarch64.yaml +++ b/.goreleaser_gpuhealth_rpm_aarch64.yaml @@ -30,10 +30,10 @@ nfpms: recommends: [nvidia-driver] contents: - src: deployments/packages/gpuhealth/systemd/gpuhealthd.service - dst: /lib/systemd/system/gpuhealthd.service + dst: /usr/lib/systemd/system/gpuhealthd.service file_info: { mode: 0644 } - src: deployments/packages/gpuhealth/systemd/gpuhealth.env - dst: /etc/default/gpuhealth + dst: /etc/sysconfig/gpuhealth type: config file_info: { mode: 0644 } - src: deployments/packages/gpuhealth/doc/README.md diff --git a/.goreleaser_gpuhealth_rpm_x86_64.yaml b/.goreleaser_gpuhealth_rpm_x86_64.yaml index 66efd974..4f723257 100644 --- a/.goreleaser_gpuhealth_rpm_x86_64.yaml +++ b/.goreleaser_gpuhealth_rpm_x86_64.yaml @@ -30,10 +30,10 @@ nfpms: recommends: [nvidia-driver] contents: - src: deployments/packages/gpuhealth/systemd/gpuhealthd.service - dst: /lib/systemd/system/gpuhealthd.service + dst: /usr/lib/systemd/system/gpuhealthd.service file_info: { mode: 0644 } - src: deployments/packages/gpuhealth/systemd/gpuhealth.env - dst: /etc/default/gpuhealth + dst: /etc/sysconfig/gpuhealth type: config file_info: { mode: 0644 } - src: deployments/packages/gpuhealth/doc/README.md diff --git a/deployments/packages/gpuhealth/doc/README.md b/deployments/packages/gpuhealth/doc/README.md index 7a8e24f5..f040c7f3 100644 --- a/deployments/packages/gpuhealth/doc/README.md +++ b/deployments/packages/gpuhealth/doc/README.md @@ -7,10 +7,11 @@ Configuration ------------- The service configuration is located at: - /etc/default/gpuhealth + deb based system: /etc/default/gpuhealth + rpm based system: /etc/sysconfig/gpuhealth To configure the service: - 1. Edit: sudo vi /etc/default/gpuhealth + 1. Edit: sudo vi /etc/default/gpuhealth or sudo vi /etc/sysconfig/gpuhealth 2. Restart: sudo systemctl restart gpuhealthd The default configuration sets log level to 'info'. Available levels: diff --git a/deployments/packages/gpuhealth/systemd/gpuhealthd.service b/deployments/packages/gpuhealth/systemd/gpuhealthd.service index bc3458db..802b0a57 100644 --- a/deployments/packages/gpuhealth/systemd/gpuhealthd.service +++ b/deployments/packages/gpuhealth/systemd/gpuhealthd.service @@ -3,7 +3,8 @@ Description=GPU Health Daemon After=network.target local-fs.target [Service] -EnvironmentFile=/etc/default/gpuhealth +EnvironmentFile=-/etc/default/gpuhealth +EnvironmentFile=-/etc/sysconfig/gpuhealth ExecStart=/usr/bin/gpuhealth run $GPUHEALTH_FLAGS ExecReload=/bin/kill -HUP $MAINPID From 6388849b26178836a23afe18a29ab932f3cf5256 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Thu, 28 Aug 2025 14:16:31 -0700 Subject: [PATCH 06/26] workflow file --- .github/workflows/build-self-hosted.yml | 176 ++++++++++++++++++++++ .github/workflows/release-self-hosted.yml | 172 +++++++++++++++++++++ 2 files changed, 348 insertions(+) create mode 100644 .github/workflows/build-self-hosted.yml create mode 100644 .github/workflows/release-self-hosted.yml diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml new file mode 100644 index 00000000..9f8a61ad --- /dev/null +++ b/.github/workflows/build-self-hosted.yml @@ -0,0 +1,176 @@ +name: Build and Test with Self-Hosted Runners + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + # Lint on a single runner (fastest) + lint: + name: Lint and Static Analysis + runs-on: gpuhealth-ubuntu-22.04-x86 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install golangci-lint + run: go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 + + - name: Build custom golangci-lint with nilaway + run: golangci-lint custom + + - name: Run linter + run: ./custom-gcl run --verbose --config=.golangci.yml ./... + + # Test on multiple platforms + test: + strategy: + fail-fast: false + matrix: + include: + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: amd64 + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + + name: Test on ${{ matrix.os_name }}-${{ matrix.arch }} + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Verify environment + run: | + go version + go env GOOS GOARCH + gcc --version || true + + - name: Run unit tests + run: | + make test || go test -v ./... + + - name: Build binary + run: | + make gpuhealth || go build -o bin/gpuhealth ./cmd/gpuhealth + + - name: Test binary + run: | + ./bin/gpuhealth --version + ./bin/gpuhealth --help + + - name: Upload test artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-results-${{ matrix.os_name }}-${{ matrix.arch }} + path: | + bin/gpuhealth + *.out + coverage.html + retention-days: 3 + + # Build packages on PR for validation + build-packages: + if: github.event_name == 'pull_request' + needs: [lint, test] + strategy: + fail-fast: false + matrix: + include: + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml + + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: amd64 + goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml + + name: Build Package Test on ${{ matrix.os_name }}-${{ matrix.arch }} + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Install GoReleaser + run: | + go install github.com/goreleaser/goreleaser@latest + + - name: Test package build (dry-run) + run: | + goreleaser build --config ${{ matrix.goreleaser_config }} --snapshot --clean + + - name: Upload package test artifacts + uses: actions/upload-artifact@v4 + with: + name: package-test-${{ matrix.os_name }}-${{ matrix.arch }} + path: | + dist/ + retention-days: 1 diff --git a/.github/workflows/release-self-hosted.yml b/.github/workflows/release-self-hosted.yml new file mode 100644 index 00000000..96a6219f --- /dev/null +++ b/.github/workflows/release-self-hosted.yml @@ -0,0 +1,172 @@ +name: Release with Self-Hosted Runners + +on: + push: + tags: + - "v*" + workflow_dispatch: + inputs: + tag: + description: 'Tag to release (leave empty for latest commit)' + required: false + type: string + +permissions: + contents: write + +jobs: + build-packages: + strategy: + fail-fast: false + matrix: + include: + # Ubuntu 22.04 DEB packages + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + package_type: deb + goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml + build_deps: "build-essential" + + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + package_type: deb + goreleaser_config: .goreleaser_gpuhealth_deb_arm64.yaml + build_deps: "build-essential" + + # RHEL 8 RPM packages + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: amd64 + package_type: rpm + goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml + build_deps: "gcc gcc-c++ make" + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + package_type: rpm + goreleaser_config: .goreleaser_gpuhealth_rpm_aarch64.yaml + build_deps: "gcc gcc-c++ make" + + name: Build ${{ matrix.package_type }} for ${{ matrix.os_name }}-${{ matrix.arch }} + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Full history for proper versioning + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y ${{ matrix.build_deps }} linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y ${{ matrix.build_deps }} kernel-headers kernel-devel + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Verify Go installation + run: | + go version + go env GOOS GOARCH + + - name: Install GoReleaser + run: | + go install github.com/goreleaser/goreleaser@latest + goreleaser --version + + - name: Validate GoReleaser config + run: | + goreleaser check --config ${{ matrix.goreleaser_config }} + + - name: Run GoReleaser + run: | + goreleaser release --config ${{ matrix.goreleaser_config }} --clean + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload package artifacts + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }} + path: | + dist/*.deb + dist/*.rpm + retention-days: 7 + + - name: List generated artifacts + run: | + echo "Generated artifacts:" + find dist/ -name "*.deb" -o -name "*.rpm" | head -20 + + # Collect all artifacts and create unified release + create-release: + needs: build-packages + runs-on: ubuntu-latest + if: always() && (needs.build-packages.result == 'success') + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: ./artifacts + + - name: Organize artifacts + run: | + mkdir -p ./release-assets + find ./artifacts -name "*.deb" -exec cp {} ./release-assets/ \; + find ./artifacts -name "*.rpm" -exec cp {} ./release-assets/ \; + ls -la ./release-assets/ + + - name: Get tag information + id: tag_info + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ github.event.inputs.tag }}" ]]; then + TAG="${{ github.event.inputs.tag }}" + else + TAG=${GITHUB_REF#refs/tags/} + fi + echo "tag=${TAG}" >> $GITHUB_OUTPUT + echo "release_name=gpuhealth-${TAG}" >> $GITHUB_OUTPUT + + - name: Create Release + uses: softprops/action-gh-release@v1 + with: + tag_name: ${{ steps.tag_info.outputs.tag }} + name: ${{ steps.tag_info.outputs.release_name }} + draft: false + prerelease: false + generate_release_notes: true + files: ./release-assets/* + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Release Summary + run: | + echo "## Release Summary" >> $GITHUB_STEP_SUMMARY + echo "- **Tag:** ${{ steps.tag_info.outputs.tag }}" >> $GITHUB_STEP_SUMMARY + echo "- **Packages Built:**" >> $GITHUB_STEP_SUMMARY + echo " - Ubuntu 22.04 DEB (amd64, arm64)" >> $GITHUB_STEP_SUMMARY + echo " - RHEL 8 RPM (x86_64, aarch64)" >> $GITHUB_STEP_SUMMARY + echo "- **Artifacts:** $(ls ./release-assets/ | wc -l) packages" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Package List:" >> $GITHUB_STEP_SUMMARY + for file in ./release-assets/*; do + echo "- $(basename $file)" >> $GITHUB_STEP_SUMMARY + done From d64f143d525af0e3ce77e0d86bf5cdce5835eb42 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Thu, 28 Aug 2025 16:33:27 -0700 Subject: [PATCH 07/26] CI pipeline --- .github/workflows/build.yml | 31 --- .github/workflows/ci-build.yml | 123 +++++++++++ .github/workflows/ci-e2e-tests.yml | 121 +++++++++++ .github/workflows/ci-lint.yml | 51 +++++ ...{build-self-hosted.yml => ci-packages.yml} | 194 +++++++++--------- .github/workflows/ci-security.yml | 54 +++++ .github/workflows/ci-unit-tests.yml | 121 +++++++++++ .github/workflows/golangci-lint.yml | 41 ---- .github/workflows/goreleaser-dev.yml | 88 -------- .github/workflows/goreleaser.yml | 90 -------- .github/workflows/release-hybrid.yml | 187 +++++++++++++++++ .github/workflows/release-self-hosted.yml | 172 ---------------- .github/workflows/tests-e2e.yml | 32 --- .github/workflows/tests-unit.yml | 36 ---- 14 files changed, 758 insertions(+), 583 deletions(-) delete mode 100644 .github/workflows/build.yml create mode 100644 .github/workflows/ci-build.yml create mode 100644 .github/workflows/ci-e2e-tests.yml create mode 100644 .github/workflows/ci-lint.yml rename .github/workflows/{build-self-hosted.yml => ci-packages.yml} (52%) create mode 100644 .github/workflows/ci-security.yml create mode 100644 .github/workflows/ci-unit-tests.yml delete mode 100644 .github/workflows/golangci-lint.yml delete mode 100644 .github/workflows/goreleaser-dev.yml delete mode 100644 .github/workflows/goreleaser.yml create mode 100644 .github/workflows/release-hybrid.yml delete mode 100644 .github/workflows/release-self-hosted.yml delete mode 100644 .github/workflows/tests-e2e.yml delete mode 100644 .github/workflows/tests-unit.yml diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index b276995b..00000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Build and Store gpuhealth binary - -on: - push: - branches: - - main - pull_request: - branches: ["**"] - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Set up environment - run: | - sudo apt-get update - sudo apt-get install -y linux-headers-$(uname -r) - - - name: Build project - run: | - make gpuhealth - - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: gpuhealth - path: bin/gpuhealth \ No newline at end of file diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml new file mode 100644 index 00000000..a4b77e4b --- /dev/null +++ b/.github/workflows/ci-build.yml @@ -0,0 +1,123 @@ +name: CI - Build Binary + +on: + push: + branches: ["main", "dev/**", "feature/**"] + pull_request: + paths: + - ".github/workflows/ci-build.yml" + - "**.go" + - "go.mod" + - "go.sum" + branches: ["**"] + +permissions: + contents: read + +jobs: + build-dev: + if: github.ref != 'refs/heads/main' + name: Build (Dev) + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Build binary + run: | + echo "🔨 Building gpuhealth binary for dev validation..." + make gpuhealth || go build -o bin/gpuhealth ./cmd/gpuhealth + + - name: Test binary + run: | + ./bin/gpuhealth --version + ./bin/gpuhealth --help + + - name: Upload binary + uses: actions/upload-artifact@v4 + with: + name: gpuhealth-binary-dev + path: bin/gpuhealth + if-no-files-found: warn + retention-days: 1 + + build-full: + # Full platform builds for PR/main/release - Self-hosted runners + if: github.ref == 'refs/heads/main' || github.event_name == 'pull_request' || startsWith(github.ref, 'refs/tags/') + strategy: + fail-fast: false + matrix: + include: + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: amd64 + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + + name: Build (${{ matrix.os_name }}-${{ matrix.arch }}) + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Build binary (native) + run: | + echo "🔨 Building gpuhealth binary natively on ${{ matrix.os_name }}-${{ matrix.arch }}..." + make gpuhealth || go build -o bin/gpuhealth ./cmd/gpuhealth + + - name: Test binary + run: | + echo "Testing binary functionality..." + ./bin/gpuhealth --version + ./bin/gpuhealth --help + file ./bin/gpuhealth + ldd ./bin/gpuhealth || echo "Static binary or different libc" + + - name: Upload binary + uses: actions/upload-artifact@v4 + with: + name: gpuhealth-binary-${{ matrix.os_name }}-${{ matrix.arch }} + path: bin/gpuhealth + if-no-files-found: warn + retention-days: 7 diff --git a/.github/workflows/ci-e2e-tests.yml b/.github/workflows/ci-e2e-tests.yml new file mode 100644 index 00000000..15750fc7 --- /dev/null +++ b/.github/workflows/ci-e2e-tests.yml @@ -0,0 +1,121 @@ +name: CI - E2E Tests + +on: + push: + branches: ["main"] + pull_request: + paths: + - ".github/workflows/ci-e2e-tests.yml" + - "**.go" + - "go.mod" + - "go.sum" + branches: ["**"] + +permissions: + contents: read + pull-requests: read + +jobs: + e2e-smoke-pr: + if: github.event_name == 'pull_request' + name: E2E Smoke Tests (PR) + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Run E2E smoke tests + run: | + echo "Running E2E smoke tests for PR validation..." + # Create a lightweight E2E test for PRs + KMSG_FILE_PATH=/dev/null timeout 5m ./scripts/tests-e2e.sh || { + echo "E2E smoke tests completed or timed out (expected for smoke tests)" + } + + - name: Upload smoke test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: e2e-smoke-results + path: | + e2e/**/*.log + **/*test.xml + if-no-files-found: warn + retention-days: 1 + + e2e-full: + # Full E2E tests for main/release - Self-hosted runners + if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/') + strategy: + fail-fast: false + matrix: + include: + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: amd64 + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + + name: E2E Tests (${{ matrix.os_name }}-${{ matrix.arch }}) + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Run E2E tests + run: | + echo "Running full E2E tests on ${{ matrix.os_name }}-${{ matrix.arch }}..." + KMSG_FILE_PATH=/dev/null ./scripts/tests-e2e.sh + + - name: Upload E2E test results + if: always() + uses: actions/upload-artifact@v4 + with: + name: e2e-results-${{ matrix.os_name }}-${{ matrix.arch }} + path: | + e2e/**/*.log + **/*test.xml + bin/ + if-no-files-found: warn + retention-days: 7 diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml new file mode 100644 index 00000000..25a1f662 --- /dev/null +++ b/.github/workflows/ci-lint.yml @@ -0,0 +1,51 @@ +name: CI - Lint & Code Quality + +on: + push: + branches: ["**"] + pull_request: + paths: + - ".github/workflows/ci-lint.yml" + - "**.go" + - "go.mod" + - "go.sum" + - ".golangci.yml" + branches: ["**"] + +permissions: + contents: read + pull-requests: read + +jobs: + lint: + name: Lint & Code Quality + runs-on: ubuntu-22.04 # GitHub-hosted runner + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Install golangci-lint + run: go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 + + - name: Build custom golangci-lint with nilaway + run: golangci-lint custom + + - name: Run golangci-lint + run: ./custom-gcl run --verbose --config=.golangci.yml ./... + + - name: Check go mod tidy + run: | + go mod tidy + if ! git diff --exit-code go.mod go.sum; then + echo "go.mod or go.sum is not tidy" + exit 1 + fi diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/ci-packages.yml similarity index 52% rename from .github/workflows/build-self-hosted.yml rename to .github/workflows/ci-packages.yml index 9f8a61ad..852513c4 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/ci-packages.yml @@ -1,42 +1,25 @@ -name: Build and Test with Self-Hosted Runners +name: CI - Package Building on: push: - branches: [main] + branches: ["main"] pull_request: - branches: [main] + paths: + - ".github/workflows/ci-packages.yml" + - "**.go" + - "go.mod" + - "go.sum" + - ".goreleaser_*.yaml" + - "deployments/packages/**" + branches: ["**"] + +permissions: + contents: read jobs: - # Lint on a single runner (fastest) - lint: - name: Lint and Static Analysis - runs-on: gpuhealth-ubuntu-22.04-x86 - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - cache: true - - - name: Install dependencies - run: | - sudo apt-get update - sudo apt-get install -y build-essential linux-headers-$(uname -r) - - - name: Install golangci-lint - run: go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 - - - name: Build custom golangci-lint with nilaway - run: golangci-lint custom - - - name: Run linter - run: ./custom-gcl run --verbose --config=.golangci.yml ./... - - # Test on multiple platforms - test: + package-pr: + # Package validation for PRs - subset of platforms + if: github.event_name == 'pull_request' strategy: fail-fast: false matrix: @@ -44,94 +27,99 @@ jobs: - runner: gpuhealth-ubuntu-22.04-x86 os_name: ubuntu-22.04 arch: amd64 - - - runner: gpuhealth-ubuntu-22.04-arm - os_name: ubuntu-22.04 - arch: arm64 + package_type: deb + goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml - runner: gpuhealth-rhel-8-x86 os_name: rhel-8 arch: amd64 - - - runner: gpuhealth-rhel-8-arm - os_name: rhel-8 - arch: aarch64 + package_type: rpm + goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml - name: Test on ${{ matrix.os_name }}-${{ matrix.arch }} + name: Package Test (${{ matrix.package_type }}-${{ matrix.arch }}) runs-on: ${{ matrix.runner }} steps: - name: Checkout code uses: actions/checkout@v4 - + with: + fetch-depth: 0 + - name: Install build dependencies (Ubuntu) if: contains(matrix.os_name, 'ubuntu') run: | sudo apt-get update sudo apt-get install -y build-essential linux-headers-$(uname -r) - + - name: Install build dependencies (RHEL) if: contains(matrix.os_name, 'rhel') run: | sudo dnf update -y sudo dnf groupinstall -y "Development Tools" - sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel - + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel rpm-build + - name: Set up Go uses: actions/setup-go@v5 with: - go-version-file: go.mod cache: true - - - name: Verify environment - run: | - go version - go env GOOS GOARCH - gcc --version || true - - - name: Run unit tests + go-version-file: go.mod + + - name: Install GoReleaser run: | - make test || go test -v ./... - - - name: Build binary + go install github.com/goreleaser/goreleaser@latest + goreleaser --version + + - name: Validate GoReleaser config run: | - make gpuhealth || go build -o bin/gpuhealth ./cmd/gpuhealth - - - name: Test binary + goreleaser check --config ${{ matrix.goreleaser_config }} + + - name: Test package build (dry-run) run: | - ./bin/gpuhealth --version - ./bin/gpuhealth --help - - - name: Upload test artifacts - if: always() + echo "📦 Testing ${{ matrix.package_type }} package build..." + goreleaser build --config ${{ matrix.goreleaser_config }} --snapshot --clean + + - name: Upload package test artifacts uses: actions/upload-artifact@v4 with: - name: test-results-${{ matrix.os_name }}-${{ matrix.arch }} - path: | - bin/gpuhealth - *.out - coverage.html - retention-days: 3 - - # Build packages on PR for validation - build-packages: - if: github.event_name == 'pull_request' - needs: [lint, test] + name: package-test-${{ matrix.package_type }}-${{ matrix.arch }} + path: dist/ + if-no-files-found: warn + retention-days: 1 + + package-full: + # Full package builds for main branch - all platforms + if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/') strategy: fail-fast: false matrix: include: + # Ubuntu DEB packages - runner: gpuhealth-ubuntu-22.04-x86 os_name: ubuntu-22.04 arch: amd64 + package_type: deb goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + package_type: deb + goreleaser_config: .goreleaser_gpuhealth_deb_arm64.yaml + + # RHEL RPM packages - runner: gpuhealth-rhel-8-x86 os_name: rhel-8 arch: amd64 + package_type: rpm goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + package_type: rpm + goreleaser_config: .goreleaser_gpuhealth_rpm_aarch64.yaml - name: Build Package Test on ${{ matrix.os_name }}-${{ matrix.arch }} + name: Build Package (${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }}) runs-on: ${{ matrix.runner }} steps: @@ -139,38 +127,58 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 - + - name: Install build dependencies (Ubuntu) if: contains(matrix.os_name, 'ubuntu') run: | sudo apt-get update sudo apt-get install -y build-essential linux-headers-$(uname -r) - + - name: Install build dependencies (RHEL) if: contains(matrix.os_name, 'rhel') run: | sudo dnf update -y sudo dnf groupinstall -y "Development Tools" - sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel - + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel rpm-build + - name: Set up Go uses: actions/setup-go@v5 with: - go-version-file: go.mod cache: true - + go-version-file: go.mod + - name: Install GoReleaser run: | go install github.com/goreleaser/goreleaser@latest - - - name: Test package build (dry-run) + goreleaser --version + + - name: Validate GoReleaser config run: | - goreleaser build --config ${{ matrix.goreleaser_config }} --snapshot --clean - - - name: Upload package test artifacts + goreleaser check --config ${{ matrix.goreleaser_config }} + + - name: Build package + run: | + echo "📦 Building ${{ matrix.package_type }} package on ${{ matrix.os_name }}-${{ matrix.arch }}..." + if [ "${{ github.ref }}" = "refs/heads/main" ]; then + goreleaser build --config ${{ matrix.goreleaser_config }} --snapshot --clean + else + # Release build for tags + goreleaser release --config ${{ matrix.goreleaser_config }} --clean + fi + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: List generated packages + run: | + echo "Generated packages:" + find dist/ -name "*.deb" -o -name "*.rpm" | head -10 + + - name: Upload package artifacts uses: actions/upload-artifact@v4 with: - name: package-test-${{ matrix.os_name }}-${{ matrix.arch }} - path: | - dist/ - retention-days: 1 + name: package-${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }} + path: + - "dist/*.deb" + - "dist/*.rpm" + if-no-files-found: warn + retention-days: 7 diff --git a/.github/workflows/ci-security.yml b/.github/workflows/ci-security.yml new file mode 100644 index 00000000..60557e46 --- /dev/null +++ b/.github/workflows/ci-security.yml @@ -0,0 +1,54 @@ +name: CI - Security & Compliance + +# Security scans always run on GitHub-hosted (platform-agnostic) +on: + push: + branches: ["main"] + pull_request: + branches: ["**"] + schedule: + - cron: '0 6 * * 1' # Weekly on Monday at 6 AM + +permissions: + contents: read + security-events: write + +jobs: + security-scan: + name: Security Scan + runs-on: ubuntu-22.04 # GitHub-hosted (security tools are platform-agnostic) + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Run Gosec security scanner + uses: securecodewarrior/github-action-gosec@master + with: + args: '-fmt sarif -out gosec.sarif ./...' + + - name: Upload SARIF file + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: gosec.sarif + + - name: Vulnerability scan with Nancy + run: | + go install github.com/sonatypeoss/nancy@latest + go list -json -deps ./... | nancy sleuth + + - name: Check for known vulnerabilities + run: | + go install golang.org/x/vuln/cmd/govulncheck@latest + govulncheck ./... + + - name: License compliance check + run: | + go install github.com/google/go-licenses@latest + go-licenses check ./... diff --git a/.github/workflows/ci-unit-tests.yml b/.github/workflows/ci-unit-tests.yml new file mode 100644 index 00000000..16017bf5 --- /dev/null +++ b/.github/workflows/ci-unit-tests.yml @@ -0,0 +1,121 @@ +name: CI - Unit Tests + +on: + push: + branches: ["**"] + pull_request: + paths: + - ".github/workflows/ci-unit-tests.yml" + - "**.go" + - "go.mod" + - "go.sum" + branches: ["**"] + +permissions: + contents: read + pull-requests: read + +jobs: + unit-test-dev: + if: github.ref != 'refs/heads/main' + name: Unit Tests (Dev) + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Run unit tests + run: | + echo "Running unit tests for dev branch..." + KMSG_FILE_PATH=/dev/null ./scripts/tests-unit.sh + + - name: Upload coverage + uses: actions/upload-artifact@v4 + with: + name: coverage-dev + path: coverage.txt + if-no-files-found: warn + retention-days: 1 + + unit-test-full: + # Full platform testing for PR/main/release - Self-hosted runners + if: github.ref == 'refs/heads/main' || github.event_name == 'pull_request' || startsWith(github.ref, 'refs/tags/') + strategy: + fail-fast: false + matrix: + include: + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: amd64 + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + + name: Unit Tests (${{ matrix.os_name }}-${{ matrix.arch }}) + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Run unit tests + run: | + echo "Running unit tests on ${{ matrix.os_name }}-${{ matrix.arch }}..." + KMSG_FILE_PATH=/dev/null ./scripts/tests-unit.sh + + - name: Upload coverage reports + if: matrix.os_name == 'ubuntu-22.04' && matrix.arch == 'amd64' + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: coverage.txt + + - name: Upload test artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: unit-test-results-${{ matrix.os_name }}-${{ matrix.arch }} + path: + - coverage.txt + - "**/*test.xml" + if-no-files-found: warn + retention-days: 3 diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml deleted file mode 100644 index 0ec356b5..00000000 --- a/.github/workflows/golangci-lint.yml +++ /dev/null @@ -1,41 +0,0 @@ - -# https://github.com/golangci/golangci-lint-action?tab=readme-ov-file#options -name: golangci-lint.run - -on: - push: - branches: ["main"] - pull_request: - paths: - - .github/workflows/golangci-lint.yml - - "**.go" - - go.mod - - go.sum - branches: ["**"] - -permissions: - contents: read - pull-requests: read - -jobs: - golangci-lint: - name: golangci-lint - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - uses: actions/setup-go@v5 - with: - cache: false - go-version-file: go.mod - - - name: Install golangci-lint - run: go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 - - - name: Build custom golangci-lint - run: golangci-lint custom - - - name: Run golangci-lint - run: ./custom-gcl run --verbose --config=.golangci.yml ./... diff --git a/.github/workflows/goreleaser-dev.yml b/.github/workflows/goreleaser-dev.yml deleted file mode 100644 index d46f9fc2..00000000 --- a/.github/workflows/goreleaser-dev.yml +++ /dev/null @@ -1,88 +0,0 @@ -# name: goreleaser-dev - -# # ref. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions -# on: -# push: -# branches: ["main"] -# pull_request: -# paths: -# - .github/workflows/golangci-lint.yml -# - .github/workflows/goreleaser-dev.yml -# - "**.go" -# - go.mod -# - go.sum -# - .goreleaser* -# branches: ["**"] - -# permissions: -# contents: write - -# jobs: -# release: -# # https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix -# strategy: -# matrix: -# job: -# - os: ubuntu-22.04 -# platform: linux -# target: linux_amd64 -# goreleaser_suffix: _ubuntu22.04 - -# - os: ubuntu-22.04 -# platform: linux -# target: linux_arm64 -# goreleaser_suffix: _ubuntu22.04 - -# - os: ubuntu-24.04 -# platform: linux -# target: linux_amd64 -# goreleaser_suffix: _ubuntu24.04 - -# - os: ubuntu-24.04 -# platform: linux -# target: linux_arm64 -# goreleaser_suffix: _ubuntu24.04 - -# - os: macos-latest -# platform: darwin -# target: darwin_amd64 - -# - os: macos-latest -# platform: darwin -# target: darwin_arm64 - -# name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) -# runs-on: ${{ matrix.job.os }} - -# steps: -# - name: Checkout code -# uses: actions/checkout@v3 - -# - name: Install OS dependencies -# shell: bash -# run: | -# case ${{ matrix.job.target }} in -# linux_arm64) sudo apt-get -y update ; sudo apt-get -y install gcc-aarch64-linux-gnu ;; -# esac - -# - name: Set up Go -# uses: actions/setup-go@v5 -# with: -# go-version-file: go.mod - -# - name: Show version information -# shell: bash -# run: | -# gcc --version || true -# go version - -# # https://github.com/goreleaser/goreleaser-action -# - name: Run GoReleaser in snapshot mode -# uses: goreleaser/goreleaser-action@v6 -# with: -# distribution: goreleaser -# version: latest -# args: release --snapshot --config .goreleaser_${{ matrix.job.target }}${{ matrix.job.goreleaser_suffix }}.yaml -# workdir: . -# env: -# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/goreleaser.yml b/.github/workflows/goreleaser.yml deleted file mode 100644 index 53fcc25a..00000000 --- a/.github/workflows/goreleaser.yml +++ /dev/null @@ -1,90 +0,0 @@ -# name: goreleaser - -# # ref. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions -# on: -# push: -# tags: -# - "*" - -# permissions: -# contents: write - -# jobs: -# release: -# # https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix -# strategy: -# matrix: -# job: -# - os: ubuntu-22.04 -# platform: linux -# target: gpuhealth_deb_amd64 -# goreleaser_suffix: "" - -# - os: ubuntu-22.04 -# platform: linux -# target: gpuhealth_deb_arm64 -# goreleaser_suffix: "" - -# - os: ubuntu-22.04 -# platform: linux -# target: gpuhealth_rpm_x86_64 -# goreleaser_suffix: "" - -# - os: ubuntu-22.04 -# platform: linux -# target: gpuhealth_rpm_aarch64 -# goreleaser_suffix: "" - -# name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) -# runs-on: ${{ matrix.job.os }} - -# steps: -# - name: Checkout code -# uses: actions/checkout@v3 - -# - name: Install OS dependencies -# shell: bash -# run: | -# case ${{ matrix.job.target }} in -# linux_arm64) sudo apt-get -y update ; sudo apt-get -y install gcc-aarch64-linux-gnu ;; -# esac - -# - name: Set up Go -# uses: actions/setup-go@v5 -# with: -# go-version-file: go.mod - -# - name: Show version information -# shell: bash -# run: | -# gcc --version || true -# go version - -# # https://github.com/goreleaser/goreleaser-action -# - name: Run GoReleaser -# uses: goreleaser/goreleaser-action@v6 -# with: -# distribution: goreleaser -# version: latest -# args: release --config .goreleaser_${{ matrix.job.target }}.yaml -# workdir: . -# env: -# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - -# - name: Get tag version -# if: startsWith(github.ref, 'refs/tags/') -# id: get_tag_version -# run: echo TAG_VERSION=${GITHUB_REF/refs\/tags\//} >> $GITHUB_OUTPUT - -# - name: Release latest -# uses: softprops/action-gh-release@v1 -# if: ${{ github.ref == 'refs/heads/main' }} -# with: -# name: Latest release -# tag_name: latest -# draft: false -# prerelease: false -# body: Latest builds from the last commit -# files: | -# ./dist/gpuhealth_*.deb -# ./dist/gpuhealth_*.rpm diff --git a/.github/workflows/release-hybrid.yml b/.github/workflows/release-hybrid.yml new file mode 100644 index 00000000..3cddafa6 --- /dev/null +++ b/.github/workflows/release-hybrid.yml @@ -0,0 +1,187 @@ +name: Release - Production + +on: + push: + tags: + - "v*" + workflow_dispatch: + inputs: + tag: + description: 'Tag to release' + required: true + type: string + +permissions: + contents: write + +jobs: + # Pre-release validation on GitHub-hosted (fast) + pre-release-validation: + name: Pre-Release Validation + runs-on: ubuntu-22.04 # GitHub-hosted + outputs: + tag: ${{ steps.tag.outputs.tag }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get tag + id: tag + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + TAG="${{ github.event.inputs.tag }}" + else + TAG=${GITHUB_REF#refs/tags/} + fi + echo "tag=${TAG}" >> $GITHUB_OUTPUT + echo "Release tag: ${TAG}" + + - name: Validate tag format + run: | + TAG="${{ steps.tag.outputs.tag }}" + if [[ ! $TAG =~ ^v[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$ ]]; then + echo "Invalid tag format: $TAG" + echo "Expected format: v1.2.3 or v1.2.3-alpha" + exit 1 + fi + + - name: Check changelog + run: | + if [ -f CHANGELOG.md ]; then + TAG="${{ steps.tag.outputs.tag }}" + if ! grep -q "$TAG" CHANGELOG.md; then + echo "Warning: Tag $TAG not found in CHANGELOG.md" + fi + fi + + # Build packages on self-hosted runners + build-release-packages: + needs: pre-release-validation + strategy: + fail-fast: true # Fail fast for releases + matrix: + include: + # Ubuntu DEB packages + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + package_type: deb + goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml + + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + package_type: deb + goreleaser_config: .goreleaser_gpuhealth_deb_arm64.yaml + + # RHEL RPM packages + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: amd64 + package_type: rpm + goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + package_type: rpm + goreleaser_config: .goreleaser_gpuhealth_rpm_aarch64.yaml + + name: Release Package (${{ matrix.package_type }}-${{ matrix.arch }}) + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel rpm-build + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Install GoReleaser + run: | + go install github.com/goreleaser/goreleaser@latest + goreleaser --version + + - name: Build release package + run: | + echo "🚀 Building release ${{ matrix.package_type }} package..." + goreleaser release --config ${{ matrix.goreleaser_config }} --clean + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload release artifacts + uses: actions/upload-artifact@v4 + with: + name: release-${{ matrix.package_type }}-${{ matrix.arch }} + path: + - "dist/*.deb" + - "dist/*.rpm" + if-no-files-found: warn + retention-days: 90 + + # Create GitHub release on GitHub-hosted (API operations) + create-github-release: + needs: [pre-release-validation, build-release-packages] + name: Create GitHub Release + runs-on: ubuntu-22.04 # GitHub-hosted for API operations + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download all release artifacts + uses: actions/download-artifact@v4 + with: + path: ./release-assets + pattern: release-* + + - name: Organize release assets + run: | + mkdir -p ./final-release + find ./release-assets -name "*.deb" -exec cp {} ./final-release/ \; + find ./release-assets -name "*.rpm" -exec cp {} ./final-release/ \; + ls -la ./final-release/ + + - name: Create Release + uses: softprops/action-gh-release@v1 + with: + tag_name: ${{ needs.pre-release-validation.outputs.tag }} + name: "gpuhealth ${{ needs.pre-release-validation.outputs.tag }}" + draft: false + prerelease: ${{ contains(needs.pre-release-validation.outputs.tag, '-') }} + generate_release_notes: true + files: ./final-release/* + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Release Summary + run: | + echo "## Release Summary" >> $GITHUB_STEP_SUMMARY + echo "- **Tag:** ${{ needs.pre-release-validation.outputs.tag }}" >> $GITHUB_STEP_SUMMARY + echo "- **Packages:** $(ls ./final-release/ | wc -l) files" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Release Assets:" >> $GITHUB_STEP_SUMMARY + for file in ./final-release/*; do + echo "- $(basename $file)" >> $GITHUB_STEP_SUMMARY + done diff --git a/.github/workflows/release-self-hosted.yml b/.github/workflows/release-self-hosted.yml deleted file mode 100644 index 96a6219f..00000000 --- a/.github/workflows/release-self-hosted.yml +++ /dev/null @@ -1,172 +0,0 @@ -name: Release with Self-Hosted Runners - -on: - push: - tags: - - "v*" - workflow_dispatch: - inputs: - tag: - description: 'Tag to release (leave empty for latest commit)' - required: false - type: string - -permissions: - contents: write - -jobs: - build-packages: - strategy: - fail-fast: false - matrix: - include: - # Ubuntu 22.04 DEB packages - - runner: gpuhealth-ubuntu-22.04-x86 - os_name: ubuntu-22.04 - arch: amd64 - package_type: deb - goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml - build_deps: "build-essential" - - - runner: gpuhealth-ubuntu-22.04-arm - os_name: ubuntu-22.04 - arch: arm64 - package_type: deb - goreleaser_config: .goreleaser_gpuhealth_deb_arm64.yaml - build_deps: "build-essential" - - # RHEL 8 RPM packages - - runner: gpuhealth-rhel-8-x86 - os_name: rhel-8 - arch: amd64 - package_type: rpm - goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml - build_deps: "gcc gcc-c++ make" - - - runner: gpuhealth-rhel-8-arm - os_name: rhel-8 - arch: aarch64 - package_type: rpm - goreleaser_config: .goreleaser_gpuhealth_rpm_aarch64.yaml - build_deps: "gcc gcc-c++ make" - - name: Build ${{ matrix.package_type }} for ${{ matrix.os_name }}-${{ matrix.arch }} - runs-on: ${{ matrix.runner }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Full history for proper versioning - - - name: Install build dependencies (Ubuntu) - if: contains(matrix.os_name, 'ubuntu') - run: | - sudo apt-get update - sudo apt-get install -y ${{ matrix.build_deps }} linux-headers-$(uname -r) - - - name: Install build dependencies (RHEL) - if: contains(matrix.os_name, 'rhel') - run: | - sudo dnf update -y - sudo dnf groupinstall -y "Development Tools" - sudo dnf install -y ${{ matrix.build_deps }} kernel-headers kernel-devel - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - cache: true - - - name: Verify Go installation - run: | - go version - go env GOOS GOARCH - - - name: Install GoReleaser - run: | - go install github.com/goreleaser/goreleaser@latest - goreleaser --version - - - name: Validate GoReleaser config - run: | - goreleaser check --config ${{ matrix.goreleaser_config }} - - - name: Run GoReleaser - run: | - goreleaser release --config ${{ matrix.goreleaser_config }} --clean - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Upload package artifacts - uses: actions/upload-artifact@v4 - with: - name: ${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }} - path: | - dist/*.deb - dist/*.rpm - retention-days: 7 - - - name: List generated artifacts - run: | - echo "Generated artifacts:" - find dist/ -name "*.deb" -o -name "*.rpm" | head -20 - - # Collect all artifacts and create unified release - create-release: - needs: build-packages - runs-on: ubuntu-latest - if: always() && (needs.build-packages.result == 'success') - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download all artifacts - uses: actions/download-artifact@v4 - with: - path: ./artifacts - - - name: Organize artifacts - run: | - mkdir -p ./release-assets - find ./artifacts -name "*.deb" -exec cp {} ./release-assets/ \; - find ./artifacts -name "*.rpm" -exec cp {} ./release-assets/ \; - ls -la ./release-assets/ - - - name: Get tag information - id: tag_info - run: | - if [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ github.event.inputs.tag }}" ]]; then - TAG="${{ github.event.inputs.tag }}" - else - TAG=${GITHUB_REF#refs/tags/} - fi - echo "tag=${TAG}" >> $GITHUB_OUTPUT - echo "release_name=gpuhealth-${TAG}" >> $GITHUB_OUTPUT - - - name: Create Release - uses: softprops/action-gh-release@v1 - with: - tag_name: ${{ steps.tag_info.outputs.tag }} - name: ${{ steps.tag_info.outputs.release_name }} - draft: false - prerelease: false - generate_release_notes: true - files: ./release-assets/* - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Release Summary - run: | - echo "## Release Summary" >> $GITHUB_STEP_SUMMARY - echo "- **Tag:** ${{ steps.tag_info.outputs.tag }}" >> $GITHUB_STEP_SUMMARY - echo "- **Packages Built:**" >> $GITHUB_STEP_SUMMARY - echo " - Ubuntu 22.04 DEB (amd64, arm64)" >> $GITHUB_STEP_SUMMARY - echo " - RHEL 8 RPM (x86_64, aarch64)" >> $GITHUB_STEP_SUMMARY - echo "- **Artifacts:** $(ls ./release-assets/ | wc -l) packages" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Package List:" >> $GITHUB_STEP_SUMMARY - for file in ./release-assets/*; do - echo "- $(basename $file)" >> $GITHUB_STEP_SUMMARY - done diff --git a/.github/workflows/tests-e2e.yml b/.github/workflows/tests-e2e.yml deleted file mode 100644 index 794e9fbb..00000000 --- a/.github/workflows/tests-e2e.yml +++ /dev/null @@ -1,32 +0,0 @@ -# https://github.com/golangci/golangci-lint-action?tab=readme-ov-file#options -name: tests-e2e - -on: - push: - branches: ["main"] - pull_request: - paths: - - "**.go" - - go.mod - - go.sum - branches: ["**"] - -permissions: - contents: read - pull-requests: read - -jobs: - tests-e2e: - name: tests-e2e - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - uses: actions/setup-go@v5 - with: - cache: false - go-version-file: go.mod - - name: run e2e tests - run: | - KMSG_FILE_PATH=/dev/null ./scripts/tests-e2e.sh diff --git a/.github/workflows/tests-unit.yml b/.github/workflows/tests-unit.yml deleted file mode 100644 index 02e505ac..00000000 --- a/.github/workflows/tests-unit.yml +++ /dev/null @@ -1,36 +0,0 @@ -# https://github.com/golangci/golangci-lint-action?tab=readme-ov-file#options -name: tests-unit - -on: - push: - branches: ["main"] - pull_request: - paths: - - "**.go" - - go.mod - - go.sum - branches: ["**"] - -permissions: - contents: read - pull-requests: read - -jobs: - tests-unit: - name: tests-unit - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - uses: actions/setup-go@v5 - with: - cache: false - go-version-file: go.mod - - name: run unit tests - run: | - KMSG_FILE_PATH=/dev/null ./scripts/tests-unit.sh - - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} From 4432b857ce987e1baa0538515bb1b817e31e4d61 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Thu, 28 Aug 2025 16:48:45 -0700 Subject: [PATCH 08/26] CI change --- .github/workflows/ci-build.yml | 7 +++- .github/workflows/ci-e2e-tests.yml | 5 +++ .github/workflows/ci-lint.yml | 15 ++++---- .github/workflows/ci-packages.yml | 7 +++- .github/workflows/ci-security.yml | 54 ---------------------------- .github/workflows/ci-unit-tests.yml | 5 +++ .github/workflows/release-hybrid.yml | 2 +- 7 files changed, 31 insertions(+), 64 deletions(-) delete mode 100644 .github/workflows/ci-security.yml diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index a4b77e4b..0f794693 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -2,7 +2,8 @@ name: CI - Build Binary on: push: - branches: ["main", "dev/**", "feature/**"] + branches: ["**"] + tags: ['*.*.*'] pull_request: paths: - ".github/workflows/ci-build.yml" @@ -14,6 +15,10 @@ on: permissions: contents: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: build-dev: if: github.ref != 'refs/heads/main' diff --git a/.github/workflows/ci-e2e-tests.yml b/.github/workflows/ci-e2e-tests.yml index 15750fc7..3fa23db9 100644 --- a/.github/workflows/ci-e2e-tests.yml +++ b/.github/workflows/ci-e2e-tests.yml @@ -3,6 +3,7 @@ name: CI - E2E Tests on: push: branches: ["main"] + tags: ['*.*.*'] pull_request: paths: - ".github/workflows/ci-e2e-tests.yml" @@ -15,6 +16,10 @@ permissions: contents: read pull-requests: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: e2e-smoke-pr: if: github.event_name == 'pull_request' diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml index 25a1f662..54cfb465 100644 --- a/.github/workflows/ci-lint.yml +++ b/.github/workflows/ci-lint.yml @@ -16,6 +16,10 @@ permissions: contents: read pull-requests: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: lint: name: Lint & Code Quality @@ -33,14 +37,11 @@ jobs: cache: true go-version-file: go.mod - - name: Install golangci-lint - run: go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 - - - name: Build custom golangci-lint with nilaway - run: golangci-lint custom - - name: Run golangci-lint - run: ./custom-gcl run --verbose --config=.golangci.yml ./... + uses: golangci/golangci-lint-action@v6 + with: + version: v1.61.0 + args: --verbose --config=.golangci.yml - name: Check go mod tidy run: | diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index 852513c4..d6dc5b8a 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -3,6 +3,7 @@ name: CI - Package Building on: push: branches: ["main"] + tags: ['*.*.*'] pull_request: paths: - ".github/workflows/ci-packages.yml" @@ -14,7 +15,11 @@ on: branches: ["**"] permissions: - contents: read + contents: write + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true jobs: package-pr: diff --git a/.github/workflows/ci-security.yml b/.github/workflows/ci-security.yml deleted file mode 100644 index 60557e46..00000000 --- a/.github/workflows/ci-security.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: CI - Security & Compliance - -# Security scans always run on GitHub-hosted (platform-agnostic) -on: - push: - branches: ["main"] - pull_request: - branches: ["**"] - schedule: - - cron: '0 6 * * 1' # Weekly on Monday at 6 AM - -permissions: - contents: read - security-events: write - -jobs: - security-scan: - name: Security Scan - runs-on: ubuntu-22.04 # GitHub-hosted (security tools are platform-agnostic) - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - cache: true - go-version-file: go.mod - - - name: Run Gosec security scanner - uses: securecodewarrior/github-action-gosec@master - with: - args: '-fmt sarif -out gosec.sarif ./...' - - - name: Upload SARIF file - uses: github/codeql-action/upload-sarif@v3 - with: - sarif_file: gosec.sarif - - - name: Vulnerability scan with Nancy - run: | - go install github.com/sonatypeoss/nancy@latest - go list -json -deps ./... | nancy sleuth - - - name: Check for known vulnerabilities - run: | - go install golang.org/x/vuln/cmd/govulncheck@latest - govulncheck ./... - - - name: License compliance check - run: | - go install github.com/google/go-licenses@latest - go-licenses check ./... diff --git a/.github/workflows/ci-unit-tests.yml b/.github/workflows/ci-unit-tests.yml index 16017bf5..3da2f336 100644 --- a/.github/workflows/ci-unit-tests.yml +++ b/.github/workflows/ci-unit-tests.yml @@ -3,6 +3,7 @@ name: CI - Unit Tests on: push: branches: ["**"] + tags: ['*.*.*'] pull_request: paths: - ".github/workflows/ci-unit-tests.yml" @@ -15,6 +16,10 @@ permissions: contents: read pull-requests: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: unit-test-dev: if: github.ref != 'refs/heads/main' diff --git a/.github/workflows/release-hybrid.yml b/.github/workflows/release-hybrid.yml index 3cddafa6..72c60f7e 100644 --- a/.github/workflows/release-hybrid.yml +++ b/.github/workflows/release-hybrid.yml @@ -3,7 +3,7 @@ name: Release - Production on: push: tags: - - "v*" + - "*.*.*" workflow_dispatch: inputs: tag: From e992c3f8d221d845752bf45ec12a84baab787351 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Thu, 28 Aug 2025 16:59:40 -0700 Subject: [PATCH 09/26] ci pipeline --- .github/workflows/ci-build.yml | 2 +- .github/workflows/ci-lint.yml | 9 +--- .github/workflows/ci-unit-tests.yml | 83 +---------------------------- 3 files changed, 3 insertions(+), 91 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 0f794693..5270c3c4 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -10,7 +10,7 @@ on: - "**.go" - "go.mod" - "go.sum" - branches: ["**"] + branches: ["main"] permissions: contents: read diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml index 54cfb465..4e9dd2e4 100644 --- a/.github/workflows/ci-lint.yml +++ b/.github/workflows/ci-lint.yml @@ -3,14 +3,7 @@ name: CI - Lint & Code Quality on: push: branches: ["**"] - pull_request: - paths: - - ".github/workflows/ci-lint.yml" - - "**.go" - - "go.mod" - - "go.sum" - - ".golangci.yml" - branches: ["**"] + tags: ["*.*.*"] permissions: contents: read diff --git a/.github/workflows/ci-unit-tests.yml b/.github/workflows/ci-unit-tests.yml index 3da2f336..34818512 100644 --- a/.github/workflows/ci-unit-tests.yml +++ b/.github/workflows/ci-unit-tests.yml @@ -4,13 +4,6 @@ on: push: branches: ["**"] tags: ['*.*.*'] - pull_request: - paths: - - ".github/workflows/ci-unit-tests.yml" - - "**.go" - - "go.mod" - - "go.sum" - branches: ["**"] permissions: contents: read @@ -21,7 +14,7 @@ concurrency: cancel-in-progress: true jobs: - unit-test-dev: + unit-test: if: github.ref != 'refs/heads/main' name: Unit Tests (Dev) runs-on: ubuntu-22.04 @@ -50,77 +43,3 @@ jobs: path: coverage.txt if-no-files-found: warn retention-days: 1 - - unit-test-full: - # Full platform testing for PR/main/release - Self-hosted runners - if: github.ref == 'refs/heads/main' || github.event_name == 'pull_request' || startsWith(github.ref, 'refs/tags/') - strategy: - fail-fast: false - matrix: - include: - - runner: gpuhealth-ubuntu-22.04-x86 - os_name: ubuntu-22.04 - arch: amd64 - - - runner: gpuhealth-ubuntu-22.04-arm - os_name: ubuntu-22.04 - arch: arm64 - - - runner: gpuhealth-rhel-8-x86 - os_name: rhel-8 - arch: amd64 - - - runner: gpuhealth-rhel-8-arm - os_name: rhel-8 - arch: aarch64 - - name: Unit Tests (${{ matrix.os_name }}-${{ matrix.arch }}) - runs-on: ${{ matrix.runner }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Install build dependencies (Ubuntu) - if: contains(matrix.os_name, 'ubuntu') - run: | - sudo apt-get update - sudo apt-get install -y build-essential linux-headers-$(uname -r) - - - name: Install build dependencies (RHEL) - if: contains(matrix.os_name, 'rhel') - run: | - sudo dnf update -y - sudo dnf groupinstall -y "Development Tools" - sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel - - - name: Set up Go - uses: actions/setup-go@v5 - with: - cache: true - go-version-file: go.mod - - - name: Run unit tests - run: | - echo "Running unit tests on ${{ matrix.os_name }}-${{ matrix.arch }}..." - KMSG_FILE_PATH=/dev/null ./scripts/tests-unit.sh - - - name: Upload coverage reports - if: matrix.os_name == 'ubuntu-22.04' && matrix.arch == 'amd64' - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: coverage.txt - - - name: Upload test artifacts - if: always() - uses: actions/upload-artifact@v4 - with: - name: unit-test-results-${{ matrix.os_name }}-${{ matrix.arch }} - path: - - coverage.txt - - "**/*test.xml" - if-no-files-found: warn - retention-days: 3 From a62778dcd738843117ca5890a1b0a6d96da82a54 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Thu, 28 Aug 2025 17:03:33 -0700 Subject: [PATCH 10/26] Update build binary to only on main and tag an pr --- .github/workflows/ci-build.yml | 39 +--------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 5270c3c4..c34d8384 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -2,7 +2,7 @@ name: CI - Build Binary on: push: - branches: ["**"] + branches: ["main"] tags: ['*.*.*'] pull_request: paths: @@ -20,44 +20,7 @@ concurrency: cancel-in-progress: true jobs: - build-dev: - if: github.ref != 'refs/heads/main' - name: Build (Dev) - runs-on: ubuntu-22.04 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Go - uses: actions/setup-go@v5 - with: - cache: true - go-version-file: go.mod - - - name: Build binary - run: | - echo "🔨 Building gpuhealth binary for dev validation..." - make gpuhealth || go build -o bin/gpuhealth ./cmd/gpuhealth - - - name: Test binary - run: | - ./bin/gpuhealth --version - ./bin/gpuhealth --help - - - name: Upload binary - uses: actions/upload-artifact@v4 - with: - name: gpuhealth-binary-dev - path: bin/gpuhealth - if-no-files-found: warn - retention-days: 1 - build-full: - # Full platform builds for PR/main/release - Self-hosted runners - if: github.ref == 'refs/heads/main' || github.event_name == 'pull_request' || startsWith(github.ref, 'refs/tags/') strategy: fail-fast: false matrix: From cbd1225412b7148d1ebd59bf5c18b016c206f706 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Thu, 28 Aug 2025 17:07:55 -0700 Subject: [PATCH 11/26] simplify ci --- .github/workflows/ci-build.yml | 2 +- .github/workflows/ci-e2e-tests.yml | 42 +--------------- .github/workflows/ci-packages.yml | 75 +--------------------------- .github/workflows/release-hybrid.yml | 2 +- 4 files changed, 6 insertions(+), 115 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index c34d8384..826e8418 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -35,7 +35,7 @@ jobs: - runner: gpuhealth-rhel-8-x86 os_name: rhel-8 - arch: amd64 + arch: x86_64 - runner: gpuhealth-rhel-8-arm os_name: rhel-8 diff --git a/.github/workflows/ci-e2e-tests.yml b/.github/workflows/ci-e2e-tests.yml index 3fa23db9..04902ec3 100644 --- a/.github/workflows/ci-e2e-tests.yml +++ b/.github/workflows/ci-e2e-tests.yml @@ -21,45 +21,7 @@ concurrency: cancel-in-progress: true jobs: - e2e-smoke-pr: - if: github.event_name == 'pull_request' - name: E2E Smoke Tests (PR) - runs-on: ubuntu-22.04 - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Set up Go - uses: actions/setup-go@v5 - with: - cache: true - go-version-file: go.mod - - - name: Run E2E smoke tests - run: | - echo "Running E2E smoke tests for PR validation..." - # Create a lightweight E2E test for PRs - KMSG_FILE_PATH=/dev/null timeout 5m ./scripts/tests-e2e.sh || { - echo "E2E smoke tests completed or timed out (expected for smoke tests)" - } - - - name: Upload smoke test results - if: always() - uses: actions/upload-artifact@v4 - with: - name: e2e-smoke-results - path: | - e2e/**/*.log - **/*test.xml - if-no-files-found: warn - retention-days: 1 - - e2e-full: - # Full E2E tests for main/release - Self-hosted runners - if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/') + e2e-test: strategy: fail-fast: false matrix: @@ -74,7 +36,7 @@ jobs: - runner: gpuhealth-rhel-8-x86 os_name: rhel-8 - arch: amd64 + arch: x86_64 - runner: gpuhealth-rhel-8-arm os_name: rhel-8 diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index d6dc5b8a..f1d4b9c1 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -22,78 +22,7 @@ concurrency: cancel-in-progress: true jobs: - package-pr: - # Package validation for PRs - subset of platforms - if: github.event_name == 'pull_request' - strategy: - fail-fast: false - matrix: - include: - - runner: gpuhealth-ubuntu-22.04-x86 - os_name: ubuntu-22.04 - arch: amd64 - package_type: deb - goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml - - - runner: gpuhealth-rhel-8-x86 - os_name: rhel-8 - arch: amd64 - package_type: rpm - goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml - - name: Package Test (${{ matrix.package_type }}-${{ matrix.arch }}) - runs-on: ${{ matrix.runner }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install build dependencies (Ubuntu) - if: contains(matrix.os_name, 'ubuntu') - run: | - sudo apt-get update - sudo apt-get install -y build-essential linux-headers-$(uname -r) - - - name: Install build dependencies (RHEL) - if: contains(matrix.os_name, 'rhel') - run: | - sudo dnf update -y - sudo dnf groupinstall -y "Development Tools" - sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel rpm-build - - - name: Set up Go - uses: actions/setup-go@v5 - with: - cache: true - go-version-file: go.mod - - - name: Install GoReleaser - run: | - go install github.com/goreleaser/goreleaser@latest - goreleaser --version - - - name: Validate GoReleaser config - run: | - goreleaser check --config ${{ matrix.goreleaser_config }} - - - name: Test package build (dry-run) - run: | - echo "📦 Testing ${{ matrix.package_type }} package build..." - goreleaser build --config ${{ matrix.goreleaser_config }} --snapshot --clean - - - name: Upload package test artifacts - uses: actions/upload-artifact@v4 - with: - name: package-test-${{ matrix.package_type }}-${{ matrix.arch }} - path: dist/ - if-no-files-found: warn - retention-days: 1 - - package-full: - # Full package builds for main branch - all platforms - if: github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/') + package: strategy: fail-fast: false matrix: @@ -114,7 +43,7 @@ jobs: # RHEL RPM packages - runner: gpuhealth-rhel-8-x86 os_name: rhel-8 - arch: amd64 + arch: x86_64 package_type: rpm goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml diff --git a/.github/workflows/release-hybrid.yml b/.github/workflows/release-hybrid.yml index 72c60f7e..7c5bf8f2 100644 --- a/.github/workflows/release-hybrid.yml +++ b/.github/workflows/release-hybrid.yml @@ -80,7 +80,7 @@ jobs: # RHEL RPM packages - runner: gpuhealth-rhel-8-x86 os_name: rhel-8 - arch: amd64 + arch: x86_64 package_type: rpm goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml From 4b0a2642a5cc7becf0a6ab1a9ccfbe4e8bf12ec2 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Thu, 28 Aug 2025 17:11:46 -0700 Subject: [PATCH 12/26] ci simplify --- .github/workflows/ci-e2e-tests.yml | 44 ++++------------------------- .github/workflows/ci-unit-tests.yml | 3 +- 2 files changed, 6 insertions(+), 41 deletions(-) diff --git a/.github/workflows/ci-e2e-tests.yml b/.github/workflows/ci-e2e-tests.yml index 04902ec3..514a95df 100644 --- a/.github/workflows/ci-e2e-tests.yml +++ b/.github/workflows/ci-e2e-tests.yml @@ -21,49 +21,15 @@ concurrency: cancel-in-progress: true jobs: - e2e-test: - strategy: - fail-fast: false - matrix: - include: - - runner: gpuhealth-ubuntu-22.04-x86 - os_name: ubuntu-22.04 - arch: amd64 - - - runner: gpuhealth-ubuntu-22.04-arm - os_name: ubuntu-22.04 - arch: arm64 - - - runner: gpuhealth-rhel-8-x86 - os_name: rhel-8 - arch: x86_64 - - - runner: gpuhealth-rhel-8-arm - os_name: rhel-8 - arch: aarch64 - - name: E2E Tests (${{ matrix.os_name }}-${{ matrix.arch }}) - runs-on: ${{ matrix.runner }} - + e2e: + name: E2E Tests + runs-on: ubuntu-22.04 steps: - name: Checkout code uses: actions/checkout@v4 with: submodules: recursive - - name: Install build dependencies (Ubuntu) - if: contains(matrix.os_name, 'ubuntu') - run: | - sudo apt-get update - sudo apt-get install -y build-essential linux-headers-$(uname -r) - - - name: Install build dependencies (RHEL) - if: contains(matrix.os_name, 'rhel') - run: | - sudo dnf update -y - sudo dnf groupinstall -y "Development Tools" - sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel - - name: Set up Go uses: actions/setup-go@v5 with: @@ -72,14 +38,14 @@ jobs: - name: Run E2E tests run: | - echo "Running full E2E tests on ${{ matrix.os_name }}-${{ matrix.arch }}..." + echo "Running E2E tests on ubuntu-22.04 (GitHub-hosted)..." KMSG_FILE_PATH=/dev/null ./scripts/tests-e2e.sh - name: Upload E2E test results if: always() uses: actions/upload-artifact@v4 with: - name: e2e-results-${{ matrix.os_name }}-${{ matrix.arch }} + name: e2e-results path: | e2e/**/*.log **/*test.xml diff --git a/.github/workflows/ci-unit-tests.yml b/.github/workflows/ci-unit-tests.yml index 34818512..e0d3a3ed 100644 --- a/.github/workflows/ci-unit-tests.yml +++ b/.github/workflows/ci-unit-tests.yml @@ -15,8 +15,7 @@ concurrency: jobs: unit-test: - if: github.ref != 'refs/heads/main' - name: Unit Tests (Dev) + name: Unit Tests runs-on: ubuntu-22.04 steps: From d6b6b4e8bd0d62cc4ce97ac023759d5b66e80dcc Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Fri, 29 Aug 2025 09:49:09 -0700 Subject: [PATCH 13/26] update linter version --- .github/workflows/ci-build.yml | 2 +- .github/workflows/ci-lint.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 826e8418..74433231 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -71,7 +71,7 @@ jobs: - name: Build binary (native) run: | - echo "🔨 Building gpuhealth binary natively on ${{ matrix.os_name }}-${{ matrix.arch }}..." + echo "Building gpuhealth binary on ${{ matrix.os_name }}-${{ matrix.arch }}..." make gpuhealth || go build -o bin/gpuhealth ./cmd/gpuhealth - name: Test binary diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml index 4e9dd2e4..3b5ad8ca 100644 --- a/.github/workflows/ci-lint.yml +++ b/.github/workflows/ci-lint.yml @@ -33,7 +33,7 @@ jobs: - name: Run golangci-lint uses: golangci/golangci-lint-action@v6 with: - version: v1.61.0 + version: latest args: --verbose --config=.golangci.yml - name: Check go mod tidy From 8cc0b8151b49a235dd261f521dae37fdff4c7ff5 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Fri, 29 Aug 2025 10:00:03 -0700 Subject: [PATCH 14/26] Remove e2e test for gpud --- .github/workflows/ci-e2e-tests.yml | 54 ------------------------------ 1 file changed, 54 deletions(-) delete mode 100644 .github/workflows/ci-e2e-tests.yml diff --git a/.github/workflows/ci-e2e-tests.yml b/.github/workflows/ci-e2e-tests.yml deleted file mode 100644 index 514a95df..00000000 --- a/.github/workflows/ci-e2e-tests.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: CI - E2E Tests - -on: - push: - branches: ["main"] - tags: ['*.*.*'] - pull_request: - paths: - - ".github/workflows/ci-e2e-tests.yml" - - "**.go" - - "go.mod" - - "go.sum" - branches: ["**"] - -permissions: - contents: read - pull-requests: read - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - e2e: - name: E2E Tests - runs-on: ubuntu-22.04 - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Set up Go - uses: actions/setup-go@v5 - with: - cache: true - go-version-file: go.mod - - - name: Run E2E tests - run: | - echo "Running E2E tests on ubuntu-22.04 (GitHub-hosted)..." - KMSG_FILE_PATH=/dev/null ./scripts/tests-e2e.sh - - - name: Upload E2E test results - if: always() - uses: actions/upload-artifact@v4 - with: - name: e2e-results - path: | - e2e/**/*.log - **/*test.xml - bin/ - if-no-files-found: warn - retention-days: 7 From 2abbce56ad8ec4461f94e12c6dead63e99ff4ab4 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Fri, 29 Aug 2025 16:06:17 -0700 Subject: [PATCH 15/26] Simpify readme --- README.md | 279 ++++++++++-------------------------------------------- 1 file changed, 48 insertions(+), 231 deletions(-) diff --git a/README.md b/README.md index 73052089..b8291e23 100644 --- a/README.md +++ b/README.md @@ -1,294 +1,111 @@ -# NVIDIA GPU Health Monitoring and Reporting Agent +# NVIDIA GPU Health Monitoring Agent ## Overview -`gpuhealth` is a streamlined GPU health monitoring and reporting agent designed to ensure GPU reliability by actively monitoring GPU status and exporting health metrics for analysis. `gpuhealth` is based on the upstream [leptonai/gpud](https://github.com/leptonai/gpud) project but focuses specifically on GPU health monitoring without management overhead. +`gpuhealth` is a lightweight GPU health monitoring agent that tracks GPU status and exports health metrics. Based on [leptonai/gpud](https://github.com/leptonai/gpud), it focuses specifically on monitoring without management overhead. -### Key Characteristics +**Key Features:** +- **Health-Focused**: GPU health monitoring and metrics export +- **Lightweight**: Minimal CPU and memory footprint (<100MB RAM, <1% CPU) +- **Non-Intrusive**: Read-only operations, no system modifications +- **Integration-Ready**: HTTP API, file export, optional centralized reporting +- **Production-Ready**: Built for 24/7 datacenter operation -- **Health-Focused**: Concentrates purely on GPU health monitoring and metrics export -- **Lightweight**: Self-contained binary with minimal CPU and memory footprint -- **Non-Intrusive**: Operates with read-only operations in a non-critical path -- **Integration-Ready**: Easy to integrate with existing monitoring and alerting systems -- **Production-Ready**: Built for reliability in datacenter environments - -### Architecture - -GPUHealth operates as a standalone monitoring agent that: -- Collects GPU health metrics and status information -- Detects hardware issues and performance anomalies -- Exports data in standard formats (JSON, CSV) -- Supports multiple deployment modes: - - **Local API**: HTTP endpoints for on-demand access - - **Offline Collection**: File-based batch data export - - **Centralized Reporting**: Optional push-mode to control planes (configurable) - -## Get Started +## Quick Start ### Installation -Choose between **package installation** (recommended for production) or **building from source** (for development/customization): - -#### Package Installation (Recommended) - -**Includes systemd integration and auto-start capability** - -**Ubuntu (22.04, 24.04):** +**Package Installation (Recommended):** ```bash -# Download the appropriate package from releases page: -# https://github.com/NVIDIA/gpuhealth/releases - -# For x86_64/amd64 +# Ubuntu/Debian wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_*_amd64.deb sudo dpkg -i gpuhealth_*_amd64.deb -# For ARM64/aarch64 -wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_*_arm64.deb -sudo dpkg -i gpuhealth_*_arm64.deb - -# Verify installation and service status -systemctl status gpuhealthd -``` - -**RHEL, Rocky Linux, AlmaLinux (8, 9, 10) & Amazon Linux 2023:** -```bash -# Download the appropriate package from releases page: -# https://github.com/NVIDIA/gpuhealth/releases - -# For x86_64 +# RHEL/Rocky/AlmaLinux/AmazonLinux wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth-*-1.x86_64.rpm sudo rpm -i gpuhealth-*-1.x86_64.rpm -# For ARM64/aarch64 -wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth-*-1.aarch64.rpm -sudo rpm -i gpuhealth-*-1.aarch64.rpm - -# Verify installation and service status +# Verify installation systemctl status gpuhealthd ``` -**Package installation provides:** -- ✅ **Systemd integration**: Service management with `systemctl` -- ✅ **Auto-start**: Automatically starts on system boot -- ✅ **Service configuration**: Pre-configured service files and environment -- ✅ **Standard paths**: Binary, logs, and data stored in standard system locations - -#### Build from Source - -**For development, customization, or manual deployment** - +**Build from Source:** ```bash git clone https://github.com/NVIDIA/gpuhealth.git -cd gpuhealth +cd gpuhealth make gpuhealth sudo mv bin/gpuhealth /usr/local/bin/ - -# Manual setup required: -# - No systemd integration (run manually or create your own service) -# - No auto-start capability -# - Manual configuration of paths and permissions ``` -**Source installation provides:** -- ✅ **Latest code**: Access to newest features and bug fixes -- ✅ **Customization**: Modify source code as needed -- ✅ **Minimal installation**: Just the binary, no additional system integration -- ❌ **Manual setup**: You handle service management, auto-start, and configuration - ### Usage -#### Health Monitoring Server - -Start the health monitoring server: - ```bash -# Start server (runs on port 15133 by default) +# Start monitoring server (port 15133) gpuhealth run -# Start with custom configuration -gpuhealth run --listen-address=0.0.0.0:8080 --log-level=debug -``` - -#### One-time Health Check - -Perform a quick health scan: - -```bash -gpuhealth scan -``` - -#### Offline Data Collection - -Collect health data to files: +# Quick health check +gpuhealth scan -```bash -# Collect data for 1 hour to /tmp/gpu-health/ -gpuhealth run --offline-mode --path=/tmp/gpu-health --duration=1h -``` +# Offline data collection +gpuhealth run --offline-mode --path=/tmp/gpu-health --duration=00:05:00 --format csv -#### Check Service Status - -```bash +# Check status gpuhealth status ``` ### API Access -Once running, access health data via HTTP API: - ```bash -# Health endpoint +# Health status curl http://localhost:15133/healthz -# Machine information +# Machine info & health states curl http://localhost:15133/machine-info - -# Health states curl http://localhost:15133/v1/states -``` - -## Key Features - -### GPU Health Monitoring -- **Hardware Metrics**: Power consumption, temperature, clock speeds, utilization -- **Error Detection**: NVML Xid events, hardware slowdown, row remapping failures -- **Fabric Health**: GPU fabric status and interconnect monitoring -- **Performance Tracking**: GPU performance counters and throughput metrics - -### System Health Monitoring -- **Basic System Metrics**: CPU, memory, and disk usage -- **Driver Status**: NVIDIA driver version and compatibility checks -- **Process Monitoring**: GPU process information and resource allocation - -### Data Export & Integration -- **Multiple Formats**: JSON and CSV output formats -- **HTTP API**: RESTful endpoints for real-time data access -- **Offline Mode**: File-based data collection for batch processing -- **Centralized Reporting**: Optional push-mode data export to control planes -- **Configurable Intervals**: Customizable health check and export frequencies -- **Flexible Endpoints**: Support for custom monitoring infrastructure integration - -### Production Features -- **Low Overhead**: Minimal CPU and memory footprint -- **Read-Only**: Non-intrusive monitoring with no system modifications -- **Reliability**: Built for 24/7 operation in datacenter environments -- **Scalability**: Deploy across large GPU clusters with consistent performance - -Check out [*components*](./docs/COMPONENTS.md) for a detailed list of monitoring components and their capabilities. - -## FAQs - -### Does GPUHealth send data externally? - -**By default, no.** GPUHealth operates in a fully self-contained mode and does not send any data to external services by default. However, it **can be configured** to send health data to a centralized control plane for further analysis if desired. - -**Default behavior:** -- Stored locally on your system -- Accessed only through the local HTTP API (if enabled) -- Exported to local files in offline mode -- **No external data transmission** without explicit configuration - -**Optional centralized reporting:** -- Can be configured to send health data to a centralized monitoring platform -- Configurable endpoints, intervals, and data filtering -- All data transmission is **opt-in** and under your control -- Supports secure channels for data transmission - -GPUHealth is designed for environments where data privacy and security are paramount, giving you full control over where and how your GPU health data is used. - -### How do I integrate GPUHealth with my monitoring system? - -GPUHealth provides multiple integration options: -**HTTP API Integration:** -```bash -# Prometheus-style metrics +# Prometheus metrics curl http://localhost:15133/metrics - -# JSON health data -curl http://localhost:15133/v1/states -``` - -**File-based Integration:** -```bash -# Export to files for processing -gpuhealth run --offline-mode --path=/monitoring/data --duration=24h -``` - -**Centralized Control Plane Integration:** -```bash -# Configure centralized reporting (optional) -gpuhealth run --health-exporter-endpoint=https://monitoring.company.com/gpu-health \ - --health-exporter-interval=5m \ - --include-metrics=true ``` -**Custom Endpoints:** -Configure your monitoring system to scrape the GPUHealth API endpoints at your desired interval, or set up centralized reporting to push data to your monitoring infrastructure. - -### How do I update GPUHealth? - -1. **Download latest release** from [GitHub Releases](https://github.com/NVIDIA/gpuhealth/releases) -2. **Stop running instance**: `gpuhealth status` to check, then stop if needed -3. **Replace binary**: Update `/usr/local/bin/gpuhealth` or your installation path -4. **Restart**: Launch gpuhealth with your previous configuration - -For package installations (deb/rpm), use your system's package manager to update. +## What It Monitors -### What are the system requirements? +- **GPU Health**: Power, temperature, clocks, utilization, Xid events +- **System Metrics**: CPU, memory, disk usage +- **Driver Status**: NVIDIA driver version and compatibility +- **Process Info**: GPU process allocation and resource usage -#### Supported Operating Systems +## Data Export -| OS Distribution | Version | x86_64 (amd64) | ARM64 (aarch64) | -|-----------------|---------|:--------------:|:---------------:| -| **Ubuntu** | 22.04 | ✅ | ✅ | -| **Ubuntu** | 24.04 | ✅ | ✅ | -| **RHEL/Rocky Linux/AlmaLinux** | 8 | ✅ | ✅ | -| **RHEL/Rocky Linux/AlmaLinux** | 9 | ✅ | ✅ | -| **RHEL/Rocky Linux/AlmaLinux** | 10 | ✅ | ✅ | -| **Amazon Linux** | 2023 | ✅ | ✅ | +- **HTTP API**: Real-time JSON/Prometheus metrics +- **Offline Mode**: File-based data collection (JSON/CSV) +- **Centralized Reporting**: Optional push to control planes -#### System Resources +See [Components Guide](./docs/COMPONENTS.md) for detailed monitoring capabilities. -- **NVIDIA Driver**: Version 535+ recommended (not required for basic system monitoring) -- **Memory**: <100MB RAM usage -- **CPU**: Minimal overhead, typically <1% CPU usage -- **Storage**: ~100MB for binary and logs -- **Network**: HTTP/HTTPS access for centralized reporting (optional) +## FAQ -#### Additional Requirements +**Does it send data externally?** +No, by default all data stays local. Optional centralized reporting can be configured if desired. -- **systemd**: Version 230+ (for package installations) -- **curl**: Required for installation scripts and HTTP exports -- **Root privileges**: Required for full system monitoring capabilities +**System requirements?** +Ubuntu 22.04+, RHEL 8+, <100MB RAM, <1% CPU. NVIDIA drivers recommended but not required. -### Can I run GPUHealth without NVIDIA drivers? - -Yes! GPUHealth will operate in a reduced functionality mode: -- ✅ **System monitoring**: CPU, memory, disk metrics still available -- ✅ **Basic GPU detection**: PCI device enumeration -- ❌ **NVIDIA-specific monitoring**: Requires NVIDIA drivers for full GPU health data +**Integration options?** +HTTP API (JSON/Prometheus), offline file export, or optional push to monitoring systems. ## Documentation -- [Components Guide](./docs/COMPONENTS.md) - Detailed component information and configuration +- [Components Guide](./docs/COMPONENTS.md) - Monitoring capabilities and configuration - [Architecture Overview](./docs/ARCHITECTURE.md) - System design and technical details -- [Installation Guide](./docs/INSTALL.md) - Comprehensive installation instructions -- [Integration Guide](./docs/INTEGRATION.md) - How to integrate with monitoring systems - -## Related Projects - -- **Upstream Project**: [leptonai/gpud](https://github.com/leptonai/gpud) - Full-featured GPU management daemon -- **NVIDIA Tools**: Compatible with NVIDIA's GPU monitoring ecosystem +- [Installation Guide](./docs/INSTALL.md) - Comprehensive setup instructions +- [Integration Guide](./docs/INTEGRATION.md) - Monitoring system integration ## Contributing -We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for: -- Development setup and build instructions -- Code style and contribution guidelines -- How to report issues and submit pull requests -- Upstream sync procedures for maintainers +Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines. + +**Related Projects:** [leptonai/gpud](https://github.com/leptonai/gpud) (upstream full-featured GPU management daemon) ## License -This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details. +Apache License 2.0 - see [LICENSE](LICENSE) for details. From 48df3f35303cee07e7961a04ae17b6c7d1e89327 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 14:28:23 -0700 Subject: [PATCH 16/26] CI Pipeline change --- .github/workflows/ci-build.yml | 7 +- .github/workflows/ci-lint.yml | 11 +- .github/workflows/ci-packages.yml | 8 +- .github/workflows/ci-release.yml | 304 +++++++++++++++++++++++++++ .github/workflows/ci-unit-tests.yml | 11 +- .github/workflows/release-hybrid.yml | 187 ---------------- 6 files changed, 330 insertions(+), 198 deletions(-) create mode 100644 .github/workflows/ci-release.yml delete mode 100644 .github/workflows/release-hybrid.yml diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 74433231..2e3a1406 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -2,15 +2,16 @@ name: CI - Build Binary on: push: - branches: ["main"] - tags: ['*.*.*'] + branches: ["main"] # Main branch commits + tags: ['*.*.*'] # Release tags pull_request: paths: - ".github/workflows/ci-build.yml" - "**.go" - "go.mod" - "go.sum" - branches: ["main"] + - "Makefile" + branches: ["main"] # PRs to main permissions: contents: read diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml index 3b5ad8ca..63532746 100644 --- a/.github/workflows/ci-lint.yml +++ b/.github/workflows/ci-lint.yml @@ -2,8 +2,15 @@ name: CI - Lint & Code Quality on: push: - branches: ["**"] - tags: ["*.*.*"] + branches: ["**"] # Every commit on every branch + pull_request: + paths: + - ".github/workflows/ci-lint.yml" + - "**.go" + - "go.mod" + - "go.sum" + - ".golangci.yml" + branches: ["main"] permissions: contents: read diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index f1d4b9c1..a9aead13 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -2,8 +2,8 @@ name: CI - Package Building on: push: - branches: ["main"] - tags: ['*.*.*'] + branches: ["main"] # Main branch commits + tags: ['*.*.*'] # Release tags pull_request: paths: - ".github/workflows/ci-packages.yml" @@ -12,7 +12,7 @@ on: - "go.sum" - ".goreleaser_*.yaml" - "deployments/packages/**" - branches: ["**"] + branches: ["main"] # PRs to main permissions: contents: write @@ -92,7 +92,7 @@ jobs: - name: Build package run: | - echo "📦 Building ${{ matrix.package_type }} package on ${{ matrix.os_name }}-${{ matrix.arch }}..." + echo "Building ${{ matrix.package_type }} package on ${{ matrix.os_name }}-${{ matrix.arch }}..." if [ "${{ github.ref }}" = "refs/heads/main" ]; then goreleaser build --config ${{ matrix.goreleaser_config }} --snapshot --clean else diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml new file mode 100644 index 00000000..7410a886 --- /dev/null +++ b/.github/workflows/ci-release.yml @@ -0,0 +1,304 @@ +name: Release - Production + +on: + workflow_dispatch: + inputs: + tag: + description: 'Tag to release (must have existing CI artifacts)' + required: true + type: string + +permissions: + contents: write + actions: read # Required to download artifacts from other workflows + +jobs: + # Validate that required artifacts exist for the specified tag + validate-artifacts: + name: Validate Required Artifacts + runs-on: ubuntu-22.04 + outputs: + tag: ${{ github.event.inputs.tag }} + artifacts-valid: ${{ steps.validate.outputs.artifacts-valid }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Validate tag exists + id: validate + run: | + TAG="${{ github.event.inputs.tag }}" + echo "Validating release for tag: $TAG" + + # Check if tag exists + if ! git rev-parse --verify "refs/tags/$TAG" >/dev/null 2>&1; then + echo "Tag '$TAG' does not exist in repository" + echo "Available tags:" + git tag --sort=-version:refname | head -10 + exit 1 + fi + + echo "Tag '$TAG' exists" + echo "artifacts-valid=true" >> $GITHUB_OUTPUT + + - name: Check for required CI artifacts + uses: actions/github-script@v7 + with: + script: | + const tag = '${{ github.event.inputs.tag }}'; + console.log(`Checking for CI artifacts for tag: ${tag}`); + + // Get the commit SHA for the tag + const tagRef = await github.rest.git.getRef({ + owner: context.repo.owner, + repo: context.repo.repo, + ref: `tags/${tag}` + }); + const tagSha = tagRef.data.object.sha; + console.log(`Tag SHA: ${tagSha}`); + + // Check CI Build workflow runs + const buildRuns = await github.rest.actions.listWorkflowRuns({ + owner: context.repo.owner, + repo: context.repo.repo, + workflow_id: 'ci-build.yml', + head_sha: tagSha + }); + + const successfulBuildRun = buildRuns.data.workflow_runs.find(run => run.conclusion === 'success'); + if (!successfulBuildRun) { + core.setFailed(`No successful CI build workflow run found for tag ${tag}`); + return; + } + console.log(`Found successful build run: ${successfulBuildRun.id}`); + + // Check CI Package workflow runs + const packageRuns = await github.rest.actions.listWorkflowRuns({ + owner: context.repo.owner, + repo: context.repo.repo, + workflow_id: 'ci-packages.yml', + head_sha: tagSha + }); + + const successfulPackageRun = packageRuns.data.workflow_runs.find(run => run.conclusion === 'success'); + if (!successfulPackageRun) { + core.setFailed(`No successful CI package workflow run found for tag ${tag}`); + return; + } + console.log(`Found successful package run: ${successfulPackageRun.id}`); + + // Verify build artifacts exist + const buildArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: successfulBuildRun.id + }); + + const requiredBinaries = [ + 'gpuhealth-binary-ubuntu-22.04-amd64', + 'gpuhealth-binary-ubuntu-22.04-arm64', + 'gpuhealth-binary-rhel-8-x86_64', + 'gpuhealth-binary-rhel-8-aarch64' + ]; + + const foundBinaries = buildArtifacts.data.artifacts.filter(a => + requiredBinaries.includes(a.name) && a.expired === false + ); + + if (foundBinaries.length !== requiredBinaries.length) { + const missing = requiredBinaries.filter(req => + !foundBinaries.some(found => found.name === req) + ); + core.setFailed(`Missing binary artifacts: ${missing.join(', ')}`); + return; + } + console.log(`✅ All binary artifacts found: ${foundBinaries.map(a => a.name).join(', ')}`); + + // Verify package artifacts exist + const packageArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: successfulPackageRun.id + }); + + const requiredPackages = [ + 'package-deb-ubuntu-22.04-amd64', + 'package-deb-ubuntu-22.04-arm64', + 'package-rpm-rhel-8-x86_64', + 'package-rpm-rhel-8-aarch64' + ]; + + const foundPackages = packageArtifacts.data.artifacts.filter(a => + requiredPackages.includes(a.name) && a.expired === false + ); + + if (foundPackages.length !== requiredPackages.length) { + const missing = requiredPackages.filter(req => + !foundPackages.some(found => found.name === req) + ); + core.setFailed(`Missing package artifacts: ${missing.join(', ')}`); + return; + } + console.log(`All package artifacts found: ${foundPackages.map(a => a.name).join(', ')}`); + + console.log(`All required artifacts validated for tag ${tag}`); + + # Download artifacts from successful CI workflows + collect-artifacts: + name: Collect Build Artifacts + runs-on: ubuntu-22.04 + needs: validate-artifacts + if: needs.validate-artifacts.outputs.artifacts-valid == 'true' + steps: + - name: Download all build artifacts + uses: actions/download-artifact@v4 + with: + pattern: gpuhealth-binary-* + path: ./binaries/ + + - name: Download all package artifacts + uses: actions/download-artifact@v4 + with: + pattern: package-* + path: ./packages/ + + - name: List collected artifacts + run: | + echo "=== Binaries ===" + find ./binaries/ -type f -name "gpuhealth" -exec ls -la {} \; + echo "=== Packages ===" + find ./packages/ -type f \( -name "*.deb" -o -name "*.rpm" \) -exec ls -la {} \; + + - name: Get release tag + id: tag + run: | + if [[ "${{ github.event_name }}" == "push" ]]; then + echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + else + echo "tag=${{ github.event.inputs.tag }}" >> $GITHUB_OUTPUT + fi + + - name: Prepare release artifacts + run: | + mkdir -p ./release-assets + TAG="${{ needs.validate-artifacts.outputs.tag }}" + + # Copy binaries with proper naming + for binary in ./binaries/*/gpuhealth; do + if [[ -f "$binary" ]]; then + dir_name=$(basename $(dirname "$binary")) + # Extract platform info from artifact name + platform=$(echo "$dir_name" | sed 's/gpuhealth-binary-//') + cp "$binary" "./release-assets/gpuhealth-${TAG}-${platform}" + fi + done + + # Copy packages + find ./packages/ -type f \( -name "*.deb" -o -name "*.rpm" \) -exec cp {} ./release-assets/ \; + + echo "=== Final Release Assets ===" + ls -la ./release-assets/ + + - name: Upload consolidated release artifacts + uses: actions/upload-artifact@v4 + with: + name: release-assets-${{ needs.validate-artifacts.outputs.tag }} + path: ./release-assets/ + retention-days: 30 + + # Pre-release validation on GitHub-hosted (fast) + pre-release-validation: + needs: [validate-artifacts, collect-artifacts] + if: needs.validate-artifacts.outputs.artifacts-valid == 'true' + name: Pre-Release Validation + runs-on: ubuntu-22.04 # GitHub-hosted + outputs: + tag: ${{ needs.validate-artifacts.outputs.tag }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Get tag + id: tag + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + TAG="${{ github.event.inputs.tag }}" + else + TAG=${GITHUB_REF#refs/tags/} + fi + echo "tag=${TAG}" >> $GITHUB_OUTPUT + echo "Release tag: ${TAG}" + + - name: Validate tag format + run: | + TAG="${{ steps.tag.outputs.tag }}" + if [[ ! $TAG =~ ^v[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$ ]]; then + echo "Invalid tag format: $TAG" + echo "Expected format: v1.2.3 or v1.2.3-alpha" + exit 1 + fi + + - name: Check changelog + run: | + if [ -f CHANGELOG.md ]; then + TAG="${{ steps.tag.outputs.tag }}" + if ! grep -q "$TAG" CHANGELOG.md; then + echo "Warning: Tag $TAG not found in CHANGELOG.md" + fi + fi + + + + # Create GitHub release using pre-built artifacts + create-github-release: + needs: [validate-artifacts, collect-artifacts, pre-release-validation] + if: needs.validate-artifacts.outputs.artifacts-valid == 'true' + name: Create GitHub Release + runs-on: ubuntu-22.04 # GitHub-hosted for API operations + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download consolidated release artifacts + uses: actions/download-artifact@v4 + with: + name: release-assets-${{ needs.validate-artifacts.outputs.tag }} + path: ./release-assets + + - name: Verify release assets + run: | + echo "=== Release Assets Ready for Upload ===" + ls -la ./release-assets/ + echo "=== Binary Count ===" + find ./release-assets/ -name "gpuhealth-*" -type f | wc -l + echo "=== Package Count ===" + find ./release-assets/ -name "*.deb" -o -name "*.rpm" | wc -l + + - name: Create Release + uses: softprops/action-gh-release@v1 + with: + tag_name: ${{ needs.validate-artifacts.outputs.tag }} + name: "gpuhealth ${{ needs.validate-artifacts.outputs.tag }}" + draft: false + prerelease: ${{ contains(needs.validate-artifacts.outputs.tag, '-') }} + generate_release_notes: true + files: ./release-assets/* + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Release Summary + run: | + echo "## Release Summary" >> $GITHUB_STEP_SUMMARY + echo "- **Tag:** ${{ needs.pre-release-validation.outputs.tag }}" >> $GITHUB_STEP_SUMMARY + echo "- **Packages:** $(ls ./final-release/ | wc -l) files" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Release Assets:" >> $GITHUB_STEP_SUMMARY + for file in ./final-release/*; do + echo "- $(basename $file)" >> $GITHUB_STEP_SUMMARY + done diff --git a/.github/workflows/ci-unit-tests.yml b/.github/workflows/ci-unit-tests.yml index e0d3a3ed..7cb41285 100644 --- a/.github/workflows/ci-unit-tests.yml +++ b/.github/workflows/ci-unit-tests.yml @@ -2,8 +2,15 @@ name: CI - Unit Tests on: push: - branches: ["**"] - tags: ['*.*.*'] + branches: ["**"] # Every commit on every branch + pull_request: + paths: + - ".github/workflows/ci-unit-tests.yml" + - "**.go" + - "go.mod" + - "go.sum" + - "scripts/tests-unit.sh" + branches: ["main"] permissions: contents: read diff --git a/.github/workflows/release-hybrid.yml b/.github/workflows/release-hybrid.yml deleted file mode 100644 index 7c5bf8f2..00000000 --- a/.github/workflows/release-hybrid.yml +++ /dev/null @@ -1,187 +0,0 @@ -name: Release - Production - -on: - push: - tags: - - "*.*.*" - workflow_dispatch: - inputs: - tag: - description: 'Tag to release' - required: true - type: string - -permissions: - contents: write - -jobs: - # Pre-release validation on GitHub-hosted (fast) - pre-release-validation: - name: Pre-Release Validation - runs-on: ubuntu-22.04 # GitHub-hosted - outputs: - tag: ${{ steps.tag.outputs.tag }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get tag - id: tag - run: | - if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then - TAG="${{ github.event.inputs.tag }}" - else - TAG=${GITHUB_REF#refs/tags/} - fi - echo "tag=${TAG}" >> $GITHUB_OUTPUT - echo "Release tag: ${TAG}" - - - name: Validate tag format - run: | - TAG="${{ steps.tag.outputs.tag }}" - if [[ ! $TAG =~ ^v[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$ ]]; then - echo "Invalid tag format: $TAG" - echo "Expected format: v1.2.3 or v1.2.3-alpha" - exit 1 - fi - - - name: Check changelog - run: | - if [ -f CHANGELOG.md ]; then - TAG="${{ steps.tag.outputs.tag }}" - if ! grep -q "$TAG" CHANGELOG.md; then - echo "Warning: Tag $TAG not found in CHANGELOG.md" - fi - fi - - # Build packages on self-hosted runners - build-release-packages: - needs: pre-release-validation - strategy: - fail-fast: true # Fail fast for releases - matrix: - include: - # Ubuntu DEB packages - - runner: gpuhealth-ubuntu-22.04-x86 - os_name: ubuntu-22.04 - arch: amd64 - package_type: deb - goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml - - - runner: gpuhealth-ubuntu-22.04-arm - os_name: ubuntu-22.04 - arch: arm64 - package_type: deb - goreleaser_config: .goreleaser_gpuhealth_deb_arm64.yaml - - # RHEL RPM packages - - runner: gpuhealth-rhel-8-x86 - os_name: rhel-8 - arch: x86_64 - package_type: rpm - goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml - - - runner: gpuhealth-rhel-8-arm - os_name: rhel-8 - arch: aarch64 - package_type: rpm - goreleaser_config: .goreleaser_gpuhealth_rpm_aarch64.yaml - - name: Release Package (${{ matrix.package_type }}-${{ matrix.arch }}) - runs-on: ${{ matrix.runner }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Install build dependencies (Ubuntu) - if: contains(matrix.os_name, 'ubuntu') - run: | - sudo apt-get update - sudo apt-get install -y build-essential linux-headers-$(uname -r) - - - name: Install build dependencies (RHEL) - if: contains(matrix.os_name, 'rhel') - run: | - sudo dnf update -y - sudo dnf groupinstall -y "Development Tools" - sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel rpm-build - - - name: Set up Go - uses: actions/setup-go@v5 - with: - cache: true - go-version-file: go.mod - - - name: Install GoReleaser - run: | - go install github.com/goreleaser/goreleaser@latest - goreleaser --version - - - name: Build release package - run: | - echo "🚀 Building release ${{ matrix.package_type }} package..." - goreleaser release --config ${{ matrix.goreleaser_config }} --clean - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Upload release artifacts - uses: actions/upload-artifact@v4 - with: - name: release-${{ matrix.package_type }}-${{ matrix.arch }} - path: - - "dist/*.deb" - - "dist/*.rpm" - if-no-files-found: warn - retention-days: 90 - - # Create GitHub release on GitHub-hosted (API operations) - create-github-release: - needs: [pre-release-validation, build-release-packages] - name: Create GitHub Release - runs-on: ubuntu-22.04 # GitHub-hosted for API operations - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download all release artifacts - uses: actions/download-artifact@v4 - with: - path: ./release-assets - pattern: release-* - - - name: Organize release assets - run: | - mkdir -p ./final-release - find ./release-assets -name "*.deb" -exec cp {} ./final-release/ \; - find ./release-assets -name "*.rpm" -exec cp {} ./final-release/ \; - ls -la ./final-release/ - - - name: Create Release - uses: softprops/action-gh-release@v1 - with: - tag_name: ${{ needs.pre-release-validation.outputs.tag }} - name: "gpuhealth ${{ needs.pre-release-validation.outputs.tag }}" - draft: false - prerelease: ${{ contains(needs.pre-release-validation.outputs.tag, '-') }} - generate_release_notes: true - files: ./final-release/* - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Release Summary - run: | - echo "## Release Summary" >> $GITHUB_STEP_SUMMARY - echo "- **Tag:** ${{ needs.pre-release-validation.outputs.tag }}" >> $GITHUB_STEP_SUMMARY - echo "- **Packages:** $(ls ./final-release/ | wc -l) files" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Release Assets:" >> $GITHUB_STEP_SUMMARY - for file in ./final-release/*; do - echo "- $(basename $file)" >> $GITHUB_STEP_SUMMARY - done From d29034830c40f08fe2be3903ce9debdc082d533d Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 14:30:13 -0700 Subject: [PATCH 17/26] Disable unit test and lint on PR --- .github/workflows/ci-lint.yml | 8 -------- .github/workflows/ci-unit-tests.yml | 8 -------- 2 files changed, 16 deletions(-) diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml index 63532746..a85b5423 100644 --- a/.github/workflows/ci-lint.yml +++ b/.github/workflows/ci-lint.yml @@ -3,14 +3,6 @@ name: CI - Lint & Code Quality on: push: branches: ["**"] # Every commit on every branch - pull_request: - paths: - - ".github/workflows/ci-lint.yml" - - "**.go" - - "go.mod" - - "go.sum" - - ".golangci.yml" - branches: ["main"] permissions: contents: read diff --git a/.github/workflows/ci-unit-tests.yml b/.github/workflows/ci-unit-tests.yml index 7cb41285..3718ce77 100644 --- a/.github/workflows/ci-unit-tests.yml +++ b/.github/workflows/ci-unit-tests.yml @@ -3,14 +3,6 @@ name: CI - Unit Tests on: push: branches: ["**"] # Every commit on every branch - pull_request: - paths: - - ".github/workflows/ci-unit-tests.yml" - - "**.go" - - "go.mod" - - "go.sum" - - "scripts/tests-unit.sh" - branches: ["main"] permissions: contents: read From cebc537b1ca6420e01ab4a0ab59e25b20ffe3c0a Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 15:37:16 -0700 Subject: [PATCH 18/26] Change yaml format --- .github/workflows/ci-packages.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index a9aead13..588987dd 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -115,4 +115,4 @@ jobs: - "dist/*.deb" - "dist/*.rpm" if-no-files-found: warn - retention-days: 7 + retention-days: 7 \ No newline at end of file From 87665f50a1e430d1f977c7701a523dade21cccf6 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 15:39:57 -0700 Subject: [PATCH 19/26] fix package built --- .github/workflows/ci-packages.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index 588987dd..4959345d 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -111,8 +111,6 @@ jobs: uses: actions/upload-artifact@v4 with: name: package-${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }} - path: - - "dist/*.deb" - - "dist/*.rpm" + path: "dist/" if-no-files-found: warn retention-days: 7 \ No newline at end of file From 8b8eed6391bbbbc718c73218ac7c1947261a23c1 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 15:44:03 -0700 Subject: [PATCH 20/26] goreleaser version --- .github/workflows/ci-packages.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index 4959345d..24d769a8 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -83,7 +83,7 @@ jobs: - name: Install GoReleaser run: | - go install github.com/goreleaser/goreleaser@latest + go install github.com/goreleaser/goreleaser/v2@latest goreleaser --version - name: Validate GoReleaser config From 35aa73c625bc9474d8ff19847ba09b65ff8650e0 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 15:54:38 -0700 Subject: [PATCH 21/26] test build --- .github/workflows/ci-packages.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index 24d769a8..af1f8d65 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -93,11 +93,10 @@ jobs: - name: Build package run: | echo "Building ${{ matrix.package_type }} package on ${{ matrix.os_name }}-${{ matrix.arch }}..." - if [ "${{ github.ref }}" = "refs/heads/main" ]; then - goreleaser build --config ${{ matrix.goreleaser_config }} --snapshot --clean - else - # Release build for tags + if [[ "${{ github.ref }}" == refs/tags/* ]]; then goreleaser release --config ${{ matrix.goreleaser_config }} --clean + else + goreleaser release --snapshot --config ${{ matrix.goreleaser_config }} --clean # For branches and PRs fi env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 5f6699113673a25e033a1dbf1a69e2d5bfd00326 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 16:09:37 -0700 Subject: [PATCH 22/26] release pipeline --- .github/workflows/ci-build.yml | 12 +++++++++++- .github/workflows/ci-packages.yml | 13 +++++++++++-- .github/workflows/ci-release.yml | 23 +++++++++++------------ 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 2e3a1406..3588da95 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -67,8 +67,18 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - cache: true + cache: false # Disable automatic caching to avoid tar issues go-version-file: go.mod + + - name: Setup Go cache manually + uses: actions/cache@v4 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- - name: Build binary (native) run: | diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index af1f8d65..498058e4 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -78,8 +78,18 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - cache: true + cache: false # Disable automatic caching to avoid tar issues go-version-file: go.mod + + - name: Setup Go cache manually + uses: actions/cache@v4 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- - name: Install GoReleaser run: | @@ -109,7 +119,6 @@ jobs: - name: Upload package artifacts uses: actions/upload-artifact@v4 with: - name: package-${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }} path: "dist/" if-no-files-found: warn retention-days: 7 \ No newline at end of file diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml index 7410a886..759924ae 100644 --- a/.github/workflows/ci-release.yml +++ b/.github/workflows/ci-release.yml @@ -123,22 +123,21 @@ jobs: run_id: successfulPackageRun.id }); - const requiredPackages = [ - 'package-deb-ubuntu-22.04-amd64', - 'package-deb-ubuntu-22.04-arm64', - 'package-rpm-rhel-8-x86_64', - 'package-rpm-rhel-8-aarch64' + // Check for package artifacts with actual GoReleaser-generated names + const requiredPackagePatterns = [ + /^gpuhealth.*amd64\.deb$/, + /^gpuhealth.*arm64\.deb$/, + /^gpuhealth.*x86_64\.rpm$/, + /^gpuhealth.*aarch64\.rpm$/ ]; const foundPackages = packageArtifacts.data.artifacts.filter(a => - requiredPackages.includes(a.name) && a.expired === false + requiredPackagePatterns.some(pattern => pattern.test(a.name)) && a.expired === false ); - if (foundPackages.length !== requiredPackages.length) { - const missing = requiredPackages.filter(req => - !foundPackages.some(found => found.name === req) - ); - core.setFailed(`Missing package artifacts: ${missing.join(', ')}`); + if (foundPackages.length !== requiredPackagePatterns.length) { + const foundNames = foundPackages.map(a => a.name); + core.setFailed(`Expected 4 package artifacts, found ${foundPackages.length}. Found: ${foundNames.join(', ')}`); return; } console.log(`All package artifacts found: ${foundPackages.map(a => a.name).join(', ')}`); @@ -161,7 +160,7 @@ jobs: - name: Download all package artifacts uses: actions/download-artifact@v4 with: - pattern: package-* + pattern: "gpuhealth*" path: ./packages/ - name: List collected artifacts From 08189007f20564830b0a4db26fb032d04e53a7be Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 16:16:56 -0700 Subject: [PATCH 23/26] Revert "release pipeline" This reverts commit 5f6699113673a25e033a1dbf1a69e2d5bfd00326. --- .github/workflows/ci-build.yml | 12 +----------- .github/workflows/ci-packages.yml | 13 ++----------- .github/workflows/ci-release.yml | 23 ++++++++++++----------- 3 files changed, 15 insertions(+), 33 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 3588da95..2e3a1406 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -67,18 +67,8 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - cache: false # Disable automatic caching to avoid tar issues + cache: true go-version-file: go.mod - - - name: Setup Go cache manually - uses: actions/cache@v4 - with: - path: | - ~/.cache/go-build - ~/go/pkg/mod - key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} - restore-keys: | - ${{ runner.os }}-go- - name: Build binary (native) run: | diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index 498058e4..af1f8d65 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -78,18 +78,8 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - cache: false # Disable automatic caching to avoid tar issues + cache: true go-version-file: go.mod - - - name: Setup Go cache manually - uses: actions/cache@v4 - with: - path: | - ~/.cache/go-build - ~/go/pkg/mod - key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} - restore-keys: | - ${{ runner.os }}-go- - name: Install GoReleaser run: | @@ -119,6 +109,7 @@ jobs: - name: Upload package artifacts uses: actions/upload-artifact@v4 with: + name: package-${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }} path: "dist/" if-no-files-found: warn retention-days: 7 \ No newline at end of file diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml index 759924ae..7410a886 100644 --- a/.github/workflows/ci-release.yml +++ b/.github/workflows/ci-release.yml @@ -123,21 +123,22 @@ jobs: run_id: successfulPackageRun.id }); - // Check for package artifacts with actual GoReleaser-generated names - const requiredPackagePatterns = [ - /^gpuhealth.*amd64\.deb$/, - /^gpuhealth.*arm64\.deb$/, - /^gpuhealth.*x86_64\.rpm$/, - /^gpuhealth.*aarch64\.rpm$/ + const requiredPackages = [ + 'package-deb-ubuntu-22.04-amd64', + 'package-deb-ubuntu-22.04-arm64', + 'package-rpm-rhel-8-x86_64', + 'package-rpm-rhel-8-aarch64' ]; const foundPackages = packageArtifacts.data.artifacts.filter(a => - requiredPackagePatterns.some(pattern => pattern.test(a.name)) && a.expired === false + requiredPackages.includes(a.name) && a.expired === false ); - if (foundPackages.length !== requiredPackagePatterns.length) { - const foundNames = foundPackages.map(a => a.name); - core.setFailed(`Expected 4 package artifacts, found ${foundPackages.length}. Found: ${foundNames.join(', ')}`); + if (foundPackages.length !== requiredPackages.length) { + const missing = requiredPackages.filter(req => + !foundPackages.some(found => found.name === req) + ); + core.setFailed(`Missing package artifacts: ${missing.join(', ')}`); return; } console.log(`All package artifacts found: ${foundPackages.map(a => a.name).join(', ')}`); @@ -160,7 +161,7 @@ jobs: - name: Download all package artifacts uses: actions/download-artifact@v4 with: - pattern: "gpuhealth*" + pattern: package-* path: ./packages/ - name: List collected artifacts From 5d2b4167923683fe6e85f71c62fc517b76db46a1 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 16:18:29 -0700 Subject: [PATCH 24/26] cache file --- .github/workflows/ci-build.yml | 12 +++++++++++- .github/workflows/ci-packages.yml | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 2e3a1406..aa2bfe96 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -67,9 +67,19 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - cache: true + cache: false go-version-file: go.mod + - name: Setup Go cache manually + uses: actions/cache@v4 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + - name: Build binary (native) run: | echo "Building gpuhealth binary on ${{ matrix.os_name }}-${{ matrix.arch }}..." diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index af1f8d65..0e51c4cb 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -78,9 +78,19 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - cache: true + cache: false go-version-file: go.mod + - name: Setup Go cache manually + uses: actions/cache@v4 + with: + path: | + ~/.cache/go-build + ~/go/pkg/mod + key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go- + - name: Install GoReleaser run: | go install github.com/goreleaser/goreleaser/v2@latest From d90a65e0653e501922e2ca037f07f689d705f659 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 16:28:39 -0700 Subject: [PATCH 25/26] disable cache --- .github/workflows/ci-build.yml | 11 ----------- .github/workflows/ci-packages.yml | 11 ----------- 2 files changed, 22 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index aa2bfe96..c83604a4 100644 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -67,19 +67,8 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - cache: false go-version-file: go.mod - - name: Setup Go cache manually - uses: actions/cache@v4 - with: - path: | - ~/.cache/go-build - ~/go/pkg/mod - key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} - restore-keys: | - ${{ runner.os }}-go- - - name: Build binary (native) run: | echo "Building gpuhealth binary on ${{ matrix.os_name }}-${{ matrix.arch }}..." diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml index 0e51c4cb..8cde2d65 100644 --- a/.github/workflows/ci-packages.yml +++ b/.github/workflows/ci-packages.yml @@ -78,19 +78,8 @@ jobs: - name: Set up Go uses: actions/setup-go@v5 with: - cache: false go-version-file: go.mod - - name: Setup Go cache manually - uses: actions/cache@v4 - with: - path: | - ~/.cache/go-build - ~/go/pkg/mod - key: ${{ runner.os }}-go-${{ hashFiles('**/go.sum') }} - restore-keys: | - ${{ runner.os }}-go- - - name: Install GoReleaser run: | go install github.com/goreleaser/goreleaser/v2@latest From 82da5a983e221a15ebd79ae2ce1fb4b5eedd0984 Mon Sep 17 00:00:00 2001 From: Jingxiang Zhang Date: Tue, 2 Sep 2025 16:45:02 -0700 Subject: [PATCH 26/26] pipeline --- .github/workflows/ci-release.yml | 304 --------------------------- .goreleaser_gpuhealth_deb_amd64.yaml | 1 + .goreleaser_gpuhealth_deb_arm64.yaml | 1 + 3 files changed, 2 insertions(+), 304 deletions(-) delete mode 100644 .github/workflows/ci-release.yml diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml deleted file mode 100644 index 7410a886..00000000 --- a/.github/workflows/ci-release.yml +++ /dev/null @@ -1,304 +0,0 @@ -name: Release - Production - -on: - workflow_dispatch: - inputs: - tag: - description: 'Tag to release (must have existing CI artifacts)' - required: true - type: string - -permissions: - contents: write - actions: read # Required to download artifacts from other workflows - -jobs: - # Validate that required artifacts exist for the specified tag - validate-artifacts: - name: Validate Required Artifacts - runs-on: ubuntu-22.04 - outputs: - tag: ${{ github.event.inputs.tag }} - artifacts-valid: ${{ steps.validate.outputs.artifacts-valid }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Validate tag exists - id: validate - run: | - TAG="${{ github.event.inputs.tag }}" - echo "Validating release for tag: $TAG" - - # Check if tag exists - if ! git rev-parse --verify "refs/tags/$TAG" >/dev/null 2>&1; then - echo "Tag '$TAG' does not exist in repository" - echo "Available tags:" - git tag --sort=-version:refname | head -10 - exit 1 - fi - - echo "Tag '$TAG' exists" - echo "artifacts-valid=true" >> $GITHUB_OUTPUT - - - name: Check for required CI artifacts - uses: actions/github-script@v7 - with: - script: | - const tag = '${{ github.event.inputs.tag }}'; - console.log(`Checking for CI artifacts for tag: ${tag}`); - - // Get the commit SHA for the tag - const tagRef = await github.rest.git.getRef({ - owner: context.repo.owner, - repo: context.repo.repo, - ref: `tags/${tag}` - }); - const tagSha = tagRef.data.object.sha; - console.log(`Tag SHA: ${tagSha}`); - - // Check CI Build workflow runs - const buildRuns = await github.rest.actions.listWorkflowRuns({ - owner: context.repo.owner, - repo: context.repo.repo, - workflow_id: 'ci-build.yml', - head_sha: tagSha - }); - - const successfulBuildRun = buildRuns.data.workflow_runs.find(run => run.conclusion === 'success'); - if (!successfulBuildRun) { - core.setFailed(`No successful CI build workflow run found for tag ${tag}`); - return; - } - console.log(`Found successful build run: ${successfulBuildRun.id}`); - - // Check CI Package workflow runs - const packageRuns = await github.rest.actions.listWorkflowRuns({ - owner: context.repo.owner, - repo: context.repo.repo, - workflow_id: 'ci-packages.yml', - head_sha: tagSha - }); - - const successfulPackageRun = packageRuns.data.workflow_runs.find(run => run.conclusion === 'success'); - if (!successfulPackageRun) { - core.setFailed(`No successful CI package workflow run found for tag ${tag}`); - return; - } - console.log(`Found successful package run: ${successfulPackageRun.id}`); - - // Verify build artifacts exist - const buildArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ - owner: context.repo.owner, - repo: context.repo.repo, - run_id: successfulBuildRun.id - }); - - const requiredBinaries = [ - 'gpuhealth-binary-ubuntu-22.04-amd64', - 'gpuhealth-binary-ubuntu-22.04-arm64', - 'gpuhealth-binary-rhel-8-x86_64', - 'gpuhealth-binary-rhel-8-aarch64' - ]; - - const foundBinaries = buildArtifacts.data.artifacts.filter(a => - requiredBinaries.includes(a.name) && a.expired === false - ); - - if (foundBinaries.length !== requiredBinaries.length) { - const missing = requiredBinaries.filter(req => - !foundBinaries.some(found => found.name === req) - ); - core.setFailed(`Missing binary artifacts: ${missing.join(', ')}`); - return; - } - console.log(`✅ All binary artifacts found: ${foundBinaries.map(a => a.name).join(', ')}`); - - // Verify package artifacts exist - const packageArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ - owner: context.repo.owner, - repo: context.repo.repo, - run_id: successfulPackageRun.id - }); - - const requiredPackages = [ - 'package-deb-ubuntu-22.04-amd64', - 'package-deb-ubuntu-22.04-arm64', - 'package-rpm-rhel-8-x86_64', - 'package-rpm-rhel-8-aarch64' - ]; - - const foundPackages = packageArtifacts.data.artifacts.filter(a => - requiredPackages.includes(a.name) && a.expired === false - ); - - if (foundPackages.length !== requiredPackages.length) { - const missing = requiredPackages.filter(req => - !foundPackages.some(found => found.name === req) - ); - core.setFailed(`Missing package artifacts: ${missing.join(', ')}`); - return; - } - console.log(`All package artifacts found: ${foundPackages.map(a => a.name).join(', ')}`); - - console.log(`All required artifacts validated for tag ${tag}`); - - # Download artifacts from successful CI workflows - collect-artifacts: - name: Collect Build Artifacts - runs-on: ubuntu-22.04 - needs: validate-artifacts - if: needs.validate-artifacts.outputs.artifacts-valid == 'true' - steps: - - name: Download all build artifacts - uses: actions/download-artifact@v4 - with: - pattern: gpuhealth-binary-* - path: ./binaries/ - - - name: Download all package artifacts - uses: actions/download-artifact@v4 - with: - pattern: package-* - path: ./packages/ - - - name: List collected artifacts - run: | - echo "=== Binaries ===" - find ./binaries/ -type f -name "gpuhealth" -exec ls -la {} \; - echo "=== Packages ===" - find ./packages/ -type f \( -name "*.deb" -o -name "*.rpm" \) -exec ls -la {} \; - - - name: Get release tag - id: tag - run: | - if [[ "${{ github.event_name }}" == "push" ]]; then - echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT - else - echo "tag=${{ github.event.inputs.tag }}" >> $GITHUB_OUTPUT - fi - - - name: Prepare release artifacts - run: | - mkdir -p ./release-assets - TAG="${{ needs.validate-artifacts.outputs.tag }}" - - # Copy binaries with proper naming - for binary in ./binaries/*/gpuhealth; do - if [[ -f "$binary" ]]; then - dir_name=$(basename $(dirname "$binary")) - # Extract platform info from artifact name - platform=$(echo "$dir_name" | sed 's/gpuhealth-binary-//') - cp "$binary" "./release-assets/gpuhealth-${TAG}-${platform}" - fi - done - - # Copy packages - find ./packages/ -type f \( -name "*.deb" -o -name "*.rpm" \) -exec cp {} ./release-assets/ \; - - echo "=== Final Release Assets ===" - ls -la ./release-assets/ - - - name: Upload consolidated release artifacts - uses: actions/upload-artifact@v4 - with: - name: release-assets-${{ needs.validate-artifacts.outputs.tag }} - path: ./release-assets/ - retention-days: 30 - - # Pre-release validation on GitHub-hosted (fast) - pre-release-validation: - needs: [validate-artifacts, collect-artifacts] - if: needs.validate-artifacts.outputs.artifacts-valid == 'true' - name: Pre-Release Validation - runs-on: ubuntu-22.04 # GitHub-hosted - outputs: - tag: ${{ needs.validate-artifacts.outputs.tag }} - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Get tag - id: tag - run: | - if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then - TAG="${{ github.event.inputs.tag }}" - else - TAG=${GITHUB_REF#refs/tags/} - fi - echo "tag=${TAG}" >> $GITHUB_OUTPUT - echo "Release tag: ${TAG}" - - - name: Validate tag format - run: | - TAG="${{ steps.tag.outputs.tag }}" - if [[ ! $TAG =~ ^v[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9]+)?$ ]]; then - echo "Invalid tag format: $TAG" - echo "Expected format: v1.2.3 or v1.2.3-alpha" - exit 1 - fi - - - name: Check changelog - run: | - if [ -f CHANGELOG.md ]; then - TAG="${{ steps.tag.outputs.tag }}" - if ! grep -q "$TAG" CHANGELOG.md; then - echo "Warning: Tag $TAG not found in CHANGELOG.md" - fi - fi - - - - # Create GitHub release using pre-built artifacts - create-github-release: - needs: [validate-artifacts, collect-artifacts, pre-release-validation] - if: needs.validate-artifacts.outputs.artifacts-valid == 'true' - name: Create GitHub Release - runs-on: ubuntu-22.04 # GitHub-hosted for API operations - - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Download consolidated release artifacts - uses: actions/download-artifact@v4 - with: - name: release-assets-${{ needs.validate-artifacts.outputs.tag }} - path: ./release-assets - - - name: Verify release assets - run: | - echo "=== Release Assets Ready for Upload ===" - ls -la ./release-assets/ - echo "=== Binary Count ===" - find ./release-assets/ -name "gpuhealth-*" -type f | wc -l - echo "=== Package Count ===" - find ./release-assets/ -name "*.deb" -o -name "*.rpm" | wc -l - - - name: Create Release - uses: softprops/action-gh-release@v1 - with: - tag_name: ${{ needs.validate-artifacts.outputs.tag }} - name: "gpuhealth ${{ needs.validate-artifacts.outputs.tag }}" - draft: false - prerelease: ${{ contains(needs.validate-artifacts.outputs.tag, '-') }} - generate_release_notes: true - files: ./release-assets/* - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Release Summary - run: | - echo "## Release Summary" >> $GITHUB_STEP_SUMMARY - echo "- **Tag:** ${{ needs.pre-release-validation.outputs.tag }}" >> $GITHUB_STEP_SUMMARY - echo "- **Packages:** $(ls ./final-release/ | wc -l) files" >> $GITHUB_STEP_SUMMARY - echo "" >> $GITHUB_STEP_SUMMARY - echo "### Release Assets:" >> $GITHUB_STEP_SUMMARY - for file in ./final-release/*; do - echo "- $(basename $file)" >> $GITHUB_STEP_SUMMARY - done diff --git a/.goreleaser_gpuhealth_deb_amd64.yaml b/.goreleaser_gpuhealth_deb_amd64.yaml index f0601a40..81da0606 100644 --- a/.goreleaser_gpuhealth_deb_amd64.yaml +++ b/.goreleaser_gpuhealth_deb_amd64.yaml @@ -20,6 +20,7 @@ nfpms: ids: [gpuhealth] formats: [deb] bindir: /usr/bin + file_name_template: '{{ .PackageName }}_{{ .Version }}.{{ .Arch }}' maintainer: "GPU Health Team " license: Apache-2.0 homepage: https://github.com/NVIDIA/gpuhealth diff --git a/.goreleaser_gpuhealth_deb_arm64.yaml b/.goreleaser_gpuhealth_deb_arm64.yaml index 6988e931..123b7c29 100644 --- a/.goreleaser_gpuhealth_deb_arm64.yaml +++ b/.goreleaser_gpuhealth_deb_arm64.yaml @@ -20,6 +20,7 @@ nfpms: ids: [gpuhealth] formats: [deb] bindir: /usr/bin + file_name_template: '{{ .PackageName }}_{{ .Version }}.{{ .Arch }}' maintainer: "GPU Health Team " license: Apache-2.0 homepage: https://github.com/NVIDIA/gpuhealth