diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 5fa31df6..00000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Build and Store gpud binary - -on: - push: - branches: - - main - pull_request: - branches: ["**"] - -jobs: - build: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Set up environment - run: | - sudo apt-get update - sudo apt-get install -y linux-headers-$(uname -r) - - - name: Build project - run: | - make - - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: gpud - path: bin/gpud \ No newline at end of file diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml new file mode 100644 index 00000000..c83604a4 --- /dev/null +++ b/.github/workflows/ci-build.yml @@ -0,0 +1,91 @@ +name: CI - Build Binary + +on: + push: + branches: ["main"] # Main branch commits + tags: ['*.*.*'] # Release tags + pull_request: + paths: + - ".github/workflows/ci-build.yml" + - "**.go" + - "go.mod" + - "go.sum" + - "Makefile" + branches: ["main"] # PRs to main + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-full: + strategy: + fail-fast: false + matrix: + include: + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: x86_64 + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + + name: Build (${{ matrix.os_name }}-${{ matrix.arch }}) + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Build binary (native) + run: | + echo "Building gpuhealth binary on ${{ matrix.os_name }}-${{ matrix.arch }}..." + make gpuhealth || go build -o bin/gpuhealth ./cmd/gpuhealth + + - name: Test binary + run: | + echo "Testing binary functionality..." + ./bin/gpuhealth --version + ./bin/gpuhealth --help + file ./bin/gpuhealth + ldd ./bin/gpuhealth || echo "Static binary or different libc" + + - name: Upload binary + uses: actions/upload-artifact@v4 + with: + name: gpuhealth-binary-${{ matrix.os_name }}-${{ matrix.arch }} + path: bin/gpuhealth + if-no-files-found: warn + retention-days: 7 diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml new file mode 100644 index 00000000..a85b5423 --- /dev/null +++ b/.github/workflows/ci-lint.yml @@ -0,0 +1,44 @@ +name: CI - Lint & Code Quality + +on: + push: + branches: ["**"] # Every commit on every branch + +permissions: + contents: read + pull-requests: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + name: Lint & Code Quality + runs-on: ubuntu-22.04 # GitHub-hosted runner + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Run golangci-lint + uses: golangci/golangci-lint-action@v6 + with: + version: latest + args: --verbose --config=.golangci.yml + + - name: Check go mod tidy + run: | + go mod tidy + if ! git diff --exit-code go.mod go.sum; then + echo "go.mod or go.sum is not tidy" + exit 1 + fi diff --git a/.github/workflows/ci-packages.yml b/.github/workflows/ci-packages.yml new file mode 100644 index 00000000..8cde2d65 --- /dev/null +++ b/.github/workflows/ci-packages.yml @@ -0,0 +1,114 @@ +name: CI - Package Building + +on: + push: + branches: ["main"] # Main branch commits + tags: ['*.*.*'] # Release tags + pull_request: + paths: + - ".github/workflows/ci-packages.yml" + - "**.go" + - "go.mod" + - "go.sum" + - ".goreleaser_*.yaml" + - "deployments/packages/**" + branches: ["main"] # PRs to main + +permissions: + contents: write + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + package: + strategy: + fail-fast: false + matrix: + include: + # Ubuntu DEB packages + - runner: gpuhealth-ubuntu-22.04-x86 + os_name: ubuntu-22.04 + arch: amd64 + package_type: deb + goreleaser_config: .goreleaser_gpuhealth_deb_amd64.yaml + + - runner: gpuhealth-ubuntu-22.04-arm + os_name: ubuntu-22.04 + arch: arm64 + package_type: deb + goreleaser_config: .goreleaser_gpuhealth_deb_arm64.yaml + + # RHEL RPM packages + - runner: gpuhealth-rhel-8-x86 + os_name: rhel-8 + arch: x86_64 + package_type: rpm + goreleaser_config: .goreleaser_gpuhealth_rpm_x86_64.yaml + + - runner: gpuhealth-rhel-8-arm + os_name: rhel-8 + arch: aarch64 + package_type: rpm + goreleaser_config: .goreleaser_gpuhealth_rpm_aarch64.yaml + + name: Build Package (${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }}) + runs-on: ${{ matrix.runner }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Install build dependencies (Ubuntu) + if: contains(matrix.os_name, 'ubuntu') + run: | + sudo apt-get update + sudo apt-get install -y build-essential linux-headers-$(uname -r) + + - name: Install build dependencies (RHEL) + if: contains(matrix.os_name, 'rhel') + run: | + sudo dnf update -y + sudo dnf groupinstall -y "Development Tools" + sudo dnf install -y gcc gcc-c++ make kernel-headers kernel-devel rpm-build + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Install GoReleaser + run: | + go install github.com/goreleaser/goreleaser/v2@latest + goreleaser --version + + - name: Validate GoReleaser config + run: | + goreleaser check --config ${{ matrix.goreleaser_config }} + + - name: Build package + run: | + echo "Building ${{ matrix.package_type }} package on ${{ matrix.os_name }}-${{ matrix.arch }}..." + if [[ "${{ github.ref }}" == refs/tags/* ]]; then + goreleaser release --config ${{ matrix.goreleaser_config }} --clean + else + goreleaser release --snapshot --config ${{ matrix.goreleaser_config }} --clean # For branches and PRs + fi + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: List generated packages + run: | + echo "Generated packages:" + find dist/ -name "*.deb" -o -name "*.rpm" | head -10 + + - name: Upload package artifacts + uses: actions/upload-artifact@v4 + with: + name: package-${{ matrix.package_type }}-${{ matrix.os_name }}-${{ matrix.arch }} + path: "dist/" + if-no-files-found: warn + retention-days: 7 \ No newline at end of file diff --git a/.github/workflows/ci-unit-tests.yml b/.github/workflows/ci-unit-tests.yml new file mode 100644 index 00000000..3718ce77 --- /dev/null +++ b/.github/workflows/ci-unit-tests.yml @@ -0,0 +1,43 @@ +name: CI - Unit Tests + +on: + push: + branches: ["**"] # Every commit on every branch + +permissions: + contents: read + pull-requests: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + unit-test: + name: Unit Tests + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Set up Go + uses: actions/setup-go@v5 + with: + cache: true + go-version-file: go.mod + + - name: Run unit tests + run: | + echo "Running unit tests for dev branch..." + KMSG_FILE_PATH=/dev/null ./scripts/tests-unit.sh + + - name: Upload coverage + uses: actions/upload-artifact@v4 + with: + name: coverage-dev + path: coverage.txt + if-no-files-found: warn + retention-days: 1 diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml deleted file mode 100644 index 0ec356b5..00000000 --- a/.github/workflows/golangci-lint.yml +++ /dev/null @@ -1,41 +0,0 @@ - -# https://github.com/golangci/golangci-lint-action?tab=readme-ov-file#options -name: golangci-lint.run - -on: - push: - branches: ["main"] - pull_request: - paths: - - .github/workflows/golangci-lint.yml - - "**.go" - - go.mod - - go.sum - branches: ["**"] - -permissions: - contents: read - pull-requests: read - -jobs: - golangci-lint: - name: golangci-lint - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - - uses: actions/setup-go@v5 - with: - cache: false - go-version-file: go.mod - - - name: Install golangci-lint - run: go install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0 - - - name: Build custom golangci-lint - run: golangci-lint custom - - - name: Run golangci-lint - run: ./custom-gcl run --verbose --config=.golangci.yml ./... diff --git a/.github/workflows/goreleaser-dev.yml b/.github/workflows/goreleaser-dev.yml deleted file mode 100644 index 3b8be418..00000000 --- a/.github/workflows/goreleaser-dev.yml +++ /dev/null @@ -1,88 +0,0 @@ -name: goreleaser-dev - -# ref. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions -on: - push: - branches: ["main"] - pull_request: - paths: - - .github/workflows/golangci-lint.yml - - .github/workflows/goreleaser-dev.yml - - "**.go" - - go.mod - - go.sum - - .goreleaser* - branches: ["**"] - -permissions: - contents: write - -jobs: - release: - # https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix - strategy: - matrix: - job: - - os: ubuntu-22.04 - platform: linux - target: linux_amd64 - goreleaser_suffix: _ubuntu22.04 - - - os: ubuntu-22.04 - platform: linux - target: linux_arm64 - goreleaser_suffix: _ubuntu22.04 - - - os: ubuntu-24.04 - platform: linux - target: linux_amd64 - goreleaser_suffix: _ubuntu24.04 - - - os: ubuntu-24.04 - platform: linux - target: linux_arm64 - goreleaser_suffix: _ubuntu24.04 - - - os: macos-latest - platform: darwin - target: darwin_amd64 - - - os: macos-latest - platform: darwin - target: darwin_arm64 - - name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) - runs-on: ${{ matrix.job.os }} - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Install OS dependencies - shell: bash - run: | - case ${{ matrix.job.target }} in - linux_arm64) sudo apt-get -y update ; sudo apt-get -y install gcc-aarch64-linux-gnu ;; - esac - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - - - name: Show version information - shell: bash - run: | - gcc --version || true - go version - - # https://github.com/goreleaser/goreleaser-action - - name: Run GoReleaser in snapshot mode - uses: goreleaser/goreleaser-action@v6 - with: - distribution: goreleaser - version: latest - args: release --snapshot --config .goreleaser_${{ matrix.job.target }}${{ matrix.job.goreleaser_suffix }}.yaml - workdir: . - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/goreleaser.yml b/.github/workflows/goreleaser.yml deleted file mode 100644 index 45b99e35..00000000 --- a/.github/workflows/goreleaser.yml +++ /dev/null @@ -1,97 +0,0 @@ -name: goreleaser - -# ref. https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions -on: - push: - tags: - - "*" - -permissions: - contents: write - -jobs: - release: - # https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix - strategy: - matrix: - job: - - os: ubuntu-22.04 - platform: linux - target: linux_amd64 - goreleaser_suffix: _ubuntu22.04 - - - os: ubuntu-22.04 - platform: linux - target: linux_arm64 - goreleaser_suffix: _ubuntu22.04 - - - os: ubuntu-24.04 - platform: linux - target: linux_amd64 - goreleaser_suffix: _ubuntu24.04 - - - os: ubuntu-24.04 - platform: linux - target: linux_arm64 - goreleaser_suffix: _ubuntu24.04 - - - os: macos-latest - platform: darwin - target: darwin_amd64 - - - os: macos-latest - platform: darwin - target: darwin_arm64 - - name: Release ${{ matrix.job.target }} (${{ matrix.job.os }}) - runs-on: ${{ matrix.job.os }} - - steps: - - name: Checkout code - uses: actions/checkout@v3 - - - name: Install OS dependencies - shell: bash - run: | - case ${{ matrix.job.target }} in - linux_arm64) sudo apt-get -y update ; sudo apt-get -y install gcc-aarch64-linux-gnu ;; - esac - - - name: Set up Go - uses: actions/setup-go@v5 - with: - go-version-file: go.mod - - - name: Show version information - shell: bash - run: | - gcc --version || true - go version - - # https://github.com/goreleaser/goreleaser-action - - name: Run GoReleaser - uses: goreleaser/goreleaser-action@v6 - with: - distribution: goreleaser - version: latest - args: release --config .goreleaser_${{ matrix.job.target }}${{ matrix.job.goreleaser_suffix }}.yaml - workdir: . - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Get tag version - if: startsWith(github.ref, 'refs/tags/') - id: get_tag_version - run: echo TAG_VERSION=${GITHUB_REF/refs\/tags\//} >> $GITHUB_OUTPUT - - - name: Release latest - uses: softprops/action-gh-release@v1 - if: ${{ github.ref == 'refs/heads/main' }} - with: - name: Latest release - tag_name: latest - draft: false - prerelease: false - body: Latest builds from the last commit - files: | - ./dist/gpud_v${{steps.get_tag_version.outputs.TAG_VERSION}}_${{ matrix.job.target }}${{ matrix.job.goreleaser_suffix }}.tgz diff --git a/.github/workflows/tests-e2e.yml b/.github/workflows/tests-e2e.yml deleted file mode 100644 index 794e9fbb..00000000 --- a/.github/workflows/tests-e2e.yml +++ /dev/null @@ -1,32 +0,0 @@ -# https://github.com/golangci/golangci-lint-action?tab=readme-ov-file#options -name: tests-e2e - -on: - push: - branches: ["main"] - pull_request: - paths: - - "**.go" - - go.mod - - go.sum - branches: ["**"] - -permissions: - contents: read - pull-requests: read - -jobs: - tests-e2e: - name: tests-e2e - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - uses: actions/setup-go@v5 - with: - cache: false - go-version-file: go.mod - - name: run e2e tests - run: | - KMSG_FILE_PATH=/dev/null ./scripts/tests-e2e.sh diff --git a/.github/workflows/tests-unit.yml b/.github/workflows/tests-unit.yml deleted file mode 100644 index 02e505ac..00000000 --- a/.github/workflows/tests-unit.yml +++ /dev/null @@ -1,36 +0,0 @@ -# https://github.com/golangci/golangci-lint-action?tab=readme-ov-file#options -name: tests-unit - -on: - push: - branches: ["main"] - pull_request: - paths: - - "**.go" - - go.mod - - go.sum - branches: ["**"] - -permissions: - contents: read - pull-requests: read - -jobs: - tests-unit: - name: tests-unit - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - with: - submodules: recursive - - uses: actions/setup-go@v5 - with: - cache: false - go-version-file: go.mod - - name: run unit tests - run: | - KMSG_FILE_PATH=/dev/null ./scripts/tests-unit.sh - - name: Upload coverage reports to Codecov - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.goreleaser_darwin_amd64.yaml b/.goreleaser_darwin_amd64.yaml deleted file mode 100644 index 562f96d6..00000000 --- a/.goreleaser_darwin_amd64.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - darwin - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }} - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_darwin_arm64.yaml b/.goreleaser_darwin_arm64.yaml deleted file mode 100644 index f017436d..00000000 --- a/.goreleaser_darwin_arm64.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - darwin - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }} - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_gpuhealth_deb_amd64.yaml b/.goreleaser_gpuhealth_deb_amd64.yaml index 1e8b0339..81da0606 100644 --- a/.goreleaser_gpuhealth_deb_amd64.yaml +++ b/.goreleaser_gpuhealth_deb_amd64.yaml @@ -8,9 +8,9 @@ builds: flags: [-v] ldflags: - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} + - -X github.com/NVIDIA/gpuhealth/version.BuildTimestamp={{.CommitTimestamp}} + - -X github.com/NVIDIA/gpuhealth/version.Revision={{.Commit}} + - -X github.com/NVIDIA/gpuhealth/version.Version=v{{.Version}} goos: [linux] goarch: [amd64] @@ -20,9 +20,10 @@ nfpms: ids: [gpuhealth] formats: [deb] bindir: /usr/bin + file_name_template: '{{ .PackageName }}_{{ .Version }}.{{ .Arch }}' maintainer: "GPU Health Team " license: Apache-2.0 - homepage: https://github.com/leptonai/gpud + homepage: https://github.com/NVIDIA/gpuhealth description: GPU Health monitoring agent for datacenter environments dependencies: [systemd, curl] recommends: [nvidia-driver] @@ -52,7 +53,7 @@ release: replace_existing_draft: true make_latest: true mode: replace - github: { owner: leptonai, name: gpud } + github: { owner: NVIDIA, name: gpuhealth } name_template: "gpuhealth-v{{.Version}}" checksum: { disable: true } diff --git a/.goreleaser_gpuhealth_deb_arm64.yaml b/.goreleaser_gpuhealth_deb_arm64.yaml index c74d3ae2..123b7c29 100644 --- a/.goreleaser_gpuhealth_deb_arm64.yaml +++ b/.goreleaser_gpuhealth_deb_arm64.yaml @@ -8,9 +8,9 @@ builds: flags: [-v] ldflags: - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} + - -X github.com/NVIDIA/gpuhealth/version.BuildTimestamp={{.CommitTimestamp}} + - -X github.com/NVIDIA/gpuhealth/version.Revision={{.Commit}} + - -X github.com/NVIDIA/gpuhealth/version.Version=v{{.Version}} goos: [linux] goarch: [arm64] @@ -20,9 +20,10 @@ nfpms: ids: [gpuhealth] formats: [deb] bindir: /usr/bin + file_name_template: '{{ .PackageName }}_{{ .Version }}.{{ .Arch }}' maintainer: "GPU Health Team " license: Apache-2.0 - homepage: https://github.com/leptonai/gpud + homepage: https://github.com/NVIDIA/gpuhealth description: GPU Health monitoring agent for datacenter environments dependencies: [systemd, curl] recommends: [nvidia-driver] @@ -52,7 +53,7 @@ release: replace_existing_draft: true make_latest: true mode: replace - github: { owner: leptonai, name: gpud } + github: { owner: NVIDIA, name: gpuhealth } name_template: "gpuhealth-v{{.Version}}" checksum: { disable: true } diff --git a/.goreleaser_gpuhealth_rpm_aarch64.yaml b/.goreleaser_gpuhealth_rpm_aarch64.yaml index 8ba3e225..352bc01d 100644 --- a/.goreleaser_gpuhealth_rpm_aarch64.yaml +++ b/.goreleaser_gpuhealth_rpm_aarch64.yaml @@ -8,9 +8,9 @@ builds: flags: [-v] ldflags: - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} + - -X github.com/NVIDIA/gpuhealth/version.BuildTimestamp={{.CommitTimestamp}} + - -X github.com/NVIDIA/gpuhealth/version.Revision={{.Commit}} + - -X github.com/NVIDIA/gpuhealth/version.Version=v{{.Version}} goos: [linux] goarch: [arm64] @@ -24,16 +24,16 @@ nfpms: file_name_template: '{{ .PackageName }}-{{ .Version }}-{{ .Release }}.{{ if eq .Arch "amd64" }}x86_64{{ else if eq .Arch "arm64" }}aarch64{{ else }}{{ .Arch }}{{ end }}' maintainer: "GPU Health Team " license: Apache-2.0 - homepage: https://github.com/leptonai/gpud + homepage: https://github.com/NVIDIA/gpuhealth description: GPU Health monitoring agent for datacenter environments dependencies: [systemd, curl] recommends: [nvidia-driver] contents: - src: deployments/packages/gpuhealth/systemd/gpuhealthd.service - dst: /lib/systemd/system/gpuhealthd.service + dst: /usr/lib/systemd/system/gpuhealthd.service file_info: { mode: 0644 } - src: deployments/packages/gpuhealth/systemd/gpuhealth.env - dst: /etc/default/gpuhealth + dst: /etc/sysconfig/gpuhealth type: config file_info: { mode: 0644 } - src: deployments/packages/gpuhealth/doc/README.md @@ -56,7 +56,7 @@ release: replace_existing_draft: true make_latest: true mode: replace - github: { owner: leptonai, name: gpud } + github: { owner: NVIDIA, name: gpuhealth } name_template: "gpuhealth-v{{.Version}}" checksum: { disable: true } diff --git a/.goreleaser_gpuhealth_rpm_x86_64.yaml b/.goreleaser_gpuhealth_rpm_x86_64.yaml index 68fa06a0..4f723257 100644 --- a/.goreleaser_gpuhealth_rpm_x86_64.yaml +++ b/.goreleaser_gpuhealth_rpm_x86_64.yaml @@ -8,9 +8,9 @@ builds: flags: [-v] ldflags: - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} + - -X github.com/NVIDIA/gpuhealth/version.BuildTimestamp={{.CommitTimestamp}} + - -X github.com/NVIDIA/gpuhealth/version.Revision={{.Commit}} + - -X github.com/NVIDIA/gpuhealth/version.Version=v{{.Version}} goos: [linux] goarch: [amd64] @@ -24,16 +24,16 @@ nfpms: file_name_template: '{{ .PackageName }}-{{ .Version }}-{{ .Release }}.{{ if eq .Arch "amd64" }}x86_64{{ else if eq .Arch "arm64" }}aarch64{{ else }}{{ .Arch }}{{ end }}' maintainer: "GPU Health Team " license: Apache-2.0 - homepage: https://github.com/leptonai/gpud + homepage: https://github.com/NVIDIA/gpuhealth description: GPU Health monitoring agent for datacenter environments dependencies: [systemd, curl] recommends: [nvidia-driver] contents: - src: deployments/packages/gpuhealth/systemd/gpuhealthd.service - dst: /lib/systemd/system/gpuhealthd.service + dst: /usr/lib/systemd/system/gpuhealthd.service file_info: { mode: 0644 } - src: deployments/packages/gpuhealth/systemd/gpuhealth.env - dst: /etc/default/gpuhealth + dst: /etc/sysconfig/gpuhealth type: config file_info: { mode: 0644 } - src: deployments/packages/gpuhealth/doc/README.md @@ -56,7 +56,7 @@ release: replace_existing_draft: true make_latest: true mode: replace - github: { owner: leptonai, name: gpud } + github: { owner: NVIDIA, name: gpuhealth } name_template: "gpuhealth-v{{.Version}}" checksum: { disable: true } diff --git a/.goreleaser_linux_amd64.yaml b/.goreleaser_linux_amd64.yaml deleted file mode 100644 index 6f6a444e..00000000 --- a/.goreleaser_linux_amd64.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }} - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_amd64_amzn2.yaml b/.goreleaser_linux_amd64_amzn2.yaml deleted file mode 100644 index 051febe5..00000000 --- a/.goreleaser_linux_amd64_amzn2.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_amzn2 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_amd64_amzn2023.yaml b/.goreleaser_linux_amd64_amzn2023.yaml deleted file mode 100644 index 8e492fef..00000000 --- a/.goreleaser_linux_amd64_amzn2023.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_amzn2023 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_amd64_ubuntu22.04.yaml b/.goreleaser_linux_amd64_ubuntu22.04.yaml deleted file mode 100644 index f3e56989..00000000 --- a/.goreleaser_linux_amd64_ubuntu22.04.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_ubuntu22.04 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_amd64_ubuntu24.04.yaml b/.goreleaser_linux_amd64_ubuntu24.04.yaml deleted file mode 100644 index 308c9b0b..00000000 --- a/.goreleaser_linux_amd64_ubuntu24.04.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - amd64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_ubuntu24.04 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_arm64_amzn2.yaml b/.goreleaser_linux_arm64_amzn2.yaml deleted file mode 100644 index bc436014..00000000 --- a/.goreleaser_linux_arm64_amzn2.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - - CC=gcc - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_amzn2 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_arm64_amzn2023.yaml b/.goreleaser_linux_arm64_amzn2023.yaml deleted file mode 100644 index 6ef85c25..00000000 --- a/.goreleaser_linux_arm64_amzn2023.yaml +++ /dev/null @@ -1,68 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - # ref. https://docs.aws.amazon.com/linux/al2023/ug/compare-with-al2.html#compiler-triplet - - CC=aarch64-amazon-linux-gcc - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_amzn2023 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_arm64_ubuntu22.04.yaml b/.goreleaser_linux_arm64_ubuntu22.04.yaml deleted file mode 100644 index d8b33859..00000000 --- a/.goreleaser_linux_arm64_ubuntu22.04.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - - CC=aarch64-linux-gnu-gcc - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_ubuntu22.04 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/.goreleaser_linux_arm64_ubuntu24.04.yaml b/.goreleaser_linux_arm64_ubuntu24.04.yaml deleted file mode 100644 index 9e0055c8..00000000 --- a/.goreleaser_linux_arm64_ubuntu24.04.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# https://goreleaser.com/customization/builds/ -version: 2 -builds: - - - id: gpud - binary: gpud - main: ./cmd/gpud - env: - - CGO_ENABLED=1 - - CC=aarch64-linux-gnu-gcc - flags: - - -v - - # NOTE: the "v" prefix from the git tag is stripped for the {{.Version}} - ldflags: - - -s -w - - -X github.com/leptonai/gpud/version.BuildTimestamp={{.CommitTimestamp}} - - -X github.com/leptonai/gpud/version.Revision={{.Commit}} - - -X github.com/leptonai/gpud/version.Version=v{{.Version}} - - goos: - - linux - goarch: - - arm64 - -# https://goreleaser.com/customization/archive/ -archives: - - - id: gpud - formats: [ 'tgz' ] - - # "builds" reference which build instances should be archived in this archive - builds: - - gpud - - name_template: >- - {{ .Binary }}_v{{- .Version }}_{{- .Os }}_{{ .Arch }}_ubuntu24.04 - - # use zip for windows archives - format_overrides: - - goos: windows - formats: ['zip'] - -# https://goreleaser.com/customization/changelog/ -changelog: - use: github-native - -release: - draft: false - replace_existing_draft: true - make_latest: true - mode: replace - - github: - owner: leptonai - name: gpud - - header: | - ## GPUd release notes ({{ .Date }}) - - Welcome to this new release! - - name_template: "{{.ProjectName}}-v{{.Version}}" - -# ref. https://goreleaser.com/customization/checksum/?h=checksum -checksum: - disable: true diff --git a/Makefile b/Makefile index 15fb6f60..196d18ae 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ ROOTDIR=$(dir $(abspath $(lastword $(MAKEFILE_LIST)))) BUILD_TIMESTAMP ?= $(shell date -u +"%Y-%m-%dT%H:%M:%SZ") VERSION ?= $(shell git describe --match 'v[0-9]*' --dirty='.m' --always) REVISION=$(shell git rev-parse HEAD)$(shell if ! git diff --no-ext-diff --quiet --exit-code; then echo .m; fi) -PACKAGE=github.com/leptonai/gpud +PACKAGE=github.com/NVIDIA/gpuhealth ifneq "$(strip $(shell command -v $(GO) 2>/dev/null))" "" GOOS ?= $(shell $(GO) env GOOS) @@ -49,9 +49,9 @@ endif WHALE = "🇩" ONI = "👹" -RELEASE=gpud-$(VERSION:v%=%)-${GOOS}-${GOARCH} +RELEASE=gpuhealth-$(VERSION:v%=%)-${GOOS}-${GOARCH} -COMMANDS=gpud swagger +COMMANDS=gpuhealth GO_BUILD_FLAGS=-ldflags '-s -X $(PACKAGE)/version.BuildTimestamp=$(BUILD_TIMESTAMP) -X $(PACKAGE)/version.Version=$(VERSION) -X $(PACKAGE)/version.Revision=$(REVISION) -X $(PACKAGE)/version.Package=$(PACKAGE)' @@ -71,7 +71,7 @@ GOPATHS=$(shell echo ${GOPATH} | tr ":" "\n" | tr ";" "\n") BINARIES=$(addprefix bin/,$(COMMANDS)) -.PHONY: clean all binaries +.PHONY: clean all binaries gpuhealth .DEFAULT: default all: binaries @@ -90,6 +90,10 @@ bin/%: cmd/% FORCE binaries: $(BINARIES) ## build binaries @echo "$(WHALE) $@" +# Specific target for gpuhealth (your main binary) +gpuhealth: bin/gpuhealth ## build gpuhealth binary + @echo "$(WHALE) gpuhealth built successfully" + clean: ## clean up binaries @echo "$(WHALE) $@" @rm -f $(BINARIES) diff --git a/README.md b/README.md index f1d2e6ec..b8291e23 100644 --- a/README.md +++ b/README.md @@ -1,143 +1,111 @@ -GPUd logo - -[![Go Report Card](https://goreportcard.com/badge/github.com/leptonai/gpud)](https://goreportcard.com/report/github.com/leptonai/gpud) -![GitHub release (latest SemVer)](https://img.shields.io/github/v/release/leptonai/gpud?sort=semver) -[![Go Reference](https://pkg.go.dev/badge/github.com/leptonai/gpud.svg)](https://pkg.go.dev/github.com/leptonai/gpud) -[![codecov](https://codecov.io/gh/leptonai/gpud/graph/badge.svg?token=G8MGRK9X4A)](https://codecov.io/gh/leptonai/gpud) +# NVIDIA GPU Health Monitoring Agent ## Overview -[GPUd](https://www.gpud.ai) is designed to ensure GPU efficiency and reliability by actively monitoring GPUs and effectively managing AI/ML workloads. - -## Why GPUd - -GPUd is built on years of experience operating large-scale GPU clusters at Meta, Alibaba Cloud, Uber, and Lepton AI. It is carefully designed to be self-contained and to integrate seamlessly with other systems such as Docker, containerd, Kubernetes, and Nvidia ecosystems. - -- **First-class GPU support**: GPUd is GPU-centric, providing a unified view of critical GPU metrics and issues. -- **Easy to run at scale**: GPUd is a self-contained binary that runs on any machine with a low footprint. -- **Production grade**: GPUd is used in [Lepton AI](https://lepton.ai/)'s production infrastructure. +`gpuhealth` is a lightweight GPU health monitoring agent that tracks GPU status and exports health metrics. Based on [leptonai/gpud](https://github.com/leptonai/gpud), it focuses specifically on monitoring without management overhead. -Most importantly, GPUd operates with minimal CPU and memory overhead in a non-critical path and requires only read-only operations. See [*architecture*](./docs/ARCHITECTURE.md) for more details. +**Key Features:** +- **Health-Focused**: GPU health monitoring and metrics export +- **Lightweight**: Minimal CPU and memory footprint (<100MB RAM, <1% CPU) +- **Non-Intrusive**: Read-only operations, no system modifications +- **Integration-Ready**: HTTP API, file export, optional centralized reporting +- **Production-Ready**: Built for 24/7 datacenter operation -## Get Started - - -gpud-2025-06-01-01-install-and-scan - - -See [Tutorials](./docs/TUTORIALS.md) for more. +## Quick Start ### Installation -To install from the official release on Linux and amd64 (x86_64) machine: - +**Package Installation (Recommended):** ```bash -curl -fsSL https://pkg.gpud.dev/install.sh | sh -``` - -To specify a version - -```bash -curl -fsSL https://pkg.gpud.dev/install.sh | sh -s v0.6.0 -``` - -Note that the install script doesn't support other architectures (arm64) and OSes (macos), yet. - -### Run GPUd with Lepton Platform - -Sign up at [lepton.ai](https://www.lepton.ai/) and get the workspace token from the ["Settings" and "Tokens" page](https://dashboard.lepton.ai/workspace-redirect/settings/api-tokens): +# Ubuntu/Debian +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth_*_amd64.deb +sudo dpkg -i gpuhealth_*_amd64.deb -GPUd lepton.ai machines settings +# RHEL/Rocky/AlmaLinux/AmazonLinux +wget https://github.com/NVIDIA/gpuhealth/releases/latest/download/gpuhealth-*-1.x86_64.rpm +sudo rpm -i gpuhealth-*-1.x86_64.rpm -Copy the token and pass it to the `gpud up --token` flag: - -```bash -sudo gpud up --token +# Verify installation +systemctl status gpuhealthd ``` -You can go to the [dashboard](https://dashboard.lepton.ai/workspace-redirect/machines/self-managed-nodes) to check the self-managed machine status. - -### Run GPUd standalone - -For linux, run the following command to start the service: - +**Build from Source:** ```bash -sudo gpud up +git clone https://github.com/NVIDIA/gpuhealth.git +cd gpuhealth +make gpuhealth +sudo mv bin/gpuhealth /usr/local/bin/ ``` -You can also start with the standalone mode and later switch to the managed option: +### Usage ```bash -# when the token is ready, run the following command -sudo gpud login --token - -# to logout -sudo gpud logout - -# to logout and reset the state file -sudo gpud logout --reset-state -``` +# Start monitoring server (port 15133) +gpuhealth run -#### Run GPUd with Kubernetes +# Quick health check +gpuhealth scan -The recommended way to deploy GPUd on Kubernetes is with our official [Helm chart](./deployments/helm/gpud/README.md). +# Offline data collection +gpuhealth run --offline-mode --path=/tmp/gpu-health --duration=00:05:00 --format csv -#### If your system doesn't have systemd - -To run on Mac (without systemd): - -```bash -gpud run +# Check status +gpuhealth status ``` -Or +### API Access ```bash -nohup sudo /usr/local/bin run &>> & -``` +# Health status +curl http://localhost:15133/healthz -### Stop and uninstall +# Machine info & health states +curl http://localhost:15133/machine-info +curl http://localhost:15133/v1/states -```bash -sudo gpud down -sudo rm /usr/local/bin -sudo rm /etc/systemd/system/gpud.service +# Prometheus metrics +curl http://localhost:15133/metrics ``` -## Key Features +## What It Monitors -- Monitor critical GPU and GPU fabric metrics (power, temperature). -- Reports GPU and GPU fabric status (nvidia-smi parser, error checking). -- Detects critical GPU and GPU fabric errors (kmsg, hardware slowdown, NVML Xid event, DCGM). -- Monitor overall system metrics (CPU, memory, disk). +- **GPU Health**: Power, temperature, clocks, utilization, Xid events +- **System Metrics**: CPU, memory, disk usage +- **Driver Status**: NVIDIA driver version and compatibility +- **Process Info**: GPU process allocation and resource usage -Check out [*components*](./docs/COMPONENTS.md) for a detailed list of components and their features. +## Data Export -## Integration +- **HTTP API**: Real-time JSON/Prometheus metrics +- **Offline Mode**: File-based data collection (JSON/CSV) +- **Centralized Reporting**: Optional push to control planes -For users looking to set up a platform to collect and process data from gpud, please refer to [INTEGRATION](./docs/INTEGRATION.md). +See [Components Guide](./docs/COMPONENTS.md) for detailed monitoring capabilities. -## FAQs +## FAQ -### Does GPUd send data to lepton.ai? +**Does it send data externally?** +No, by default all data stays local. Optional centralized reporting can be configured if desired. -GPUd collects a small anonymous usage signal by default to help the engineering team better understand usage frequencies. The data is strictly anonymized and **does not contain any sensitive data**. You can disable this behavior by setting `GPUD_NO_USAGE_STATS=true`. If GPUd is run with systemd (default option for the `gpud up` command), you can add the line `GPUD_NO_USAGE_STATS=true` to the `/etc/default/gpud` environment file and restart the service. +**System requirements?** +Ubuntu 22.04+, RHEL 8+, <100MB RAM, <1% CPU. NVIDIA drivers recommended but not required. -If you opt-in to log in to the Lepton AI platform, to assist you with more helpful GPU health states, GPUd periodically sends system runtime related information about the host to the platform. All these info are system workload and health info, and contain no user data. The data are sent via secure channels. +**Integration options?** +HTTP API (JSON/Prometheus), offline file export, or optional push to monitoring systems. -### How to update GPUd? +## Documentation -GPUd is still in active development, regularly releasing new versions for critical bug fixes and new features. We strongly recommend always being on the latest version of GPUd. +- [Components Guide](./docs/COMPONENTS.md) - Monitoring capabilities and configuration +- [Architecture Overview](./docs/ARCHITECTURE.md) - System design and technical details +- [Installation Guide](./docs/INSTALL.md) - Comprehensive setup instructions +- [Integration Guide](./docs/INTEGRATION.md) - Monitoring system integration -When GPUd is registered with the Lepton platform, the platform will automatically update GPUd to the latest version. To disable such auto-updates, if GPUd is run with systemd (default option for the `gpud up` command), you may add the flag `FLAGS="--enable-auto-update=false"` to the `/etc/default/gpud` environment file and restart the service. +## Contributing -## Learn more +Contributions welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines. -- [Why GPUd](./docs/WHY.md) -- [Install GPUd](./docs/INSTALL.md) -- [GPUd components](./docs/COMPONENTS.md) -- [GPUd architecture](./docs/ARCHITECTURE.md) +**Related Projects:** [leptonai/gpud](https://github.com/leptonai/gpud) (upstream full-featured GPU management daemon) -## Contributing +## License -Please see the [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on how to contribute to this project. +Apache License 2.0 - see [LICENSE](LICENSE) for details. diff --git a/deployments/packages/gpuhealth/doc/README.md b/deployments/packages/gpuhealth/doc/README.md index 7a8e24f5..f040c7f3 100644 --- a/deployments/packages/gpuhealth/doc/README.md +++ b/deployments/packages/gpuhealth/doc/README.md @@ -7,10 +7,11 @@ Configuration ------------- The service configuration is located at: - /etc/default/gpuhealth + deb based system: /etc/default/gpuhealth + rpm based system: /etc/sysconfig/gpuhealth To configure the service: - 1. Edit: sudo vi /etc/default/gpuhealth + 1. Edit: sudo vi /etc/default/gpuhealth or sudo vi /etc/sysconfig/gpuhealth 2. Restart: sudo systemctl restart gpuhealthd The default configuration sets log level to 'info'. Available levels: diff --git a/deployments/packages/gpuhealth/systemd/gpuhealthd.service b/deployments/packages/gpuhealth/systemd/gpuhealthd.service index bc3458db..802b0a57 100644 --- a/deployments/packages/gpuhealth/systemd/gpuhealthd.service +++ b/deployments/packages/gpuhealth/systemd/gpuhealthd.service @@ -3,7 +3,8 @@ Description=GPU Health Daemon After=network.target local-fs.target [Service] -EnvironmentFile=/etc/default/gpuhealth +EnvironmentFile=-/etc/default/gpuhealth +EnvironmentFile=-/etc/sysconfig/gpuhealth ExecStart=/usr/bin/gpuhealth run $GPUHEALTH_FLAGS ExecReload=/bin/kill -HUP $MAINPID diff --git a/version/version.go b/version/version.go index bce07a4d..fa3bd9af 100644 --- a/version/version.go +++ b/version/version.go @@ -5,7 +5,7 @@ import "runtime" var ( // Package is filled at linking time - Package = "github.com/leptonai/gpud" + Package = "github.com/NVIDIA/gpuhealth" // Version holds the complete version number. Filled in at linking time. Version = "0.0.1+unknown"