diff --git a/bin/lib/onboard.js b/bin/lib/onboard.js index 5ab4e8629..5b133c5a5 100644 --- a/bin/lib/onboard.js +++ b/bin/lib/onboard.js @@ -868,6 +868,12 @@ function getSandboxInferenceConfig(model, provider = null, preferredInferenceApi supportsStore: false, }; break; + case "ollama-local": + case "vllm-local": + providerKey = "inference"; + primaryModelRef = `inference/${model}`; + inferenceBaseUrl = getLocalProviderBaseUrl(provider); + break; case "nvidia-prod": case "nvidia-nim": default: @@ -1829,7 +1835,10 @@ async function preflight() { // GPU const gpu = nim.detectGpu(); - if (gpu && gpu.type === "nvidia") { + if (gpu && gpu.type === "nvidia" && gpu.jetson) { + console.log(` ✓ NVIDIA Jetson detected: ${gpu.name}, ${gpu.totalMemoryMB} MB unified memory`); + console.log(" ⓘ NIM containers not supported on Jetson — will use Ollama or cloud inference"); + } else if (gpu && gpu.type === "nvidia") { console.log(` ✓ NVIDIA GPU detected: ${gpu.count} GPU(s), ${gpu.totalMemoryMB} MB VRAM`); if (!gpu.nimCapable) { console.log(" ⓘ GPU VRAM too small for local NIM — will use cloud inference"); @@ -1889,9 +1898,85 @@ async function preflight() { return gpu; } +// ── Jetson gateway image patch ─────────────────────────────────── +// +// JetPack kernels (Tegra) ship without nft_chain_filter and related +// nf_tables modules. The OpenShell gateway image embeds k3s, whose +// network policy controller calls iptables in nf_tables mode by default. +// Without kernel support the controller panics on startup. +// +// This function rebuilds the gateway image locally, switching the +// default iptables alternative to iptables-legacy so all rule +// manipulation uses the classic xtables backend that Tegra kernels +// fully support. + +/** Extracts the semver tag from the installed openshell CLI version. */ +function getGatewayImageTag() { + const openshellVersion = + runCapture("openshell --version 2>/dev/null", { ignoreError: true }) || ""; + const match = openshellVersion.match(/(\d+\.\d+\.\d+)/); + return match ? match[1] : "latest"; +} + +/** + * Rebuilds the OpenShell gateway container image with iptables-legacy as the + * default backend. Idempotent — skips rebuild if the image is already patched + * (checked via Docker label). Required on Jetson because the Tegra kernel + * lacks nft_chain_filter modules that k3s's network policy controller needs. + */ +function patchGatewayImageForJetson() { + const tag = getGatewayImageTag(); + const image = `ghcr.io/nvidia/openshell/cluster:${tag}`; + + // Check if already patched (look for our label) + const inspectOut = ( + runCapture( + `docker inspect --format='{{index .Config.Labels "io.nemoclaw.jetson-patched"}}' ${shellQuote(image)} 2>/dev/null`, + { ignoreError: true }, + ) || "" + ).trim(); + if (inspectOut === "true") { + console.log(" ✓ Gateway image already patched for Jetson"); + return; + } + + console.log(" Patching gateway image for Jetson (iptables-legacy)..."); + console.log(" (this may take a moment on first run if the base image needs to be pulled)"); + + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-jetson-")); + try { + const dockerfile = path.join(tmpDir, "Dockerfile"); + fs.writeFileSync( + dockerfile, + [ + `FROM ${image}`, + `RUN if command -v update-alternatives >/dev/null 2>&1 && \\`, + ` update-alternatives --set iptables /usr/sbin/iptables-legacy 2>/dev/null && \\`, + ` update-alternatives --set ip6tables /usr/sbin/ip6tables-legacy 2>/dev/null; then \\`, + ` :; \\`, + ` elif [ -f /usr/sbin/iptables-legacy ] && [ -f /usr/sbin/ip6tables-legacy ]; then \\`, + ` ln -sf /usr/sbin/iptables-legacy /usr/sbin/iptables; \\`, + ` ln -sf /usr/sbin/ip6tables-legacy /usr/sbin/ip6tables; \\`, + ` else \\`, + ` echo "iptables-legacy not available in base image" >&2; exit 1; \\`, + ` fi`, + `LABEL io.nemoclaw.jetson-patched="true"`, + "", + ].join("\n"), + ); + + run(`docker build --quiet -t ${shellQuote(image)} ${shellQuote(tmpDir)}`, { + ignoreError: false, + }); + console.log(" ✓ Gateway image patched for Jetson (iptables-legacy)"); + } finally { + fs.rmSync(tmpDir, { recursive: true, force: true }); + } +} + // ── Step 2: Gateway ────────────────────────────────────────────── -async function startGatewayWithOptions(_gpu, { exitOnFailure = true } = {}) { +async function startGatewayWithOptions(gpu, { exitOnFailure = true } = {}) { step(2, 7, "Starting OpenShell gateway"); const gatewayStatus = runCaptureOpenshell(["status"], { ignoreError: true }); @@ -1906,6 +1991,15 @@ async function startGatewayWithOptions(_gpu, { exitOnFailure = true } = {}) { return; } + // Jetson (Tegra kernel): The k3s container image ships iptables v1.8.10 in + // nf_tables mode, but JetPack kernels lack the nft_chain_filter module, + // causing the k3s network policy controller to panic on startup. + // Workaround: rebuild the gateway image locally with iptables-legacy as the + // default so iptables commands use the legacy (xtables) backend instead. + if (gpu && gpu.jetson) { + patchGatewayImageForJetson(); + } + // When a stale gateway is detected (metadata exists but container is gone, // e.g. after a Docker/Colima restart), skip the destroy — `gateway start` // can recover the container without wiping metadata and mTLS certs. @@ -2122,6 +2216,10 @@ async function createSandbox( registry.removeSandbox(sandboxName); } + // Kill stale dashboard-forward processes only when we are actually + // creating or recreating — avoids breaking a healthy forward on no-op reruns. + run("kill $(lsof -ti :18789 -c openclaw) 2>/dev/null || true", { ignoreError: true }); + // Stage build context const buildCtx = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-build-")); const stagedDockerfile = path.join(buildCtx, "Dockerfile"); @@ -3770,6 +3868,7 @@ module.exports = { classifySandboxCreateFailure, createSandbox, getFutureShellPathHint, + getGatewayImageTag, getGatewayStartEnv, getGatewayReuseState, getSandboxInferenceConfig, @@ -3805,6 +3904,7 @@ module.exports = { arePolicyPresetsApplied, setupPoliciesWithSelection, hydrateCredentialEnv, + patchGatewayImageForJetson, shouldIncludeBuildContextPath, writeSandboxConfigSyncFile, patchStagedDockerfile, diff --git a/bin/nemoclaw.js b/bin/nemoclaw.js index 63013a67e..a03b2082b 100755 --- a/bin/nemoclaw.js +++ b/bin/nemoclaw.js @@ -55,6 +55,7 @@ const GLOBAL_COMMANDS = new Set([ "deploy", "setup", "setup-spark", + "setup-jetson", "start", "stop", "status", @@ -651,6 +652,11 @@ async function setupSpark() { run(`sudo bash "${SCRIPTS}/setup-spark.sh"`); } +async function setupJetson() { + // setup-jetson.sh configures Docker runtime + iptables-legacy for Jetson. + run(`sudo bash "${SCRIPTS}/setup-jetson.sh"`); +} + // eslint-disable-next-line complexity async function deploy(instanceName) { if (!instanceName) { @@ -1199,6 +1205,7 @@ function help() { ${B}nemoclaw onboard${R} Configure inference endpoint and credentials ${D}(non-interactive: ${NOTICE_ACCEPT_FLAG} or ${NOTICE_ACCEPT_ENV}=1)${R} nemoclaw setup-spark Set up on DGX Spark ${D}(fixes cgroup v2 + Docker)${R} + nemoclaw setup-jetson Set up on Jetson ${D}(Docker runtime + iptables-legacy)${R} ${G}Sandbox Management:${R} ${B}nemoclaw list${R} List all sandboxes @@ -1261,6 +1268,9 @@ const [cmd, ...args] = process.argv.slice(2); case "setup-spark": await setupSpark(); break; + case "setup-jetson": + await setupJetson(); + break; case "deploy": await deploy(args[0]); break; diff --git a/docs/get-started/quickstart.md b/docs/get-started/quickstart.md index 182120b73..da1c0dca1 100644 --- a/docs/get-started/quickstart.md +++ b/docs/get-started/quickstart.md @@ -64,6 +64,7 @@ The sandbox image is approximately 2.4 GB compressed. During image push, the Doc | macOS (Intel) | Podman | Not supported yet. Depends on OpenShell support for Podman on macOS. | | Windows WSL | Docker Desktop (WSL backend) | Supported target path. | | DGX Spark | Docker | Refer to the [DGX Spark setup guide](https://github.com/NVIDIA/NemoClaw/blob/main/spark-install.md) for cgroup v2 and Docker configuration. | +| Jetson (Orin Nano, Orin NX, AGX Orin, Xavier) | Docker | Run `sudo nemoclaw setup-jetson` before onboarding. See [commands reference](../reference/commands.md#nemoclaw-setup-jetson). | ## Install NemoClaw and Onboard OpenClaw Agent diff --git a/docs/reference/commands.md b/docs/reference/commands.md index dae816471..09de7d5d1 100644 --- a/docs/reference/commands.md +++ b/docs/reference/commands.md @@ -216,6 +216,16 @@ After the fixes complete, the script prompts you to run `nemoclaw onboard` to co $ sudo nemoclaw setup-spark ``` +### `nemoclaw setup-jetson` + +Set up NemoClaw on NVIDIA Jetson devices (Orin Nano, Orin NX, AGX Orin, Xavier). +This command configures the NVIDIA container runtime for Docker and applies iptables-legacy fixes required by Jetson's Tegra kernel. +Run with `sudo` on the Jetson host. + +```console +$ sudo nemoclaw setup-jetson +``` + ### `nemoclaw debug` Collect diagnostics for bug reports. diff --git a/scripts/setup-jetson.sh b/scripts/setup-jetson.sh new file mode 100755 index 000000000..746f8474b --- /dev/null +++ b/scripts/setup-jetson.sh @@ -0,0 +1,236 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# NemoClaw setup for NVIDIA Jetson devices (Orin Nano, Orin NX, AGX Orin, Thor). +# +# Jetson devices use unified memory and a Tegra kernel that lacks nf_tables +# chain modules (nft_chain_filter, nft_chain_nat, etc.). The OpenShell gateway +# runs k3s inside a Docker container, and k3s's network policy controller +# uses iptables in nf_tables mode by default, which panics on Tegra kernels. +# +# This script prepares the Jetson host so that `nemoclaw onboard` succeeds: +# 1. Verifies Jetson platform +# 2. Ensures NVIDIA Container Runtime is configured for Docker +# 3. Loads required kernel modules (br_netfilter, xt_comment) +# 4. Configures Docker daemon with default-runtime=nvidia +# +# The iptables-legacy patch for the gateway container image is handled +# automatically by `nemoclaw onboard` when it detects a Jetson GPU. +# +# Usage: +# sudo nemoclaw setup-jetson +# # or directly: +# sudo bash scripts/setup-jetson.sh + +set -euo pipefail + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' +MIN_NODE_VERSION="22.16.0" + +info() { echo -e "${GREEN}>>>${NC} $1"; } +warn() { echo -e "${YELLOW}>>>${NC} $1"; } +fail() { + echo -e "${RED}>>>${NC} $1" + exit 1 +} + +version_gte() { + # Returns 0 (true) if $1 >= $2 — portable, no sort -V (BSD compat) + local IFS=. + local -a a b + read -r -a a <<<"$1" + read -r -a b <<<"$2" + for i in 0 1 2; do + local ai=${a[$i]:-0} bi=${b[$i]:-0} + if ((ai > bi)); then return 0; fi + if ((ai < bi)); then return 1; fi + done + return 0 +} + +# ── Pre-flight checks ───────────────────────────────────────────── + +if [ "$(uname -s)" != "Linux" ]; then + fail "This script is for NVIDIA Jetson (Linux). Use 'nemoclaw setup' for macOS." +fi + +if [ "$(uname -m)" != "aarch64" ]; then + fail "Jetson devices are aarch64. This system is $(uname -m)." +fi + +if [ "$(id -u)" -ne 0 ]; then + fail "Must run as root: sudo nemoclaw setup-jetson" +fi + +# Verify Jetson platform +JETSON_MODEL="" +if [ -f /proc/device-tree/model ]; then + JETSON_MODEL=$(tr -d '\0' /dev/null || echo "") + if ! echo "$GPU_NAME" | grep -qiE "orin|thor"; then + fail "This does not appear to be a Jetson device. Use 'nemoclaw onboard' directly." + fi + # Exclude discrete GPUs that happen to contain matching strings + if echo "$GPU_NAME" | grep -qiE "geforce|rtx|quadro"; then + fail "Discrete GPU detected ('$GPU_NAME'). This script is for Jetson only." + fi + JETSON_MODEL="${GPU_NAME}" +fi + +info "Detected Jetson platform: ${JETSON_MODEL}" + +# Detect the real user (not root) for docker group add +REAL_USER="${SUDO_USER:-$(logname 2>/dev/null || echo "")}" + +command -v docker >/dev/null || fail "Docker not found. Install docker.io: sudo apt-get install -y docker.io" +command -v python3 >/dev/null || fail "python3 not found. Install with: sudo apt-get install -y python3-minimal" +command -v node >/dev/null || fail "Node.js not found. NemoClaw requires Node.js >= ${MIN_NODE_VERSION}. Install Node.js before running 'nemoclaw onboard'." + +NODE_VERSION_RAW="$(node --version 2>/dev/null || true)" +NODE_VERSION="${NODE_VERSION_RAW#v}" +if ! echo "$NODE_VERSION" | grep -Eq '^[0-9]+\.[0-9]+\.[0-9]+$'; then + fail "Could not parse Node.js version from '${NODE_VERSION_RAW}'. NemoClaw requires Node.js >= ${MIN_NODE_VERSION}." +fi +if ! version_gte "$NODE_VERSION" "$MIN_NODE_VERSION"; then + fail "Node.js ${NODE_VERSION_RAW} is too old. NemoClaw requires Node.js >= ${MIN_NODE_VERSION}." +fi +info "Node.js ${NODE_VERSION_RAW} OK" + +# ── 1. Docker group ─────────────────────────────────────────────── + +if [ -n "$REAL_USER" ]; then + if id -nG "$REAL_USER" | grep -qw docker; then + info "User '$REAL_USER' already in docker group" + else + info "Adding '$REAL_USER' to docker group..." + usermod -aG docker "$REAL_USER" + info "Added. Group will take effect on next login (or use 'newgrp docker')." + fi +fi + +# ── 2. NVIDIA Container Runtime ────────────────────────────────── +# +# Jetson JetPack pre-installs nvidia-container-runtime but Docker may +# not be configured to use it as the default runtime. + +DAEMON_JSON="/etc/docker/daemon.json" +NEEDS_RESTART=false + +configure_nvidia_runtime() { + if ! command -v nvidia-container-runtime >/dev/null 2>&1; then + warn "nvidia-container-runtime not found. GPU passthrough may not work." + warn "Install with: sudo apt-get install -y nvidia-container-toolkit" + return + fi + + if [ -f "$DAEMON_JSON" ]; then + # Check if nvidia runtime is already configured + if python3 -c " +import json, sys +try: + d = json.load(open('$DAEMON_JSON')) + runtimes = d.get('runtimes', {}) if isinstance(d, dict) else {} + if 'nvidia' in runtimes and d.get('default-runtime') == 'nvidia': + sys.exit(0) + sys.exit(1) +except (IOError, ValueError, KeyError, AttributeError): + sys.exit(1) +" 2>/dev/null; then + info "NVIDIA runtime already configured in Docker daemon" + else + info "Adding NVIDIA runtime to Docker daemon config..." + python3 -c " +import json +try: + with open('$DAEMON_JSON') as f: + d = json.load(f) +except (IOError, ValueError, KeyError): + d = {} +if not isinstance(d, dict): + d = {} +d.setdefault('runtimes', {})['nvidia'] = { + 'path': 'nvidia-container-runtime', + 'runtimeArgs': [] +} +d['default-runtime'] = 'nvidia' +with open('$DAEMON_JSON', 'w') as f: + json.dump(d, f, indent=2) +" + NEEDS_RESTART=true + fi + else + info "Creating Docker daemon config with NVIDIA runtime..." + mkdir -p "$(dirname "$DAEMON_JSON")" + cat >"$DAEMON_JSON" <<'DAEMONJSON' +{ + "runtimes": { + "nvidia": { + "path": "nvidia-container-runtime", + "runtimeArgs": [] + } + }, + "default-runtime": "nvidia" +} +DAEMONJSON + NEEDS_RESTART=true + fi +} + +configure_nvidia_runtime + +# ── 3. Kernel modules ──────────────────────────────────────────── + +info "Loading required kernel modules..." +modprobe br_netfilter 2>/dev/null || warn "Could not load br_netfilter" +modprobe xt_comment 2>/dev/null || warn "Could not load xt_comment" + +# Persist across reboots +MODULES_FILE="/etc/modules-load.d/nemoclaw-jetson.conf" +if [ ! -f "$MODULES_FILE" ]; then + info "Persisting kernel modules for boot..." + cat >"$MODULES_FILE" <<'MODULES' +# NemoClaw: required for k3s networking inside Docker +br_netfilter +xt_comment +MODULES +fi + +# ── 4. Restart Docker if needed ────────────────────────────────── + +if [ "$NEEDS_RESTART" = true ]; then + info "Restarting Docker daemon..." + if command -v systemctl >/dev/null 2>&1; then + systemctl restart docker + else + service docker restart 2>/dev/null || dockerd & + fi + for i in $(seq 1 15); do + if docker info >/dev/null 2>&1; then + break + fi + [ "$i" -eq 15 ] && fail "Docker didn't come back after restart. Check 'systemctl status docker'." + sleep 2 + done + info "Docker restarted with NVIDIA runtime" +fi + +# ── Done ───────────────────────────────────────────────────────── + +echo "" +info "Jetson setup complete." +info "" +info "Device: ${JETSON_MODEL}" +info "" +info "Next step: run 'nemoclaw onboard' to set up your sandbox." +info " nemoclaw onboard" +info "" +info "The onboard wizard will automatically patch the gateway image" +info "for Jetson iptables compatibility." diff --git a/src/lib/local-inference.test.ts b/src/lib/local-inference.test.ts index 34040c814..535d52b29 100644 --- a/src/lib/local-inference.test.ts +++ b/src/lib/local-inference.test.ts @@ -8,6 +8,7 @@ import { CONTAINER_REACHABILITY_IMAGE, DEFAULT_OLLAMA_MODEL, LARGE_OLLAMA_MIN_MEMORY_MB, + DEFAULT_OLLAMA_MODEL_JETSON, getDefaultOllamaModel, getBootstrapOllamaModelOptions, getLocalProviderBaseUrl, @@ -23,6 +24,8 @@ import { validateLocalProvider, } from "../../dist/lib/local-inference"; +const FAKE_JETSON_GPU = { type: "nvidia", totalMemoryMB: 7627, jetson: true, unifiedMemory: true }; + describe("local inference helpers", () => { it("returns the expected base URL for vllm-local", () => { expect(getLocalProviderBaseUrl("vllm-local")).toBe("http://host.openshell.internal:8000/v1"); @@ -251,4 +254,14 @@ describe("local inference helpers", () => { it("treats non-JSON probe output as success once the model responds", () => { expect(validateOllamaModel("nemotron-3-nano:30b", () => "ok")).toEqual({ ok: true }); }); + + it("returns jetson 4b model as default on jetson when available", () => { + const list = "nemotron-3-nano:4b abc 2.8 GB now\nqwen3:32b def 20 GB now"; + expect(getDefaultOllamaModel(() => list, FAKE_JETSON_GPU)).toBe(DEFAULT_OLLAMA_MODEL_JETSON); + }); + + it("falls back to jetson 4b model when ollama list is empty on jetson", () => { + expect(getBootstrapOllamaModelOptions(FAKE_JETSON_GPU)).toEqual([DEFAULT_OLLAMA_MODEL_JETSON]); + expect(getDefaultOllamaModel(() => "", FAKE_JETSON_GPU)).toBe(DEFAULT_OLLAMA_MODEL_JETSON); + }); }); diff --git a/src/lib/local-inference.ts b/src/lib/local-inference.ts index 9390bb70e..f3ac6658e 100644 --- a/src/lib/local-inference.ts +++ b/src/lib/local-inference.ts @@ -14,11 +14,13 @@ export const CONTAINER_REACHABILITY_IMAGE = "curlimages/curl:8.10.1"; export const DEFAULT_OLLAMA_MODEL = "nemotron-3-nano:30b"; export const SMALL_OLLAMA_MODEL = "qwen2.5:7b"; export const LARGE_OLLAMA_MIN_MEMORY_MB = 32768; +export const DEFAULT_OLLAMA_MODEL_JETSON = "nemotron-3-nano:4b"; export type RunCaptureFn = (cmd: string, opts?: { ignoreError?: boolean }) => string; export interface GpuInfo { totalMemoryMB: number; + jetson?: boolean; } export interface ValidationResult { @@ -164,6 +166,11 @@ export function getOllamaModelOptions(runCapture: RunCaptureFn): string[] { } export function getBootstrapOllamaModelOptions(gpu: GpuInfo | null): string[] { + // Jetson: fall back to the 4B model that fits in 8GB unified memory + // instead of the 30B default which would OOM. + if (gpu && gpu.jetson) { + return [DEFAULT_OLLAMA_MODEL_JETSON]; + } const options = [SMALL_OLLAMA_MODEL]; if (gpu && gpu.totalMemoryMB >= LARGE_OLLAMA_MIN_MEMORY_MB) { options.push(DEFAULT_OLLAMA_MODEL); @@ -180,6 +187,10 @@ export function getDefaultOllamaModel( const bootstrap = getBootstrapOllamaModelOptions(gpu); return bootstrap[0]; } + if (gpu && gpu.jetson) { + if (models.includes(DEFAULT_OLLAMA_MODEL_JETSON)) return DEFAULT_OLLAMA_MODEL_JETSON; + return models[0]; + } return models.includes(DEFAULT_OLLAMA_MODEL) ? DEFAULT_OLLAMA_MODEL : models[0]; } diff --git a/src/lib/nim.test.ts b/src/lib/nim.test.ts index ab3cbe5f9..f1c99a89a 100644 --- a/src/lib/nim.test.ts +++ b/src/lib/nim.test.ts @@ -159,6 +159,7 @@ describe("nim", () => { nimCapable: false, unifiedMemory: true, spark: false, + jetson: true, }); } finally { restore(); @@ -256,4 +257,37 @@ describe("nim", () => { } }); }); + + it("detects Jetson Orin and sets jetson flag", () => { + const runCapture = vi.fn((cmd) => { + if (cmd.includes("memory.total")) return ""; + if (cmd.includes("query-gpu=name")) return "Orin"; + if (cmd.includes("free -m")) return "7627"; + return ""; + }); + const { nimModule, restore } = loadNimWithMockedRunner(runCapture); + try { + const gpu = nimModule.detectGpu(); + expect(gpu).toMatchObject({ type: "nvidia", jetson: true, unifiedMemory: true }); + } finally { + restore(); + } + }); + + it("detects Jetson via /proc/device-tree/model fallback", () => { + const runCapture = vi.fn((cmd) => { + if (cmd.includes("memory.total")) return ""; + if (cmd.includes("query-gpu=name")) return ""; + if (cmd.includes("device-tree/model")) return "NVIDIA Jetson Orin Nano Super Developer Kit"; + if (cmd.includes("free -m")) return "7627"; + return ""; + }); + const { nimModule, restore } = loadNimWithMockedRunner(runCapture); + try { + const gpu = nimModule.detectGpu(); + expect(gpu).toMatchObject({ type: "nvidia", jetson: true }); + } finally { + restore(); + } + }); }); diff --git a/src/lib/nim.ts b/src/lib/nim.ts index 2e10d1e60..cd237c7e1 100644 --- a/src/lib/nim.ts +++ b/src/lib/nim.ts @@ -26,6 +26,7 @@ export interface GpuDetection { nimCapable: boolean; unifiedMemory?: boolean; spark?: boolean; + jetson?: boolean; } export interface NimStatus { @@ -107,6 +108,9 @@ export function detectGpu(): GpuDetection | null { const count = unifiedGpuNames.length; const perGpuMB = count > 0 ? Math.floor(totalMemoryMB / count) : totalMemoryMB; const isSpark = unifiedGpuNames.some((name: string) => /GB10/i.test(name)); + const isJetson = + unifiedGpuNames.some((name: string) => /orin|thor|xavier/i.test(name)) && + !unifiedGpuNames.some((name: string) => /geforce|rtx|quadro/i.test(name)); return { type: "nvidia", name: unifiedGpuNames[0], @@ -116,6 +120,35 @@ export function detectGpu(): GpuDetection | null { nimCapable: canRunNimWithMemory(totalMemoryMB), unifiedMemory: true, spark: isSpark, + jetson: isJetson, + }; + } + } catch { + /* ignored */ + } + + // Jetson fallback: /proc/device-tree/model (for cases where nvidia-smi is absent) + try { + const dtModel = runCapture("cat /proc/device-tree/model 2>/dev/null | tr -d '\\0'", { + ignoreError: true, + }); + if (dtModel && /jetson/i.test(dtModel)) { + let totalMemoryMB = 0; + try { + const memLine = runCapture("free -m | awk '/Mem:/ {print $2}'", { ignoreError: true }); + if (memLine) totalMemoryMB = parseInt(memLine.trim(), 10) || 0; + } catch { + /* ignored */ + } + return { + type: "nvidia", + name: dtModel.trim(), + count: 1, + totalMemoryMB, + perGpuMB: totalMemoryMB, + nimCapable: false, + unifiedMemory: true, + jetson: true, }; } } catch { diff --git a/test/onboard.test.js b/test/onboard.test.js index 267696119..6bda7721f 100644 --- a/test/onboard.test.js +++ b/test/onboard.test.js @@ -11,6 +11,7 @@ import { describe, expect, it } from "vitest"; import { buildSandboxConfigSyncScript, classifySandboxCreateFailure, + getGatewayImageTag, getGatewayReuseState, getPortConflictServiceHints, getFutureShellPathHint, @@ -27,6 +28,7 @@ import { classifyValidationFailure, isLoopbackHostname, normalizeProviderBaseUrl, + patchGatewayImageForJetson, patchStagedDockerfile, printSandboxCreateRecoveryHints, resolveDashboardForwardTarget, @@ -241,6 +243,26 @@ describe("onboard helpers", () => { }); }); + it("maps ollama-local to direct endpoint with correct base URL", () => { + assert.deepEqual(getSandboxInferenceConfig("nemotron-3-nano:4b", "ollama-local"), { + providerKey: "inference", + primaryModelRef: "inference/nemotron-3-nano:4b", + inferenceBaseUrl: "http://host.openshell.internal:11434/v1", + inferenceApi: "openai-completions", + inferenceCompat: null, + }); + }); + + it("maps vllm-local to direct endpoint with correct base URL", () => { + assert.deepEqual(getSandboxInferenceConfig("meta-llama/llama-3.1-8b", "vllm-local"), { + providerKey: "inference", + primaryModelRef: "inference/meta-llama/llama-3.1-8b", + inferenceBaseUrl: "http://host.openshell.internal:8000/v1", + inferenceApi: "openai-completions", + inferenceCompat: null, + }); + }); + it("pins the gateway image to the installed OpenShell release version", () => { expect(getInstalledOpenshellVersion("openshell 0.0.12")).toBe("0.0.12"); expect(getInstalledOpenshellVersion("openshell 0.0.13-dev.8+gbbcaed2ea")).toBe("0.0.13"); @@ -1824,4 +1846,329 @@ const { setupInference } = require(${onboardPath}); assert.match(fnBody, /isNonInteractive\(\)/); assert.match(fnBody, /process\.exit\(1\)/); }); + + it("exports getGatewayImageTag and patchGatewayImageForJetson as functions", () => { + assert.equal(typeof getGatewayImageTag, "function"); + assert.equal(typeof patchGatewayImageForJetson, "function"); + }); + + it("patchGatewayImageForJetson skips rebuild when image is already patched (idempotency)", () => { + const repoRoot = path.join(import.meta.dirname, ".."); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-jetson-patch-idem-")); + const scriptPath = path.join(tmpDir, "jetson-patch-idempotent.js"); + const fakeBin = path.join(tmpDir, "bin"); + const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js")); + const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js")); + + fs.mkdirSync(fakeBin, { recursive: true }); + fs.writeFileSync(path.join(fakeBin, "openshell"), "#!/usr/bin/env bash\necho 0.0.10\n", { + mode: 0o755, + }); + + const script = String.raw` +const runner = require(${runnerPath}); +const commands = []; +runner.run = (command, opts = {}) => { + commands.push({ command, type: "run" }); + return { status: 0 }; +}; +runner.runCapture = (command) => { + commands.push({ command, type: "runCapture" }); + if (command.includes("openshell --version")) return "0.0.10"; + // Already patched — docker inspect returns "true" + if (command.includes("docker inspect") && command.includes("jetson-patched")) return "true"; + return ""; +}; + +const { patchGatewayImageForJetson } = require(${onboardPath}); +patchGatewayImageForJetson(); + +// No docker build should have been called +const buildCalls = commands.filter(c => c.command && c.command.includes("docker build")); +console.log(JSON.stringify({ buildCalls: buildCalls.length, totalCommands: commands.length })); +`; + fs.writeFileSync(scriptPath, script); + + const result = spawnSync(process.execPath, [scriptPath], { + cwd: repoRoot, + encoding: "utf-8", + env: { ...process.env, HOME: tmpDir, PATH: `${fakeBin}:${process.env.PATH || ""}` }, + }); + + assert.equal(result.status, 0, `stderr: ${result.stderr}`); + const output = JSON.parse(result.stdout.trim().split("\n").pop()); + assert.equal(output.buildCalls, 0, "docker build should NOT be called when already patched"); + }); + + it("patchGatewayImageForJetson builds image when not yet patched", () => { + const repoRoot = path.join(import.meta.dirname, ".."); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-jetson-patch-build-")); + const scriptPath = path.join(tmpDir, "jetson-patch-build.js"); + const fakeBin = path.join(tmpDir, "bin"); + const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js")); + const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js")); + + fs.mkdirSync(fakeBin, { recursive: true }); + fs.writeFileSync(path.join(fakeBin, "openshell"), "#!/usr/bin/env bash\necho 0.0.10\n", { + mode: 0o755, + }); + + const script = String.raw` +const runner = require(${runnerPath}); +const commands = []; +runner.run = (command, opts = {}) => { + commands.push({ command, type: "run" }); + return { status: 0 }; +}; +runner.runCapture = (command) => { + commands.push({ command, type: "runCapture" }); + if (command.includes("openshell --version")) return "0.0.10"; + // Not yet patched — docker inspect returns empty + if (command.includes("docker inspect") && command.includes("jetson-patched")) return ""; + return ""; +}; + +const { patchGatewayImageForJetson } = require(${onboardPath}); +patchGatewayImageForJetson(); + +const buildCalls = commands.filter(c => c.command && c.command.includes("docker build")); +const buildCmd = buildCalls.length > 0 ? buildCalls[0].command : ""; +console.log(JSON.stringify({ + buildCalls: buildCalls.length, + usesShellQuote: buildCmd.includes("'ghcr.io/nvidia/openshell/cluster:0.0.10'"), + hasImage: buildCmd.includes("ghcr.io/nvidia/openshell/cluster:0.0.10"), +})); +`; + fs.writeFileSync(scriptPath, script); + + const result = spawnSync(process.execPath, [scriptPath], { + cwd: repoRoot, + encoding: "utf-8", + env: { ...process.env, HOME: tmpDir, PATH: `${fakeBin}:${process.env.PATH || ""}` }, + }); + + assert.equal(result.status, 0, `stderr: ${result.stderr}`); + const output = JSON.parse(result.stdout.trim().split("\n").pop()); + assert.equal(output.buildCalls, 1, "docker build should be called once"); + assert.ok(output.hasImage, "docker build should reference the correct image tag"); + assert.ok(output.usesShellQuote, "docker build should use shellQuote for image name"); + }); + + it("patchGatewayImageForJetson cleans up temp directory even on build failure", () => { + const repoRoot = path.join(import.meta.dirname, ".."); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-jetson-patch-cleanup-")); + const scriptPath = path.join(tmpDir, "jetson-patch-cleanup.js"); + const fakeBin = path.join(tmpDir, "bin"); + const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js")); + const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js")); + const fsPath = JSON.stringify("fs"); + const osPath = JSON.stringify("os"); + + fs.mkdirSync(fakeBin, { recursive: true }); + fs.writeFileSync(path.join(fakeBin, "openshell"), "#!/usr/bin/env bash\necho 0.0.10\n", { + mode: 0o755, + }); + + const script = String.raw` +const fs = require(${fsPath}); +const nodeOs = require(${osPath}); +const runner = require(${runnerPath}); + +runner.run = (command) => { + if (command.includes("docker build")) { + throw new Error("simulated docker build failure"); + } + return { status: 0 }; +}; +runner.runCapture = (command) => { + if (command.includes("openshell --version")) return "0.0.10"; + if (command.includes("docker inspect") && command.includes("jetson-patched")) return ""; + return ""; +}; + +const { patchGatewayImageForJetson } = require(${onboardPath}); + +// Collect temp dirs before +const tmpBefore = fs.readdirSync(nodeOs.tmpdir()).filter(d => d.startsWith("nemoclaw-jetson-")); + +let threw = false; +try { + patchGatewayImageForJetson(); +} catch (e) { + threw = true; +} + +// Check that no new nemoclaw-jetson-* temp dirs remain +const tmpAfter = fs.readdirSync(nodeOs.tmpdir()).filter(d => d.startsWith("nemoclaw-jetson-")); +const newDirs = tmpAfter.filter(d => !tmpBefore.includes(d)); +console.log(JSON.stringify({ threw, leakedDirs: newDirs.length })); +`; + fs.writeFileSync(scriptPath, script); + + const result = spawnSync(process.execPath, [scriptPath], { + cwd: repoRoot, + encoding: "utf-8", + env: { ...process.env, HOME: tmpDir, PATH: `${fakeBin}:${process.env.PATH || ""}` }, + }); + + assert.equal(result.status, 0, `stderr: ${result.stderr}`); + const output = JSON.parse(result.stdout.trim().split("\n").pop()); + assert.ok(output.threw, "should throw on docker build failure"); + assert.equal(output.leakedDirs, 0, "temp directory should be cleaned up in finally block"); + }); + + it("preflight preserves healthy gateway and skips destroy on idempotent rerun", () => { + // Verify that when getGatewayReuseState returns "healthy", preflight's + // cleanup path does NOT call gateway destroy or forward stop. + // This is the core fix for cv's blocker #1. + const repoRoot = path.join(import.meta.dirname, ".."); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-preflight-idem-")); + const scriptPath = path.join(tmpDir, "preflight-idempotent.js"); + const fakeBin = path.join(tmpDir, "bin"); + const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js")); + const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js")); + + fs.mkdirSync(fakeBin, { recursive: true }); + fs.writeFileSync(path.join(fakeBin, "openshell"), "#!/usr/bin/env bash\necho 0.0.10\n", { + mode: 0o755, + }); + + const script = String.raw` +const runner = require(${runnerPath}); + +// Track all openshell commands +const openshellCommands = []; +runner.run = (command, opts = {}) => { + openshellCommands.push(command); + return { status: 0 }; +}; +runner.runCapture = (command) => { + openshellCommands.push(command); + // Simulate healthy named gateway + if (command.includes("openshell") && command.includes("status")) { + return "Gateway status: Connected\nGateway: nemoclaw"; + } + if (command.includes("gateway") && command.includes("info") && command.includes("nemoclaw")) { + return "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080"; + } + if (command.includes("gateway") && command.includes("info")) { + return "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080"; + } + return ""; +}; + +const { getGatewayReuseState } = require(${onboardPath}); + +// Simulate the preflight cleanup logic (lines 2005-2020 of onboard.js) +const gatewayStatus = "Gateway status: Connected\nGateway: nemoclaw"; +const gwInfo = "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080"; +const activeGatewayInfo = gwInfo; +const gatewayReuseState = getGatewayReuseState(gatewayStatus, gwInfo, activeGatewayInfo); + +// Clear tracked commands before the cleanup decision +openshellCommands.length = 0; + +// This is the exact conditional from preflight() +if (gatewayReuseState === "stale" || gatewayReuseState === "active-unnamed") { + runner.run("openshell forward stop 18789"); + runner.run("openshell gateway destroy -g nemoclaw"); +} + +const destroyCalls = openshellCommands.filter(c => c.includes("gateway destroy") || c.includes("forward stop")); +console.log(JSON.stringify({ + gatewayReuseState, + destroyCalls: destroyCalls.length, +})); +`; + fs.writeFileSync(scriptPath, script); + + const result = spawnSync(process.execPath, [scriptPath], { + cwd: repoRoot, + encoding: "utf-8", + env: { ...process.env, HOME: tmpDir, PATH: `${fakeBin}:${process.env.PATH || ""}` }, + }); + + assert.equal(result.status, 0, `stderr: ${result.stderr}`); + const output = JSON.parse(result.stdout.trim().split("\n").pop()); + assert.equal(output.gatewayReuseState, "healthy", "healthy gateway should be detected"); + assert.equal( + output.destroyCalls, + 0, + "healthy gateway must NOT be destroyed on idempotent rerun", + ); + }); + + it("preflight cleans up stale gateway but preserves healthy ones", () => { + // Verify that stale state triggers cleanup while healthy does not + const repoRoot = path.join(import.meta.dirname, ".."); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-preflight-stale-")); + const scriptPath = path.join(tmpDir, "preflight-stale.js"); + const fakeBin = path.join(tmpDir, "bin"); + const onboardPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "onboard.js")); + const runnerPath = JSON.stringify(path.join(repoRoot, "bin", "lib", "runner.js")); + + fs.mkdirSync(fakeBin, { recursive: true }); + fs.writeFileSync(path.join(fakeBin, "openshell"), "#!/usr/bin/env bash\necho 0.0.10\n", { + mode: 0o755, + }); + + const script = String.raw` +const runner = require(${runnerPath}); + +runner.run = () => ({ status: 0 }); +runner.runCapture = () => ""; + +const { getGatewayReuseState } = require(${onboardPath}); + +// Stale: disconnected but gateway info exists +const staleState = getGatewayReuseState( + "Gateway status: Disconnected", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080", + "", +); + +// Healthy: connected + named +const healthyState = getGatewayReuseState( + "Gateway status: Connected\nGateway: nemoclaw", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080", +); + +// Active-unnamed: connected but no named gateway info +const unnamedState = getGatewayReuseState( + "Gateway status: Connected\nGateway: nemoclaw", + "", + "Gateway Info\n\n Gateway: nemoclaw\n Gateway endpoint: https://127.0.0.1:8080", +); + +const shouldCleanStale = staleState === "stale" || staleState === "active-unnamed"; +const shouldCleanHealthy = healthyState === "stale" || healthyState === "active-unnamed"; +const shouldCleanUnnamed = unnamedState === "stale" || unnamedState === "active-unnamed"; + +console.log(JSON.stringify({ + staleState, + healthyState, + unnamedState, + shouldCleanStale, + shouldCleanHealthy, + shouldCleanUnnamed, +})); +`; + fs.writeFileSync(scriptPath, script); + + const result = spawnSync(process.execPath, [scriptPath], { + cwd: repoRoot, + encoding: "utf-8", + env: { ...process.env, HOME: tmpDir, PATH: `${fakeBin}:${process.env.PATH || ""}` }, + }); + + assert.equal(result.status, 0, `stderr: ${result.stderr}`); + const output = JSON.parse(result.stdout.trim().split("\n").pop()); + assert.equal(output.staleState, "stale"); + assert.equal(output.healthyState, "healthy"); + assert.equal(output.unnamedState, "active-unnamed"); + assert.ok(output.shouldCleanStale, "stale gateway should be cleaned up"); + assert.ok(!output.shouldCleanHealthy, "healthy gateway must NOT be cleaned up"); + assert.ok(output.shouldCleanUnnamed, "active-unnamed gateway should be cleaned up"); + }); });