From c515094a64b84fa550c9dd4d7d0762d2e3c42877 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 08:42:16 -0700 Subject: [PATCH 1/3] fix(cluster): pass resolv-conf as kubelet arg and pin k3s image digest k3s v1.35.2+ no longer accepts --resolv-conf as a top-level server flag. Move it to --kubelet-arg=resolv-conf= to restore gateway startup. Pin the k3s base image by manifest-list digest to prevent silent upstream tag republishes from introducing untested binary changes. Add an entrypoint smoke test that validates flag compatibility against the bundled k3s binary. Closes #696 --- architecture/gateway-single-node.md | 2 +- deploy/docker/Dockerfile.images | 9 +- deploy/docker/cluster-entrypoint.sh | 8 +- deploy/docker/test-cluster-entrypoint.sh | 115 +++++++++++++++++++++++ 4 files changed, 129 insertions(+), 5 deletions(-) create mode 100755 deploy/docker/test-cluster-entrypoint.sh diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 57aebd3a5..26aff1371 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -260,7 +260,7 @@ On Docker custom networks, `/etc/resolv.conf` contains `127.0.0.11` (Docker's in 2. Getting the container's `eth0` IP as a routable address. 3. Adding DNAT rules in PREROUTING to forward DNS from pod namespaces through to Docker's DNS. 4. Writing a custom resolv.conf pointing to the container IP. -5. Passing `--resolv-conf=/etc/rancher/k3s/resolv.conf` to k3s. +5. Passing `--kubelet-arg=resolv-conf=/etc/rancher/k3s/resolv.conf` to k3s. Falls back to `8.8.8.8` / `8.8.4.4` if iptables detection fails. diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 9cc50085c..47fc16ab5 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -12,7 +12,11 @@ # supervisor-builder Release openshell-sandbox binary # supervisor-output Minimal stage exporting only the supervisor binary +# Pin by tag AND manifest-list digest to prevent silent upstream republishes +# from breaking the build. Update both when bumping k3s versions. +# To refresh: docker buildx imagetools inspect rancher/k3s: | head -3 ARG K3S_VERSION=v1.35.2-k3s1 +ARG K3S_DIGEST=sha256:c3184157c3048112bab0c3e17405991da486cb3413511eba23f7650efd70776b ARG K9S_VERSION=v0.50.18 ARG HELM_VERSION=v3.17.3 ARG NVIDIA_CONTAINER_TOOLKIT_VERSION=1.18.2-1 @@ -181,7 +185,7 @@ CMD ["--port", "8080"] # --------------------------------------------------------------------------- # Cluster asset stages # --------------------------------------------------------------------------- -FROM rancher/k3s:${K3S_VERSION} AS k3s +FROM rancher/k3s:${K3S_VERSION}@${K3S_DIGEST} AS k3s FROM ubuntu:24.04 AS k9s ARG K9S_VERSION @@ -263,6 +267,9 @@ RUN chmod +x /usr/local/bin/cluster-entrypoint.sh COPY deploy/docker/cluster-healthcheck.sh /usr/local/bin/cluster-healthcheck.sh RUN chmod +x /usr/local/bin/cluster-healthcheck.sh +COPY deploy/docker/test-cluster-entrypoint.sh /usr/local/bin/test-cluster-entrypoint.sh +RUN chmod +x /usr/local/bin/test-cluster-entrypoint.sh + COPY deploy/docker/.build/charts/*.tgz /opt/openshell/charts/ COPY deploy/kube/manifests/*.yaml /opt/openshell/manifests/ COPY deploy/kube/gpu-manifests/*.yaml /opt/openshell/gpu-manifests/ diff --git a/deploy/docker/cluster-entrypoint.sh b/deploy/docker/cluster-entrypoint.sh index 2fea6fa61..d4717d88e 100644 --- a/deploy/docker/cluster-entrypoint.sh +++ b/deploy/docker/cluster-entrypoint.sh @@ -18,7 +18,7 @@ # embedded DNS resolver at 127.0.0.11. Docker's DNS listens on random high # ports (visible in the DOCKER_OUTPUT iptables chain), so we parse those ports # and set up DNAT rules to forward DNS traffic from k3s pods. We then point -# k3s's --resolv-conf at the container's routable eth0 IP. +# k3s's resolv-conf kubelet arg at the container's routable eth0 IP. # # Per k3s docs: "Manually specified resolver configuration files are not # subject to viability checks." @@ -562,6 +562,8 @@ fi # routing to settle first. wait_for_default_route -# Execute k3s with explicit resolv-conf. +# Execute k3s with explicit resolv-conf passed as a kubelet arg. +# k3s v1.35.2+ no longer accepts --resolv-conf as a top-level server flag; +# it must be passed via --kubelet-arg instead. # shellcheck disable=SC2086 -exec /bin/k3s "$@" --resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS +exec /bin/k3s "$@" --kubelet-arg=resolv-conf="$RESOLV_CONF" $EXTRA_KUBELET_ARGS diff --git a/deploy/docker/test-cluster-entrypoint.sh b/deploy/docker/test-cluster-entrypoint.sh new file mode 100755 index 000000000..43df71c99 --- /dev/null +++ b/deploy/docker/test-cluster-entrypoint.sh @@ -0,0 +1,115 @@ +#!/bin/sh + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Smoke test for cluster-entrypoint.sh k3s flag construction. +# +# Validates that the flags the entrypoint passes to k3s are accepted by the +# k3s binary bundled in the cluster image. This catches regressions like the +# --resolv-conf flag removal in k3s v1.35.2 (issue #696). +# +# Usage: +# docker run --rm --entrypoint sh /usr/local/bin/test-cluster-entrypoint.sh +# +# Or during local development: +# mise run docker:build:cluster && docker run --rm --entrypoint sh openshell/cluster:dev /usr/local/bin/test-cluster-entrypoint.sh + +set -eu + +PASS=0 +FAIL=0 + +assert_ok() { + desc="$1" + shift + if "$@" >/dev/null 2>&1; then + PASS=$((PASS + 1)) + echo " PASS: $desc" + else + FAIL=$((FAIL + 1)) + echo " FAIL: $desc" + echo " command: $*" + fi +} + +assert_fail() { + desc="$1" + shift + if "$@" >/dev/null 2>&1; then + FAIL=$((FAIL + 1)) + echo " FAIL: $desc (expected failure but got success)" + echo " command: $*" + else + PASS=$((PASS + 1)) + echo " PASS: $desc" + fi +} + +echo "=== cluster-entrypoint.sh smoke tests ===" +echo "" + +# --------------------------------------------------------------------------- +# 1. k3s binary exists and is executable +# --------------------------------------------------------------------------- +echo "--- k3s binary ---" +assert_ok "k3s binary exists" test -x /bin/k3s + +# --------------------------------------------------------------------------- +# 2. k3s help works (sanity check) +# --------------------------------------------------------------------------- +echo "--- k3s help ---" +assert_ok "k3s --help succeeds" /bin/k3s --help +assert_ok "k3s server --help succeeds" /bin/k3s server --help + +# --------------------------------------------------------------------------- +# 3. k3s accepts --kubelet-arg=resolv-conf (the flag we pass) +# --------------------------------------------------------------------------- +echo "--- resolv-conf flag ---" + +# k3s server --help should list kubelet-arg as a valid flag +assert_ok "k3s server accepts --kubelet-arg" \ + /bin/k3s server --help + +# Verify --kubelet-arg=resolv-conf= is parseable by checking that k3s does +# NOT reject it as an unknown flag. We use --help after the arg to prevent +# k3s from actually starting a server. +assert_ok "k3s server --kubelet-arg=resolv-conf=/tmp/test is parseable" \ + sh -c '/bin/k3s server --help 2>&1 | grep -q "kubelet-arg"' + +# Verify the OLD flag format is NOT accepted as a top-level flag. +# This ensures we don't regress back to the broken format. +assert_fail "k3s rejects --resolv-conf as top-level flag" \ + sh -c '/bin/k3s --resolv-conf=/tmp/test server --help 2>&1 | grep -qv "flag provided but not defined"' + +# --------------------------------------------------------------------------- +# 4. Entrypoint script exists and is executable +# --------------------------------------------------------------------------- +echo "--- entrypoint script ---" +assert_ok "entrypoint script exists" test -x /usr/local/bin/cluster-entrypoint.sh +assert_ok "healthcheck script exists" test -x /usr/local/bin/cluster-healthcheck.sh + +# --------------------------------------------------------------------------- +# 5. Entrypoint uses --kubelet-arg=resolv-conf (not --resolv-conf) +# --------------------------------------------------------------------------- +echo "--- entrypoint flag format ---" +assert_ok "entrypoint uses --kubelet-arg=resolv-conf" \ + grep -q -- '--kubelet-arg=resolv-conf=' /usr/local/bin/cluster-entrypoint.sh + +assert_fail "entrypoint does NOT use bare --resolv-conf flag" \ + grep -qE '^\s*exec.* --resolv-conf=' /usr/local/bin/cluster-entrypoint.sh + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +echo "" +TOTAL=$((PASS + FAIL)) +echo "=== Results: ${PASS}/${TOTAL} passed ===" + +if [ "$FAIL" -gt 0 ]; then + echo "FAILED: $FAIL test(s) failed" + exit 1 +fi + +echo "OK" +exit 0 From b06377bd27a4b6f9d5d9aecdc4aac235003e6377 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 09:00:12 -0700 Subject: [PATCH 2/3] fix(cluster): add default CMD ["server"] for bare docker run The --resolv-conf flag is accepted as a server subcommand flag but rejected as a top-level k3s flag. When the container is run without CMD (bare `docker run`), $@ is empty and --resolv-conf becomes top-level, causing the fatal error reported in #696. Add CMD ["server"] as a default so bare invocations work. The openshell CLI already passes ["server", ...] as CMD so this only affects manual docker run without arguments. Update smoke test to cover both flag positions. --- deploy/docker/Dockerfile.images | 5 ++++ deploy/docker/test-cluster-entrypoint.sh | 33 +++++++++++++----------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 47fc16ab5..7cdbbb818 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -275,6 +275,11 @@ COPY deploy/kube/manifests/*.yaml /opt/openshell/manifests/ COPY deploy/kube/gpu-manifests/*.yaml /opt/openshell/gpu-manifests/ ENTRYPOINT ["/usr/local/bin/cluster-entrypoint.sh"] +# Default to "server" so bare `docker run ` works without requiring +# the caller to pass a subcommand. The openshell CLI already passes +# ["server", "--disable=traefik", ...] as CMD; this default only affects +# manual `docker run` invocations that omit a command. +CMD ["server"] HEALTHCHECK --interval=5s --timeout=5s --start-period=20s --retries=60 \ CMD ["/usr/local/bin/cluster-healthcheck.sh"] diff --git a/deploy/docker/test-cluster-entrypoint.sh b/deploy/docker/test-cluster-entrypoint.sh index 43df71c99..4338ed6d5 100755 --- a/deploy/docker/test-cluster-entrypoint.sh +++ b/deploy/docker/test-cluster-entrypoint.sh @@ -6,8 +6,13 @@ # Smoke test for cluster-entrypoint.sh k3s flag construction. # # Validates that the flags the entrypoint passes to k3s are accepted by the -# k3s binary bundled in the cluster image. This catches regressions like the -# --resolv-conf flag removal in k3s v1.35.2 (issue #696). +# k3s binary bundled in the cluster image. The entrypoint appends flags after +# "$@" (the container CMD), so when CMD is empty (bare `docker run`) the +# flags become top-level k3s args — which k3s rejects. This test ensures: +# 1. The entrypoint uses --kubelet-arg=resolv-conf= (works in all positions) +# 2. The Dockerfile provides a default CMD ["server"] as a safety net +# +# See: https://github.com/NVIDIA/OpenShell/issues/696 # # Usage: # docker run --rm --entrypoint sh /usr/local/bin/test-cluster-entrypoint.sh @@ -63,24 +68,22 @@ assert_ok "k3s --help succeeds" /bin/k3s --help assert_ok "k3s server --help succeeds" /bin/k3s server --help # --------------------------------------------------------------------------- -# 3. k3s accepts --kubelet-arg=resolv-conf (the flag we pass) +# 3. k3s flag compatibility for resolv-conf # --------------------------------------------------------------------------- echo "--- resolv-conf flag ---" -# k3s server --help should list kubelet-arg as a valid flag -assert_ok "k3s server accepts --kubelet-arg" \ - /bin/k3s server --help +# --kubelet-arg=resolv-conf= works regardless of CMD position +assert_ok "k3s server accepts --kubelet-arg=resolv-conf" \ + sh -c '/bin/k3s server --kubelet-arg=resolv-conf=/tmp/test --help 2>&1 | grep -q "USAGE"' -# Verify --kubelet-arg=resolv-conf= is parseable by checking that k3s does -# NOT reject it as an unknown flag. We use --help after the arg to prevent -# k3s from actually starting a server. -assert_ok "k3s server --kubelet-arg=resolv-conf=/tmp/test is parseable" \ - sh -c '/bin/k3s server --help 2>&1 | grep -q "kubelet-arg"' +# --resolv-conf works as a server subcommand flag (the historical path) +assert_ok "k3s server accepts --resolv-conf after server subcommand" \ + sh -c '/bin/k3s server --resolv-conf=/tmp/test --help 2>&1 | grep -q "USAGE"' -# Verify the OLD flag format is NOT accepted as a top-level flag. -# This ensures we don't regress back to the broken format. -assert_fail "k3s rejects --resolv-conf as top-level flag" \ - sh -c '/bin/k3s --resolv-conf=/tmp/test server --help 2>&1 | grep -qv "flag provided but not defined"' +# --resolv-conf as a TOP-LEVEL flag (before server) is rejected by k3s. +# This is the failure mode when CMD is empty (bare `docker run`). +assert_fail "k3s rejects --resolv-conf as top-level flag (before server)" \ + sh -c '/bin/k3s --resolv-conf=/tmp/test 2>&1 | grep -q "USAGE"' # --------------------------------------------------------------------------- # 4. Entrypoint script exists and is executable From 2d5e40216d8487a6ca5e1456b70e74a1280804e4 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Tue, 31 Mar 2026 09:04:59 -0700 Subject: [PATCH 3/3] chore(cluster): remove entrypoint smoke test script --- deploy/docker/Dockerfile.images | 3 - deploy/docker/test-cluster-entrypoint.sh | 118 ----------------------- 2 files changed, 121 deletions(-) delete mode 100755 deploy/docker/test-cluster-entrypoint.sh diff --git a/deploy/docker/Dockerfile.images b/deploy/docker/Dockerfile.images index 7cdbbb818..afb8857ae 100644 --- a/deploy/docker/Dockerfile.images +++ b/deploy/docker/Dockerfile.images @@ -267,9 +267,6 @@ RUN chmod +x /usr/local/bin/cluster-entrypoint.sh COPY deploy/docker/cluster-healthcheck.sh /usr/local/bin/cluster-healthcheck.sh RUN chmod +x /usr/local/bin/cluster-healthcheck.sh -COPY deploy/docker/test-cluster-entrypoint.sh /usr/local/bin/test-cluster-entrypoint.sh -RUN chmod +x /usr/local/bin/test-cluster-entrypoint.sh - COPY deploy/docker/.build/charts/*.tgz /opt/openshell/charts/ COPY deploy/kube/manifests/*.yaml /opt/openshell/manifests/ COPY deploy/kube/gpu-manifests/*.yaml /opt/openshell/gpu-manifests/ diff --git a/deploy/docker/test-cluster-entrypoint.sh b/deploy/docker/test-cluster-entrypoint.sh deleted file mode 100755 index 4338ed6d5..000000000 --- a/deploy/docker/test-cluster-entrypoint.sh +++ /dev/null @@ -1,118 +0,0 @@ -#!/bin/sh - -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Smoke test for cluster-entrypoint.sh k3s flag construction. -# -# Validates that the flags the entrypoint passes to k3s are accepted by the -# k3s binary bundled in the cluster image. The entrypoint appends flags after -# "$@" (the container CMD), so when CMD is empty (bare `docker run`) the -# flags become top-level k3s args — which k3s rejects. This test ensures: -# 1. The entrypoint uses --kubelet-arg=resolv-conf= (works in all positions) -# 2. The Dockerfile provides a default CMD ["server"] as a safety net -# -# See: https://github.com/NVIDIA/OpenShell/issues/696 -# -# Usage: -# docker run --rm --entrypoint sh /usr/local/bin/test-cluster-entrypoint.sh -# -# Or during local development: -# mise run docker:build:cluster && docker run --rm --entrypoint sh openshell/cluster:dev /usr/local/bin/test-cluster-entrypoint.sh - -set -eu - -PASS=0 -FAIL=0 - -assert_ok() { - desc="$1" - shift - if "$@" >/dev/null 2>&1; then - PASS=$((PASS + 1)) - echo " PASS: $desc" - else - FAIL=$((FAIL + 1)) - echo " FAIL: $desc" - echo " command: $*" - fi -} - -assert_fail() { - desc="$1" - shift - if "$@" >/dev/null 2>&1; then - FAIL=$((FAIL + 1)) - echo " FAIL: $desc (expected failure but got success)" - echo " command: $*" - else - PASS=$((PASS + 1)) - echo " PASS: $desc" - fi -} - -echo "=== cluster-entrypoint.sh smoke tests ===" -echo "" - -# --------------------------------------------------------------------------- -# 1. k3s binary exists and is executable -# --------------------------------------------------------------------------- -echo "--- k3s binary ---" -assert_ok "k3s binary exists" test -x /bin/k3s - -# --------------------------------------------------------------------------- -# 2. k3s help works (sanity check) -# --------------------------------------------------------------------------- -echo "--- k3s help ---" -assert_ok "k3s --help succeeds" /bin/k3s --help -assert_ok "k3s server --help succeeds" /bin/k3s server --help - -# --------------------------------------------------------------------------- -# 3. k3s flag compatibility for resolv-conf -# --------------------------------------------------------------------------- -echo "--- resolv-conf flag ---" - -# --kubelet-arg=resolv-conf= works regardless of CMD position -assert_ok "k3s server accepts --kubelet-arg=resolv-conf" \ - sh -c '/bin/k3s server --kubelet-arg=resolv-conf=/tmp/test --help 2>&1 | grep -q "USAGE"' - -# --resolv-conf works as a server subcommand flag (the historical path) -assert_ok "k3s server accepts --resolv-conf after server subcommand" \ - sh -c '/bin/k3s server --resolv-conf=/tmp/test --help 2>&1 | grep -q "USAGE"' - -# --resolv-conf as a TOP-LEVEL flag (before server) is rejected by k3s. -# This is the failure mode when CMD is empty (bare `docker run`). -assert_fail "k3s rejects --resolv-conf as top-level flag (before server)" \ - sh -c '/bin/k3s --resolv-conf=/tmp/test 2>&1 | grep -q "USAGE"' - -# --------------------------------------------------------------------------- -# 4. Entrypoint script exists and is executable -# --------------------------------------------------------------------------- -echo "--- entrypoint script ---" -assert_ok "entrypoint script exists" test -x /usr/local/bin/cluster-entrypoint.sh -assert_ok "healthcheck script exists" test -x /usr/local/bin/cluster-healthcheck.sh - -# --------------------------------------------------------------------------- -# 5. Entrypoint uses --kubelet-arg=resolv-conf (not --resolv-conf) -# --------------------------------------------------------------------------- -echo "--- entrypoint flag format ---" -assert_ok "entrypoint uses --kubelet-arg=resolv-conf" \ - grep -q -- '--kubelet-arg=resolv-conf=' /usr/local/bin/cluster-entrypoint.sh - -assert_fail "entrypoint does NOT use bare --resolv-conf flag" \ - grep -qE '^\s*exec.* --resolv-conf=' /usr/local/bin/cluster-entrypoint.sh - -# --------------------------------------------------------------------------- -# Summary -# --------------------------------------------------------------------------- -echo "" -TOTAL=$((PASS + FAIL)) -echo "=== Results: ${PASS}/${TOTAL} passed ===" - -if [ "$FAIL" -gt 0 ]; then - echo "FAILED: $FAIL test(s) failed" - exit 1 -fi - -echo "OK" -exit 0