From 82c4fd1f6646276031cd703c61aa02a46f7c10bd Mon Sep 17 00:00:00 2001 From: Aaron Bockelie Date: Tue, 16 Jun 2026 11:15:03 -0500 Subject: [PATCH 1/3] docs(adr): ADR-119 appliance config delivery + first-boot orchestration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Presence-driven first boot (autoinstall/kickstart convention): a config artifact's presence decides headless-apply vs zero-config first-run; rich config (hostname/TLS/DNS creds) moves post-boot to the DCUI + operator warm path. Carrier is a stock cloud-init NoCloud vfat 'cidata' volume — no bespoke reader, no countdown, no signature file. Deliberately minimizes novelty to match familiar appliance shapes (pfSense/TrueNAS/OVF). Cross-refs ADR-104 (claim) and ADR-103 (convergence: OVA is a bootstrap seed, currency via operator upgrade). Corrects ADR-104's appliance interactivity row. --- docs/architecture/INDEX.md | 1 + ...onvergence-and-first-run-claim-protocol.md | 3 +- ...n-delivery-and-first-boot-orchestration.md | 224 ++++++++++++++++++ 3 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 docs/architecture/infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md diff --git a/docs/architecture/INDEX.md b/docs/architecture/INDEX.md index 498cb0bae..13f0bece8 100644 --- a/docs/architecture/INDEX.md +++ b/docs/architecture/INDEX.md @@ -39,6 +39,7 @@ _Containers, deployment, backup, storage, networking_ | [ADR-116](./infrastructure/ADR-116-artifact-persistence-pattern.md) | Artifact Persistence Pattern | Accepted | | [ADR-117](./infrastructure/ADR-117-deployment-topology.md) | Deployment Topology (Dev/Stable Split) | Draft | | [ADR-118](./infrastructure/ADR-118-semantic-election-protocol.md) | Semantic Election Protocol for Distributed Concept Placement | Proposed | +| [ADR-119](./infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md) | appliance configuration delivery and first-boot orchestration | Draft | ## Database/Schema _Apache AGE, migrations, schema design, PostgreSQL_ diff --git a/docs/architecture/infrastructure/ADR-104-unified-provisioning-architecture-install-path-convergence-and-first-run-claim-protocol.md b/docs/architecture/infrastructure/ADR-104-unified-provisioning-architecture-install-path-convergence-and-first-run-claim-protocol.md index da261c9d9..041f315dc 100644 --- a/docs/architecture/infrastructure/ADR-104-unified-provisioning-architecture-install-path-convergence-and-first-run-claim-protocol.md +++ b/docs/architecture/infrastructure/ADR-104-unified-provisioning-architecture-install-path-convergence-and-first-run-claim-protocol.md @@ -7,6 +7,7 @@ deciders: related: - ADR-103 - ADR-117 + - ADR-119 - ADR-211 - ADR-400 - ADR-401 @@ -29,7 +30,7 @@ they have drifted into duplicating the parts that should be identical. |------|-------|--------|---------------|---------| | `operator.sh init` | repo present | **build locally** (default) or pull GHCR | guided wizard / `--headless` | development, source-present hosts | | `install.sh` | **curl** from GitHub | **pull GHCR** | wizard / flags | bare-metal production one-liner | -| appliance first-boot | **pre-baked** repo + Docker | pull GHCR on first boot | declarative (cloud-init) / none | VM / fleet (ADR-103) | +| appliance first-boot | **pre-baked** repo + Docker | pull GHCR on first boot | declarative / interactive console / none (ADR-119) | VM / fleet (ADR-103) | The irreducible differences are real and worth keeping: **how files arrive** (repo / curl / baked), **where images come from** (build / pull), and **how much diff --git a/docs/architecture/infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md b/docs/architecture/infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md new file mode 100644 index 000000000..d05fc9a31 --- /dev/null +++ b/docs/architecture/infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md @@ -0,0 +1,224 @@ +--- +status: Draft +date: 2026-06-16 +deciders: + - aaronsb + - claude +related: + - ADR-103 + - ADR-104 + - ADR-105 + - ADR-117 +--- + +# ADR-119: appliance configuration delivery and first-boot orchestration + +## Context + +ADR-103 ships the platform as a thin appliance (OVA/qcow2). ADR-104 defines what +happens *once configuration is present* — the shared provisioning contract and the +console-token-gated first-run claim. This ADR fills the gap **before** that: how +configuration arrives, and how first boot behaves when it is — or isn't — there. + +Dogfooding the OVA path surfaced the problem. ADR-104 line 32 recorded the +appliance's interactivity as *"declarative (cloud-init) / none"* — either you +hand-craft a cloud-init seed, or you get defaults. That splits the audience badly: + +- **The seed, as we documented it, is a cloud/IaC idiom mis-applied to the + desktop.** Our own runbook told a VirtualBox/qemu user to hand-build a `cidata` + ISO with `xorriso` and attach it as a virtio disk — friction a "download the OVA + and import it" user will not push through. +- **"none" is a dead-end, not a first-run.** Without a seed the box comes up on a + DHCP IP with HTTP and no guided way to set the hostname, TLS, or a reasoning key + — the operator has to discover the rest. + +**Design value driving this ADR: minimize novelty.** Appliance provisioning is a +well-trodden, well-engineered space; our platform is not special, and the closer +its shape is to other appliance VMs the more *familiar* — and the less we have to +invent, document, and debug. So the decision below deliberately reaches for +established idioms (autoinstall/preseed, OVF properties, the pfSense/TrueNAS +console wizard) and rejects clever-but-novel machinery, even where the novel +version looked elegant. + +## Decision + +Adopt the conventional appliance shape: **a config artifact's *presence* decides +the path** (the autoinstall/kickstart convention), and **all richer configuration +happens after boot** through the console (DCUI) and operator — the always-reachable +out-of-band surface every appliance already uses. + +### A — `provision.env` is the rendezvous; the carrier is a standard NoCloud seed + +`/etc/kg/provision.env` (ADR-103's declarative control surface) stays the single +meeting point; one consumer — `operator.sh init` (ADR-104's contract) — reads it. +Configuration is delivered by **the standard cloud-init datasources**, no bespoke +reader: + +| Producer | Context | Mechanism | +|----------|---------|-----------| +| **cloud-init NoCloud** | desktop / offline / orchestrator-attached volume | a vfat volume labeled `cidata`; cloud-init's `write_files` drops `provision.env` (today's mechanism) | +| **cloud-init network datasources** | EC2 / OpenStack / Proxmox cloud-drive | platform metadata, same `write_files` | +| **interactive console + operator** | human, **post-boot** | DCUI / `operator.sh set-*` write `provision.env` and apply | + +The external carrier is a **vfat** volume, FS-label **`cidata`**, holding the +standard `user-data` + `meta-data` (cloud-init NoCloud recognizes `vfat` or +`iso9660` by that label — not ext4). vfat because it is **writable** (the box can +export/refresh the config on it) and **builds without root or ISO tooling** +(`mcopy`), which drops cleanly into Terraform/Packer. This is precisely the carrier +**Ubuntu autoinstall** uses — maximally familiar, zero custom discovery code. + +Forward-compat needs only a `KG_PROVISION_SCHEMA=N` key inside `provision.env` +(unknown keys are already ignored) — **no separate volume signature file**. The +carrier holds **declarative inputs only** (hostname, TLS mode, DNS-01 creds, +optionally admin password / reasoning key); the per-instance infra secrets +(`ENCRYPTION_KEY`, `POSTGRES_PASSWORD`, signing keys) are **always minted fresh** +per install (ADR-104 Part A) and never ride the carrier. Reload = same inputs, new +secrets. The carrier is sensitive when it holds creds and is documented "treat like +a private key"; the builder (§D) offers an omit-secrets variant. + +Because we build the OVA, the OVF **pre-declares an empty virtio slot** at a +predictable address (`/dev/vdb`) so the user points an existing slot at their +`.img` rather than adding a controller. The cloud kernel has no AHCI, so the bus is +virtio ("attach this `.img` as a disk"); we accept that over a one-click +"attach ISO to CD" because broad compatibility beats the convenience of a kernel +swap. (Revisitable — an AHCI kernel later would unlock the CD-drive UX without +changing this model.) + +### B — First boot is presence-driven, not interactive + +`kg-firstboot` does **not** pause, prompt, or run a countdown. It branches on +whether configuration is present — the kickstart/autoinstall convention: + +```mermaid +flowchart TD + S[first boot · sentinel-guarded] --> R{cloud-init produced
/etc/kg/provision.env?} + R -- yes --> APPLY[operator.sh init reads provision.env · start stack] + R -- no --> ZC[zero-config: mint secrets · start stack
DHCP · HTTP · box usable] + APPLY --> CLAIM[hand to ADR-104 claim:
UNCLAIMED + token, or pre-CLAIMED if creds supplied] + ZC --> CLAIM + CLAIM --> DONE[write sentinel → getty/console-TUI takes tty1] +``` + +This keeps first boot a headless oneshot (no tty ownership, no race with the +console getty), exactly as today — the *only* change is that the no-config branch +now lands the operator in a **real first-run experience**, not a dead-end (§C). +cloud-init keeps its standard `datasource_list` (NoCloud **and** the network +sources); attached labeled volumes are NoCloud-format — the same format +Proxmox/OpenStack config-drives use — so one path covers hand-rolled vfat *and* +platform config-drives. + +### C — All richer configuration is post-boot, via the DCUI + operator + +The no-config box boots usable, then the operator configures it through the +**always-reachable console (DCUI)** and the operator container — the +pfSense/TrueNAS shape, and the surface ADR-104's claim flow already uses: + +- **Web first-run / claim** (ADR-104): set admin password, paste a reasoning key. +- **Console DCUI + `operator.sh set-*`** (warm-reconfig, follow-on work): public + hostname, TLS mode, DNS-01 credentials, Cockpit access — the knobs that today + force a pre-boot seed. These are *idempotent* `operator.sh` commands (the + `cockpit-access` verb is the template) and each encapsulates the full ripple of + changing the external URL (Traefik router, the registered OAuth `redirect_uri`, + web runtime config, Cockpit `Origins`, the ACME cert) so a hostname change can't + silently half-apply. + +Cold (carrier) and warm (DCUI/operator) paths share the **same `provision.env` +schema and the same apply logic**, so they cannot drift. The carrier is +**cold-provisioning/reload only**; editing it does not re-apply to a running box — +that is the warm path's job. + +### D — One builder, three contexts (the operator container) + +Generating a standard NoCloud seed from inputs is pure (inputs → vfat image; no DB, +no running platform), so it is a verb on the **operator container** — the config +authority that already lives baked in the appliance *and* pullable from GHCR: + +``` +docker run kg-operator config-volume --hostname kg.x --tls letsencrypt \ + --dns-provider porkbun ... > config.img # standard cidata/NoCloud vfat +``` + +The same code runs in someone's Terraform, in the console's interactive flow, and +in the console's *export* flow ("pack and download the config out for re-use"). It +emits a stock NoCloud seed any cloud-init consumes — it is "a tool to generate +cloud-init user-data easily," not a custom format. + +## Consequences + +### Positive + +- **One artifact, both audiences, the familiar way.** A single OVA boots zero-touch + when a NoCloud seed is attached and into a first-run wizard when it isn't — + presence-driven, exactly like autoinstall/kickstart and vendor OVF appliances. +- **Almost nothing new is built.** cloud-init NoCloud is the reader; first boot + stays the current headless oneshot; the DCUI already exists. The net additions + are small: make "no config" a real first-run, add post-boot `set-*` knobs, and a + seed-builder verb. +- **Removes the seed-ISO friction** for the download-and-import user (carrier is + `mcopy`-buildable; the console path needs no carrier at all). +- **Cattle/pet separation.** The OVA is stateless and secret-free; the config seed + is the portable, owner-held identity → deterministic wipe-and-reload and a clean + Terraform/Packer seam. + +### Negative + +- **Richer first-time config moves to a second step** (post-boot DCUI/operator) + rather than being injectable pre-boot for the desktop user — but that *is* the + appliance convention, and the seed still serves the zero-touch case. +- **The config seed is a secrets-bearing artifact** when it carries DNS/provider + creds; it cannot be encrypted-at-rest for cold boot (no key exists yet — + chicken/egg), so it relies on operator handling + the omit-secrets variant. +- **The warm `set-*` commands must encapsulate the external-URL ripple** correctly + (Traefik + OAuth redirect + web config + Cockpit + cert) or a hostname change + half-applies — security-relevant, needs care. + +### Neutral + +- cloud-init is used as-is (standard datasources), neither demoted nor extended. +- The bus/kernel trade is revisitable (AHCI kernel → CD-drive UX) without touching + this model. +- The builder lands as an operator verb (ADR-211 sink), not a new tool. +- **Convergence contract (ADR-103).** The OVA is a thin *bootstrap seed*: + downloaded once, run, then kept current by `operator.sh upgrade` pulling fresh + GHCR images. The per-release artifacts are the *container images*, not the OVA — + so the OVA is republished only occasionally to move the baseline (`publish.sh + appliance`, decoupled from `release`). This ADR governs the *bootstrap*; ADR-103 + governs ongoing *currency*; the seam is the first boot that pulls images. + +## Alternatives Considered + +- **A countdown that flips between automated and interactive at boot.** Considered + and rejected as **novel** — real appliances decide by config *presence* + (kickstart `inst.ks=`, OVF property presence), not a timer. The clever version + also dragged in a bespoke volume reader, a signature file, and first-boot tty1 + ownership; presence-driven removes all of it. Familiarity and a smaller surface + beat the elegance. +- **A bespoke appliance-native config-volume reader + `/.kg-appliance` signature.** + Rejected: cloud-init's NoCloud datasource already reads a labeled vfat volume — + reimplementing it is novelty for its own sake. A `KG_PROVISION_SCHEMA` key covers + forward-compat. +- **Keep ADR-104's "declarative (cloud-init) / none".** Rejected: "none" as a + half-configured dead-end is the gap; a proper first-run (DCUI/web) closes it + without inventing anything. +- **Two OVA flavors (cloud vs interactive).** Rejected: doubles the release surface + for a difference that is purely whether a seed is attached. +- **iso9660 / ext4 carrier.** ext4 is not in cloud-init's NoCloud probe set; + iso9660 is recognized but read-only (no export/refresh) and needs ISO tooling. + vfat is the writable, tool-light intersection — and what Ubuntu autoinstall uses. + +### Prior art + +The result is an assembly of established, shipping idioms, not an invention — which +is the point: + +- **Autoinstall / preseed / kickstart.** One image runs interactive by default and + unattended when a config artifact is present — Ubuntu *autoinstall* uses the very + same NoCloud `cidata` carrier; RHEL/Fedora *kickstart* (`inst.ks=`), Debian + *preseed*, SUSE *AutoYaST* share the shape. +- **OVF vApp properties / ISO transport.** Vendor virtual appliances (F5, Palo + Alto, Cisco, NetScaler) ship a single OVA that reads hypervisor-injected + properties for zero-touch deploy, else drops to a console wizard — a polymorphic + OVA, shipping for 15+ years. +- **First-run appliances.** pfSense/OPNsense/TrueNAS/Home Assistant OS boot to a + usable default and configure via console (DCUI) / web afterward — the no-config + branch and the post-boot warm path. From c53b1b728b70420afde19ff9641ed1eb41fb7e10 Mon Sep 17 00:00:00 2001 From: Aaron Bockelie Date: Tue, 16 Jun 2026 11:15:03 -0500 Subject: [PATCH 2/3] =?UTF-8?q?feat(appliance):=20publish.sh=20appliance?= =?UTF-8?q?=20=E2=80=94=20OVA=20as=20bootstrap-seed=20release=20asset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Automates the previously-manual 'attach the OVA to a Release by hand' flow: builds the OVA, xz-compresses the qcow2, writes SHA256SUMS, and uploads to the matching GitHub release (gh release upload --clobber). Decoupled from 'release' on purpose — the OVA is a thin bootstrap seed (ADR-103/ADR-119), republished occasionally to move the baseline, while per-release currency flows through GHCR images + operator upgrade. Adds 'make publish-appliance' and refreshes the appliance README to document the convergence contract. --- Makefile | 3 ++ appliance/README.md | 16 ++++++--- scripts/publish.sh | 83 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index f97ced7dc..1b5795cc1 100644 --- a/Makefile +++ b/Makefile @@ -134,6 +134,9 @@ publish: ## Interactive publish wizard publish-status: ## Show current versions and auth status @scripts/publish.sh status +publish-appliance: ## Build + attach the appliance OVA to the GitHub release (bootstrap seed) + @scripts/publish.sh appliance + ##@ Platform start: ## Start the platform diff --git a/appliance/README.md b/appliance/README.md index c8ece9a3b..2b8f2fc3d 100644 --- a/appliance/README.md +++ b/appliance/README.md @@ -48,10 +48,18 @@ boot during the build. Artifacts land in `appliance/out/`. The Debian base is cached in `appliance/.cache/` between builds (both are git-ignored). -The qcow2/OVA is built **locally** and attached to a GitHub Release by hand, -together with a `SHA256SUMS` file (`sha256sum *.qcow2.xz *.ova > SHA256SUMS`) — -same philosophy as the container images (build where it's fast, push, let CI -integrate). CI does **not** build the image: emulated `virt-customize` under TCG +The qcow2/OVA is built **locally** and attached to the matching GitHub Release, +together with a `SHA256SUMS` file — `./publish.sh appliance` automates the build, +`.xz` compression, checksums, and upload (the previously-manual step). Same +philosophy as the container images (build where it's fast, push, let CI +integrate). + +The OVA is a **bootstrap seed, not a per-release artifact** (ADR-103): you +download it once, run it, and thereafter `operator.sh upgrade` pulls fresh GHCR +images to stay current. The *images* are the per-release artifacts; the OVA is +republished only occasionally to move the baseline — hence `publish.sh appliance` +is its own command, decoupled from `release`. CI does **not** build the image: +emulated `virt-customize` under TCG is slow and re-proves the least-interesting layer. Instead, the `appliance-integration.yml` workflow pulls the published GHCR images and runs this appliance's real first-boot path (`operator.sh init --headless diff --git a/scripts/publish.sh b/scripts/publish.sh index 45ad9aa78..a7fa2a642 100755 --- a/scripts/publish.sh +++ b/scripts/publish.sh @@ -91,6 +91,8 @@ Publishing: Defaults to rocm72-host only; --force enables deferred variants cli Publish npm package (@aaronsb/kg-cli) fuse Publish Python package (kg-fuse) to PyPI + appliance Build + attach the thin-appliance OVA to the GitHub + release (bootstrap seed; stay current via operator upgrade) all Publish everything (images, cli, fuse) Options: @@ -114,7 +116,7 @@ EOF while [[ $# -gt 0 ]]; do case "$1" in - status|bump|sync-scripts|release|images|images-rocm|all) + status|bump|sync-scripts|release|images|images-rocm|appliance|all) COMMAND="$1" shift ;; @@ -1076,6 +1078,84 @@ cmd_fuse() { fi } +# cmd_appliance +# +# Publish the x86 thin-appliance OVA (+ compressed qcow2) as assets on the +# current GitHub release. DELIBERATELY decoupled from the per-release cadence: +# the OVA is a thin *bootstrap seed* (ADR-103) — downloaded once, run, then kept +# current by `operator.sh upgrade` pulling fresh GHCR images. The container +# images are the per-release artifacts; the OVA is refreshed occasionally to move +# the baseline, not on every patch. So this is its own command, not a step in +# `release`. Automates the previously-manual "attach to a Release by hand" flow +# (appliance/README.md). +cmd_appliance() { + get_versions + + local out_dir="$PROJECT_ROOT/appliance/out" + local label="$VERSION" + local tag="v${VERSION}" + local qcow2="$out_dir/kg-appliance-${label}.qcow2" + local qcow2_xz="${qcow2}.xz" + local ova="$out_dir/kg-appliance-${label}.ova" + + echo -e "${BOLD}Publishing appliance bootstrap image${NC}" + echo -e " Version: ${BLUE}${label}${NC} → GitHub release ${BLUE}${tag}${NC}" + [ "$DRY_RUN" = "true" ] && echo -e " Mode: ${YELLOW}DRY RUN${NC}" + echo "" + echo -e " ${DIM}Convergence contract (ADR-103): the OVA is a thin bootstrap seed —${NC}" + echo -e " ${DIM}download once, run, then 'operator.sh upgrade' keeps it current via${NC}" + echo -e " ${DIM}GHCR images. Not re-downloaded per release; published occasionally.${NC}" + echo "" + + if ! command -v gh &>/dev/null; then + echo -e "${RED}gh (GitHub CLI) not found${NC}"; echo " Install: https://cli.github.com/"; exit 1 + fi + + # --- Build (unless --skip-build) ----------------------------------------- + if [ "$SKIP_BUILD" = "false" ]; then + echo -e "${BLUE}→ Building OVA (kg-appliance-${label})...${NC}" + "$PROJECT_ROOT/appliance/build-appliance.sh" --ova --version "$label" + echo -e "${GREEN}✓ Build complete${NC}" + echo "" + fi + if [ ! -f "$ova" ]; then + echo -e "${RED}OVA not found: $ova${NC}" + echo -e " ${DIM}Build it: appliance/build-appliance.sh --ova --version $label${NC}" + exit 1 + fi + + # --- Compress the qcow2 (the Proxmox/QEMU asset; OVA is already compact) -- + if [ -f "$qcow2" ] && [ ! -f "$qcow2_xz" ]; then + echo -e "${BLUE}→ Compressing qcow2 → .xz (a few minutes)...${NC}" + xz -T0 -k -f "$qcow2" + fi + + # --- Checksums over exactly the assets we publish ------------------------ + local -a assets=( "$ova" ) + [ -f "$qcow2_xz" ] && assets+=( "$qcow2_xz" ) + echo -e "${BLUE}→ Writing SHA256SUMS${NC}" + ( cd "$out_dir" && sha256sum $(for a in "${assets[@]}"; do basename "$a"; done) > SHA256SUMS ) + sed 's/^/ /' "$out_dir/SHA256SUMS" + assets+=( "$out_dir/SHA256SUMS" ) + echo "" + + if [ "$DRY_RUN" = "true" ]; then + echo -e "${DIM}Would upload to release ${tag}:${NC}" + for a in "${assets[@]}"; do echo " $(basename "$a") ($(du -h "$a" | cut -f1))"; done + return 0 + fi + + # --- Ensure the release exists, then upload (clobber to refresh) ---------- + if ! gh release view "$tag" >/dev/null 2>&1; then + echo -e "${BLUE}→ Creating release ${tag}${NC}" + gh release create "$tag" --title "$tag" --notes "${DESCRIPTION:-Appliance bootstrap image ${tag}}" + fi + echo -e "${BLUE}→ Uploading ${#assets[@]} asset(s) to ${tag}${NC}" + gh release upload "$tag" "${assets[@]}" --clobber + echo -e "${GREEN}✓ Published appliance bootstrap image to release ${tag}${NC}" + echo -e " ${DIM}Download: gh release download ${tag} -p 'kg-appliance-*.ova'${NC}" +} + cmd_all() { echo -e "${BOLD}Publishing all packages${NC}" echo "" @@ -1100,5 +1180,6 @@ case "$COMMAND" in images-rocm) cmd_images_rocm ;; cli) cmd_cli ;; fuse) cmd_fuse ;; + appliance) cmd_appliance ;; all) cmd_all ;; esac From bcd008b9948cb0ae18675b0a52ac17fd0bdcefd6 Mon Sep 17 00:00:00 2001 From: Aaron Bockelie Date: Tue, 16 Jun 2026 11:21:44 -0500 Subject: [PATCH 3/3] =?UTF-8?q?fix(appliance):=20address=20PR=20#531=20rev?= =?UTF-8?q?iew=20=E2=80=94=20fresh=20.xz,=20side-effect-free=20dry-run?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Always recompress the qcow2 (drop the stale-.xz guard that would re-publish old bytes under a fresh checksum on a same-VERSION rebuild) [H1/H2] - Dry-run reports intent only; no xz, no SHA256SUMS write [L3] - Only publish a .xz when a source qcow2 exists this run (OVA-only --skip-build ships just the OVA) [H2] - Upload SHA256SUMS last as the commit marker; add a verify hint [M3] - Comment the release-minting decoupling [M2] - ADR-119: soften the OVF empty-slot claim to future tense + follow-on note [M1] --- ...n-delivery-and-first-boot-orchestration.md | 7 +++- scripts/publish.sh | 39 ++++++++++++------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/docs/architecture/infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md b/docs/architecture/infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md index d05fc9a31..f19a037ab 100644 --- a/docs/architecture/infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md +++ b/docs/architecture/infrastructure/ADR-119-appliance-configuration-delivery-and-first-boot-orchestration.md @@ -76,9 +76,12 @@ per install (ADR-104 Part A) and never ride the carrier. Reload = same inputs, n secrets. The carrier is sensitive when it holds creds and is documented "treat like a private key"; the builder (§D) offers an omit-secrets variant. -Because we build the OVA, the OVF **pre-declares an empty virtio slot** at a +Because we build the OVA, the OVF **will pre-declare an empty virtio slot** at a predictable address (`/dev/vdb`) so the user points an existing slot at their -`.img` rather than adding a controller. The cloud kernel has no AHCI, so the bus is +`.img` rather than adding a controller. *(Follow-on, not yet shipped: the current +`ovf/kg-appliance.ovf.template` declares only the primary disk; until the empty +slot lands, the carrier is attached as an added virtio disk — the same friction +this is meant to remove. Tracked with the first-boot orchestration work below.)* The cloud kernel has no AHCI, so the bus is virtio ("attach this `.img` as a disk"); we accept that over a one-click "attach ISO to CD" because broad compatibility beats the convenience of a kernel swap. (Revisitable — an AHCI kernel later would unlock the CD-drive UX without diff --git a/scripts/publish.sh b/scripts/publish.sh index a7fa2a642..d2d7d1703 100755 --- a/scripts/publish.sh +++ b/scripts/publish.sh @@ -1124,36 +1124,47 @@ cmd_appliance() { exit 1 fi - # --- Compress the qcow2 (the Proxmox/QEMU asset; OVA is already compact) -- - if [ -f "$qcow2" ] && [ ! -f "$qcow2_xz" ]; then + # --- Dry run: report intent only, NO side effects (no xz, no SHA file) --- + if [ "$DRY_RUN" = "true" ]; then + echo -e "${DIM}Would compress, checksum, and upload to release ${tag}:${NC}" + echo " $(basename "$ova") ($(du -h "$ova" | cut -f1))" + [ -f "$qcow2" ] && echo " $(basename "$qcow2_xz") (compressed from $(du -h "$qcow2" | cut -f1) qcow2)" + echo " SHA256SUMS" + return 0 + fi + + # --- Compress the qcow2 FRESH every run ----------------------------------- + # The .xz name is keyed on the clean VERSION, so a stale same-VERSION .xz + # from a prior build must never be reused (xz -f overwrites). Only publish a + # .xz when we have a source qcow2 *this* run — a --skip-build OVA-only run + # ships just the OVA, never a leftover .xz. + if [ -f "$qcow2" ]; then echo -e "${BLUE}→ Compressing qcow2 → .xz (a few minutes)...${NC}" xz -T0 -k -f "$qcow2" fi - # --- Checksums over exactly the assets we publish ------------------------ + # --- Checksums over exactly the binary assets we publish ----------------- local -a assets=( "$ova" ) - [ -f "$qcow2_xz" ] && assets+=( "$qcow2_xz" ) + [ -f "$qcow2" ] && assets+=( "$qcow2_xz" ) echo -e "${BLUE}→ Writing SHA256SUMS${NC}" ( cd "$out_dir" && sha256sum $(for a in "${assets[@]}"; do basename "$a"; done) > SHA256SUMS ) sed 's/^/ /' "$out_dir/SHA256SUMS" - assets+=( "$out_dir/SHA256SUMS" ) echo "" - if [ "$DRY_RUN" = "true" ]; then - echo -e "${DIM}Would upload to release ${tag}:${NC}" - for a in "${assets[@]}"; do echo " $(basename "$a") ($(du -h "$a" | cut -f1))"; done - return 0 - fi - - # --- Ensure the release exists, then upload (clobber to refresh) ---------- + # --- Ensure the release exists, then upload ------------------------------ + # Decoupled from `release`: if $tag doesn't exist this MINTS the GitHub + # release — a bootstrap image can ship for a baseline that was never formally + # `release`d. Upload binaries first, then SHA256SUMS LAST as the commit + # marker, so a mid-upload failure never leaves a checksum ahead of its bytes. if ! gh release view "$tag" >/dev/null 2>&1; then echo -e "${BLUE}→ Creating release ${tag}${NC}" gh release create "$tag" --title "$tag" --notes "${DESCRIPTION:-Appliance bootstrap image ${tag}}" fi - echo -e "${BLUE}→ Uploading ${#assets[@]} asset(s) to ${tag}${NC}" + echo -e "${BLUE}→ Uploading ${#assets[@]} image asset(s) to ${tag}${NC}" gh release upload "$tag" "${assets[@]}" --clobber + gh release upload "$tag" "$out_dir/SHA256SUMS" --clobber echo -e "${GREEN}✓ Published appliance bootstrap image to release ${tag}${NC}" - echo -e " ${DIM}Download: gh release download ${tag} -p 'kg-appliance-*.ova'${NC}" + echo -e " ${DIM}Verify: gh release download ${tag} -p 'kg-appliance-*' -p SHA256SUMS && sha256sum -c SHA256SUMS${NC}" } cmd_all() {