dAppCore · Snider · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # Build artifacts
 build/
+bin/
 *.dylib
 *.so
 *.a

diff --git a/.gitmodules b/.gitmodules
@@ -22,3 +22,11 @@
 	path = external/go-io
 	url = https://github.com/dappcore/go-io.git
 	branch = dev
+[submodule "external/go-ai"]
+	path = external/go-ai
+	url = https://github.com/dappcore/go-ai.git
+	branch = dev
+[submodule "external/go-ml"]
+	path = external/go-ml
+	url = https://github.com/dappcore/go-ml.git
+	branch = dev
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -44,6 +44,7 @@ After Mantis #1241, all Go code lives under `go/`:
 ```
 go/                          Go module root (dappco.re/go/mlx)
   *.go                       Public root API: model, tokenizer, compute, training, eval, distill, GRPO, hf-fit, merge, gguf-quantize, kv-snapshot, lora-fuse
+  cmd/mlx/                   CLI tool (built with `-o core-mlx`; consumers rename: lthn-mlx)
   cmd/violet/                Unix-socket sidecar daemon
   internal/metal/            All CGO code (mlx-c bindings)
   mlxlm/                     CGO-free Python subprocess backend

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -3,6 +3,9 @@ cmake_minimum_required(VERSION 3.24)
 project(mlx)
 
 set(CMAKE_OSX_DEPLOYMENT_TARGET "26.0" CACHE STRING "Minimum macOS version")
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
 
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/dist" CACHE PATH "" FORCE)
@@ -17,7 +20,8 @@ set(CMAKE_INSTALL_RPATH "@loader_path")
 
 include(FetchContent)
 
-set(MLX_C_GIT_TAG "v0.4.1" CACHE STRING "")
+set(MLX_C_GIT_TAG "v0.6.0" CACHE STRING "")
+set(FETCHCONTENT_SOURCE_DIR_MLX "${CMAKE_CURRENT_SOURCE_DIR}/lib/mlx" CACHE PATH "Local patched MLX source")
 
 FetchContent_Declare(
   mlx-c

diff --git a/GOAL.md b/GOAL.md
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -1,7 +1,9 @@
 cmake_minimum_required(VERSION 3.24)
 project(go-mlx-cpp LANGUAGES C CXX)
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
 
 # Fetch mlx-c v0.4.1 — same version as the Go side
 include(FetchContent)

diff --git a/docs/README.md b/docs/README.md
@@ -0,0 +1,146 @@
+<!-- SPDX-Licence-Identifier: EUPL-1.2 -->
+
+# go-mlx — documentation index
+
+**Module**: `dappco.re/go/mlx`
+**Role**: Native Apple Metal GPU inference + research-grade training pipeline. Implements the go-inference `Backend` + `TextModel` + `Session/Forker` contracts for darwin/arm64.
+
+## Tetrad position
+
+```
+                    ┌──────────────────────────────┐
+                    │      dappco.re/go (core)     │
+                    └──────────────┬───────────────┘
+                                   │
+                    ┌──────────────┴────────────────┐
+                    │     go-inference  (contract)  │
+                    └──┬─────────────┬──────────────┘
+                       │             │ register via init()
+              ┌────────┴───┐  ┌──────┴────────┐
+   you are here →  go-mlx  │  │  go-rocm /    │
+                    │  darwin │  │  go-cuda      │
+                    │  arm64  │  │  (planned)    │
+                    └─────┬──┘  └───────────────┘
+                          │ consumed by
+                    ┌─────┴──────────┬────────────────┐
+                    │  go-ml         │  go-ai          │
+                    │  scoring/agent │  router/demos   │
+                    └────────────────┘ └───────────────┘
+```
+
+## What this package owns
+
+Five distinct areas, each with its own doc subtree:
+
+| Area | Owns | Doc |
+|------|------|-----|
+| `runtime/` | Backend registration + adapter + Metal allocator | [runtime/README.md](runtime/README.md) |
+| `memory/` | KV snapshots + bundles + memvid + Wake/Sleep/Fork | [memory/README.md](memory/README.md) |
+| `moe/` | MiniMax M2 + JANG/JANGTQ + codebook VQ + expert residency | [moe/README.md](moe/README.md) |
+| `training/` | SFT + GRPO + distillation + LoRA + eval + merge | [training/README.md](training/README.md) |
+| `model/` | Model-pack validation + memory planning + GGUF | [model/README.md](model/README.md) |
+| `inference/` | Scheduler + block cache + decode opt + parsers + thinking | [inference/README.md](inference/README.md) |
+| `compute/` | Non-LLM Metal compute (pixel buffers, kernels, frame pipelines) | [compute/compute.md](compute/compute.md) |
+| `observability/` | Probe emission (token / entropy / heads / router / cache / memory / training) | [observability/probe.md](observability/probe.md) |
+| `cmd/` | Sidecar daemons | [cmd/violet.md](cmd/violet.md) |
+
+## Mental model
+
+```
+                  ┌─────────────────────────────────┐
+                  │  caller: inference.LoadModel    │
+                  └──────────────┬──────────────────┘
+                                 │
+              ┌──────────────────┴───────────────────┐
+              │      go-inference Default()           │
+              │   picks "metal" → metalbackend        │
+              └──────────────────┬───────────────────┘
+                                 │
+                    runtime/ (register_metal.go)
+                                 │
+                                 ▼
+              ┌──────────────────────────────────────┐
+              │ memory_plan → load weights via       │
+              │ medium → metal.LoadAndInit → produce │
+              │ &metaladapter wrapping metal.Model    │
+              └──────────────────┬───────────────────┘
+                                 │
+        ┌────────────┬───────────┴────────┬──────────────┐
+        ▼            ▼                    ▼              ▼
+   inference/   memory/             training/       observability/
+   (scheduler   (Wake/Sleep         (SFT/LoRA/      (probe events)
+    cache       bundles             GRPO/distill/
+    decode-opt  memvid)              eval)
+    parsers
+    thinking)
+
+   moe/ adds MoE-specific paths into each area.
+   compute/ runs alongside on the same Metal device.
+```
+
+## Status snapshot (2026-05-11)
+
+**Production**: dense models (Gemma 3/4 dense, Qwen 2/3, Llama 3) — load, inference, scheduler, block cache, KV snapshots, agent memory wake/sleep/fork, SFT, LoRA, distillation, GRPO, eval, model pack validation, GGUF read+write, memory planning, frame compute. Qwen 3.6 model packs are recognised and planned through the `mlx_lm` fallback while native hybrid linear-attention kernels are pending.
+
+**Phase 1 in flight** (vMLX parity sprint, started 2026-05-09): MiniMax M2/2.7 MoE forward, JANGTQ_K weight load, codebook VQ kernels, expert residency native path, disk-backed block cache.
+
+**Planned**: speculative decoding (paired with Gemma 4 `-assistant`), prompt-lookup decoding, embeddings + rerank surfaces, OpenAI Responses handler, vision/audio (out-of-scope for core runner near-term).
+
+## Repository layout
+
+```
+go-mlx/
+├── go/                     Go module root (dappco.re/go/mlx)
+│   ├── *.go                ← root package (80+ files, this is where docs land)
+│   ├── internal/metal/     ← CGO bindings to mlx-c (44 files, internal)
+│   ├── mlxlm/              ← CGO-free Python subprocess fallback
+│   ├── cmd/violet/         ← Unix-socket sidecar daemon
+│   ├── cmd/mlx/            ← CLI tool (built with `-o core-mlx`; consumers rename: lthn-mlx, etc.)
+│   ├── pkg/daemon/         ← daemon implementation
+│   ├── pkg/memvid/         ← QR-video knowledge-pack codec
+│   └── tests/              ← integration tests
+├── cpp/                    C++ companion (CLion-side)
+├── docs/                   ← YOU ARE HERE
+├── examples/               per-feature usage walkthroughs
+├── external/               vendored core libraries
+├── lib/mlx/                upstream MLX submodule (v0.31.1)
+└── patches/                local patches to lib/mlx
+```
+
+## Where to start
+
+- **Caller (loading a model)** → [`runtime/register_metal.md`](runtime/register_metal.md) + [`runtime/adapter.md`](runtime/adapter.md)
+- **Local setup / autotune UI** → [`runtime/local_autotune.md`](runtime/local_autotune.md)
+- **Agent memory / book state** → [`memory/agent_memory.md`](memory/agent_memory.md)
+- **LTHN project context seed** → [`memory/agentic_project_seed.md`](memory/agentic_project_seed.md)
+- **Training Vi or a custom model** → [`training/README.md`](training/README.md) → [`training/sft.md`](training/sft.md) → [`training/distill.md`](training/distill.md)
+- **Understanding the vMLX parity work** → [`moe/README.md`](moe/README.md) + `docs/vmlx-feature-gap-report.md`
+- **Serving many requests** → [`inference/scheduler.md`](inference/scheduler.md)
+- **Frame compute (emulator UIs)** → [`compute/compute.md`](compute/compute.md)
+- **Sidecar deployment** → [`cmd/violet.md`](cmd/violet.md)
+
+## Legacy docs
+
+The flat docs in this folder (`architecture.md`, `compute.md`, `distillation.md`, `grpo.md`, `models.md`, `training.md`, `eval.md`, `model-operations.md`, `model-state-roadmap.md`, `build.md`, `development.md`, `history.md`, `index.md`, `vmlx-feature-gap-report.md`, `superpowers/plans/2026-05-09-vmlx-feature-parity.md`) pre-date this per-file pass and may rot. Keep `vmlx-feature-gap-report.md` and the parity plan (they're active references). Fold the rest into the per-package READMEs over time.
+
+## Measured
+
+| Operation | Bundle / model | Latency |
+|-----------|----------------|---------|
+| Wake — chapter (warm) | ~500MB | 998ms |
+| Wake — full book (warm) | ~10.5GB | 2.15s |
+| Wake — full book (cold runner) | ~10.5GB | 55.2s |
+| Sleep — incremental, parent-reuse | 200-token delta | <1s |
+| Gemma 4 E2B inference (M3 Ultra) | dense | ~80 tok/s decode |
+| Gemma 4 26B inference (M3 Ultra) | dense | ~25 tok/s decode |
+
+## Standards
+
+- UK English in code, comments, docs (colour, organisation, licence, serialise)
+- SPDX header on every new file: `// SPDX-Licence-Identifier: EUPL-1.2`
+- Conventional commits: `type(scope): description` — scopes per package + `metal`, `api`, `mlxlm`, `repo`, `deps`
+- Test triplets: `_Good` / `_Bad` / `_Ugly` + `*_example_test.go` runnable examples
+- Error wrapping via `core.E(scope, msg, cause)`
+- Co-Author: `Co-Authored-By: Virgil <virgil@lethean.io>`
+- Native files: `//go:build darwin && arm64` (or `&& !nomlx`); stubs return false on `MetalAvailable()`
+- CGO confined to `go/internal/metal/`
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -41,23 +41,26 @@ internal/metal/                                   <-- All CGO code
     +-- metal.go       Init, error handler, Eval, Materialize
     |
     v
-mlx-c v0.4.1                                     <-- C API (fetched by CMake)
+mlx-c v0.6.0                                     <-- C API (fetched by CMake)
     |
     v
-Apple MLX / Metal / Accelerate                    <-- GPU compute
+Apple MLX v0.31.1 / Metal / Accelerate            <-- local patched lib/mlx
 ```
 
 ## CGO Binding
 
 ### Build Chain
 
-mlx-c is fetched and built by CMake via `go generate ./...`. The `CMakeLists.txt` at the module root pulls mlx-c v0.4.1 from GitHub:
+mlx-c is fetched and built by CMake via `go generate ./...`. The
+`CMakeLists.txt` at the module root pulls mlx-c v0.6.0 from GitHub and points
+mlx-c's nested MLX dependency at the local patched `lib/mlx` submodule:
 
 ```cmake
+set(FETCHCONTENT_SOURCE_DIR_MLX "${CMAKE_CURRENT_SOURCE_DIR}/lib/mlx" CACHE PATH "Local patched MLX source")
 FetchContent_Declare(
   mlx-c
   GIT_REPOSITORY "https://github.com/ml-explore/mlx-c.git"
-  GIT_TAG "v0.4.1"
+  GIT_TAG "v0.6.0"
 )
 ```
 
@@ -255,7 +258,7 @@ session, err := mlx.NewSession()
 
 Options from `inference.LoadConfig` understood by the Metal backend:
 
-- `ContextLen` -- replaces unbounded `KVCache` with `RotatingKVCache(contextLen)` for all layers; default 131072
+- `ContextLen` -- replaces unbounded `KVCache` with `RotatingKVCache(contextLen)` for all layers; default `131072` (`128Ki` tokens)
 - `ParallelSlots` -- caps concurrent native inference calls for one loaded model before KV/cache allocation; default 1
 - `AdapterPath` -- loads a trained LoRA adapter from disk at model load time
 - `GPULayers` -- logged as a warning if set to 0 (Metal always uses full GPU offload)

diff --git a/docs/build.md b/docs/build.md
@@ -47,7 +47,8 @@ The submodule initialisation is required because `internal/metal/` contains
 forwarding translation units that include sources from `lib/mlx`, `lib/mlx-c`,
 and `lib/generated`.
 
-CMake fetches mlx-c v0.4.1 from GitHub and builds it with:
+CMake fetches mlx-c v0.6.0 from GitHub and builds it against the local
+patched `lib/mlx` submodule with:
 
 - `MLX_BUILD_SAFETENSORS=ON` -- required for model loading
 - `MLX_BUILD_GGUF=ON` -- enables GGUF load/save support
@@ -133,7 +134,8 @@ set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
 set(CMAKE_INSTALL_RPATH "@loader_path")
 
 include(FetchContent)
-set(MLX_C_GIT_TAG "v0.4.1" CACHE STRING "")
+set(MLX_C_GIT_TAG "v0.6.0" CACHE STRING "")
+set(FETCHCONTENT_SOURCE_DIR_MLX "${CMAKE_CURRENT_SOURCE_DIR}/lib/mlx" CACHE PATH "Local patched MLX source")
 FetchContent_Declare(
   mlx-c
   GIT_REPOSITORY "https://github.com/ml-explore/mlx-c.git"
@@ -230,8 +232,8 @@ CGO call overhead floors at approximately 170 us per operation (Metal command bu
 ```
 go-mlx
 +-- forge.lthn.ai/core/go-inference  (shared interfaces, zero dependencies)
-+-- mlx-c v0.4.1                     (CMake, fetched at go generate time)
-    +-- Apple MLX (Metal GPU compute)
++-- mlx-c v0.6.0                     (CMake, fetched at go generate time)
+    +-- Apple MLX v0.31.1             (local patched lib/mlx submodule)
         +-- Foundation, Metal, Accelerate frameworks
 ```
 

diff --git a/docs/cmd/violet.md b/docs/cmd/violet.md
@@ -0,0 +1,112 @@
+<!-- SPDX-Licence-Identifier: EUPL-1.2 -->
+
+# cmd/violet — local-native inference sidecar
+
+**Package**: `dappco.re/go/mlx/cmd/violet`
+**Files**: `cmd/violet/main.go` (entry) + `pkg/daemon/` (server)
+
+## What this is
+
+The **Violet sidecar daemon** — a long-running process exposing inference + agent memory over a Unix socket. Lets local processes (CoreAgent, IDE, ml lab) call into a hot, model-loaded mlx runtime without each spawning their own.
+
+Violet is what Cladius posts to instead of burning Anthropic tokens for routine inference. It's the local substrate that survives Codex's uncertain status (per `project_codex_status_uncertain.md`) and the budget pressure (per `project_go_mlx_research_grade.md`).
+
+## Why a daemon
+
+Three reasons one shared process beats N short-lived processes:
+
+1. **Model load cost.** Loading Gemma 4 26B takes 30-60s on first touch. The daemon pays it once.
+2. **KV cache locality.** Sessions retain their KV across requests; a fresh process can't.
+3. **Memory budget.** Two LLM processes don't fit on a 96GB Ultra; one daemon serving many clients does.
+
+## Transport
+
+Unix domain socket — fast, secure-by-default (filesystem permissions), no TCP overhead.
+
+```bash
+violet --socket /var/run/violet/violet.sock --config /etc/violet.toml
+```
+
+Request envelope is line-delimited JSON over the socket; responses likewise (or SSE-like multi-line for streaming).
+
+## Surface
+
+Per-request operations (subset, more land as parity sprint completes):
+
+- `Generate` / `Chat` — text generation
+- `Classify` / `BatchGenerate`
+- `WakeState` / `SleepState` / `ForkState` — agent memory
+- `CacheStats` / `WarmCache` / `ClearCache` — prompt cache
+- `CapabilityReport` — what this daemon supports right now
+- `LoadModel` / `UnloadModel` — admin (default off, opt-in via config)
+
+## Config
+
+```toml
+# /etc/violet.toml
+
+[runtime]
+socket = "/var/run/violet/violet.sock"
+default_model = "gemma-4-e2b"
+
+[models.gemma-4-e2b]
+path = "/Volumes/Data/models/gemma-4-e2b/"
+context_length = 32768
+
+[models.qwen-3-coding]
+path = "/Volumes/Data/models/qwen-3-coding-30b/"
+context_length = 16384
+
+[memory]
+bundles_dir = "/var/lib/violet/bundles"
+codec = "memvid"           # or "file"
+
+[scheduler]
+max_concurrent = 4
+max_queue      = 32
+
+[probe]
+log_dir = "/var/log/violet/probes"
+```
+
+The daemon pre-loads `default_model` at startup. Other models load lazily on first reference.
+
+## Lifecycle
+
+```
+violet starts
+   ↓
+read config + open socket
+   ↓
+pre-load default model
+   ↓
+warm prompt cache from on-disk seeds (if configured)
+   ↓
+serve requests until SIGINT/SIGTERM
+   ↓
+flush in-flight bundles to durable storage
+   ↓
+unload models cleanly
+   ↓
+close socket
+```
+
+## Used by
+
+- **Cladius's local-inference skills** — `mattermost`, `wiki`, code summarise — call violet for batch text processing instead of round-tripping Anthropic
+- **CoreAgent / core/ide** — chat-with-local-model surface
+- **Vi training pipeline** — distillation teacher endpoint
+- **LARQL vindex inspection** — pre/post-SFT model inference for diff
+
+## Status
+
+Production. Used in daily Cladius workflow (the wikis + mattermost + code-summarise skills route through it).
+
+## Related
+
+- `pkg/daemon/` — server implementation (planned dedicated doc)
+- `../memory/agent_memory.md` — Wake/Sleep exposed over the socket
+- `../inference/scheduler.md` — the scheduler that admits violet requests
+- `../runtime/register_metal.md` — Violet boots the metal backend
+- `project_local_inference_topology.md` — measured topology
+- `project_go_mlx_research_grade.md` — the substrate this is part of
-Original file line number
+Diff line change
@@ -1,5 +1,6 @@
     # Build artifacts
     build/
+    bin/
     *.dylib
     *.so
     *.a
@@ Expand Down @@