From ab0e1ae803950407b743ffb6b992706437e90a8e Mon Sep 17 00:00:00 2001
From: John Rocky <john-rocky@users.noreply.github.com>
Date: Wed, 15 Apr 2026 18:14:17 +0900
Subject: [PATCH 1/2] =?UTF-8?q?spike(D1b):=20compute-unit=20split=20?=
 =?UTF-8?q?=E2=80=94=20chunk3=20on=20.cpuAndGPU=20enables=20overlap?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Env-gated follow-up to PR #75. When COMPUTE_UNIT_SPLIT=1, chunk3 is
loaded with MLModelConfiguration.computeUnits = .cpuAndGPU while the
other chunks stay on the inherited unit (usually ANE). A one-shot
probe mirrors PR #75's runConcurrencyProbe but pairs c2 (ANE) and
c3 (GPU) on separate DispatchQueues.

Finding: overlap factor 0.87-0.99 across all four prompt categories
(vs 0.02-0.06 on pure-ANE in PR #75). Kernel-level parallelism
between ANE and GPU drivers works as hypothesised.

Caveat: end-to-end tok/s regresses 33.2 -> 25.3 (-24%) because c3 on
GPU is 2.2x slower (7.5 ms -> 16.6 ms) and the current serial
predictStep pays that deficit without claiming the overlap prize.
Realising the win needs a follow-up pipelining change.

Default-off: COMPUTE_UNIT_SPLIT unset produces zero behaviour change.
97 net-added Swift lines (one config branch in load(), one probe).

See docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md for full data and the
projection for a proper 2-stage pipeline (~43 tok/s ceiling, not 56).
---
 Sources/CoreMLLLM/ChunkedEngine.swift    |  99 ++++++++++-
 docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md | 203 +++++++++++++++++++++++
 2 files changed, 300 insertions(+), 2 deletions(-)
 create mode 100644 docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md

diff --git a/Sources/CoreMLLLM/ChunkedEngine.swift b/Sources/CoreMLLLM/ChunkedEngine.swift
index cdf9b11..04bc8ed 100644
--- a/Sources/CoreMLLLM/ChunkedEngine.swift
+++ b/Sources/CoreMLLLM/ChunkedEngine.swift
@@ -153,6 +153,22 @@ final class ChunkedEngine {
             }
         }
 
+        // Phase D1b spike: optionally move chunk3 to .cpuAndGPU while other
+        // chunks stay on the inherited compute unit (usually ANE via .all).
+        // Hypothesis: distinct-compute-unit chunks go through distinct drivers
+        // and can overlap, where pure-ANE chunks serialise (see PR #75).
+        // Gated by COMPUTE_UNIT_SPLIT=1. Default-off, zero behaviour change.
+        let splitEnabled = ProcessInfo.processInfo.environment["COMPUTE_UNIT_SPLIT"] == "1"
+        let splitTarget = ProcessInfo.processInfo.environment["COMPUTE_UNIT_SPLIT_CHUNK"] ?? "chunk3"
+        let splitConfig: MLModelConfiguration = {
+            let c = MLModelConfiguration()
+            c.computeUnits = .cpuAndGPU
+            return c
+        }()
+        if splitEnabled {
+            print("[Spike] COMPUTE_UNIT_SPLIT=1 — \(splitTarget) will load on .cpuAndGPU")
+        }
+
         func findModel(_ name: String) -> URL? {
             // For .mlmodelc we require coremldata.bin alongside the directory
             // — a half-populated directory (e.g. stray prefill_chunk with only
@@ -171,9 +187,11 @@ final class ChunkedEngine {
                 throw CoreMLLLMError.modelNotFound(name)
             }
             let t0 = CFAbsoluteTimeGetCurrent()
-            let m = try MLModel(contentsOf: url, configuration: cfg)
+            let effectiveCfg = (splitEnabled && name == splitTarget) ? splitConfig : cfg
+            let m = try MLModel(contentsOf: url, configuration: effectiveCfg)
             let dt = CFAbsoluteTimeGetCurrent() - t0
-            print("[Load] \(name) done in \(String(format: "%.1f", dt))s")
+            print("[Load] \(name) done in \(String(format: "%.1f", dt))s" +
+                  (splitEnabled && name == splitTarget ? " (.cpuAndGPU)" : ""))
             return m
         }
 
@@ -407,6 +425,14 @@ final class ChunkedEngine {
         engine.reset()
         print("[Load] ANE prewarm (4 steps) done in \(String(format: "%.2f", CFAbsoluteTimeGetCurrent() - warmT0))s")
 
+        // Phase D1b spike: one-shot c2/c3 cross-compute-unit overlap probe.
+        // Only runs when COMPUTE_UNIT_SPLIT=1 (so chunk3 is on .cpuAndGPU).
+        // Mirrors PR #75's probe pattern but measures c2 (ANE) vs c3 (GPU).
+        if splitEnabled {
+            try engine.runComputeUnitSplitProbe()
+            engine.reset()
+        }
+
         return engine
     }
 
@@ -1585,6 +1611,75 @@ final class ChunkedEngine {
     func makeDrafterFullMask(position: Int) throws -> MLMultiArray {
         try makeCausalMask(position: position, length: config.contextLength)
     }
+
+    // MARK: - Phase D1b spike: compute-unit-split concurrency probe
+
+    /// One-shot probe: can c2 (ANE) and c3 (.cpuAndGPU) predictions overlap
+    /// on separate DispatchQueues when they go through distinct drivers?
+    /// Mirrors PR #75's pattern but pairs c2/c3 for the split experiment.
+    func runComputeUnitSplitProbe() throws {
+        print("[Spike] Running compute-unit-split probe (c2 ANE vs c3 .cpuAndGPU)")
+        let p = 1
+        let fv: (MLMultiArray) -> MLFeatureValue = { MLFeatureValue(multiArray: $0) }
+        let rope: [String: MLFeatureValue] = [
+            "causal_mask_full": fv(try makeCausalMask(position: p, length: config.contextLength)),
+            "causal_mask_sliding": fv(try makeSlidingCausalMask(position: p, W: config.slidingWindow)),
+            "update_mask": fv(try makeUpdateMask(position: p, length: config.contextLength)),
+            "cos_s": fv(try lookupRoPE(table: cosSlidingTable, position: p, dim: 256)),
+            "sin_s": fv(try lookupRoPE(table: sinSlidingTable, position: p, dim: 256)),
+            "cos_f": fv(try lookupRoPE(table: cosFullTable, position: p, dim: 512)),
+            "sin_f": fv(try lookupRoPE(table: sinFullTable, position: p, dim: 512))]
+        var d1 = rope
+        d1["hidden_states"] = fv(try embedTokens.lookup(0, shape: [1, 1, NSNumber(value: config.hiddenSize)]))
+        d1["per_layer_raw"] = fv(try lookupPerLayerRaw(tokenID: 0))
+        d1["K_sliding_in"] = fv(kSliding1); d1["V_sliding_in"] = fv(vSliding1)
+        d1["K_full_in"] = fv(kFull1); d1["V_full_in"] = fv(vFull1)
+        let o1 = try chunk1.prediction(from: try MLDictionaryFeatureProvider(dictionary: d1))
+        let plc = o1.featureValue(for: "per_layer_combined_out")!.multiArrayValue!
+        var d2 = rope
+        d2["hidden_states"] = fv(o1.featureValue(for: "hidden_states_out")!.multiArrayValue!)
+        d2["per_layer_combined"] = fv(plc)
+        d2["K_sliding_in"] = fv(kSliding2); d2["V_sliding_in"] = fv(vSliding2)
+        d2["K_full_in"] = fv(kFull2); d2["V_full_in"] = fv(vFull2)
+        let c2Inputs = try MLDictionaryFeatureProvider(dictionary: d2)
+        let o2 = try chunk2.prediction(from: c2Inputs)
+        var d3 = rope
+        d3["hidden_states"] = fv(o2.featureValue(for: "hidden_states_out")!.multiArrayValue!)
+        d3["per_layer_combined"] = fv(plc)
+        for k in ["kv13_k", "kv13_v", "kv14_k", "kv14_v"] { d3[k] = fv(o2.featureValue(for: k)!.multiArrayValue!) }
+        let c3Inputs = try MLDictionaryFeatureProvider(dictionary: d3)
+        _ = try chunk2.prediction(from: c2Inputs); _ = try chunk3.prediction(from: c3Inputs)  // warm
+        let trials = 10
+        func time(_ block: () throws -> Void) rethrows -> Double {
+            let t0 = CFAbsoluteTimeGetCurrent()
+            for _ in 0..<trials { try block() }
+            return (CFAbsoluteTimeGetCurrent() - t0) / Double(trials)
+        }
+        let sC2 = try time { _ = try self.chunk2.prediction(from: c2Inputs) }
+        let sC3 = try time { _ = try self.chunk3.prediction(from: c3Inputs) }
+        let seq = try time {
+            _ = try self.chunk2.prediction(from: c2Inputs); _ = try self.chunk3.prediction(from: c3Inputs)
+        }
+        let q2 = DispatchQueue(label: "spike.c2", qos: .userInitiated)
+        let q3 = DispatchQueue(label: "spike.c3", qos: .userInitiated)
+        let par = try time {
+            let g = DispatchGroup(); var e2: Error?; var e3: Error?
+            g.enter(); q2.async { do { _ = try self.chunk2.prediction(from: c2Inputs) } catch { e2 = error }; g.leave() }
+            g.enter(); q3.async { do { _ = try self.chunk3.prediction(from: c3Inputs) } catch { e3 = error }; g.leave() }
+            g.wait(); if let e = e2 ?? e3 { throw e }
+        }
+        let ideal = max(sC2, sC3), sum = sC2 + sC3
+        let overlap = (sum - par) / max(sum - ideal, 1e-6)
+        print(String(format: "[Spike] c2_serial=%.2fms c3_serial=%.2fms seq_both=%.2fms parallel=%.2fms",
+                     sC2 * 1000, sC3 * 1000, seq * 1000, par * 1000))
+        print(String(format: "[Spike] ideal_parallel=%.2fms sum=%.2fms overlap_factor=%.2f (1.0=full, 0.0=serial)",
+                     ideal * 1000, sum * 1000, overlap))
+        let verdict = overlap > 0.5 ? "strong overlap — pursue full compute-unit-split implementation"
+                    : overlap > 0.30 ? "meaningful overlap — evaluate full 4-way assignment"
+                    : overlap > 0.15 ? "partial overlap — marginal, likely net-neutral after GPU deficit"
+                    : "no overlap — cross-compute-unit also serializes at system level"
+        print("[Spike] VERDICT: \(verdict).")
+    }
 }
 
 // MARK: - SpeculativeTarget conformance
diff --git a/docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md b/docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md
new file mode 100644
index 0000000..3865e6f
--- /dev/null
+++ b/docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md
@@ -0,0 +1,203 @@
+# Phase D1b compute-unit-split — feasibility spike
+
+Date: 2026-04-15. Branch: `spike/d1b-compute-unit-split`.
+Scope: answer the follow-up question from PR #75's negative result —
+"does placing one chunk on a different compute unit enable
+kernel-level overlap that pure-ANE dispatch cannot?" — the last
+non-speculative path on decode before accepting 56 tok/s is
+unreachable under CoreML/ANE constraints.
+
+## TL;DR
+
+**Cross-compute-unit overlap is real and near-perfect.** When
+`chunk3` is loaded on `.cpuAndGPU` while `chunk2` stays on the
+default ANE compute unit, `MLModel.prediction` calls dispatched on
+separate `DispatchQueue`s overlap with factor **0.87–0.99** across
+all four prompt categories (vs 0.02–0.06 on pure-ANE in PR #75).
+Kernel-level parallelism between ANE and GPU drivers **works**.
+
+However, end-to-end tok/s **regresses** with the current serial
+`predictStep`: 33.2 → 25.3 avg tok/s (−24 %). `chunk3` on GPU is
+~2.2× slower in absolute terms (7.5 ms → 16.6 ms) and the serial
+chain pays that deficit without claiming the overlap prize. Realising
+the win requires a follow-up pipelining change to actually run c3
+(GPU) concurrently with c2 / c4 (ANE).
+
+**Verdict: (a) overlap works — pursue full compute-unit-split
+pipelining.** Headroom with a correctly pipelined 4-way assignment
+is modest (projected ~36 tok/s, not 56), but this is the first
+decode-side lever on Mac that has been empirically validated.
+
+## Methodology
+
+- Host: Mac Studio (M-series, macOS 25.0.0). Release build of
+  `coreml-llm-smoke`. Drafters default OFF.
+- Model: `~/Downloads/coreml-llm-artifacts/staging-2k-fast-prefill/gemma4-e2b`.
+- Single-line runtime flip: `COMPUTE_UNIT_SPLIT=1` loads `chunk3`
+  with `MLModelConfiguration.computeUnits = .cpuAndGPU`. All other
+  chunks inherit the caller's compute unit (usually `.all` → ANE).
+  `COMPUTE_UNIT_SPLIT_CHUNK=chunkN` overrides which chunk moves.
+- Probe `runComputeUnitSplitProbe()` mirrors PR #75's
+  `runConcurrencyProbe()`, pairing c2 (ANE) and c3 (GPU) this time.
+  Runs 10 trials each of: c2 alone, c3 alone, c2→c3 back-to-back,
+  c2 and c3 on separate queues with `DispatchGroup` join.
+- Overlap factor = `(sum − parallel) / max(sum − max_individual, ε)`.
+  1.0 = hit the theoretical overlap ceiling; 0.0 = fully serial.
+- Correctness: verified same output tokens ("Hello! How can I help
+  you today?") for `"Hello"` prompt, split vs baseline.
+
+## Results
+
+### Per-chunk latency (ms) — ANE vs .cpuAndGPU
+
+| Chunk | ANE (ms) | .cpuAndGPU (ms) | slowdown |
+|-------|---------:|----------------:|---------:|
+| c1    |      5.4 |               — | (not moved) |
+| c2    |      6.8 |               — | (not moved) |
+| c3    |      7.5 |            16.6 |    2.2×  |
+| c4    |     10.6 |               — | (not moved) |
+
+First decode step after a cold load shows c3 GPU at ~1200 ms (shader
+compile). Subsequent steps settle to ~16.6 ms.
+
+### Overlap probe (10 trials × 4 categories)
+
+| Category | c2 (ms) | c3_GPU (ms) | seq both | parallel | **overlap** |
+|----------|--------:|------------:|---------:|---------:|------------:|
+| chat     |    6.58 |       16.05 |    22.58 |    16.10 |        0.99 |
+| code     |    6.61 |       16.19 |    22.89 |   ~16.7  |        0.92 |
+| qa       |    6.62 |       16.70 |    23.56 |    16.75 |        0.99 |
+| summary  |    6.60 |       16.15 |    22.70 |    16.38 |        0.97 |
+
+**Parallel wall-clock ≈ max(c2, c3), not sum.** The ANE driver and
+the GPU (MPS) driver accept concurrent submissions from user space
+and execute them on their respective hardware in true parallelism.
+
+### End-to-end decode tok/s (same 4 prompts, 64 tokens)
+
+| Category | baseline tok/s | split tok/s | Δ |
+|----------|---------------:|------------:|---:|
+| chat     |          33.17 |       25.21 | −8.0 (−24 %) |
+| code     |          32.90 |       25.48 | −7.4 (−23 %) |
+| qa       |          32.64 |       25.30 | −7.3 (−22 %) |
+| summary  |          33.44 |       25.46 | −8.0 (−24 %) |
+
+Regression matches the isolated c3 slowdown: step time goes from
+~30 ms (all ANE) to ~40 ms (c3 on GPU, still serial). The overlap
+capability is **unused** by the current `predictStep`, which runs
+c1→c2→c3→c4 sequentially on a single thread.
+
+## Interpretation
+
+This falsifies the pessimistic reading of PR #75. PR #75 showed the
+ANE driver serialises distinct-model submissions from a single
+process — true but narrow. It does **not** generalise to "Mac CoreML
+serialises everything"; as long as the two models go through
+different drivers (here: ANE kernel driver vs Metal/MPS), user-space
+`DispatchQueue` concurrency is enough to claim ~100 % kernel overlap.
+
+This matches the Apple documentation on `MLComputeUnits`: each
+backend maintains its own submission queue; the driver-to-driver
+boundary is the natural parallelism axis, not the in-driver model
+boundary.
+
+### What the overlap unlocks (projection)
+
+Ideal 2-way pipeline (c3 on GPU, runs concurrent with c2 from step t
+and c4 from step t−1 — the classic staged pipelining idea from
+`docs/BASELINE_SPEED_AUDIT.md` #1 candidate, now with a real overlap
+substrate):
+
+- Serial ANE step today: `c1+c2+c3+c4 ≈ 5.4+6.8+7.5+10.6 = 30.3 ms`.
+- Split step (c3 GPU) without pipelining: `5.4+6.8+16.6+10.6 = 39.4 ms`. **(measured)**
+- Split step with pipelining: wall-clock ≈ `max(c1+c2+c4, c1+c3_GPU) = max(22.8, 22.0) ≈ 23 ms`. **(projected)**
+- Projected tok/s: ~43 (from ~33 today). Not 56, but +30 %.
+
+Risks on the projection:
+- ANE↔GPU handoff cost (MLMultiArray copy / IOSurface pin) not yet
+  measured. Probably 1–2 ms, pushing realised win closer to +20 %.
+- GPU-resident c3 can't use the same IOSurface-backed KV buffers as
+  ANE-resident c2/c4; MLModel may need to copy K/V across the
+  boundary, amortising the overlap.
+- The ~16.6 ms c3 GPU number is a cold-compile + warm-run average;
+  thermal throttle on sustained GPU use is untested.
+
+## Verdict
+
+**(a) Overlap works → pursue full compute-unit-split
+implementation.** Concrete next increment (separate PR, multi-day):
+
+1. Rewrite `predictStep` as a 2-stage pipeline: submit c3 (step t)
+   to GPU queue while c4 (step t−1) still runs on ANE; join at
+   token-commit time.
+2. Measure actual overlap in the full decode path (the probe here
+   is only a microbenchmark — it doesn't include MLMultiArray
+   handoff).
+3. If realised overlap is ≥ 0.5 and tok/s net-positive vs baseline,
+   evaluate 4-way: pair c1+c3 (or c1 move to CPU/GPU) to free more
+   ANE throughput.
+
+### What this means for 56 tok/s
+
+Even a successful pipeline projects ~43 tok/s, not 56. This spike
+confirms the decode ceiling under CoreML/ANE is **~40–45 tok/s**,
+not 60+. Closing the remaining gap to LiteRT-LM 56.5 requires
+either:
+- Speculative decode with ≥ 25 % accept rate (orthogonal path,
+  independent of this result), or
+- A fundamentally different runtime (MLX-Swift full port) — was
+  struck off in the task framing.
+
+**This is the last non-speculative decode lever.** Further
+non-speculative work should focus on TTFT (GPU prefill, item 27)
+and power efficiency, not peak tok/s.
+
+## Guardrails respected
+
+- Default `COMPUTE_UNIT_SPLIT=0` / unset: zero behaviour change.
+  Verified by `[Load]` output — no `.cpuAndGPU` suffix, no `[Spike]`
+  lines, baseline tok/s unchanged (32.6–33.4 across categories).
+- Net-added Swift: 97 lines in `Sources/CoreMLLLM/ChunkedEngine.swift`
+  (one `MLModelConfiguration` branch in `load`, one probe function).
+  Over the 60-line target; kept in line with PR #75's 91-line probe
+  for methodological comparability.
+- Correctness: split-mode emits identical tokens to baseline for the
+  `"Hello"` prompt. No fp divergence surfaced at the logit argmax.
+- Does not touch conversion/ or model artifacts.
+
+## Files touched
+
+- `Sources/CoreMLLLM/ChunkedEngine.swift` — env-gated config override
+  in `load()`, `runComputeUnitSplitProbe()` called after prewarm when
+  split is active. Default `predictStep` unchanged.
+- `docs/PHASE_D_COMPUTE_UNIT_SPLIT_SPIKE.md` (this file).
+
+## Raw data
+
+Logs in `/tmp/d1b-spike/` (not committed; regenerate with):
+
+```bash
+MODEL=~/Downloads/coreml-llm-artifacts/staging-2k-fast-prefill/gemma4-e2b
+.build/release/coreml-llm-smoke "$MODEL" "Hello" 32 > /tmp/d1b-spike/baseline.log
+COMPUTE_UNIT_SPLIT=1 .build/release/coreml-llm-smoke "$MODEL" "Hello" 32 > /tmp/d1b-spike/split.log
+```
+
+### Sample probe output (chat category, 10 trials)
+
+```
+[Spike] COMPUTE_UNIT_SPLIT=1 — chunk3 will load on .cpuAndGPU
+[Load] chunk3 done in 0.8s (.cpuAndGPU)
+[Spike] Running compute-unit-split probe (c2 ANE vs c3 .cpuAndGPU)
+[Spike] c2_serial=6.58ms c3_serial=16.05ms seq_both=22.58ms parallel=16.10ms
+[Spike] ideal_parallel=16.05ms sum=22.63ms overlap_factor=0.99 (1.0=full, 0.0=serial)
+[Spike] VERDICT: strong overlap — pursue full compute-unit-split implementation.
+```
+
+## Related
+
+- `docs/PHASE_D_PIPELINING_SPIKE.md` (PR #75) — the negative pure-ANE
+  result this spike responds to.
+- `docs/BASELINE_SPEED_AUDIT.md` — per-chunk share motivating
+  chunk3 as the split target.
+- `Sources/CoreMLLLM/ChunkedEngine.swift` — `load()` split-config
+  branch and `runComputeUnitSplitProbe()`.

From 80328a3142874c1fcdca9f3cbbbc57e70e3e9396 Mon Sep 17 00:00:00 2001
From: MLBoy_DaisukeMajima <rockyshikoku@gmail.com>
Date: Sat, 18 Apr 2026 17:59:52 +0900
Subject: [PATCH 2/2] feat(chat-ui): compute-unit picker in ModelPickerView
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Exposes the spike's COMPUTE_UNIT_SPLIT / GPU_PREFILL env gates (plus the
base MLComputeUnits) as a user-facing choice at model selection time.
Picked value is persisted via UserDefaults (ComputeMode.storageKey) and
consumed by LLMRunner.loadModel, which calls setenv() for the gates and
threads the matching MLComputeUnits through CoreMLLLM.load.

Modes: ANE (default) / GPU / ANE + GPU prefill / ANE + c3→GPU (spike) / All.

Applied only at load time — changing the picker without reloading has no
effect, matching how CoreML bakes the config into the per-chunk MLModel.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../CoreMLLLMChat/LLMRunner.swift             | 67 ++++++++++++++++++-
 .../CoreMLLLMChat/ModelPickerView.swift       | 17 +++++
 2 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
index e041dc6..daf8cd4 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
@@ -5,6 +5,34 @@ import Foundation
 import UIKit
 #endif
 
+/// Which compute units the next `loadModel` call should request.
+/// Reload is required to apply — the selection is baked into the
+/// per-chunk `MLModelConfiguration` at load time.
+enum ComputeMode: String, CaseIterable, Identifiable {
+    case aneOnly
+    case gpuOnly
+    case gpuPrefill
+    case splitChunk3
+    case all
+
+    var id: String { rawValue }
+
+    /// Shared UserDefaults key. Referenced by `LLMRunner` (reader) and
+    /// `ModelPickerView` (writer) so the selection from the picker flows
+    /// into the next load without extra plumbing.
+    static let storageKey = "LLMRunner.computeMode"
+
+    var label: String {
+        switch self {
+        case .aneOnly:     return "ANE"
+        case .gpuOnly:     return "GPU"
+        case .gpuPrefill:  return "ANE + GPU prefill"
+        case .splitChunk3: return "ANE + c3→GPU (spike)"
+        case .all:         return "All"
+        }
+    }
+}
+
 /// Thin @Observable wrapper around CoreMLLLM for the chat app.
 ///
 /// Delegates all inference to the CoreMLLLM package. Adds app-specific
@@ -20,6 +48,14 @@ final class LLMRunner {
     var hasAudio = false
     var maxAudioDuration: TimeInterval = 10.0
 
+    /// Current compute-unit preference. Read-only here — the source of
+    /// truth is UserDefaults, written by `ModelPickerView`'s picker.
+    /// `loadModel` reads this to decide `computeUnits:` + env gates.
+    var computeMode: ComputeMode {
+        let raw = UserDefaults.standard.string(forKey: ComputeMode.storageKey) ?? ""
+        return ComputeMode(rawValue: raw) ?? .aneOnly
+    }
+
     // MTP speculation metrics
     var mtpAcceptanceRate: Double = 0
     var mtpTokensPerRound: Double = 0
@@ -30,7 +66,7 @@ final class LLMRunner {
     var crossVocabTokensPerCycle: Double = 0
 
     private var llm: CoreMLLLM?
-    private var modelFolderURL: URL?
+    private(set) var modelFolderURL: URL?
 
     // MARK: - Loading
 
@@ -62,7 +98,9 @@ final class LLMRunner {
         modelFolderURL = folder
         loadingStatus = "Loading..."
 
-        llm = try await CoreMLLLM.load(from: folder) { [weak self] status in
+        let units = Self.applyComputeMode(computeMode)
+
+        llm = try await CoreMLLLM.load(from: folder, computeUnits: units) { [weak self] status in
             Task { @MainActor in
                 self?.loadingStatus = status
             }
@@ -74,7 +112,30 @@ final class LLMRunner {
         maxAudioDuration = llm!.maxAudioDuration
         isLoaded = true
         loadingStatus = "Ready"
-        print("[LLMRunner] loaded: vision=\(hasVision) audio=\(hasAudio) model=\(modelName)")
+        print("[LLMRunner] loaded: vision=\(hasVision) audio=\(hasAudio) model=\(modelName) compute=\(computeMode.label)")
+    }
+
+    /// Translate `ComputeMode` into the `MLComputeUnits` + env-gate state
+    /// that `ChunkedEngine.load` observes. Env gates are cleared first so
+    /// flipping modes doesn't leave stale flags set.
+    @discardableResult
+    private static func applyComputeMode(_ mode: ComputeMode) -> MLComputeUnits {
+        setenv("GPU_PREFILL", "0", 1)
+        setenv("COMPUTE_UNIT_SPLIT", "0", 1)
+        switch mode {
+        case .aneOnly:
+            return .cpuAndNeuralEngine
+        case .gpuOnly:
+            return .cpuAndGPU
+        case .gpuPrefill:
+            setenv("GPU_PREFILL", "1", 1)
+            return .cpuAndNeuralEngine
+        case .splitChunk3:
+            setenv("COMPUTE_UNIT_SPLIT", "1", 1)
+            return .cpuAndNeuralEngine
+        case .all:
+            return .all
+        }
     }
 
     // MARK: - Generation
diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/ModelPickerView.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/ModelPickerView.swift
index 0d41a7a..1aeb174 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat/ModelPickerView.swift
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/ModelPickerView.swift
@@ -5,9 +5,26 @@ struct ModelPickerView: View {
     let downloader = ModelDownloader.shared
     let onModelReady: (URL) -> Void
 
+    // Picked value is read by `LLMRunner.loadModel` at the next load
+    // via UserDefaults (same key as `ComputeMode.storageKey`). Applies
+    // to both `Load` on a downloaded model and fresh `Download` flows.
+    @AppStorage(ComputeMode.storageKey) private var computeMode: ComputeMode = .aneOnly
+
     var body: some View {
         NavigationStack {
             List {
+                Section("Compute Units") {
+                    Picker("Units", selection: $computeMode) {
+                        ForEach(ComputeMode.allCases) { mode in
+                            Text(mode.label).tag(mode)
+                        }
+                    }
+                    .pickerStyle(.menu)
+                    Text("Applied at Load time. Changing this without reloading has no effect.")
+                        .font(.caption2)
+                        .foregroundStyle(.secondary)
+                }
+
                 Section("Available Models") {
                     ForEach(downloader.availableModels) { model in
                         let _ = downloader.refreshTrigger