From 7468d213c9e4fac2cfe1eec82036104337493482 Mon Sep 17 00:00:00 2001
From: john-rocky <samuraibrothersmail@gmail.com>
Date: Tue, 28 Apr 2026 18:57:55 +0900
Subject: [PATCH 1/7] feat(gemma4): generalize stateful 3-chunk converter to
 E2B + E4B
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase A1-A3 of the E4B optimization stack. Brings the stage2-e4b 4-chunk
foundation (Phase 1 stateful + Phase 2a cross-turn KV) onto current main
and adds 3-chunk merged + multifunction prefill_bN support for E4B —
the lever that gave E2B its 33.4 tok/s iPhone 17 Pro decode.

Converter side
  - SWAStatefulMergedChunk23{,Prefill,Single,PrefillSingle} accept
    own_range / shared_range; defaults remain E2B (own=L8-14, shared=
    L15-24) for back-compat. E4B passes (12,24)/(24,33) derived from
    compute_chunk_boundaries(config) — kv13/kv14 names are kept as
    legacy aliases for the (sliding,full) producer slots.
  - build_gemma4_e2b_stateful_3chunks.py: drops the "E2B only"
    hardcoded help; --model gemma4-e4b now produces a 3-chunk merged
    bundle (chunk_1 L0-11 / chunk_2 L12-32 merged / chunk_3 L33-41 +
    lm_head). Chunk-2 layout printed dynamically.
  - sanity_stateful_chunks.py: from stage2-e4b — adds --model preset
    so /tmp/gemma4-{e2b,e4b}-stateful chunks share one verifier.

Bundle side
  - scripts/assemble_gemma4_stateful_e4b.sh: from stage2-e4b — pulls
    chunk_*.mlmodelc + legacy E4B sidecars into the bundle layout
    Gemma4StatefulEngine expects (subdir gemma4_e2b_stateful_chunks/
    is intentionally shared across E2B/E4B; engine reads hidden /
    layers / HKV from model_config.json).

Runtime side (Swift)
  - ModelDownloader.swift: gemma4e4bStateful + gemma4e4bStatefulLinear
    ModelInfo entries (slots 6/7 under LLM_SHOW_EXPERIMENTAL=1).
    downloadURL is intentionally blank — A6 will fill in the new
    mlboydaisuke/gemma-4-E4B-stateful-coreml repo URL once iPhone 17
    Pro A/B clears. Existing mlboydaisuke/gemma-4-E4B-coreml legacy
    repo is untouched, preserving the dual-repo pattern E2B uses.
  - LLMRunner.swift: stateful detection comment now lists all four
    folders that share the gemma4_e2b_stateful_chunks/ layout.

Build artefacts (A4) and iPhone validation (A5) follow.
---
 .../CoreMLLLMChat/LLMRunner.swift             |  16 +--
 Sources/CoreMLLLM/ModelDownloader.swift       |  27 +++++
 .../build_gemma4_e2b_stateful_3chunks.py      | 104 +++++++++++++-----
 .../models/gemma4_swa_stateful_chunks.py      |  62 ++++++++---
 conversion/sanity_stateful_chunks.py          |  66 ++++++++---
 scripts/assemble_gemma4_stateful_e4b.sh       | 102 +++++++++++++++++
 6 files changed, 308 insertions(+), 69 deletions(-)
 create mode 100755 scripts/assemble_gemma4_stateful_e4b.sh

diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
index ce75469..30f43bd 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
@@ -187,13 +187,15 @@ final class LLMRunner {
             return
         }
 
-        // Gemma 4 E2B STATEFUL detection: chunk_{1..4}.mlpackage/.mlmodelc
-        // + embed_tokens_q8.bin under gemma4_e2b_stateful_chunks/. Both
-        // the Conv2d wrapper variant (folder=gemma4-e2b-stateful) and the
-        // Linear variant (folder=gemma4-e2b-stateful-linear, Plan 3 A/B)
-        // share the same internal layout — Gemma4StatefulEngine handles
-        // both transparently because the only difference is the MIL graph
-        // inside each chunk_*.mlpackage.
+        // Gemma 4 STATEFUL detection: chunk_{1..4}.mlpackage/.mlmodelc
+        // + embed_tokens_q8.bin under gemma4_e2b_stateful_chunks/. The
+        // subdir name is shared across all six published variants —
+        //   E2B: gemma4-e2b-stateful{,-linear}  (Conv2d / Plan 3 Linear)
+        //   E4B: gemma4-e4b-stateful{,-linear}  (Stage 2 port)
+        // — because Gemma4StatefulEngine reads hidden_size / num_layers /
+        // num_kv_heads from model_config.json, so per-model differences
+        // (E2B 35 layers / HKV=1 vs E4B 42 layers / HKV=2) need no
+        // engine code change.
         // Require either:
         //  - chunks 1-3 (3-chunk or 4-chunk bundle — chunk_4 optional)
         //  - model.{mlpackage,mlmodelc} (1-chunk all-in-one)
diff --git a/Sources/CoreMLLLM/ModelDownloader.swift b/Sources/CoreMLLLM/ModelDownloader.swift
index 54c6fab..7ac9200 100644
--- a/Sources/CoreMLLLM/ModelDownloader.swift
+++ b/Sources/CoreMLLLM/ModelDownloader.swift
@@ -263,6 +263,31 @@ public final class ModelDownloader: NSObject {
             downloadURL: "https://huggingface.co/mlboydaisuke/gemma-4-E2B-stateful-coreml/resolve/main",
             folderName: "gemma4-e2b-stateful-linear")
 
+        /// Gemma 4 E4B stateful — Stage 2 port of the E2B Phase 1 + 2a
+        /// stateful path to the larger 4 B sibling. Built by
+        /// `conversion/build_gemma4_e2b_stateful_chunks.py --model gemma4-e4b`
+        /// (same script; chunk boundaries / hidden / HKV come from the HF
+        /// config). Shares the inner subdir name `gemma4_e2b_stateful_chunks`
+        /// with the E2B variants — Gemma4StatefulEngine reads
+        /// hidden_size / num_layers / per_layer_dim from `model_config.json`,
+        /// so E4B runs without engine code changes. Sideload-only to
+        /// `Documents/Models/gemma4-e4b-stateful/gemma4_e2b_stateful_chunks/`.
+        public static let gemma4e4bStateful = ModelInfo(
+            id: "gemma4-e4b-stateful",
+            name: "Gemma 4 E4B (stateful, MLState)", size: "5.6 GB",
+            downloadURL: "",
+            folderName: "gemma4-e4b-stateful")
+
+        /// Gemma 4 E4B stateful — Linear projections variant (cml9 PR #2577
+        /// `nn.Linear` form, ANE-equivalent placement). Same layout as
+        /// `gemma4e4bStateful`. Production HF download URL is filled in
+        /// once the iPhone 17 Pro A/B clears (Stage 2 closure step A6).
+        public static let gemma4e4bStatefulLinear = ModelInfo(
+            id: "gemma4-e4b-stateful-linear",
+            name: "Gemma 4 E4B (stateful, Linear projections)", size: "5.6 GB",
+            downloadURL: "",
+            folderName: "gemma4-e4b-stateful-linear")
+
         /// Visible in the UI picker. EAGLE-3 / LookAhead probe variants are
         /// hidden unless `LLM_SHOW_EXPERIMENTAL=1` is set (or the
         /// UserDefaults key `showExperimentalModels` is true). Keeps the
@@ -297,6 +322,8 @@ public final class ModelDownloader: NSObject {
                 list.insert(gemma4e2bEagle3, at: 3)
                 list.insert(gemma4e2bLookaheadProbe, at: 4)
                 list.insert(gemma4e2bStateful, at: 5)        // Conv2d variant
+                list.insert(gemma4e4bStateful, at: 6)        // E4B Stage 2 Conv2d
+                list.insert(gemma4e4bStatefulLinear, at: 7)  // E4B Stage 2 Linear
             }
             return list
         }
diff --git a/conversion/build_gemma4_e2b_stateful_3chunks.py b/conversion/build_gemma4_e2b_stateful_3chunks.py
index d2f31f0..14c55ef 100644
--- a/conversion/build_gemma4_e2b_stateful_3chunks.py
+++ b/conversion/build_gemma4_e2b_stateful_3chunks.py
@@ -1,25 +1,35 @@
 #!/usr/bin/env python3
-"""Build Gemma 4 E2B stateful 3-chunk variant (merged middle).
+"""Build Gemma 4 stateful 3-chunk variant (merged middle).
 
 Same as `build_gemma4_e2b_stateful_chunks.py` but emits 3 mlpackages
-instead of 4 — the middle chunk merges the 4-chunk's chunk_2 (own KV
-L8-14) and chunk_3 (KV-shared L15-24), keeping kv13/kv14 producer
-aliases internal. Final chunk_3 = old chunk_4 (KV-shared L25-34 +
-lm_head + argmax).
-
-Layout:
-  chunk_1.mlpackage  (L0-7,  own KV, computes PLE)        — same as 4-chunk
-  chunk_2.mlpackage  (L8-24, merged: own + shared inside) — NEW
-  chunk_3.mlpackage  (L25-34 + lm_head + argmax)          — = old chunk_4
+instead of 4 — the middle chunk merges the 4-chunk's chunk_2 (own KV)
+and chunk_3 (KV-shared), keeping kv13/kv14 producer aliases internal.
+Final chunk_3 = old chunk_4 (KV-shared tail + lm_head + argmax).
+
+Layout (E2B / E4B, derived from `compute_chunk_boundaries(config)`):
+  E2B (35 layers):
+    chunk_1  L0-7   own KV, computes PLE        — same as 4-chunk
+    chunk_2  L8-24  merged: own L8-14 + shared L15-24
+    chunk_3  L25-34 + lm_head + argmax          — = old chunk_4
+  E4B (42 layers):
+    chunk_1  L0-11  own KV, computes PLE
+    chunk_2  L12-32 merged: own L12-23 + shared L24-32
+    chunk_3  L33-41 + lm_head + argmax
 
 Multifunction `--prefill-batches "8"` adds a `prefill_b8` function to
 each chunk (sharing weights via coremltools save_multifunction).
 
 Usage:
     python conversion/build_gemma4_e2b_stateful_3chunks.py \
+        --model gemma4-e2b \
         --output /tmp/g4_3chunk/multi \
         --hf-dir /path/to/gemma4-e2b/hf_model \
         --ctx 2048 --linear-projections --prefill-batches "8"
+
+    python conversion/build_gemma4_e2b_stateful_3chunks.py \
+        --model gemma4-e4b \
+        --output /tmp/g4_3chunk_e4b \
+        --ctx 2048 --linear-projections --prefill-batches "8"
 """
 from __future__ import annotations
 
@@ -63,9 +73,13 @@
 fp16 = np.float16
 
 
-def convert_chunk2_merged(base, ctx, out_path, nbits, *, use_linear=False):
+def convert_chunk2_merged(base, ctx, out_path, nbits, *, use_linear=False,
+                          own_range=None, shared_range=None):
+    own = own_range or (8, 15)
+    shared = shared_range or (15, 25)
     print("\n" + "=" * 60)
-    print(f"CHUNK 2 MERGED (L8-24) — own KV L8-14 + KV-shared L15-24")
+    print(f"CHUNK 2 MERGED (L{own[0]}-{shared[1]-1}) — "
+          f"own KV L{own[0]}-{own[1]-1} + KV-shared L{shared[0]}-{shared[1]-1}")
     print("=" * 60)
     cfg = base.config
     hidden = cfg.hidden_size
@@ -77,7 +91,10 @@ def convert_chunk2_merged(base, ctx, out_path, nbits, *, use_linear=False):
     HKV = cfg.num_key_value_heads
 
     chunk = SWAStatefulMergedChunk23(base, ctx,
-                                       use_linear=use_linear).eval().to(MODEL_DTYPE)
+                                       use_linear=use_linear,
+                                       own_range=own_range,
+                                       shared_range=shared_range
+                                       ).eval().to(MODEL_DTYPE)
     ns, nf = max(chunk.num_sliding, 1), max(chunk.num_full, 1)
 
     sample = (
@@ -128,9 +145,12 @@ def convert_chunk2_merged(base, ctx, out_path, nbits, *, use_linear=False):
 
 
 def convert_chunk2_merged_prefill(base, ctx, T, out_path, nbits, *,
-                                    use_linear=False):
+                                    use_linear=False,
+                                    own_range=None, shared_range=None):
+    own = own_range or (8, 15)
+    shared = shared_range or (15, 25)
     print("\n" + "-" * 60)
-    print(f"CHUNK 2 MERGED PREFILL T={T} (L8-24)")
+    print(f"CHUNK 2 MERGED PREFILL T={T} (L{own[0]}-{shared[1]-1})")
     print("-" * 60)
     cfg = base.config
     hidden = cfg.hidden_size
@@ -142,7 +162,8 @@ def convert_chunk2_merged_prefill(base, ctx, T, out_path, nbits, *,
     HKV = cfg.num_key_value_heads
 
     chunk = SWAStatefulMergedChunk23Prefill(
-        base, ctx, use_linear=use_linear, T=T).eval().to(MODEL_DTYPE)
+        base, ctx, use_linear=use_linear, T=T,
+        own_range=own_range, shared_range=shared_range).eval().to(MODEL_DTYPE)
     ns, nf = max(chunk.num_sliding, 1), max(chunk.num_full, 1)
 
     sample = (
@@ -305,7 +326,8 @@ def convert_chunk1_prefill_single(base, c_start, c_end, ctx, T, out_path, nbits,
         chunk, sample, inputs, outputs, states, out_path, nbits)
 
 
-def convert_chunk2_merged_single(base, ctx, out_path, nbits, *, use_linear=False):
+def convert_chunk2_merged_single(base, ctx, out_path, nbits, *, use_linear=False,
+                                 own_range=None, shared_range=None):
     print("\n" + "=" * 60)
     print(f"CHUNK 2 MERGED SINGLE-BUFFER (L8-24)")
     print("=" * 60)
@@ -319,7 +341,10 @@ def convert_chunk2_merged_single(base, ctx, out_path, nbits, *, use_linear=False
     HKV = cfg.num_key_value_heads
 
     chunk = SWAStatefulMergedChunk23Single(base, ctx,
-                                             use_linear=use_linear).eval().to(MODEL_DTYPE)
+                                             use_linear=use_linear,
+                                             own_range=own_range,
+                                             shared_range=shared_range
+                                             ).eval().to(MODEL_DTYPE)
     no = max(chunk.num_own, 1)
     sample = (
         torch.zeros(1, 1, hidden, dtype=torch.float16),
@@ -364,7 +389,8 @@ def convert_chunk2_merged_single(base, ctx, out_path, nbits, *, use_linear=False
 
 
 def convert_chunk2_merged_prefill_single(base, ctx, T, out_path, nbits, *,
-                                            use_linear=False):
+                                            use_linear=False,
+                                            own_range=None, shared_range=None):
     print("\n" + "-" * 60)
     print(f"CHUNK 2 MERGED SINGLE-BUFFER PREFILL T={T} (L8-24)")
     print("-" * 60)
@@ -378,7 +404,8 @@ def convert_chunk2_merged_prefill_single(base, ctx, T, out_path, nbits, *,
     HKV = cfg.num_key_value_heads
 
     chunk = SWAStatefulMergedChunk23PrefillSingle(
-        base, ctx, use_linear=use_linear, T=T).eval().to(MODEL_DTYPE)
+        base, ctx, use_linear=use_linear, T=T,
+        own_range=own_range, shared_range=shared_range).eval().to(MODEL_DTYPE)
     no = max(chunk.num_own, 1)
     sample = (
         torch.zeros(1, T, hidden, dtype=torch.float16),
@@ -425,7 +452,11 @@ def convert_chunk2_merged_prefill_single(base, ctx, T, out_path, nbits, *,
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--model", default="gemma4-e2b",
-                    help="Model name (gemma4-e2b only for now)")
+                    help="Model name (gemma4-e2b or gemma4-e4b). Chunk "
+                         "boundaries are derived from the HF config via "
+                         "compute_chunk_boundaries(config); kv13/kv14 names "
+                         "are legacy aliases (sliding/full producer slots) "
+                         "shared across both models.")
     ap.add_argument("--output", required=True)
     ap.add_argument("--hf-dir", default=None)
     ap.add_argument("--ctx", type=int, default=None)
@@ -465,15 +496,22 @@ def main():
 
     cfg = base.config
     boundaries = compute_chunk_boundaries(cfg)
-    # 3-chunk: re-use boundaries[0] (chunk_1 L0-7) and boundaries[3]
-    # (= old chunk_4 L25-34, which becomes new chunk_3). The merged
-    # middle (= boundaries[1] start, boundaries[2] end = L8-25) is
-    # baked into SWAStatefulMergedChunk23.
+    # 3-chunk: re-use boundaries[0] (chunk_1) and boundaries[3] (= old
+    # chunk_4, which becomes new chunk_3). The merged middle spans
+    # boundaries[1] (own KV) → boundaries[2] (KV-shared), passed into
+    # SWAStatefulMergedChunk23 via own_range/shared_range so the same
+    # builder works for E2B (own=L8-14, shared=L15-24) and E4B
+    # (own=L12-23, shared=L24-32).
     chunk1_range = boundaries[0]
+    own_range = boundaries[1]
+    shared_range = boundaries[2]
     chunk3_range = boundaries[3]   # final chunk = old chunk_4
     print(f"\nctx={args.ctx}  W={cfg.sliding_window}  hidden={cfg.hidden_size}")
     print(f"3-chunk layout: c1=L{chunk1_range[0]}-{chunk1_range[1]-1}, "
-          f"c2_merged=L8-24, c3=L{chunk3_range[0]}-{chunk3_range[1]-1}")
+          f"c2_merged=L{own_range[0]}-{shared_range[1]-1} "
+          f"(own L{own_range[0]}-{own_range[1]-1} + "
+          f"shared L{shared_range[0]}-{shared_range[1]-1}), "
+          f"c3=L{chunk3_range[0]}-{chunk3_range[1]-1}")
     print(f"Quantize: int{args.nbits}" if args.nbits else "Quantize: fp16")
     if args.linear_projections:
         print(f"Projections: nn.Linear")
@@ -542,20 +580,26 @@ def _build_one(decode_fn, prefill_fn, final_name):
             _build_one(
                 lambda p: convert_chunk2_merged_single(base, args.ctx, p,
                                                          args.nbits,
-                                                         use_linear=use_linear),
+                                                         use_linear=use_linear,
+                                                         own_range=own_range,
+                                                         shared_range=shared_range),
                 lambda T, p: convert_chunk2_merged_prefill_single(
                     base, args.ctx, T, p, args.nbits,
-                    use_linear=use_linear),
+                    use_linear=use_linear,
+                    own_range=own_range, shared_range=shared_range),
                 "chunk_2",
             )
         else:
             _build_one(
                 lambda p: convert_chunk2_merged(base, args.ctx, p,
                                                   args.nbits,
-                                                  use_linear=use_linear),
+                                                  use_linear=use_linear,
+                                                  own_range=own_range,
+                                                  shared_range=shared_range),
                 lambda T, p: convert_chunk2_merged_prefill(
                     base, args.ctx, T, p, args.nbits,
-                    use_linear=use_linear),
+                    use_linear=use_linear,
+                    own_range=own_range, shared_range=shared_range),
                 "chunk_2",
             )
     if do(3):
diff --git a/conversion/models/gemma4_swa_stateful_chunks.py b/conversion/models/gemma4_swa_stateful_chunks.py
index 0747feb..3f42bd2 100644
--- a/conversion/models/gemma4_swa_stateful_chunks.py
+++ b/conversion/models/gemma4_swa_stateful_chunks.py
@@ -838,17 +838,27 @@ def forward(self, hidden_states, causal_mask_full, causal_mask_sliding,
 
 
 class SWAStatefulMergedChunk23(_StatefulChunkBase):
-    """Merged stateful chunk for L8-24. Owns KV state for L8-14, runs
-    L15-24 KV-shared internally. Eliminates the 4-chunk's chunk_2 →
-    chunk_3 hidden-state round-trip (~+5-10% Mac decode).
+    """Merged stateful chunk that owns the lower-half KV span and runs
+    the upper-half KV-shared internally. Eliminates the 4-chunk's
+    chunk_2 → chunk_3 hidden-state round-trip (~+5-10% Mac decode).
+
+    Boundaries default to E2B (own=L8-14, shared=L15-24). For E4B pass
+    own_range / shared_range derived from compute_chunk_boundaries(cfg)
+    (E4B: own=L12-23, shared=L24-32).
     """
-    START_OWN, END_OWN = 8, 15      # own-KV layers (= old chunk_2)
-    START_SHARED, END_SHARED = 15, 25  # KV-shared layers (= old chunk_3)
+    DEFAULT_OWN = (8, 15)       # E2B own-KV layers (= old chunk_2)
+    DEFAULT_SHARED = (15, 25)   # E2B KV-shared layers (= old chunk_3)
 
     def __init__(self, model: Gemma4Model, ctx: int = 2048,
-                 use_linear: bool = False):
+                 use_linear: bool = False,
+                 own_range: tuple[int, int] | None = None,
+                 shared_range: tuple[int, int] | None = None):
+        own = own_range if own_range is not None else self.DEFAULT_OWN
+        shared = shared_range if shared_range is not None else self.DEFAULT_SHARED
+        self.START_OWN, self.END_OWN = own
+        self.START_SHARED, self.END_SHARED = shared
         # Init base with the OWN-KV span so the kv_cache_* buffers size
-        # to L8-14 only. KV-shared layers don't need state slots.
+        # to chunk_2 only. KV-shared layers don't need state slots.
         super().__init__(model, self.START_OWN, self.END_OWN, ctx)
         self.layers_shared = nn.ModuleList([
             model.layers[i] for i in range(self.START_SHARED, self.END_SHARED)
@@ -905,8 +915,11 @@ def forward(self, hidden_states, causal_mask_full, causal_mask_sliding,
 class SWAStatefulMergedChunk23Prefill(SWAStatefulMergedChunk23):
     """T=N prefill variant of the merged middle chunk."""
 
-    def __init__(self, model, ctx=2048, use_linear=False, T: int = 8):
-        super().__init__(model, ctx, use_linear=use_linear)
+    def __init__(self, model, ctx=2048, use_linear=False, T: int = 8,
+                 own_range: tuple[int, int] | None = None,
+                 shared_range: tuple[int, int] | None = None):
+        super().__init__(model, ctx, use_linear=use_linear,
+                         own_range=own_range, shared_range=shared_range)
         self.T = T
 
     def forward(self, hidden_states, causal_mask_full, causal_mask_sliding,
@@ -1351,13 +1364,23 @@ def forward(self, hidden_states, causal_mask_full, causal_mask_sliding,
 
 
 class SWAStatefulMergedChunk23Single(_StatefulSingleChunkBase):
-    """3-chunk merged middle (L8-24) with unified state buffer.
-    Owns L8-14 KV; runs L15-24 KV-shared internally. Emits kv13/kv14
-    aliases for the final chunk_3."""
-    START_OWN, END_OWN = 8, 15
-    START_SHARED, END_SHARED = 15, 25
-
-    def __init__(self, model, ctx=2048, use_linear=False):
+    """3-chunk merged middle with unified state buffer.
+    Owns chunk_2 KV; runs chunk_3 KV-shared internally. Emits kv13/kv14
+    aliases for the final chunk_3.
+
+    Boundaries default to E2B (own=L8-14, shared=L15-24). For E4B pass
+    own_range / shared_range from compute_chunk_boundaries(cfg)
+    (E4B: own=L12-23, shared=L24-32)."""
+    DEFAULT_OWN = (8, 15)
+    DEFAULT_SHARED = (15, 25)
+
+    def __init__(self, model, ctx=2048, use_linear=False,
+                 own_range: tuple[int, int] | None = None,
+                 shared_range: tuple[int, int] | None = None):
+        own = own_range if own_range is not None else self.DEFAULT_OWN
+        shared = shared_range if shared_range is not None else self.DEFAULT_SHARED
+        self.START_OWN, self.END_OWN = own
+        self.START_SHARED, self.END_SHARED = shared
         super().__init__(model, self.START_OWN, self.END_OWN, ctx)
         self.layers_shared = nn.ModuleList([
             model.layers[i] for i in range(self.START_SHARED, self.END_SHARED)
@@ -1406,8 +1429,11 @@ def forward(self, hidden_states, causal_mask_full, causal_mask_sliding,
 class SWAStatefulMergedChunk23PrefillSingle(SWAStatefulMergedChunk23Single):
     """T=N prefill variant of merged middle with unified state."""
 
-    def __init__(self, model, ctx=2048, use_linear=False, T: int = 8):
-        super().__init__(model, ctx, use_linear=use_linear)
+    def __init__(self, model, ctx=2048, use_linear=False, T: int = 8,
+                 own_range: tuple[int, int] | None = None,
+                 shared_range: tuple[int, int] | None = None):
+        super().__init__(model, ctx, use_linear=use_linear,
+                         own_range=own_range, shared_range=shared_range)
         self.T = T
 
     def forward(self, hidden_states, causal_mask_full, causal_mask_sliding,
diff --git a/conversion/sanity_stateful_chunks.py b/conversion/sanity_stateful_chunks.py
index b9d1f91..c80b65f 100644
--- a/conversion/sanity_stateful_chunks.py
+++ b/conversion/sanity_stateful_chunks.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Mac sanity check for /tmp/gemma4-e2b-stateful/ chunk_{1..4}.mlpackage.
+"""Mac sanity check for Gemma 4 stateful chunk_{1..4}.mlpackage bundles.
 
 Verifies:
   1. Each chunk loads on Mac CPU_AND_NE without error.
@@ -15,9 +15,15 @@
 This is NOT a numerical correctness test — inputs are zeros / synthetic
 RoPE — only a wiring sanity check. Real perf/correctness will be on
 iPhone after the Swift Generator is wired up.
+
+Usage:
+    python conversion/sanity_stateful_chunks.py             # E2B default
+    python conversion/sanity_stateful_chunks.py --model gemma4-e4b
+    python conversion/sanity_stateful_chunks.py --artifacts /tmp/foo
 """
 from __future__ import annotations
 
+import argparse
 import os
 import sys
 import time
@@ -26,19 +32,26 @@
 import numpy as np
 import coremltools as ct
 
-CTX = 512
-W = 512
-HIDDEN = 1536
-PLD = 256
-NLAYERS = 35
-HKV = 1
-HD_S = 256
-HD_F = 512
-VOCAB = 262_144
-ARTIFACTS = Path("/tmp/gemma4-e2b-stateful")
-
-# --- E2B chunk topology (must match build_gemma4_e2b_stateful_chunks.py) ---
-CHUNK_BOUNDARIES = [(0, 8), (8, 15), (15, 25), (25, 35)]
+# --- Per-model presets (must match build_gemma4_e2b_stateful_chunks.py) ---
+PRESETS = {
+    "gemma4-e2b": dict(
+        ctx=512, w=512, hidden=1536, pld=256, nlayers=35, hkv=1,
+        hd_s=256, hd_f=512, vocab=262_144,
+        artifacts="/tmp/gemma4-e2b-stateful",
+        boundaries=[(0, 8), (8, 15), (15, 25), (25, 35)],
+    ),
+    "gemma4-e4b": dict(
+        ctx=2048, w=512, hidden=2560, pld=256, nlayers=42, hkv=2,
+        hd_s=256, hd_f=512, vocab=262_144,
+        artifacts="/tmp/gemma4-e4b-stateful",
+        boundaries=[(0, 12), (12, 24), (24, 33), (33, 42)],
+    ),
+}
+
+# Defaults overwritten in main() once we read --model / --artifacts.
+CTX = W = HIDDEN = PLD = NLAYERS = HKV = HD_S = HD_F = VOCAB = 0
+ARTIFACTS: Path = Path("/tmp/gemma4-e2b-stateful")
+CHUNK_BOUNDARIES: list = []
 
 
 def _make_mask_full(pos: int) -> np.ndarray:
@@ -134,7 +147,32 @@ def shared_chunk_inputs(seed: int, hidden_in: np.ndarray, per_layer_combined: np
     }
 
 
+def _apply_preset(name: str, artifacts_override: str | None) -> None:
+    """Populate the module-level constants other functions read."""
+    if name not in PRESETS:
+        sys.exit(f"unknown preset {name!r}; choose from {list(PRESETS)}")
+    p = PRESETS[name]
+    g = globals()
+    g["CTX"], g["W"] = p["ctx"], p["w"]
+    g["HIDDEN"], g["PLD"] = p["hidden"], p["pld"]
+    g["NLAYERS"], g["HKV"] = p["nlayers"], p["hkv"]
+    g["HD_S"], g["HD_F"], g["VOCAB"] = p["hd_s"], p["hd_f"], p["vocab"]
+    g["ARTIFACTS"] = Path(artifacts_override or p["artifacts"])
+    g["CHUNK_BOUNDARIES"] = p["boundaries"]
+    print(f"[preset] {name}  artifacts={ARTIFACTS}")
+    print(f"  ctx={CTX} W={W} hidden={HIDDEN} layers={NLAYERS} HKV={HKV}")
+
+
 def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--model", default="gemma4-e2b",
+                    choices=list(PRESETS),
+                    help="Which Gemma 4 stateful preset to sanity-test")
+    ap.add_argument("--artifacts", default=None,
+                    help="Override artifacts dir (defaults to preset's path)")
+    args = ap.parse_args()
+    _apply_preset(args.model, args.artifacts)
+
     if not ARTIFACTS.is_dir():
         sys.exit(f"missing: {ARTIFACTS}")
 
diff --git a/scripts/assemble_gemma4_stateful_e4b.sh b/scripts/assemble_gemma4_stateful_e4b.sh
new file mode 100755
index 0000000..698f893
--- /dev/null
+++ b/scripts/assemble_gemma4_stateful_e4b.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# Assemble the Gemma 4 E4B stateful bundle for iPhone sideload.
+# Stage 2 sibling of assemble_gemma4_stateful_bundle.sh (which builds
+# the E2B variant). Layout matches what Gemma4StatefulEngine expects:
+#
+#   build/gemma4_stateful_e4b/
+#     gemma4_e2b_stateful_chunks/        # subdir name shared with E2B
+#       chunk_{1..4}.mlmodelc            (from /tmp/gemma4-e4b-stateful)
+#       embed_tokens_q8.bin              (E4B sidecars from output/)
+#       embed_tokens_scales.bin
+#       embed_tokens_per_layer_q8.bin
+#       embed_tokens_per_layer_scales.bin
+#       per_layer_projection.bin         (parity, not used by Engine)
+#       per_layer_norm_weight.bin
+#       cos_sliding.npy / sin_sliding.npy
+#       cos_full.npy    / sin_full.npy
+#       hf_model/                        (tokenizer files)
+#       model_config.json                (E4B: hidden=2560, layers=42, HKV=2)
+#
+# Push:
+#   xcrun devicectl device copy to --device <ID> \
+#     --domain-type appDataContainer \
+#     --domain-identifier com.example.CoreMLLLMChat \
+#     --source build/gemma4_stateful_e4b \
+#     --destination Documents/Models/gemma4-e4b-stateful
+#
+# Scheme: LLM_SHOW_EXPERIMENTAL=1 to reveal the picker entry.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+SRC_CHUNKS="${SRC_CHUNKS:-/tmp/gemma4-e4b-stateful}"
+# E4B sidecars: the existing legacy 4-chunk E4B bundle in the sibling
+# CoreML-LLM workspace already ships every sidecar we need (same names
+# the E2B staging-2k-fast-prefill dir uses). Override via env if you
+# moved the bundle.
+SIDECARS="${SIDECARS:-/Users/majimadaisuke/Downloads/workspace/CoreML-LLM/output/gemma4-e4b/bundle}"
+OUT_PARENT="${OUT_PARENT:-$ROOT/build/gemma4_stateful_e4b}"
+OUT="$OUT_PARENT/gemma4_e2b_stateful_chunks"
+
+for d in "$SRC_CHUNKS" "$SIDECARS"; do
+    if [[ ! -d "$d" ]]; then
+        echo "[error] missing $d" >&2
+        exit 1
+    fi
+done
+for c in chunk_1 chunk_2 chunk_3 chunk_4; do
+    if [[ ! -d "$SRC_CHUNKS/${c}.mlpackage" && ! -d "$SRC_CHUNKS/${c}.mlmodelc" ]]; then
+        echo "[error] $SRC_CHUNKS/${c}.{mlpackage,mlmodelc} missing — run build_gemma4_e2b_stateful_chunks.py --model gemma4-e4b first" >&2
+        exit 1
+    fi
+done
+
+rm -rf "$OUT_PARENT"
+mkdir -p "$OUT"
+
+# 1. Compile chunks .mlpackage → .mlmodelc into the bundle dir
+for c in chunk_1 chunk_2 chunk_3 chunk_4; do
+    echo "[compile] $c"
+    if [[ -d "$SRC_CHUNKS/${c}.mlmodelc" ]]; then
+        cp -R "$SRC_CHUNKS/${c}.mlmodelc" "$OUT/${c}.mlmodelc"
+    else
+        xcrun coremlcompiler compile \
+            "$SRC_CHUNKS/${c}.mlpackage" "$OUT/" 2>&1 | tail -2
+    fi
+done
+
+# 2. Copy sidecars from the E4B legacy bundle
+SIDE_ITEMS=(
+    "embed_tokens_q8.bin"
+    "embed_tokens_scales.bin"
+    "embed_tokens_per_layer_q8.bin"
+    "embed_tokens_per_layer_scales.bin"
+    "per_layer_projection.bin"
+    "per_layer_norm_weight.bin"
+    "cos_sliding.npy"
+    "sin_sliding.npy"
+    "cos_full.npy"
+    "sin_full.npy"
+    "hf_model"
+    "model_config.json"
+)
+for item in "${SIDE_ITEMS[@]}"; do
+    if [[ -e "$SIDECARS/$item" ]]; then
+        echo "[copy] $item"
+        cp -R "$SIDECARS/$item" "$OUT/"
+    else
+        echo "  [warn] missing $item"
+    fi
+done
+
+echo ""
+echo "=== assembled ==="
+du -sh "$OUT_PARENT"
+ls -la "$OUT/" | head -25
+
+echo ""
+echo "Push to iPhone:"
+echo "  DEVICE=A6F3E849-1947-5202-9AD1-9C881CA58EEF"
+echo "  xcrun devicectl device copy to --device \$DEVICE \\"
+echo "    --domain-type appDataContainer \\"
+echo "    --domain-identifier com.example.CoreMLLLMChat \\"
+echo "    --source $OUT_PARENT --destination Documents/Models/gemma4-e4b-stateful"

From e110752e5fff7f09aa795e8c44d237a633594df3 Mon Sep 17 00:00:00 2001
From: john-rocky <samuraibrothersmail@gmail.com>
Date: Tue, 28 Apr 2026 18:59:55 +0900
Subject: [PATCH 2/7] feat(gemma4): make singlefunc prefill builder E4B-aware

Stage 8 builder (PR #149) already used `compute_chunk_boundaries` for
chunk_1 / chunk_3 windows but called `convert_chunk2_merged_prefill`
without `own_range` / `shared_range`, so on E4B the merged middle
chunk silently used E2B's L8-14 / L15-24 layer ranges instead of
L12-23 / L24-32. After A3 made the converter parametric, plumb the
ranges through and refresh the docstring + the stale "we don't ship
E4B stateful yet" comment.
---
 ...uild_gemma4_stateful_singlefunc_prefill.py | 34 ++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/conversion/build_gemma4_stateful_singlefunc_prefill.py b/conversion/build_gemma4_stateful_singlefunc_prefill.py
index 4fdac3b..7da6681 100644
--- a/conversion/build_gemma4_stateful_singlefunc_prefill.py
+++ b/conversion/build_gemma4_stateful_singlefunc_prefill.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
-"""Build Gemma 4 E2B stateful prefill chunks as **single-function**
-mlpackages (no multifunction merge).
+"""Build Gemma 4 stateful prefill chunks as **single-function**
+mlpackages (no multifunction merge). Supports E2B and E4B; chunk
+boundaries come from `compute_chunk_boundaries(config)`.
 
 Stage 8 / Stage 6.5 opt-in builder. Companion to
 `docs/HANDOFF_STAGE8_MLSTATE_MULTIMODAL.md` and the probe results in
@@ -26,11 +27,14 @@
 
 Layout produced (3-chunk merged variant, T=288 default):
 
-    chunk_1_prefill_T288.mlpackage      (L0-7, own KV)
-    chunk_2_3way_prefill_T288.mlpackage (L8-24 merged, own + shared)
-    chunk_3_prefill_T288.mlpackage      (L25-34 + lm_head + argmax,
-                                         structurally same as 4-chunk
-                                         chunk_4_prefill)
+    E2B (35 layers):
+      chunk_1_prefill_T288.mlpackage       (L0-7, own KV)
+      chunk_2_3way_prefill_T288.mlpackage  (L8-24 merged, own + shared)
+      chunk_3_prefill_T288.mlpackage       (L25-34 + lm_head + argmax)
+    E4B (42 layers):
+      chunk_1_prefill_T288.mlpackage       (L0-11, own KV)
+      chunk_2_3way_prefill_T288.mlpackage  (L12-32 merged, own + shared)
+      chunk_3_prefill_T288.mlpackage       (L33-41 + lm_head + argmax)
 
 Usage:
     python conversion/build_gemma4_stateful_singlefunc_prefill.py \\
@@ -125,12 +129,16 @@ def main():
     base = Gemma4Model.from_pretrained(hf_dir, context_length=args.ctx)
     base.eval()
 
-    # E2B layer split: chunk_1 = L0-7 (own KV), chunk_2_3way = L8-24
-    # (merged), chunk_3 = L25-34 (+head). E4B has different boundaries
-    # but we don't ship E4B stateful yet.
+    # Chunk layout (config-derived via compute_chunk_boundaries):
+    #   E2B: c1=L0-7,   own=L8-14,  shared=L15-24, c4=L25-34
+    #   E4B: c1=L0-11,  own=L12-23, shared=L24-32, c4=L33-41
+    # The merged prefill needs own_range + shared_range so it picks
+    # the right layer-index window for the kv13/kv14 producer aliases.
     boundaries = compute_chunk_boundaries(base.config)
-    c1_start, c1_end = boundaries[0]   # E2B (0, 8)
-    c4_start, c4_end = boundaries[3]   # E2B (25, 35)
+    c1_start, c1_end = boundaries[0]
+    own_range = boundaries[1]
+    shared_range = boundaries[2]
+    c4_start, c4_end = boundaries[3]
 
     paths = {
         "chunk1": os.path.join(args.output, f"chunk_1_prefill_T{args.t}.mlpackage"),
@@ -154,6 +162,8 @@ def main():
             out_path=paths["chunk2_3way"],
             nbits=args.nbits,
             use_linear=args.linear_projections,
+            own_range=own_range,
+            shared_range=shared_range,
         )
     if args.only in (None, "chunk3"):
         convert_chunk_shared_prefill(

From 000a292b79493c9680b8f0f8554561c353f1a977 Mon Sep 17 00:00:00 2001
From: john-rocky <samuraibrothersmail@gmail.com>
Date: Tue, 28 Apr 2026 19:50:41 +0900
Subject: [PATCH 3/7] docs(stage8): Phase B implementation plan for E4B + E2B
 multimodal stateful
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Captures the design intel + chosen architecture (Option A: separate
Gemma4StatefulMultimodalEngine class) so the next session can pick up
without re-deriving. Records:

  - Phase A scope (already shipped on this branch as 4665ab2 + 2655c17)
  - Phase B engine class layout (storage, public API, helper port list)
  - State bridge code path (probe-2-verified nested withMultiArray
    closures + memcpy)
  - Generate flow for image+text prompts (T=288 prefill → bridge → decode)
  - Bundle layout for new HF repos gemma-4-{E2B,E4B}-stateful-multimodal-coreml
  - Open questions (picker naming, default-swap timing, cross-turn KV
    with re-encoded image features)
  - Build commands for the Mac compile run
---
 docs/STAGE8_DESIGN.md | 324 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 324 insertions(+)
 create mode 100644 docs/STAGE8_DESIGN.md

diff --git a/docs/STAGE8_DESIGN.md b/docs/STAGE8_DESIGN.md
new file mode 100644
index 0000000..de52b3e
--- /dev/null
+++ b/docs/STAGE8_DESIGN.md
@@ -0,0 +1,324 @@
+# Stage 8: E4B + E2B Multimodal Stateful Engine — Implementation Plan
+
+**Status:** Phase A code complete (text-only E4B parity with E2B). Phase B (multimodal wiring) deferred to a follow-up session.
+
+**Branch:** `feat/e4b-optimize-multimodal`
+**Date:** 2026-04-28
+**Predecessor docs:** `docs/MLSTATE_MULTIMODAL_PROBE.md`, `docs/HANDOFF_STAGE8_MLSTATE_MULTIMODAL.md`, `docs/SESSION_2026_04_27_STAGE6_MULTIMODAL.md`
+
+---
+
+## Goal
+
+Ship Gemma 4 E4B and E2B with the stateful Linear decode path **and** vision + video + audio multimodal input on iPhone 17 Pro. Reach the fastest decode tok/s the architecture allows while keeping multimodal correctness intact.
+
+**Scope split chosen on 2026-04-28:**
+- **Option A** — separate Swift class `Gemma4StatefulMultimodalEngine`, leaving the legacy `Gemma4StatefulEngine` (text-only multifunction prefill_b8 path) untouched.
+- **E2B + E4B both get multimodal stateful** — same engine class drives both via `model_config.json`-derived dimensions.
+- **Existing HF repos preserved** — new `mlboydaisuke/gemma-4-{E2B,E4B}-stateful-multimodal-coreml` repos rather than mutating the existing stateful repos. Mirrors the dual-repo pattern.
+
+---
+
+## What's already shipped (Phase A — code only, builds + iPhone gates pending)
+
+Two commits on `feat/e4b-optimize-multimodal`:
+
+1. **`4665ab2`** — generalize 3-chunk + 4-chunk converters to E2B + E4B
+   - `SWAStatefulMergedChunk23{,Prefill,Single,PrefillSingle}` accept `own_range` / `shared_range`. Defaults E2B (own=L8-14, shared=L15-24); E4B passes (12,24)/(24,33).
+   - `build_gemma4_e2b_stateful_3chunks.py` --model gemma4-e4b now produces a 3-chunk merged bundle (chunk_1 L0-11 / chunk_2 L12-32 / chunk_3 L33-41).
+   - `sanity_stateful_chunks.py` model presets (--model gemma4-e2b / gemma4-e4b).
+   - `scripts/assemble_gemma4_stateful_e4b.sh` bundle assembler for iPhone sideload.
+   - `Sources/CoreMLLLM/ModelDownloader.swift` — `gemma4e4bStateful` + `gemma4e4bStatefulLinear` ModelInfo entries (slots 6/7 under `LLM_SHOW_EXPERIMENTAL=1`, sideload-only — `downloadURL: ""`).
+   - `Examples/.../LLMRunner.swift` — stateful detection comment now lists all four E2B+E4B folders.
+
+2. **`2655c17`** — single-function T=288 prefill builder accepts E4B
+   - `build_gemma4_stateful_singlefunc_prefill.py` plumbs `own_range` / `shared_range` through `convert_chunk2_merged_prefill`. Without this, on E4B the merged middle prefill chunk silently used E2B layer ranges.
+
+**Pending Phase A work (hardware-blocked):**
+- A4: Mac build (3-chunk decode + multifunction prefill_b8) — kicked off in background after this doc lands.
+- A4': Mac build (T=288 single-function prefill) for E2B + E4B — same session.
+- A5: iPhone 17 Pro A/B for E4B 3-chunk merged stateful Linear — needs device.
+- A6: HF upload `mlboydaisuke/gemma-4-E4B-stateful-coreml` once iPhone clears.
+
+---
+
+## Phase B — Stage 8 multimodal stateful engine
+
+### B1. T=288 single-function prefill mlpackages (DONE script-side, build pending)
+
+E2B and E4B variants of:
+- `chunk_1_prefill_T288.mlpackage` (own KV)
+- `chunk_2_3way_prefill_T288.mlpackage` (merged: own + shared internal)
+- `chunk_3_prefill_T288.mlpackage` (KV-shared + lm_head + argmax)
+
+T=288 = 256-token image span + ~32 text margin (BOS / turn markers). Drop to T=224 if 8 GB iPhone non-Pro rejects T=288 compile peak (probe required, see C1).
+
+**Single-function** (separate mlpackage per T) instead of multifunction merge — iPhone ANE 18 rejects multifunction T>1 + dual MLState with `ANECCompile FAILED 11`. Probe 2 verified single-function T=288 compiles in 7.3 s on iPhone 17 Pro A19 Pro.
+
+### B2. New Swift class `Gemma4StatefulMultimodalEngine`
+
+**Location:** `Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift` (new file).
+
+**Why a separate class (not inheritance / extension):**
+- `Gemma4StatefulEngine` is `public final class` — not extensible.
+- The two engines have different prefill path topology (multifunction merged vs separate single-func mlpackages), different state lifecycle (decode-only state vs decode+prefill state with bridge), and different public API shape (`generate(prompt:)` vs `generate(prompt:images:audio:)`).
+- Keeps the Stage 3 stateful Linear 33.4 tok/s text-only path bit-identical for users who don't want multimodal.
+
+**Storage:**
+
+```swift
+@available(iOS 18.0, macOS 15.0, *)
+public final class Gemma4StatefulMultimodalEngine {
+    // Decode chunks (3-chunk merged stateful Linear)
+    private var decodeChunk1: MLModel?  // L0-7 / L0-11 (E4B) — own KV
+    private var decodeChunk2: MLModel?  // L8-24 / L12-32 — merged own+shared
+    private var decodeChunk3: MLModel?  // L25-34 / L33-41 + lm_head + argmax
+
+    // Prefill T=288 chunks (separate mlpackages, single-function)
+    private let prefillT: Int = 288
+    private var prefillChunk1: MLModel?
+    private var prefillChunk2: MLModel?
+    private var prefillChunk3: MLModel?  // identical structure to decodeChunk3
+
+    // Per-chunk MLState (chunk_3 is stateless — KV-shared from chunk_2)
+    private var decodeState1: MLState?
+    private var decodeState2: MLState?
+    private var prefillState1: MLState?
+    private var prefillState2: MLState?
+
+    // Multimodal encoders (lazy)
+    private var visionModel: MLModel?         // SigLIP still-image, 256 tokens
+    private var videoVisionModel: MLModel?    // pooled SigLIP, 64 tokens/frame
+    private var audioModel: MLModel?          // Conformer, ~50 tokens/2sec
+    private var audioProjection: ProjectionWeights?
+    private var melFilterbank: Data?
+
+    // Sidecars (same as legacy engine)
+    private var embedTokens: EmbeddingLookup?
+    private var embedTokensPerLayer: EmbeddingLookup?
+    private var cosSlidingTable: Data?  // mmap
+    private var sinSlidingTable: Data?
+    private var cosFullTable: Data?
+    private var sinFullTable: Data?
+
+    // Cross-turn state (Phase 2a — LCP match)
+    private var persistedInputIds: [Int32] = []
+    private var persistedPosition: Int = 0
+
+    // Reusable scratch (T=1 decode + T=288 prefill)
+    private var maskFullDecode, maskSlidingDecode: MLMultiArray!
+    private var maskFullPrefill288, maskSlidingPrefill288: MLMultiArray!
+    // ... batch hidden, per-layer raw, RoPE batched, etc.
+}
+```
+
+**Public API:**
+
+```swift
+public init(config: Config = Config())
+public func resetPersistedState()
+public func load(modelDirectory: URL) async throws
+
+public func generate(
+    prompt: String,
+    images: [CGImage] = [],
+    audioPCM16k: [Float]? = nil,
+    maxNewTokens: Int = 512,
+    eosTokenIds: Set<Int32> = [],
+    onToken: ((Int32) -> Void)? = nil
+) async throws -> [Int32]
+```
+
+Video is a series of CGImages produced by `VideoProcessor.extractFrames` — exposed at the LLMRunner layer, not the engine.
+
+### B3. Multimodal helpers — port from Stage 6 (`origin/stage6-multimodal-stateful` commit `02ac583`)
+
+The Stage 6 patch added 528 lines to the legacy engine. We port these into the new class with one structural change: feature splice happens **during prefill at T=288** instead of during multifunction prefill_b8.
+
+**Helpers to port (file:line references for `02ac583`):**
+
+| Helper | Purpose | Adaption needed |
+|---|---|---|
+| `loadMultimodalEncoders` | Probe + load vision/video/audio mlmodelc + sidecars | Same layout, new file paths |
+| `processImage(_: CGImage)` | UIImage → 256-token feature MLMultiArray | None — same encoder |
+| `processVideoFrame` | Per-frame still vision encoding (64 tokens) | None |
+| `processAudio(_: [Float])` | PCM 16k → mel → Conformer → projection | None |
+| `computeVisionGroupIds` | Per-token group label (which image each token belongs to) | T=8 → T=288 generalization |
+| `fillBatchMasksVisionAware` | Bidirectional within-image, causal across | T=8 → T=288 generalization |
+| `multimodalSpliceT1` | Per-token feature splice at IMAGE/AUDIO_TOKEN_ID position | Reused for tail of prompt that doesn't fit T=288 |
+
+**Special token IDs (preserved from Stage 6):**
+- `IMAGE_TOKEN_ID = 258880`
+- `AUDIO_TOKEN_ID = 258881`
+- `VIDEO_TOKEN_ID = 258884`
+
+### B4. State bridge (probe 2 verified)
+
+After prefill completes, copy `kv_cache_sliding` and `kv_cache_full` from prefill MLState to decode MLState. Critical requirement: **nested closures** — the buffer pointer is only valid within `withMultiArray(for:)` scope.
+
+```swift
+private func bridgeKVState(from src: MLState, to dst: MLState) {
+    let names = ["kv_cache_sliding", "kv_cache_full"]
+    for name in names {
+        src.withMultiArray(for: name) { srcArr in
+            dst.withMultiArray(for: name) { dstArr in
+                let bytes = srcArr.count * MemoryLayout<UInt16>.stride  // fp16
+                memcpy(dstArr.dataPointer, srcArr.dataPointer, bytes)
+            }
+        }
+    }
+}
+```
+
+Called twice per generate(): once for chunk_1 state (sliding-only on E2B / sliding+full on E4B), once for chunk_2 state (sliding+full both).
+
+Pitfall: chunk_3 is **stateless** in the 3-chunk variant (KV-shared from chunk_2 outputs kv13/kv14). No state to bridge for chunk_3.
+
+### B5. Generate flow (single-image text+image example)
+
+```
+Input: prompt = "What's in this picture?", images = [oneImage]
+
+1. Build inputIds:
+     [BOS] <image_pad×256> "What's in this picture?" [EOT]
+     ≈ 1 + 256 + 8 + 1 = 266 tokens — fits in T=288.
+
+2. Preprocess image:
+     features = visionModel(processImage(oneImage))  // (1, 256, hiddenSize)
+
+3. Build prefill input:
+   - embed_lookup(inputIds) → hidden (1, 266, hidden)
+   - splice features[0..<256] into hidden[1..<257]
+   - zero per_layer_raw at image positions
+   - vision-aware mask: bidirectional within hidden[1..<257],
+     causal elsewhere
+
+4. Run prefill T=288 (pad inputIds to 288 with mask = -inf):
+   - prefillChunk1(hidden, masks, rope, pos=0..287, ringPos=0)
+       → updates prefillState1 (kv_cache_sliding[0..287])
+   - prefillChunk2(prefill1.hidden, ..., pos=0..287, ringPos=0)
+       → updates prefillState2; outputs kv13_k/v + kv14_k/v at last layer
+   - prefillChunk3(prefill2.hidden, kv13_*, kv14_*, ...)
+       → outputs token_id (last decode token)
+
+5. Bridge state:
+     bridgeKVState(prefillState1 → decodeState1)
+     bridgeKVState(prefillState2 → decodeState2)
+
+6. Decode loop (T=1, position=266, 267, ...):
+   - decodeChunk1(emb(token), masks, rope, pos, ringPos)
+     state: decodeState1
+   - decodeChunk2(...) state: decodeState2
+   - decodeChunk3(..., kv13, kv14) → next token
+   - emit, append to output, repeat until EOS or maxTokens
+```
+
+For prompts longer than T=288: **split into multiple T=288 prefill passes** (no overlap; each pass writes consecutive ring positions). Image span must NOT split across passes — push image to first pass and chunk text after.
+
+### B6. ModelDownloader bundle layout
+
+Mirror E2B's existing `gemma-4-E2B-stateful-coreml` layout but add a `prefill_T288/` subdir:
+
+```
+mlboydaisuke/gemma-4-{E2B,E4B}-stateful-multimodal-coreml/
+  gemma4_e2b_stateful_chunks/         # subdir kept for engine compat
+    chunk_1.mlmodelc                  # decode multifunction merged
+    chunk_2.mlmodelc
+    chunk_3.mlmodelc
+    prefill_T288/
+      chunk_1_prefill_T288.mlmodelc
+      chunk_2_3way_prefill_T288.mlmodelc
+      chunk_3_prefill_T288.mlmodelc
+    embed_tokens_q8.bin               # sidecars
+    embed_tokens_scales.bin
+    embed_tokens_per_layer_q8.bin
+    embed_tokens_per_layer_scales.bin
+    per_layer_projection.bin
+    per_layer_norm_weight.bin
+    cos_sliding.npy / sin_sliding.npy
+    cos_full.npy    / sin_full.npy
+    hf_model/                         # tokenizer
+    model_config.json
+    vision.mlmodelc                   # multimodal encoders
+    vision_video.mlmodelc
+    audio.mlmodelc
+    output_proj_weight.npy            # audio projection sidecars
+    output_proj_bias.npy
+    embed_proj_weight.npy
+```
+
+Total bundle size:
+- Decode chunks: ~1.15 GB (E2B) / ~1.6 GB (E4B)
+- T=288 prefill chunks: ~1.50 GB (E2B) / ~2.0 GB (E4B)
+- Encoders: ~0.99 GB (shared between models)
+- Sidecars + tokenizer: ~0.4 GB
+- **Total: ~4.0 GB (E2B) / ~5.0 GB (E4B)** download.
+
+`ModelDownloader.buildGemma4StatefulMultimodalE{2,4}BFileList()` enumerates all files. Mirror the existing E2B helpers' pattern.
+
+### B7-B8. iPhone tests + parity
+
+- B7: Real-device test — image+text, video+text, audio+text → correct output.
+- B8: Parity test — fixed image prompt through legacy 4-chunk prefill+decode vs new T=288 stateful prefill+bridge+decode. First 32 decode tokens must agree (top-1).
+
+### C1. 8 GB iPhone non-Pro probe
+
+Probe 1 only validated 12 GB iPhone 17 Pro at T=288. iPhone 15 / 16 / 17 non-Pro have 8 GB RAM — chunk_2 prefill at T=288 may fail compile peak (chunk_2 is the largest at 21 layers for E4B). If 8 GB fails, fall back to T=224 (image still fits 256 tokens; text margin shrinks to ~−32 — acceptable since prompt-tail fallback to T=1 already exists).
+
+---
+
+## Open questions for next session
+
+1. **Picker entry naming.** "Gemma 4 E4B (multimodal stateful)" or "Gemma 4 E4B (stateful, vision+audio)"? UI clarity vs concision.
+2. **Default model swap.** When B is shipped, should `gemma4e2b3way` (current production multimodal) be deprecated in favor of `gemma4e2bStatefulMultimodal` (faster decode + multimodal)? Memory note says current E2B 3-chunk is the multimodal default; swapping requires a soft-deprecation cycle for users mid-download.
+3. **Cross-turn KV with vision.** Phase 2a LCP match assumes prefix invariance. If turn 1 has image and turn 2 reuses the same image, the image features may have been re-encoded — does the LCP match still hold? Stage 6 had this concern unresolved.
+
+---
+
+## Build commands (Phase A — kick off after this doc lands)
+
+```bash
+# Build 1: E4B 3-chunk merged decode + multifunction prefill_b8
+HF_DIR=/Users/majimadaisuke/Downloads/CoreML-LLM/output/gemma4-e4b/hf_model
+python conversion/build_gemma4_e2b_stateful_3chunks.py \
+    --model gemma4-e4b \
+    --hf-dir "$HF_DIR" \
+    --output /tmp/gemma4-e4b-stateful-3chunk \
+    --linear-projections \
+    --prefill-batches "8" \
+    --ctx 2048 \
+    --nbits 4
+
+# Build 2: E4B T=288 single-function prefill (Stage 8)
+python conversion/build_gemma4_stateful_singlefunc_prefill.py \
+    --model gemma4-e4b \
+    --hf-dir "$HF_DIR" \
+    --output /tmp/gemma4-e4b-singlefunc-prefill-T288 \
+    --t 288 \
+    --linear-projections \
+    --ctx 2048 \
+    --nbits 4
+
+# Sanity (chunk shape + chained 1→2→3 forward, CPU_AND_NE):
+python conversion/sanity_stateful_chunks.py \
+    --model gemma4-e4b \
+    --artifacts /tmp/gemma4-e4b-stateful-3chunk
+
+# Bundle assemble for sideload (assumes legacy E4B sidecars in
+# CoreML-LLM/output/gemma4-e4b/bundle):
+SIDECARS=/Users/majimadaisuke/Downloads/CoreML-LLM/output/gemma4-e4b/bundle \
+SRC_CHUNKS=/tmp/gemma4-e4b-stateful-3chunk \
+bash scripts/assemble_gemma4_stateful_e4b.sh
+```
+
+Both builds load the 15 GB E4B safetensors; estimated 60-120 min total. Run sequentially to avoid memory contention.
+
+---
+
+## Reference
+
+- `docs/MLSTATE_MULTIMODAL_PROBE.md` — probe 1 (T=288 chunk_1 compiles) + probe 2 (state bridge memcpy works).
+- `docs/HANDOFF_STAGE8_MLSTATE_MULTIMODAL.md` — original Stage 8 handoff with 5-step plan.
+- `docs/SESSION_2026_04_27_STAGE6_MULTIMODAL.md` — Stage 6 multimodal in legacy engine (in-place patch; ours is fresh class).
+- Stage 6 commits: `origin/stage6-multimodal-stateful` (`02ac583`, `2432995`, `987ad86`) — port these helpers verbatim into new class.
+- `Sources/CoreMLLLM/Gemma4StatefulEngine.swift` — legacy engine, reference for the patterns we duplicate (mask filling, RoPE lookup, EmbeddingLookup wiring, position scratch, etc.).

From 526a73d3edbb2a0faf1c80a6c6a81772d6d033b4 Mon Sep 17 00:00:00 2001
From: john-rocky <samuraibrothersmail@gmail.com>
Date: Tue, 28 Apr 2026 21:22:17 +0900
Subject: [PATCH 4/7] docs: macOS 26 coremltools 9.0 wheel workaround

PyPI wheel ships .so files referencing @rpath/lib*.dylib that aren't
included; on macOS 26 (Darwin 25 / Tahoe) this silently produces an
empty pybind11 module so every conversion script crashes at
"BlobWriter not loaded". Captures the fresh-venv + source-build steps
that get a working /tmp/ct_build_venv to unblock builds until upstream
ships fixed wheels.
---
 docs/MACOS_26_BUILD_ENV.md | 109 +++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 docs/MACOS_26_BUILD_ENV.md

diff --git a/docs/MACOS_26_BUILD_ENV.md b/docs/MACOS_26_BUILD_ENV.md
new file mode 100644
index 0000000..ca53583
--- /dev/null
+++ b/docs/MACOS_26_BUILD_ENV.md
@@ -0,0 +1,109 @@
+# macOS 26 (Tahoe / Darwin 25) — coremltools 9.0 build workaround
+
+The PyPI wheel `coremltools-9.0-*-macosx_11_0_arm64.whl` packages the
+C++ extensions as `.so` files but the install_name baked at link time
+references `@rpath/libmilstoragepython.dylib` etc. — the matching
+`.dylib` files are **NOT** included. On macOS 26, this triggers an
+import that loads the module silently without any C++ classes
+registered, so `coremltools.libmilstoragepython._BlobStorageWriter`
+ends up undefined and Apple Conversion stalls at:
+
+```
+RuntimeError: BlobWriter not loaded
+```
+
+(Symptom: every conversion script in `conversion/` fails after model
+load and trace, before saving the mlpackage.)
+
+This was working on Apr 26, 2026; the symptom appeared after upgrading
+to macOS 26. Reproduces in every venv (Python 3.10 / 3.11 / 3.12 /
+3.14, coremltools 8.3.0 / 9.0).
+
+## Fix: build coremltools from source into a fresh venv
+
+```bash
+# 1. Toolchain
+brew install protobuf            # protoc 34.x
+xcode-select -p                  # confirm /Applications/Xcode.app/...
+which cmake                      # confirm /opt/homebrew/bin/cmake
+
+# 2. Fresh venv (Python 3.10 — the most stable target for coremltools 9.0)
+~/.pyenv/versions/3.10.13/bin/python3 -m venv /tmp/ct_build_venv
+/tmp/ct_build_venv/bin/pip install --upgrade pip wheel setuptools
+/tmp/ct_build_venv/bin/pip install pybind11 numpy
+
+# 3. Source build
+cd /tmp
+git clone --depth 1 https://github.com/apple/coremltools.git coremltools-src
+cd coremltools-src
+mkdir -p build && cd build
+
+xcrun --sdk macosx cmake \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_OSX_DEPLOYMENT_TARGET=12.3 \
+    -DPYTHON_EXECUTABLE:FILEPATH=/tmp/ct_build_venv/bin/python \
+    -DPYTHON_INCLUDE_DIR=/Users/$USER/.pyenv/versions/3.10.13/include/python3.10 \
+    -DPYTHON_LIBRARY=/Users/$USER/.pyenv/versions/3.10.13/lib/libpython3.10.dylib \
+    -DOVERWRITE_PB_SOURCE=0 \
+    /tmp/coremltools-src
+
+make -j$(sysctl -n hw.ncpu)
+cmake --build . --target dist           # produces build/dist/coremltools-*.whl
+
+# 4. Install the freshly built wheel + copy the dylibs alongside the .so files
+/tmp/ct_build_venv/bin/pip install build/dist/coremltools-*.whl
+cp build/lib*.dylib \
+   /tmp/ct_build_venv/lib/python3.10/site-packages/coremltools/
+install_name_tool -add_rpath @loader_path \
+   /tmp/ct_build_venv/lib/python3.10/site-packages/coremltools/libmilstoragepython.so
+install_name_tool -add_rpath @loader_path \
+   /tmp/ct_build_venv/lib/python3.10/site-packages/coremltools/libcoremlpython.so
+install_name_tool -add_rpath @loader_path \
+   /tmp/ct_build_venv/lib/python3.10/site-packages/coremltools/libmodelpackage.so
+
+# 5. Install conversion deps
+/tmp/ct_build_venv/bin/pip install --no-cache-dir \
+    torch transformers safetensors huggingface-hub scikit-learn
+
+# 6. Verify
+/tmp/ct_build_venv/bin/python -c "
+from coremltools.converters.mil.backend.mil.load import BlobWriter
+import coremltools as ct
+print('ct', ct.__version__, '— BlobWriter:', BlobWriter)
+"
+# Expected: ct 9.0 — BlobWriter: <class 'coremltools.libmilstoragepython._BlobStorageWriter'>
+```
+
+## Use the venv for conversion runs
+
+```bash
+PY=/tmp/ct_build_venv/bin/python
+$PY conversion/build_gemma4_e2b_stateful_3chunks.py \
+    --model gemma4-e4b \
+    --hf-dir /path/to/gemma4-e4b/hf_model \
+    --output /tmp/gemma4-e4b-stateful-3chunk \
+    --linear-projections \
+    --prefill-batches "8" \
+    --ctx 2048 --nbits 4
+```
+
+`/tmp/ct_build_venv` is the pinned env for all `conversion/` scripts on
+this machine until coremltools 9.1 (or newer) ships a wheel that bundles
+the dylibs alongside the .so files for macOS 26.
+
+## Why the symptom is silent
+
+The Python extension `.so` exports `_PyInit_libmilstoragepython` and
+loads cleanly under `dlopen`. PyInit registers the pybind11 module and
+attaches `_BlobStorageWriter` / `_BlobStorageReader` only if the
+matching `libmilstoragepython.dylib` is found and its C++ symbols
+resolve. When the dylib is missing, pybind11 silently skips class
+registration; the module loads with `dir(m) == ['__doc__', '__file__',
+'__loader__', '__name__', '__package__', '__spec__']` — no error, no
+warning, just an empty module.
+
+Confirm with:
+```bash
+$PY -c "import coremltools.libmilstoragepython as m; print(dir(m))"
+```
+A working install also lists `_BlobStorageReader` and `_BlobStorageWriter`.

From a540e395796666205ebf8656e9259b8820b3bb46 Mon Sep 17 00:00:00 2001
From: john-rocky <samuraibrothersmail@gmail.com>
Date: Sun, 3 May 2026 10:39:32 +0900
Subject: [PATCH 5/7] feat(gemma4-e4b): multimodal CoreML bundle
 (text+image+video+audio on iPhone)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Working configuration for iPhone 17 Pro at 15.7 tok/s decode + correct
output across all four input modalities. Validated 2026-05-03 on a clean
sandbox push of the assembled bundle.

Topology:
  decode  = Topology II (chunk1 legacy + chunk2_3way + chunk3_3way merged
            21-layer middle + final lm_head). Auto-detected by
            ChunkedEngine via chunk2_3way/chunk3_3way presence.
  prefill = legacy chunks 1/2/3/4 prefill_b8 multifunction. Vision-aware
            bidirectional mask within image span via the engine's
            existing fillBatchMasksVisionAware (works at T=8 batches).
  vision  = vision.ane.mlmodelc (E4B, output [1, 256, 2560]).
  audio   = audio.mlmodelc (E4B, output [1, 50, 1024]) + Swift two-stage
            projection 1024 -> 1536 -> 2560.

Changes:
- Sources/CoreMLLLM/AudioProcessor.swift: ProjectionWeights now derives
  inDim/outDim/finalDim from weight tensor sizes (was hard-coded for
  E2B's square 1536x1536 embed_proj). E4B's embed_proj is non-square
  (2560, 1536); the embed_proj sgemm now uses finalDim for the output
  dimension. Direct cause of the audio gibberish on E4B.
- conversion/models/gemma4_swa_merged.py: MergedChunk23 (the
  non-stateful merged chunk2+chunk3 used by Topology II) now accepts
  own_range / shared_range; defaults stay at E2B (L8-14 / L15-24).
  Mirrors the stateful generalisation from 4665ab2.
- conversion/build_gemma4_3way.py: thread compute_chunk_boundaries(cfg)
  through to MergedChunk23 so `--model gemma4-e4b` produces a 21-layer
  chunk2_3way (L12-23 own + L24-32 shared) instead of the E2B-hardcoded
  17-layer span.
- scripts/assemble_gemma4_e4b_multimodal.sh: reproducible bundle
  assembly script (compiles mlpackage->mlmodelc, copies sidecars +
  legacy chunks + E4B encoders).
- docs/E4B_MULTIMODAL_BUILD.md: build + sideload guide, including the
  rejected paths (prefill_chunk* multifunction, stateful) and the
  iPhone clean-sandbox requirement (devicectl never deletes orphans).

Out of scope (in this commit):
- Stateful Stage 8 engine — separate commit, Mac-only / iPhone-blocked.
- prefill_chunk{1..4}.mlmodelc multifunction path — built and tested
  but produces broken output on iPhone with E4B (Mac OK); not shipped.
- vision_video.mlmodelc — engine falls back to 2x2 pool of vision
  encoder; quality validated.
---
 Sources/CoreMLLLM/AudioProcessor.swift    |  43 ++++--
 conversion/build_gemma4_3way.py           |   8 +-
 conversion/models/gemma4_swa_merged.py    |  25 +--
 docs/E4B_MULTIMODAL_BUILD.md              | 138 +++++++++++++++++
 scripts/assemble_gemma4_e4b_multimodal.sh | 176 ++++++++++++++++++++++
 5 files changed, 363 insertions(+), 27 deletions(-)
 create mode 100644 docs/E4B_MULTIMODAL_BUILD.md
 create mode 100755 scripts/assemble_gemma4_e4b_multimodal.sh

diff --git a/Sources/CoreMLLLM/AudioProcessor.swift b/Sources/CoreMLLLM/AudioProcessor.swift
index 08b8d51..d88bc98 100644
--- a/Sources/CoreMLLLM/AudioProcessor.swift
+++ b/Sources/CoreMLLLM/AudioProcessor.swift
@@ -23,21 +23,32 @@ public enum AudioProcessor {
     // MARK: - Projection weights (loaded from .npy files)
 
     /// Loaded projection weights for Swift-side computation.
+    /// Two-stage projection: 1024 → outDim (output_proj) → finalDim (embed_proj).
+    /// E2B: outDim=1536, finalDim=1536 (square embed_proj).
+    /// E4B: outDim=1536, finalDim=2560 (non-square embed_proj — projects up to LM hidden).
     public struct ProjectionWeights {
-        let outputProjWeight: [Float]  // (1536, 1024) row-major
-        let outputProjBias: [Float]    // (1536,)
-        let embedProjWeight: [Float]   // (1536, 1536) row-major
+        let outputProjWeight: [Float]  // (outDim, 1024) row-major
+        let outputProjBias: [Float]    // (outDim,)
+        let embedProjWeight: [Float]   // (finalDim, outDim) row-major
         let inDim: Int                 // 1024
-        let outDim: Int                // 1536
+        let outDim: Int                // 1536 (audio_soft_token_size)
+        let finalDim: Int              // LM hidden size (1536 E2B / 2560 E4B)
 
         /// Load projection weights from .npy files in the model directory.
         public static func load(from directory: URL) throws -> ProjectionWeights {
             let opW = try loadNpyFloat16(directory.appendingPathComponent("output_proj_weight.npy"))
             let opB = try loadNpyFloat16(directory.appendingPathComponent("output_proj_bias.npy"))
             let epW = try loadNpyFloat16(directory.appendingPathComponent("embed_proj_weight.npy"))
+            // outDim = output_proj_bias length = audio_soft_token_size (1536).
+            // inDim  = output_proj_weight.count / outDim = 1024.
+            // finalDim = embed_proj_weight.count / outDim = LM hidden (E2B 1536, E4B 2560).
+            let outDim = opB.count
+            let inDim = opW.count / outDim
+            let finalDim = epW.count / outDim
             return ProjectionWeights(
                 outputProjWeight: opW, outputProjBias: opB,
-                embedProjWeight: epW, inDim: 1024, outDim: 1536)
+                embedProjWeight: epW,
+                inDim: inDim, outDim: outDim, finalDim: finalDim)
         }
 
         /// Load a float16 numpy file as [Float].
@@ -182,26 +193,28 @@ public enum AudioProcessor {
             }
         }
 
-        // embed_proj: (S, 1536) @ W^T(1536, 1536) → (S, 1536)
-        var features = [Float](repeating: 0, count: S * outDim)
+        // embed_proj: (S, outDim) @ W^T(finalDim, outDim) → (S, finalDim).
+        // E2B: finalDim==outDim==1536 (square). E4B: finalDim=2560 != outDim=1536.
+        let finalDim = proj.finalDim
+        var features = [Float](repeating: 0, count: S * finalDim)
         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                    Int32(S), Int32(outDim), Int32(outDim),
+                    Int32(S), Int32(finalDim), Int32(outDim),
                     1.0, projected, Int32(outDim),
                     proj.embedProjWeight, Int32(outDim),
-                    0.0, &features, Int32(outDim))
+                    0.0, &features, Int32(finalDim))
 
         // fp32 → fp16 batch conversion via Accelerate
         let result = try! MLMultiArray(
-            shape: [1, NSNumber(value: S), NSNumber(value: outDim)],
+            shape: [1, NSNumber(value: S), NSNumber(value: finalDim)],
             dataType: .float16)
-        let rp = result.dataPointer.bindMemory(to: UInt16.self, capacity: S * outDim)
+        let rp = result.dataPointer.bindMemory(to: UInt16.self, capacity: S * finalDim)
         features.withUnsafeBufferPointer { src in
             var srcBuf = vImage_Buffer(data: UnsafeMutableRawPointer(mutating: src.baseAddress!),
-                                        height: 1, width: vImagePixelCount(S * outDim),
-                                        rowBytes: S * outDim * 4)
+                                        height: 1, width: vImagePixelCount(S * finalDim),
+                                        rowBytes: S * finalDim * 4)
             var dstBuf = vImage_Buffer(data: rp, height: 1,
-                                        width: vImagePixelCount(S * outDim),
-                                        rowBytes: S * outDim * 2)
+                                        width: vImagePixelCount(S * finalDim),
+                                        rowBytes: S * finalDim * 2)
             vImageConvert_PlanarFtoPlanar16F(&srcBuf, &dstBuf, 0)
         }
 
diff --git a/conversion/build_gemma4_3way.py b/conversion/build_gemma4_3way.py
index 27e4d4c..b39a6af 100644
--- a/conversion/build_gemma4_3way.py
+++ b/conversion/build_gemma4_3way.py
@@ -180,9 +180,13 @@ def build_chunk2_merged(base, ctx: int, out_pkg: str, *, quantize: bool) -> None
     max_hd = hd_f
     nkv = cfg.num_key_value_heads
 
-    mc = MergedChunk23(base).eval()
+    boundaries = compute_chunk_boundaries(cfg)
+    own_range = boundaries[1]
+    shared_range = boundaries[2]
+    mc = MergedChunk23(base, own_range=own_range, shared_range=shared_range).eval()
     ns, nf = mc.num_sliding, mc.num_full
-    print(f"\n=== chunk2_3way (L{mc.START_C2}-{mc.END_C3-1}, 17 layers) ===")
+    n_layers = (mc.END_C2 - mc.START_C2) + (mc.END_C3 - mc.START_C3)
+    print(f"\n=== chunk2_3way (L{mc.START_C2}-{mc.END_C3-1}, {n_layers} layers) ===")
     print(f"    own-KV: {ns} sliding + {nf} full")
 
     sample = (
diff --git a/conversion/models/gemma4_swa_merged.py b/conversion/models/gemma4_swa_merged.py
index a493ab3..bd54dad 100644
--- a/conversion/models/gemma4_swa_merged.py
+++ b/conversion/models/gemma4_swa_merged.py
@@ -20,21 +20,26 @@
 
 
 class MergedChunk23(nn.Module):
-    """Layers 8-24: chunk2 (L8-14) + chunk3 (L15-24) merged.
+    """Merged chunk2 + chunk3 (own KV + KV-shared). Boundaries default
+    to E2B (own=L8-14, shared=L15-24). For E4B pass own_range /
+    shared_range from `compute_chunk_boundaries(cfg)`
+    (E4B: own=L12-23, shared=L24-32).
 
-    Own KV: L8-14 (5 sliding + 2 full). Shared KV: L15-24 (all shared from L13/L14).
-    kv13/kv14 stay internal — never leave the ANE.
-
-    Outputs: hidden_states, K/V for L8-14, BUT NOT kv13/kv14 (internal).
-    chunk4 still needs kv14 → output it for chunk4.
+    kv13/kv14 stay internal — never leave the ANE. Outputs: hidden_states,
+    K/V for own layers, kv13/kv14 (chunk4 still needs them).
     """
-    START_C2, END_C2 = 8, 15  # chunk2 layers
-    START_C3, END_C3 = 15, 25  # chunk3 layers
+    DEFAULT_OWN = (8, 15)       # E2B own-KV layers
+    DEFAULT_SHARED = (15, 25)   # E2B KV-shared layers
 
-    def __init__(self, model: Gemma4Model):
+    def __init__(self, model: Gemma4Model,
+                 own_range: tuple[int, int] | None = None,
+                 shared_range: tuple[int, int] | None = None):
         super().__init__()
         self.config = model.config
-        # All layers 8-24
+        own = own_range if own_range is not None else self.DEFAULT_OWN
+        shared = shared_range if shared_range is not None else self.DEFAULT_SHARED
+        self.START_C2, self.END_C2 = own
+        self.START_C3, self.END_C3 = shared
         self.layers_c2 = nn.ModuleList([model.layers[i] for i in range(self.START_C2, self.END_C2)])
         self.layers_c3 = nn.ModuleList([model.layers[i] for i in range(self.START_C3, self.END_C3)])
         self.sliding_map, self.full_map = _layer_kv_map(self.START_C2, self.END_C2, model.config)
diff --git a/docs/E4B_MULTIMODAL_BUILD.md b/docs/E4B_MULTIMODAL_BUILD.md
new file mode 100644
index 0000000..a9d0cd9
--- /dev/null
+++ b/docs/E4B_MULTIMODAL_BUILD.md
@@ -0,0 +1,138 @@
+# Gemma 4 E4B multimodal CoreML — build & sideload guide
+
+**Status:** Validated 2026-05-03 on iPhone 17 Pro. Text 15.7 tok/s + image / video / audio all functional.
+
+**Working bundle:** `gemma4-e4b-multimodal` (~7.6 GB).
+
+---
+
+## Bundle topology (what works on iPhone)
+
+| Component | File(s) | Source |
+|---|---|---|
+| Decode (Topology II 3-chunk) | `chunk1` (legacy) + `chunk2_3way` + `chunk3_3way` | legacy E4B HF + `build_gemma4_3way.py --model gemma4-e4b` |
+| Prefill (multifunction `prefill_b8`) | `chunk1` / `chunk2` / `chunk3` / `chunk4` (legacy) | legacy E4B HF (`mlboydaisuke/gemma-4-E4B-coreml`) |
+| Vision encoder | `vision.ane.mlmodelc` (output `[1, 256, 2560]`) | `convert_gemma4_multimodal.py --vision-ane --model-path <E4B HF>` |
+| Audio encoder | `audio.mlmodelc` (output `[1, 50, 1024]`) | `convert_audio.py --model-path <E4B HF>` |
+| Audio projection | `output_proj_*.npy` (1024→1536) + `embed_proj_weight.npy` (1536→2560) | from `convert_audio.py` |
+| Text sidecars | `embed_tokens_*`, RoPE tables, `model_config.json`, `hf_model/` | legacy E4B HF |
+
+`AudioProcessor.swift` `projectHiddenStates` runs the two-stage projection in Swift/Accelerate. `embed_proj` is now non-square aware (E4B `(2560, 1536)` vs E2B `(1536, 1536)`).
+
+---
+
+## What was tried and rejected
+
+### `prefill_chunk{1..4}.mlmodelc` separate-file multifunction (T=64/128/256/512)
+
+Built via `build_prefill_multifunction.py` (the production E2B `gemma4e2b3way` path).
+
+- **Mac**: works fine, 16.5 tok/s text + correct multimodal.
+- **iPhone**: text and image/audio prompts both produce degenerate output (e.g. `こんにちは` → `こんにちは。\n(同じトーンで)\nこんにちは。`).
+- Likely cause: int4 quantization noise on iPhone ANE 18 + E4B-specific graph (HKV=2, 21 merged layers in `chunk2_3way`) tips greedy argmax into a degenerate loop. E2B ships the same multifunction layout and works on iPhone.
+- Engine code is unchanged; the bundle ships **without `prefill_chunk*`** so the umbrella engine falls back to legacy `prefill_b8` multifunction. Vision-aware bidirectional mask within the image span still functions through `fillBatchMasksVisionAware` in `ChunkedEngine.swift`.
+
+### Stateful (MLState) E4B multimodal
+
+Engine class `Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift` builds and runs on Mac. iPhone ANE 18 fails to compile `chunk_2` with `std::bad_cast` in MIL→EIR translation when the producer layer's `kv13_k`/`kv14_k` alias slice is exposed as a chunk output. `.clone()` in PyTorch and 4-chunk decode split (each chunk smaller) both produce the same compile failure. Stateful path remains Mac-only / dev-only.
+
+### iPhone bundle pushes
+
+`xcrun devicectl device copy to` does **not** delete files that aren't in the source. Switching bundle layouts (e.g. multimodal → baseline) requires deleting and re-installing the app to clear the data container — otherwise orphan files (a leftover `prefill_chunk1.mlmodelc` is enough) silently override the new bundle's behaviour.
+
+---
+
+## Build steps
+
+Run on Mac with a working `coremltools 9.0` venv. macOS 26 needs the source-built wheel (see `docs/MACOS_26_BUILD_ENV.md`).
+
+```bash
+PY=/tmp/ct_build_venv/bin/python
+HF_DIR=/path/to/gemma4-e4b/hf_model     # local clone of google/gemma-4-E4B-it
+ROOT=$(pwd)
+
+# 1. 3-chunk decode (Topology II merged middle chunk).
+mkdir -p /tmp/gemma4-e4b-3way
+$PY conversion/build_gemma4_3way.py \
+    --model gemma4-e4b --hf-dir "$HF_DIR" \
+    --output /tmp/gemma4-e4b-3way --ctx 2048
+
+# 2. Vision encoder (ANE-targeted, square 48×48 grid → 256 soft tokens at LM hidden 2560).
+mkdir -p /tmp/gemma4-e4b-vision-ane
+$PY ../CoreML-LLM/conversion/convert_gemma4_multimodal.py \
+    --model-path "$HF_DIR" \
+    --output /tmp/gemma4-e4b-vision-ane \
+    --quantize int4 \
+    --vision-ane
+
+# 3. Audio encoder (Conformer + Swift projection sidecars).
+mkdir -p /tmp/gemma4-e4b-audio
+$PY ../CoreML-LLM/conversion/convert_audio.py \
+    --model-path "$HF_DIR" \
+    --output /tmp/gemma4-e4b-audio \
+    --quantize int4
+
+# 4. Assemble bundle (compiles mlpackage→mlmodelc, copies sidecars + legacy chunks).
+LEGACY=/path/to/gemma4-e4b-coreml-bundle bash scripts/assemble_gemma4_e4b_multimodal.sh
+# → build/gemma4-e4b-multimodal/   (~7.6 GB)
+```
+
+The assembler script accepts env vars `THREEWAY` / `VISION_ANE` / `AUDIO` / `LEGACY` / `MEL_FALLBACK` / `OUT` to override defaults. See the script header for the full layout description.
+
+---
+
+## Sideload to iPhone
+
+```bash
+DEVICE=$(xcrun devicectl list devices | awk '/iPhone 17 Pro/{print $3}')
+
+# 1. Delete CoreMLLLMChat app on iPhone (long-press home icon → "Remove App"
+#    → "Delete App"). devicectl doesn't remove orphan files; switching from a
+#    previous bundle without a clean sandbox WILL produce broken output.
+# 2. In Xcode, Cmd+R to reinstall a fresh app. Launch once to create the
+#    Documents container.
+# 3. Force-quit the app (swipe up in app switcher) so devicectl can write.
+
+xcrun devicectl device copy to --device "$DEVICE" \
+    --domain-type appDataContainer \
+    --domain-identifier com.example.CoreMLLLMChat \
+    --source build/gemma4-e4b-multimodal \
+    --destination Documents/Models/gemma4-e4b
+
+# 4. Xcode scheme env vars:
+#      LLM_SHOW_EXPERIMENTAL=1  (already required for some pickers)
+#      LLM_VISION_FORCE_ANE=1   (route vision.ane.mlmodelc through ANE)
+# 5. Cmd+R, pick "Gemma 4 E4B" in the picker, test.
+```
+
+---
+
+## Verified iPhone 17 Pro results (2026-05-03)
+
+| Modality | Result |
+|---|---|
+| Text-only | 15.7 tok/s, baseline-quality response (matches Mac) |
+| Image + text | Coherent description, no gibberish |
+| Video + text | Coherent description |
+| Audio + text | Correct response (after `AudioProcessor` `embed_proj` non-square fix) |
+
+---
+
+## Files of interest
+
+| File | Role |
+|---|---|
+| `Sources/CoreMLLLM/AudioProcessor.swift` | Two-stage Swift projection. `ProjectionWeights` now derives `inDim` / `outDim` / `finalDim` from weight tensor sizes; embed_proj sgemm uses `finalDim` (E4B 2560) instead of hard-coded `outDim`. |
+| `conversion/models/gemma4_swa_merged.py` | `MergedChunk23` accepts `own_range` / `shared_range`; defaults E2B (L8-14 / L15-24); E4B passes (12,24)/(24,33). |
+| `conversion/build_gemma4_3way.py` | Threads `compute_chunk_boundaries(cfg)` into the merged chunk so `--model gemma4-e4b` produces correct 3-way decode. |
+| `scripts/assemble_gemma4_e4b_multimodal.sh` | Reproducible bundle assembly. |
+| `Sources/CoreMLLLM/ChunkedEngine.swift` | Auto-detects Topology II via `chunk2_3way` + `chunk3_3way` presence. Routes prefill via `prefill_b8` multifunction in legacy chunks 1-4 when `prefill_chunk1` is absent (our case). |
+
+---
+
+## What's NOT in this bundle (intentional)
+
+- **`prefill_chunk{1..4}.mlmodelc` (multifunction T=64/128/256/512)**: see "What was tried and rejected" above.
+- **`vision.mlmodelc` (GPU variant, output `[1, 280, hidden]`)**: not built for E4B. We ship `vision.ane.mlmodelc` only and rely on `LLM_VISION_FORCE_ANE=1`.
+- **`vision_video.mlmodelc`**: video runs through still-image vision with 2×2 pooling fallback in the engine. Adequate quality on validation.
+- **Stateful chunks**: `Gemma4StatefulMultimodalEngine` is Mac-only / dev-only.
diff --git a/scripts/assemble_gemma4_e4b_multimodal.sh b/scripts/assemble_gemma4_e4b_multimodal.sh
new file mode 100755
index 0000000..93abe54
--- /dev/null
+++ b/scripts/assemble_gemma4_e4b_multimodal.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+# Assemble the Gemma 4 E4B multimodal CoreML bundle for iPhone sideload
+# (or HF upload). Working configuration validated 2026-05-03 on iPhone 17
+# Pro: text 15.7 tok/s + image / video / audio all functional.
+#
+# Layout produced:
+#
+#   build/gemma4-e4b-multimodal/
+#     chunk1.mlmodelc          # legacy 4-chunk (also has prefill_b8 multifunction)
+#     chunk2.mlmodelc          # legacy chunk2 — used as prefill_b8 only
+#     chunk3.mlmodelc          # legacy chunk3 — used as prefill_b8 only
+#     chunk4.mlmodelc          # legacy chunk4 — used as prefill_b8 only
+#     chunk2_3way.mlmodelc     # Topology II decode (own L12-23 + shared L24-32 merged)
+#     chunk3_3way.mlmodelc     # Topology II decode (shared L33-41 + lm_head)
+#     vision.ane.mlmodelc      # E4B SigLIP encoder (output [1, 256, 2560])
+#     audio.mlmodelc           # E4B Conformer encoder (output [1, 50, 1024])
+#     audio_config.json
+#     mel_filterbank.bin
+#     output_proj_weight.npy   # Audio projection 1024 → 1536
+#     output_proj_bias.npy
+#     embed_proj_weight.npy    # Audio projection 1536 → 2560 (E4B-specific shape)
+#     embed_tokens_q8.bin
+#     embed_tokens_scales.bin
+#     embed_tokens_per_layer_q8.bin
+#     embed_tokens_per_layer_scales.bin
+#     per_layer_projection.bin
+#     per_layer_norm_weight.bin
+#     cos_sliding.npy / sin_sliding.npy / cos_full.npy / sin_full.npy
+#     hf_model/                (tokenizer)
+#     model_config.json
+#
+# Total bundle size: ~7.6 GB.
+#
+# Engine routing (CoreMLLLM umbrella in Sources/CoreMLLLM/):
+#   - decode = Topology II (chunk1 + chunk2_3way + chunk3_3way) — auto-detected
+#     when chunk2_3way/chunk3_3way are present.
+#   - prefill = legacy chunks 1/2/3/4 prefill_b8 multifunction. The newer
+#     prefill_chunk{1..4}.mlmodelc separate-file path is INTENTIONALLY
+#     omitted: it produces broken outputs on iPhone ANE 18 with E4B
+#     (likely int4 quantization noise). Mac decodes E4B prefill_chunk*
+#     fine — iPhone-specific issue.
+#   - vision = vision.ane.mlmodelc when LLM_VISION_FORCE_ANE=1, else GPU
+#     fallback. Built E4B-specific (output dim 2560 matches LM hidden).
+#   - audio = audio.mlmodelc + Swift-side projection (AudioProcessor.swift,
+#     embed_proj is non-square 1536 → 2560 for E4B).
+#
+# Usage:
+#   bash scripts/assemble_gemma4_e4b_multimodal.sh
+#
+# Required input directories (override via env if non-default):
+#   THREEWAY=/tmp/gemma4-e4b-3way               (build_gemma4_3way.py --model gemma4-e4b output)
+#   VISION_ANE=/tmp/gemma4-e4b-vision-ane       (convert_gemma4_multimodal.py --vision-ane on E4B HF)
+#   AUDIO=/tmp/gemma4-e4b-audio                 (convert_audio.py on E4B HF)
+#   LEGACY=.../output/gemma4-e4b/bundle         (legacy 4-chunk text-only bundle, e.g. from HF
+#                                                mlboydaisuke/gemma-4-E4B-coreml)
+#   MEL_FALLBACK=.../conversion/output/audio    (mel_filterbank.bin source if missing from AUDIO)
+#
+# Push to iPhone (clean sandbox required — see docs/E4B_MULTIMODAL_BUILD.md):
+#   xcrun devicectl device copy to --device <ID> \
+#       --domain-type appDataContainer \
+#       --domain-identifier com.example.CoreMLLLMChat \
+#       --source build/gemma4-e4b-multimodal \
+#       --destination Documents/Models/gemma4-e4b
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+LEGACY="${LEGACY:-/Users/$USER/Downloads/CoreML-LLM/output/gemma4-e4b/bundle}"
+THREEWAY="${THREEWAY:-/tmp/gemma4-e4b-3way}"
+VISION_ANE="${VISION_ANE:-/tmp/gemma4-e4b-vision-ane}"
+AUDIO="${AUDIO:-/tmp/gemma4-e4b-audio}"
+MEL_FALLBACK="${MEL_FALLBACK:-/Users/$USER/Downloads/CoreML-LLM/conversion/output/audio}"
+OUT="${OUT:-$ROOT/build/gemma4-e4b-multimodal}"
+
+# Sanity
+for d in "$LEGACY" "$THREEWAY" "$VISION_ANE" "$AUDIO"; do
+    if [[ ! -d "$d" ]]; then
+        echo "[error] missing input dir: $d" >&2
+        exit 1
+    fi
+done
+
+mkdir -p "$OUT"
+rm -rf "$OUT"/*
+
+echo "[$(date)] === decode (chunk1 legacy + chunk{2,3}_3way Topology II) ==="
+cp -R "$LEGACY/chunk1.mlmodelc" "$OUT/"
+for c in chunk2_3way chunk3_3way; do
+    if [[ -d "$THREEWAY/$c.mlmodelc" ]]; then
+        cp -R "$THREEWAY/$c.mlmodelc" "$OUT/"
+    elif [[ -d "$THREEWAY/$c.mlpackage" ]]; then
+        echo "  compile $c"
+        xcrun coremlcompiler compile "$THREEWAY/$c.mlpackage" "$OUT/" 2>&1 | tail -1
+    else
+        echo "[error] $c{,_3way}.{mlpackage,mlmodelc} missing in $THREEWAY" >&2
+        exit 1
+    fi
+done
+
+echo ""
+echo "[$(date)] === legacy chunks 2/3/4 (prefill_b8 multifunction) ==="
+for c in chunk2 chunk3 chunk4; do
+    cp -R "$LEGACY/$c.mlmodelc" "$OUT/"
+done
+
+echo ""
+echo "[$(date)] === text sidecars ==="
+SIDE_TEXT=(
+    embed_tokens_q8.bin embed_tokens_scales.bin
+    embed_tokens_per_layer_q8.bin embed_tokens_per_layer_scales.bin
+    per_layer_projection.bin per_layer_norm_weight.bin
+    cos_sliding.npy sin_sliding.npy cos_full.npy sin_full.npy
+    hf_model model_config.json
+)
+for f in "${SIDE_TEXT[@]}"; do
+    if [[ -e "$LEGACY/$f" ]]; then
+        cp -R "$LEGACY/$f" "$OUT/"
+    else
+        echo "  [warn] missing $f"
+    fi
+done
+
+echo ""
+echo "[$(date)] === E4B encoders + audio sidecars ==="
+# Vision (E4B-specific, output dim 2560 matches LM hidden)
+if [[ -d "$VISION_ANE/vision.ane.mlmodelc" ]]; then
+    cp -R "$VISION_ANE/vision.ane.mlmodelc" "$OUT/"
+elif [[ -d "$VISION_ANE/vision.ane.mlpackage" ]]; then
+    xcrun coremlcompiler compile "$VISION_ANE/vision.ane.mlpackage" "$OUT/" 2>&1 | tail -1
+else
+    echo "[error] vision.ane.{mlpackage,mlmodelc} missing in $VISION_ANE" >&2
+    exit 1
+fi
+# Audio (E4B-specific, output [1, 50, 1024])
+if [[ -d "$AUDIO/audio.mlmodelc" ]]; then
+    cp -R "$AUDIO/audio.mlmodelc" "$OUT/"
+elif [[ -d "$AUDIO/audio.mlpackage" ]]; then
+    xcrun coremlcompiler compile "$AUDIO/audio.mlpackage" "$OUT/" 2>&1 | tail -1
+else
+    echo "[error] audio.{mlpackage,mlmodelc} missing in $AUDIO" >&2
+    exit 1
+fi
+# Audio sidecars
+SIDE_AUDIO=(
+    audio_config.json
+    output_proj_weight.npy output_proj_bias.npy embed_proj_weight.npy
+)
+for f in "${SIDE_AUDIO[@]}"; do
+    if [[ -e "$AUDIO/$f" ]]; then
+        cp "$AUDIO/$f" "$OUT/"
+    else
+        echo "  [warn] missing $f"
+    fi
+done
+# mel_filterbank.bin (often shipped from a sibling audio build dir)
+if [[ -e "$AUDIO/mel_filterbank.bin" ]]; then
+    cp "$AUDIO/mel_filterbank.bin" "$OUT/"
+elif [[ -e "$MEL_FALLBACK/mel_filterbank.bin" ]]; then
+    cp "$MEL_FALLBACK/mel_filterbank.bin" "$OUT/"
+else
+    echo "  [warn] missing mel_filterbank.bin (audio path will fail at runtime)"
+fi
+
+echo ""
+echo "=== assembled ==="
+du -sh "$OUT"
+ls "$OUT"
+echo ""
+echo "Push to iPhone (CLEAN sandbox — delete + reinstall app first; devicectl"
+echo "doesn't remove orphan files from previous bundles):"
+echo "  xcrun devicectl device copy to --device <ID> \\"
+echo "      --domain-type appDataContainer \\"
+echo "      --domain-identifier com.example.CoreMLLLMChat \\"
+echo "      --source $OUT \\"
+echo "      --destination Documents/Models/gemma4-e4b"
+echo ""
+echo "Scheme env vars: LLM_VISION_FORCE_ANE=1 (route vision.ane via ANE)."

From fc52b8b41bb46052f6f86c4b3ae0ef7aeb6919c8 Mon Sep 17 00:00:00 2001
From: john-rocky <samuraibrothersmail@gmail.com>
Date: Sun, 3 May 2026 10:40:23 +0900
Subject: [PATCH 6/7] =?UTF-8?q?research(gemma4-stateful-mm):=20Stage=208?=
 =?UTF-8?q?=20multimodal=20stateful=20engine=20=E2=80=94=20Mac=20dev=20/?=
 =?UTF-8?q?=20iPhone=20blocked?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage 8 follow-up to the stateful Linear shipment. Adds a parallel
engine that drives Gemma 4 stateful (3-chunk merged + Linear) with
T=288 single-function prefill chunks + the Stage 6 vision/audio
splice. The engine class works end-to-end on Mac (text decode
16.5 tok/s; assembled bundle drives image + audio splice through the
T=288 batched prefill with bidirectional within-image mask).

iPhone status: BLOCKED. Multiple converter paths attempted, all hit
the same iPhone ANE 18 MIL->EIR translation failure on chunk_2 (the
merged 21-layer middle chunk):
  - 3-chunk merged stateful with kv13/kv14 alias output: std::bad_cast
  - .clone() patch on the alias output assignment: same error
  - 4-chunk decode split (chunk_2_own + chunk_2_shared): same error,
    confirming the alias-slice-over-MLState pattern is the root cause
    rather than graph size.
The non-stateful 3-way merged chunk2_3way (same 21 layers, but K/V
flow as plain tensor inputs/outputs — no MLState alias) compiles and
runs on iPhone ANE 18 at 15.7 tok/s, confirming the diagnosis.

Code keeps the stateful path for Mac development and future revisits
(stateful + multifunction T=288 might unlock once iPhone ANE picks up
multifunction T>1 + dual MLState; not on iOS 18).

Files:
- Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift (NEW)
    ~880-line dimension-agnostic stateful engine. 3-chunk merged
    decode + 4-state MLState (decode/prefill x s1/s2) + bridgeKVState
    via withMultiArray nested closures + ported Stage 6 multimodal
    helpers (vision/video/audio splice + vision-aware bidir mask +
    cross-turn LCP-resume). Padding-replicate scheme keeps
    auto-emitted token at row T-1 valid even when validCount < T.
- Sources/CoreMLLLM/ModelDownloader.swift
    gemma4e2bStatefulMultimodal + gemma4e4bStatefulMultimodal
    ModelInfo entries (sideload-only, exposed under
    LLM_SHOW_EXPERIMENTAL=1).
- Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
    Detection (prefill_T288/ subdir presence) routes to the new
    engine; load + generate + image/audio caching mirror the existing
    gemma4Stateful pattern.
- Sources/gemma4mm-smoke/main.swift (NEW)
    Mac CLI smoke test for the stateful multimodal engine.
- Package.swift: gemma4mm-smoke executable target.
- scripts/assemble_gemma4_stateful_multimodal.sh (NEW)
    Reproducible bundle assembly (decode 3-chunk + prefill_T288/
    subdir + multimodal encoders).
- conversion/build_gemma4_stateful_singlefunc_prefill.py
    Adds --four-chunk variant (used during the chunk_2 split probe).
- conversion/models/gemma4_swa_stateful_chunks.py
    .clone() on the kv13/kv14 producer alias output (decode + prefill
    T=N variants). Materialises the slice over MLState into a fresh
    tensor; ineffective vs the iPhone ANE bug but not regressive.
---
 .../CoreMLLLMChat/LLMRunner.swift             |  218 +++
 Package.swift                                 |   12 +
 .../Gemma4StatefulMultimodalEngine.swift      | 1213 +++++++++++++++++
 Sources/CoreMLLLM/ModelDownloader.swift       |   30 +
 Sources/gemma4mm-smoke/main.swift             |   92 ++
 ...uild_gemma4_stateful_singlefunc_prefill.py |  119 +-
 .../models/gemma4_swa_stateful_chunks.py      |   39 +-
 .../assemble_gemma4_stateful_multimodal.sh    |  211 +++
 8 files changed, 1888 insertions(+), 46 deletions(-)
 create mode 100644 Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift
 create mode 100644 Sources/gemma4mm-smoke/main.swift
 create mode 100755 scripts/assemble_gemma4_stateful_multimodal.sh

diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
index 30f43bd..aa03fc0 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
@@ -59,6 +59,21 @@ final class LLMRunner {
     private var gemma4StatefulEngine: Gemma4StatefulEngine?
     private var gemma4StatefulTokenizer: (any Tokenizer)?
 
+    // Gemma 4 stateful + multimodal path (Stage 8): same 3-chunk merged
+    // Linear decode as the text-only stateful entry, plus a separate
+    // T=288 single-function prefill set under `prefill_T288/` and the
+    // vision/video/audio encoders. Selected when the bundle has both
+    // `chunk_{1..3}` and a `prefill_T288/` subdir alongside.
+    private var gemma4StatefulMultimodalEngine: Gemma4StatefulMultimodalEngine?
+    private var gemma4StatefulMultimodalTokenizer: (any Tokenizer)?
+    /// Cache the last image/audio features so a same-attachment follow-up
+    /// turn skips encoder cost (mirrors the legacy gemma4 multimodal path).
+    private var cachedGemma4MMImage: CGImage?
+    private var cachedGemma4MMImageFeatures: MLMultiArray?
+    private var cachedGemma4MMAudioSig: [Float]?
+    private var cachedGemma4MMAudioFeatures: MLMultiArray?
+    private var cachedGemma4MMAudioTokens: Int = 0
+
     // Qwen3-VL 2B path: separate generator + tokenizer, selected when
     // the downloaded folder contains `qwen3_vl_2b_decode_chunks/`.
     // Plain GQA architecture (not the Qwen3.5 hybrid SSM), so it gets
@@ -91,6 +106,7 @@ final class LLMRunner {
             || qwen3vl2bGenerator != nil
             || qwen3vl2bStatefulGenerator != nil
             || gemma4StatefulEngine != nil
+            || gemma4StatefulMultimodalEngine != nil
         {
             llm = nil
             qwen35Generator = nil
@@ -102,6 +118,13 @@ final class LLMRunner {
             qwen3vl2bVisionEncoder = nil
             gemma4StatefulEngine = nil
             gemma4StatefulTokenizer = nil
+            gemma4StatefulMultimodalEngine = nil
+            gemma4StatefulMultimodalTokenizer = nil
+            cachedGemma4MMImage = nil
+            cachedGemma4MMImageFeatures = nil
+            cachedGemma4MMAudioSig = nil
+            cachedGemma4MMAudioFeatures = nil
+            cachedGemma4MMAudioTokens = 0
             cachedVisionImage = nil
             cachedVisionFeatures = nil
             isLoaded = false
@@ -214,7 +237,26 @@ final class LLMRunner {
         let gemma4StatefulPresent = fm.fileExists(atPath:
             gemma4StatefulDir.appendingPathComponent("embed_tokens_q8.bin").path)
             && (hasChunks || has1Chunk)
+        // Stage 8 multimodal-stateful detection: same 3-chunk decode
+        // bundle plus a `prefill_T288/` subdir with the three single-
+        // function prefill mlpackages, plus at least one of
+        // vision/audio mlmodelc. Route to Gemma4StatefulMultimodalEngine
+        // when present — falls through to the text-only stateful path
+        // when only the decode chunks are installed.
         if gemma4StatefulPresent {
+            let prefillT288Dir = gemma4StatefulDir.appendingPathComponent("prefill_T288")
+            let hasPrefillT288 = ["chunk_1_prefill_T288",
+                                  "chunk_2_3way_prefill_T288",
+                                  "chunk_3_prefill_T288"].allSatisfy { name in
+                fm.fileExists(atPath:
+                    prefillT288Dir.appendingPathComponent("\(name).mlpackage").path)
+                || fm.fileExists(atPath:
+                    prefillT288Dir.appendingPathComponent("\(name).mlmodelc").path)
+            }
+            if hasPrefillT288 {
+                try await loadGemma4StatefulMultimodal(folder: gemma4StatefulDir)
+                return
+            }
             try await loadGemma4Stateful(folder: gemma4StatefulDir)
             return
         }
@@ -299,6 +341,10 @@ final class LLMRunner {
             return try await generateQwen3VL2BStateful(
                 messages: messages, image: image)
         }
+        if gemma4StatefulMultimodalEngine != nil {
+            return try await generateGemma4StatefulMultimodal(
+                messages: messages, image: image, audio: audio)
+        }
         if gemma4StatefulEngine != nil {
             return try await generateGemma4Stateful(messages: messages)
         }
@@ -1151,6 +1197,178 @@ final class LLMRunner {
         }
     }
 
+    // MARK: - Gemma 4 stateful + multimodal (Stage 8)
+
+    private func loadGemma4StatefulMultimodal(folder: URL) async throws {
+        loadingStatus = "Loading Gemma 4 multimodal tokenizer..."
+        let hfDir = folder.appendingPathComponent("hf_model")
+        let tok = try await AutoTokenizer.from(modelFolder: hfDir)
+        loadingStatus = "Compiling Gemma 4 stateful multimodal chunks (first run only)..."
+        let engine = Gemma4StatefulMultimodalEngine()
+        try await engine.load(modelDirectory: folder)
+        gemma4StatefulMultimodalEngine = engine
+        gemma4StatefulMultimodalTokenizer = tok
+
+        let parent = folder.deletingLastPathComponent().lastPathComponent
+        let isE4B = parent.lowercased().contains("e4b")
+        modelName = isE4B
+            ? "Gemma 4 E4B (stateful, multimodal)"
+            : "Gemma 4 E2B (stateful, multimodal)"
+        hasVision = engine.hasVision
+        hasAudio = engine.hasAudio
+        isLoaded = true
+        loadingStatus = "Ready"
+        print("[LLMRunner] Gemma 4 stateful multimodal loaded — \(modelName) " +
+              "vision=\(hasVision) video=\(engine.hasVideoVision) audio=\(hasAudio)")
+    }
+
+    private func generateGemma4StatefulMultimodal(messages: [ChatMessage],
+                                                    image: CGImage?,
+                                                    audio: [Float]?
+    ) async throws -> AsyncStream<String> {
+        guard let engine = gemma4StatefulMultimodalEngine,
+              let tok = gemma4StatefulMultimodalTokenizer
+        else {
+            throw NSError(domain: "LLMRunner", code: 42,
+                userInfo: [NSLocalizedDescriptionKey:
+                    "Gemma 4 stateful multimodal not loaded"])
+        }
+        isGenerating = true
+        tokensPerSecond = 0
+
+        // Encode image once per distinct attachment. Cache hit (same
+        // CGImage instance) skips the ~30 s vision graph + lets the
+        // engine's cross-turn KV reuse hit the LCP fast path.
+        var imageFeatures: MLMultiArray? = nil
+        var imageNumTokens = 0
+        var imageChanged = false
+        if let img = image {
+            if cachedGemma4MMImage === img, let f = cachedGemma4MMImageFeatures {
+                imageFeatures = f
+                imageNumTokens = 256
+            } else {
+                imageFeatures = try engine.processImage(img)
+                imageNumTokens = 256
+                cachedGemma4MMImage = img
+                cachedGemma4MMImageFeatures = imageFeatures
+                imageChanged = true
+            }
+        } else if cachedGemma4MMImage != nil {
+            cachedGemma4MMImage = nil
+            cachedGemma4MMImageFeatures = nil
+            imageChanged = true
+        }
+
+        var audioFeatures: MLMultiArray? = nil
+        var audioNumTokens = 0
+        var audioChanged = false
+        if let pcm = audio {
+            // Cheap fingerprint: [count, first, last]. Re-encode on
+            // any mismatch.
+            let sig: [Float] = pcm.isEmpty
+                ? [0, 0, 0]
+                : [Float(pcm.count), pcm.first ?? 0, pcm.last ?? 0]
+            let sigMatches = (cachedGemma4MMAudioSig == sig)
+            if sigMatches, let f = cachedGemma4MMAudioFeatures {
+                audioFeatures = f
+                audioNumTokens = cachedGemma4MMAudioTokens
+            } else {
+                let (feat, n) = try engine.processAudio(pcm)
+                audioFeatures = feat
+                audioNumTokens = n
+                cachedGemma4MMAudioSig = sig
+                cachedGemma4MMAudioFeatures = feat
+                cachedGemma4MMAudioTokens = n
+                audioChanged = true
+            }
+        } else if cachedGemma4MMAudioFeatures != nil {
+            cachedGemma4MMAudioSig = nil
+            cachedGemma4MMAudioFeatures = nil
+            cachedGemma4MMAudioTokens = 0
+            audioChanged = true
+        }
+
+        // Attachment changed → drop persisted KV so the LCP match
+        // doesn't reuse stale image/audio rows from a prior turn.
+        if imageChanged || audioChanged { engine.resetPersistedState() }
+
+        // Build the Gemma 4 prompt. Image / audio blocks are pinned to
+        // the LAST user turn so cross-turn resume keeps the pad span at
+        // a fixed offset (same trick as the legacy gemma4 path).
+        let imageBlock = "<|image>"
+            + String(repeating: "<|image|>", count: 256)
+            + "<image|>"
+        let audioBlock = "<|audio>"
+            + String(repeating: "<|audio|>", count: audioNumTokens)
+            + "<audio|>"
+        let lastUserIdx = messages.lastIndex { $0.role == .user }
+        var prompt = "<bos>"
+        for (i, m) in messages.enumerated() {
+            switch m.role {
+            case .user:
+                let isLast = i == lastUserIdx
+                var mediaPrefix = ""
+                if imageFeatures != nil && isLast { mediaPrefix += imageBlock + "\n" }
+                if audioFeatures != nil && isLast && audioNumTokens > 0 {
+                    mediaPrefix += audioBlock + "\n"
+                }
+                prompt += "<|turn>user\n\(mediaPrefix)\(m.content)<turn|>\n"
+            case .assistant:
+                prompt += "<|turn>model\n\(m.content)<turn|>\n"
+            case .system:
+                continue
+            }
+        }
+        prompt += "<|turn>model\n"
+        let inputIds = tok.encode(text: prompt).map { Int32($0) }
+
+        var eosSet: Set<Int32> = [1, 106]
+        if let eid = tok.eosTokenId { eosSet.insert(Int32(eid)) }
+        let skipSet: Set<Int32> = [1, 105, 106]
+
+        let genStart = Date()
+        return AsyncStream { continuation in
+            Task { [weak self] in
+                defer { Task { @MainActor in self?.isGenerating = false } }
+                var accum: [Int] = []
+                var emittedString = ""
+                var totalEmitted = 0
+                do {
+                    _ = try await engine.generate(
+                        inputIds: inputIds,
+                        imageFeatures: imageFeatures,
+                        imageNumTokens: imageNumTokens,
+                        audioFeatures: audioFeatures,
+                        audioNumTokens: audioNumTokens,
+                        maxNewTokens: 256,
+                        eosTokenIds: eosSet,
+                        onToken: { tokenId in
+                            if skipSet.contains(tokenId) { return }
+                            accum.append(Int(tokenId))
+                            let current = tok.decode(tokens: accum)
+                            if current.count > emittedString.count {
+                                let delta = String(
+                                    current.suffix(current.count - emittedString.count))
+                                continuation.yield(delta)
+                                emittedString = current
+                            }
+                            totalEmitted += 1
+                        })
+                    let dt = Date().timeIntervalSince(genStart)
+                    if dt > 0 {
+                        let tps = Double(totalEmitted) / dt
+                        Task { @MainActor in
+                            self?.tokensPerSecond = tps
+                        }
+                    }
+                } catch {
+                    continuation.yield("[Error: \(error.localizedDescription)]")
+                }
+                continuation.finish()
+            }
+        }
+    }
+
     /// Build the token ID sequence for a vision-augmented Qwen3-VL 2B
     /// prompt. Emits the same prefix the HF processor would produce for
     /// `[{role:"user", content:[{type:"image"},{type:"text", text:...}]}]`
diff --git a/Package.swift b/Package.swift
index 50df101..9ec6f0c 100644
--- a/Package.swift
+++ b/Package.swift
@@ -16,6 +16,7 @@ let package = Package(
         .executable(name: "determinism-oracle", targets: ["DeterminismOracle"]),
         .executable(name: "verify-k8-probe", targets: ["VerifyK8Probe"]),
         .executable(name: "ane-residency-gate", targets: ["AneResidencyGate"]),
+        .executable(name: "gemma4mm-smoke", targets: ["Gemma4MMSmoke"]),
         // Standalone samples for the two Gemma-3-based models. These live in
         // the same package on purpose — a LocalAIKit-style wrapper can depend
         // on the `CoreMLLLM` library and use `FunctionGemma` / `EmbeddingGemma`
@@ -91,6 +92,17 @@ let package = Package(
             path: "Sources/verify-k8-probe",
             swiftSettings: [.swiftLanguageMode(.v5)]
         ),
+        // Mac smoke test for Gemma4StatefulMultimodalEngine — text-only
+        // generate to catch engine bugs without an iPhone trip.
+        .executableTarget(
+            name: "Gemma4MMSmoke",
+            dependencies: [
+                "CoreMLLLM",
+                .product(name: "Tokenizers", package: "swift-transformers"),
+            ],
+            path: "Sources/gemma4mm-smoke",
+            swiftSettings: [.swiftLanguageMode(.v5)]
+        ),
         // FunctionGemma-270M standalone CLI. Does NOT combine with Gemma 4 —
         // multi-model orchestration belongs in the LocalAIKit wrapper.
         .executableTarget(
diff --git a/Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift b/Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift
new file mode 100644
index 0000000..aac4c51
--- /dev/null
+++ b/Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift
@@ -0,0 +1,1213 @@
+// Gemma4StatefulMultimodalEngine — Stage 8 runtime that pairs the
+// 3-chunk merged stateful Linear decode path with single-function
+// T=288 prefill chunks and the Stage 6 multimodal feature splice.
+//
+// Sibling of (not subclass of) Gemma4StatefulEngine. The legacy class
+// keeps shipping the multifunction prefill_b8 path bit-identical for
+// users who don't want multimodal; this class layers on:
+//
+//   - 3 separate single-function prefill mlpackages (T=288):
+//       prefill_T288/chunk_1_prefill_T288.mlmodelc
+//       prefill_T288/chunk_2_3way_prefill_T288.mlmodelc
+//       prefill_T288/chunk_3_prefill_T288.mlmodelc
+//     iPhone ANE 18 rejects multifunction T>1 + dual MLState. Probe
+//     2 verified single-function T=288 compiles in 7.3 s on A19 Pro.
+//
+//   - 4 MLStates (decode_s1/s2 + prefill_s1/s2). After each prefill
+//     pass we memcpy kv_cache_sliding + kv_cache_full from prefill
+//     state into decode state via the NS_REFINED_FOR_SWIFT
+//     `withMultiArray(for:handler:)` bridge.
+//
+//   - vision (256/image), video (64/frame), audio (~188/2 sec)
+//     encoder splice at IMAGE/VIDEO/AUDIO_TOKEN_ID positions.
+//     Vision-aware mask preserves bidirectional within-image
+//     attention during prefill.
+//
+// Both Gemma 4 E2B (35 layers) and E4B (42 layers) drive through this
+// engine — chunk topology comes from the loaded mlpackages, layer
+// counts come from model_config.json. The engine itself is
+// dimension-agnostic.
+
+import Accelerate
+import CoreGraphics
+import CoreML
+import Foundation
+
+@available(iOS 18.0, macOS 15.0, *)
+public final class Gemma4StatefulMultimodalEngine {
+    // MARK: - Public surface
+
+    public struct Config {
+        public let computeUnits: MLComputeUnits
+        public init(computeUnits: MLComputeUnits = .cpuAndNeuralEngine) {
+            self.computeUnits = computeUnits
+        }
+    }
+
+    public private(set) var modelConfig: ModelConfig?
+    public var lastDecodeTokensPerSecond: Double = 0
+
+    // MARK: - Storage
+
+    private let cfg: Config
+    private var modelDir: URL?
+
+    // Decode chunks. Two layouts auto-detected at load():
+    //  - 3-chunk merged: chunk_1 (own) + chunk_2 (own + KV-shared
+    //    internal) + chunk_3 (KV-shared + lm_head). chunk_3 emits
+    //    token_id. Used for E2B (chunk_2 fits ANE budget).
+    //  - 4-chunk split: chunk_1 (own) + chunk_2 (own only) + chunk_3
+    //    (KV-shared, no lm_head) + chunk_4 (KV-shared + lm_head).
+    //    chunk_4 emits token_id. Used for E4B because the merged
+    //    21-layer chunk_2 trips iPhone ANE 18 MIL→EIR translation
+    //    (`std::bad_cast`); splitting keeps each subgraph small enough.
+    private var decodeChunk1: MLModel?
+    private var decodeChunk2: MLModel?
+    private var decodeChunk3: MLModel?
+    private var decodeChunk4: MLModel?
+    private var is4Chunk: Bool = false
+
+    // T=288 single-function prefill chunks (separate mlpackages).
+    private static let kPrefillT: Int = 288
+    private var prefillChunk1: MLModel?
+    private var prefillChunk2: MLModel?
+    private var prefillChunk3: MLModel?
+    private var prefillChunk4: MLModel?
+
+    /// True when both decode and prefill chunk sets loaded successfully.
+    public var hasT288Prefill: Bool {
+        let core = prefillChunk1 != nil && prefillChunk2 != nil
+            && prefillChunk3 != nil
+        return is4Chunk ? (core && prefillChunk4 != nil) : core
+    }
+
+    // Per-chunk MLStates. Decode + prefill paths each have their own
+    // since they bind to different MLModel instances. chunk_3 is
+    // stateless (KV-shared from chunk_2's kv13/kv14 outputs) for both
+    // decode and prefill.
+    private var decodeState1: MLState?
+    private var decodeState2: MLState?
+    private var prefillState1: MLState?
+    private var prefillState2: MLState?
+
+    // Sidecars (same as Gemma4StatefulEngine).
+    private var embedTokens: EmbeddingLookup?
+    private var embedTokensPerLayer: EmbeddingLookup?
+    private var cosSlidingTable: Data?
+    private var sinSlidingTable: Data?
+    private var cosFullTable: Data?
+    private var sinFullTable: Data?
+
+    // T=1 decode scratch.
+    private var maskFull: MLMultiArray!
+    private var maskSliding: MLMultiArray!
+    private var fvMaskFull: MLFeatureValue!
+    private var fvMaskSliding: MLFeatureValue!
+    private var posScratch: MLMultiArray!
+    private var ringScratch: MLMultiArray!
+    private var fvPos: MLFeatureValue!
+    private var fvRing: MLFeatureValue!
+
+    // T=288 prefill scratch (allocated once at load).
+    private var batchHidden: MLMultiArray?
+    private var batchPerLayerRaw: MLMultiArray?
+    private var batchMaskFull: MLMultiArray?
+    private var batchMaskSliding: MLMultiArray?
+    private var batchCosS: MLMultiArray?
+    private var batchSinS: MLMultiArray?
+    private var batchCosF: MLMultiArray?
+    private var batchSinF: MLMultiArray?
+
+    // Cross-turn KV reuse — only on decode states (prefill states are
+    // reusable scratch and get overwritten each generate()). The LCP
+    // match invariant: persistedInputIds is a strict prefix of the
+    // next prompt's inputIds. LLMRunner is responsible for calling
+    // resetPersistedState() on chat clear or attachment change.
+    private var persistedInputIds: [Int32] = []
+    private var persistedPosition: Int = 0
+
+    // MARK: - Multimodal (Stage 6 helpers, ported)
+
+    private static let IMAGE_TOKEN_ID: Int32 = 258880
+    private static let AUDIO_TOKEN_ID: Int32 = 258881
+    private static let VIDEO_TOKEN_ID: Int32 = 258884
+
+    private var visionModelURL: URL?
+    private var visionConfig: MLModelConfiguration?
+    private var visionModel: MLModel?
+    private var visionUsesANEBuild: Bool = false
+
+    private var videoVisionModelURL: URL?
+    private var videoVisionConfig: MLModelConfiguration?
+    private var videoVisionModel: MLModel?
+
+    private var audioModelURL: URL?
+    private var audioConfig: MLModelConfiguration?
+    private var audioModel: MLModel?
+    private var melFilterbank: [Float]?
+    private var audioProjection: AudioProcessor.ProjectionWeights?
+    private var audioMelFrames: Int = 200
+    private var audioNumTokensConfig: Int = 188
+    private var audioMelFloor: Float = 0.001
+
+    public var hasVision: Bool { visionModelURL != nil }
+    public var hasVideoVision: Bool { videoVisionModelURL != nil }
+    public var hasAudio: Bool { audioModelURL != nil }
+    public var defaultAudioNumTokens: Int { audioNumTokensConfig }
+
+    // Per-call multimodal binding.
+    private var mmImageFeatures: MLMultiArray?
+    private var mmImageNumTokens: Int = 0
+    private var mmAudioFeatures: MLMultiArray?
+    private var mmAudioNumTokens: Int = 0
+    private var mmImageIdx: Int = 0
+    private var mmAudioIdx: Int = 0
+    private var mmVisionGroupIds: [Int]?
+
+    // Reusable PLR=0 scratch for T=1 multimodal positions.
+    private var prlZerosT1: MLMultiArray?
+
+    // MARK: - Init / Load
+
+    public init(config: Config = Config()) {
+        self.cfg = config
+    }
+
+    /// Drop the cross-turn KV cache. Call when chat history clears,
+    /// the vision/audio prefix changes, or any other prompt-prefix
+    /// invariant breaks.
+    public func resetPersistedState() {
+        decodeState1 = nil
+        decodeState2 = nil
+        prefillState1 = nil
+        prefillState2 = nil
+        persistedInputIds = []
+        persistedPosition = 0
+    }
+
+    public func load(modelDirectory: URL) async throws {
+        resetPersistedState()
+        modelDir = modelDirectory
+        let mc = try ModelConfig.load(from: modelDirectory)
+        modelConfig = mc
+
+        let mcfg = MLModelConfiguration()
+        mcfg.computeUnits = cfg.computeUnits
+
+        decodeChunk1 = try openChunk("chunk_1", in: modelDirectory, cfg: mcfg)
+        decodeChunk2 = try openChunk("chunk_2", in: modelDirectory, cfg: mcfg)
+        decodeChunk3 = try openChunk("chunk_3", in: modelDirectory, cfg: mcfg)
+
+        // 4-chunk vs 3-chunk detection: chunk_4 present → 4-chunk;
+        // chunk_3.token_id output → 3-chunk merged final.
+        let chunk4Mlc = modelDirectory.appendingPathComponent("chunk_4.mlmodelc")
+        let chunk4Pkg = modelDirectory.appendingPathComponent("chunk_4.mlpackage")
+        let has4 = FileManager.default.fileExists(atPath: chunk4Mlc.path)
+            || FileManager.default.fileExists(atPath: chunk4Pkg.path)
+        if has4 {
+            decodeChunk4 = try openChunk("chunk_4", in: modelDirectory, cfg: mcfg)
+            is4Chunk = true
+            print("[Gemma4MM] 4-chunk decode layout (chunk_2 own / chunk_3 shared / chunk_4 final)")
+        } else {
+            print("[Gemma4MM] 3-chunk merged decode layout (chunk_3 = final)")
+        }
+
+        // T=288 prefill chunks live under prefill_T288/ in the bundle
+        // layout. Failing to find them is fatal — this engine has no
+        // T=1 prefill fallback (the legacy engine handles that path).
+        let pfDir = modelDirectory.appendingPathComponent("prefill_T288")
+        prefillChunk1 = try openChunk(
+            "chunk_1_prefill_T288", in: pfDir, cfg: mcfg)
+        if is4Chunk {
+            prefillChunk2 = try openChunk(
+                "chunk_2_prefill_T288", in: pfDir, cfg: mcfg)
+            prefillChunk3 = try openChunk(
+                "chunk_3_prefill_T288", in: pfDir, cfg: mcfg)
+            prefillChunk4 = try openChunk(
+                "chunk_4_prefill_T288", in: pfDir, cfg: mcfg)
+        } else {
+            prefillChunk2 = try openChunk(
+                "chunk_2_3way_prefill_T288", in: pfDir, cfg: mcfg)
+            prefillChunk3 = try openChunk(
+                "chunk_3_prefill_T288", in: pfDir, cfg: mcfg)
+        }
+        print("[Gemma4MM] T=\(Self.kPrefillT) prefill chunks loaded "
+              + "(\(is4Chunk ? "4-chunk" : "3-chunk merged"))")
+
+        embedTokens = try EmbeddingLookup(
+            dataURL: modelDirectory.appendingPathComponent("embed_tokens_q8.bin"),
+            scalesURL: modelDirectory.appendingPathComponent("embed_tokens_scales.bin"),
+            vocabSize: mc.vocabSize, dim: mc.hiddenSize, scale: mc.embedScale)
+        embedTokensPerLayer = try EmbeddingLookup(
+            dataURL: modelDirectory.appendingPathComponent("embed_tokens_per_layer_q8.bin"),
+            scalesURL: modelDirectory.appendingPathComponent("embed_tokens_per_layer_scales.bin"),
+            vocabSize: mc.vocabSize,
+            dim: mc.numLayers * mc.perLayerDim,
+            scale: mc.perLayerEmbedScale)
+
+        cosSlidingTable = try? Data(
+            contentsOf: modelDirectory.appendingPathComponent("cos_sliding.npy"),
+            options: .mappedIfSafe)
+        sinSlidingTable = try? Data(
+            contentsOf: modelDirectory.appendingPathComponent("sin_sliding.npy"),
+            options: .mappedIfSafe)
+        cosFullTable = try? Data(
+            contentsOf: modelDirectory.appendingPathComponent("cos_full.npy"),
+            options: .mappedIfSafe)
+        sinFullTable = try? Data(
+            contentsOf: modelDirectory.appendingPathComponent("sin_full.npy"),
+            options: .mappedIfSafe)
+
+        let ctx = mc.contextLength
+        let W = mc.slidingWindow
+        maskFull = try MLMultiArray(
+            shape: [1, 1, 1, NSNumber(value: ctx)], dataType: .float16)
+        maskSliding = try MLMultiArray(
+            shape: [1, 1, 1, NSNumber(value: W)], dataType: .float16)
+        posScratch = try MLMultiArray(shape: [1], dataType: .int32)
+        ringScratch = try MLMultiArray(shape: [1], dataType: .int32)
+        fvMaskFull = MLFeatureValue(multiArray: maskFull)
+        fvMaskSliding = MLFeatureValue(multiArray: maskSliding)
+        fvPos = MLFeatureValue(multiArray: posScratch)
+        fvRing = MLFeatureValue(multiArray: ringScratch)
+
+        try ensureBatchScratch(T: Self.kPrefillT)
+
+        loadMultimodalEncoders(modelDirectory: modelDirectory)
+    }
+
+    private func openChunk(_ name: String, in dir: URL,
+                           cfg: MLModelConfiguration) throws -> MLModel {
+        let mlc = dir.appendingPathComponent("\(name).mlmodelc")
+        let pkg = dir.appendingPathComponent("\(name).mlpackage")
+
+        let url: URL
+        if FileManager.default.fileExists(atPath: mlc.path) {
+            url = mlc
+        } else if FileManager.default.fileExists(atPath: pkg.path) {
+            url = try MLModel.compileModel(at: pkg)
+        } else {
+            throw CoreMLLLMError.modelNotFound(
+                "\(name).mlmodelc/.mlpackage not found in \(dir.path)")
+        }
+
+        // Try the requested compute units first. iPhone ANE 18 has been
+        // observed to fail MIL→EIR translation on some merged chunks
+        // (`std::bad_cast` in `_ANECompiler::ANECCompile()`); fall back to
+        // CPU+GPU so the engine still loads. Gates per-chunk via env
+        // var `LLM_GEMMA4MM_FORCE_GPU=<chunk_name>[,<chunk_name>...]`.
+        let envForceGPU = ProcessInfo.processInfo
+            .environment["LLM_GEMMA4MM_FORCE_GPU"] ?? ""
+        let forced = envForceGPU.split(separator: ",")
+            .map { String($0).trimmingCharacters(in: .whitespaces) }
+        if forced.contains(name) {
+            print("[Gemma4MM] \(name) — LLM_GEMMA4MM_FORCE_GPU forces cpuAndGPU")
+            let gpu = MLModelConfiguration()
+            gpu.computeUnits = .cpuAndGPU
+            return try MLModel(contentsOf: url, configuration: gpu)
+        }
+        do {
+            return try MLModel(contentsOf: url, configuration: cfg)
+        } catch {
+            print("[Gemma4MM] \(name) load failed on \(cfg.computeUnits.rawValue): \(error). Retrying on cpuAndGPU.")
+            let gpu = MLModelConfiguration()
+            gpu.computeUnits = .cpuAndGPU
+            return try MLModel(contentsOf: url, configuration: gpu)
+        }
+    }
+
+    // MARK: - Multimodal encoder loading (ported from Stage 6 02ac583)
+
+    private func loadMultimodalEncoders(modelDirectory: URL) {
+        let forceANE = ProcessInfo.processInfo.environment["LLM_VISION_FORCE_ANE"] == "1"
+        let visionANEv2Compiled = modelDirectory.appendingPathComponent("vision.ane.v2.mlmodelc")
+        let visionANECompiled = modelDirectory.appendingPathComponent("vision.ane.mlmodelc")
+        let visionANEPkg = modelDirectory.appendingPathComponent("vision.ane.mlpackage")
+        let visionCompiled = modelDirectory.appendingPathComponent("vision.mlmodelc")
+        let visionPkg = modelDirectory.appendingPathComponent("vision.mlpackage")
+        if forceANE, FileManager.default.fileExists(atPath: visionANEv2Compiled.path) {
+            visionModelURL = visionANEv2Compiled; visionUsesANEBuild = true
+        } else if forceANE, FileManager.default.fileExists(atPath: visionANECompiled.path) {
+            visionModelURL = visionANECompiled; visionUsesANEBuild = true
+        } else if forceANE, FileManager.default.fileExists(atPath: visionANEPkg.path) {
+            visionModelURL = visionANEPkg; visionUsesANEBuild = true
+        } else if FileManager.default.fileExists(atPath: visionCompiled.path) {
+            visionModelURL = visionCompiled
+        } else if FileManager.default.fileExists(atPath: visionPkg.path) {
+            visionModelURL = visionPkg
+        } else if FileManager.default.fileExists(atPath: visionANEv2Compiled.path) {
+            visionModelURL = visionANEv2Compiled; visionUsesANEBuild = true
+        } else if FileManager.default.fileExists(atPath: visionANECompiled.path) {
+            visionModelURL = visionANECompiled; visionUsesANEBuild = true
+        } else if FileManager.default.fileExists(atPath: visionANEPkg.path) {
+            visionModelURL = visionANEPkg; visionUsesANEBuild = true
+        }
+        if let url = visionModelURL {
+            let cfg = MLModelConfiguration()
+            cfg.computeUnits = visionUsesANEBuild ? .cpuAndNeuralEngine : .cpuAndGPU
+            visionConfig = cfg
+            print("[Gemma4MM/Vision] selected \(url.lastPathComponent) → \(visionUsesANEBuild ? "ANE" : "GPU")")
+            prewarmVisionInBackground()
+        }
+
+        let videoVisionCompiled = modelDirectory.appendingPathComponent("vision_video.mlmodelc")
+        let videoVisionPkg = modelDirectory.appendingPathComponent("vision_video.mlpackage")
+        if FileManager.default.fileExists(atPath: videoVisionCompiled.path) {
+            videoVisionModelURL = videoVisionCompiled
+        } else if FileManager.default.fileExists(atPath: videoVisionPkg.path) {
+            videoVisionModelURL = videoVisionPkg
+        }
+        if videoVisionModelURL != nil {
+            let cfg = MLModelConfiguration()
+            cfg.computeUnits = .cpuAndGPU
+            videoVisionConfig = cfg
+        }
+
+        let audioCompiled = modelDirectory.appendingPathComponent("audio.mlmodelc")
+        let audioPkg = modelDirectory.appendingPathComponent("audio.mlpackage")
+        if FileManager.default.fileExists(atPath: audioCompiled.path) {
+            audioModelURL = audioCompiled
+        } else if FileManager.default.fileExists(atPath: audioPkg.path) {
+            audioModelURL = audioPkg
+        }
+        if audioModelURL != nil {
+            let cfg = MLModelConfiguration()
+            cfg.computeUnits = .cpuAndGPU
+            audioConfig = cfg
+
+            let melURL = modelDirectory.appendingPathComponent("mel_filterbank.bin")
+            if FileManager.default.fileExists(atPath: melURL.path) {
+                melFilterbank = try? AudioProcessor.loadMelFilterbank(from: melURL)
+            }
+            let projURL = modelDirectory.appendingPathComponent("output_proj_weight.npy")
+            if FileManager.default.fileExists(atPath: projURL.path) {
+                audioProjection = try? AudioProcessor.ProjectionWeights.load(from: modelDirectory)
+            }
+            let audioConfURL = modelDirectory.appendingPathComponent("audio_config.json")
+            if let data = try? Data(contentsOf: audioConfURL),
+               let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] {
+                audioMelFrames = json["mel_frames"] as? Int ?? 200
+                audioNumTokensConfig = json["num_tokens"] as? Int ?? 188
+                if let mf = json["log_offset"] as? Double {
+                    audioMelFloor = Float(mf)
+                } else if let mf = json["mel_floor"] as? Double {
+                    audioMelFloor = Float(mf)
+                }
+            }
+        }
+
+        if hasVision || hasAudio || hasVideoVision {
+            print("[Gemma4MM] multimodal encoders: vision=\(hasVision) " +
+                  "video=\(hasVideoVision) audio=\(hasAudio)")
+        }
+    }
+
+    private func prewarmVisionInBackground() {
+        guard let url = visionModelURL, let cfg = visionConfig,
+              !visionUsesANEBuild else { return }
+        DispatchQueue.global(qos: .utility).async { [weak self] in
+            do {
+                let t0 = CFAbsoluteTimeGetCurrent()
+                let m = try MLModel(contentsOf: url, configuration: cfg)
+                self?.visionModel = m
+                let pd = 16 * 16 * 3
+                let total = 2520
+                let pv = try MLMultiArray(
+                    shape: [1, NSNumber(value: total), NSNumber(value: pd)],
+                    dataType: .float32)
+                let pid = try MLMultiArray(
+                    shape: [1, NSNumber(value: total), 2], dataType: .int32)
+                let pidp = pid.dataPointer.bindMemory(
+                    to: Int32.self, capacity: total * 2)
+                var k = 0
+                for py in 0..<48 {
+                    for px in 0..<48 {
+                        pidp[k * 2] = Int32(px)
+                        pidp[k * 2 + 1] = Int32(py)
+                        k += 1
+                    }
+                }
+                for i in (48 * 48)..<total {
+                    pidp[i * 2] = -1
+                    pidp[i * 2 + 1] = -1
+                }
+                let input = try MLDictionaryFeatureProvider(dictionary: [
+                    "pixel_values": MLFeatureValue(multiArray: pv),
+                    "pixel_position_ids": MLFeatureValue(multiArray: pid),
+                ])
+                _ = try? m.prediction(from: input)
+                let dt = CFAbsoluteTimeGetCurrent() - t0
+                print("[Gemma4MM/Vision] prewarm done in \(String(format: "%.1f", dt))s")
+            } catch {
+                print("[Gemma4MM/Vision] prewarm failed: \(error)")
+            }
+        }
+    }
+
+    // MARK: - Multimodal feature extraction
+
+    public func processImage(_ image: CGImage) throws -> MLMultiArray {
+        if visionModel == nil, let url = visionModelURL, let cfg = visionConfig {
+            visionModel = try MLModel(contentsOf: url, configuration: cfg)
+        }
+        guard let vm = visionModel else { throw CoreMLLLMError.visionNotAvailable }
+        return visionUsesANEBuild
+            ? try ImageProcessor.processANE(image, with: vm)
+            : try ImageProcessor.process(image, with: vm)
+    }
+
+    public func processVideoFrame(_ image: CGImage) throws -> MLMultiArray {
+        if videoVisionModel == nil, let url = videoVisionModelURL, let cfg = videoVisionConfig {
+            videoVisionModel = try MLModel(contentsOf: url, configuration: cfg)
+        }
+        guard let vm = videoVisionModel else { throw CoreMLLLMError.visionNotAvailable }
+        return try ImageProcessor.processVideoFrame(image, with: vm)
+    }
+
+    public func processAudio(_ samples: [Float]) throws -> (MLMultiArray, Int) {
+        if audioModel == nil, let url = audioModelURL, let cfg = audioConfig {
+            audioModel = try MLModel(contentsOf: url, configuration: cfg)
+        }
+        guard let am = audioModel else { throw CoreMLLLMError.audioNotAvailable }
+        guard let mel = melFilterbank else { throw CoreMLLLMError.audioNotAvailable }
+
+        let padLeft = 160
+        let paddedLen = padLeft + samples.count
+        let unfoldSize = 321
+        let actualMelFrames = max(0, (paddedLen - unfoldSize) / 160 + 1)
+        let afterConv1 = (actualMelFrames + 1) / 2
+        let actualTokens = min((afterConv1 + 1) / 2, audioNumTokensConfig)
+
+        let features = try AudioProcessor.process(
+            samples, with: am, melFilterbank: mel,
+            targetFrames: audioMelFrames, projection: audioProjection,
+            melFloor: audioMelFloor)
+        return (features, actualTokens)
+    }
+
+    // MARK: - Multimodal mask + splice helpers
+
+    private func computeVisionGroupIds(inputIds: [Int32]) -> [Int] {
+        var ids = [Int](repeating: -1, count: inputIds.count)
+        var current = -1
+        var prev = false
+        for i in 0..<inputIds.count {
+            let isVision = inputIds[i] == Self.IMAGE_TOKEN_ID
+                || inputIds[i] == Self.VIDEO_TOKEN_ID
+            if isVision {
+                if !prev { current += 1 }
+                ids[i] = current
+            }
+            prev = isVision
+        }
+        return ids
+    }
+
+    private func prlZerosT1Buffer() throws -> MLMultiArray {
+        if let buf = prlZerosT1 { return buf }
+        guard let mc = modelConfig else {
+            throw CoreMLLLMError.modelNotFound("no config")
+        }
+        let dim = mc.numLayers * mc.perLayerDim
+        let arr = try MLMultiArray(
+            shape: [1, 1, NSNumber(value: dim)], dataType: .float16)
+        memset(arr.dataPointer, 0, dim * MemoryLayout<UInt16>.stride)
+        prlZerosT1 = arr
+        return arr
+    }
+
+    private func multimodalSpliceT1(token: Int32) -> MLMultiArray? {
+        guard let mc = modelConfig else { return nil }
+        if (token == Self.IMAGE_TOKEN_ID || token == Self.VIDEO_TOKEN_ID),
+           let img = mmImageFeatures, mmImageIdx < mmImageNumTokens {
+            let row = ImageProcessor.sliceFeature(
+                img, at: mmImageIdx, hiddenSize: mc.hiddenSize)
+            mmImageIdx += 1
+            return row
+        }
+        if token == Self.AUDIO_TOKEN_ID,
+           let aud = mmAudioFeatures, mmAudioIdx < mmAudioNumTokens {
+            let row = AudioProcessor.sliceFeature(
+                aud, at: mmAudioIdx, hiddenSize: mc.hiddenSize)
+            mmAudioIdx += 1
+            return row
+        }
+        return nil
+    }
+
+    // MARK: - Mask + position helpers (T=1 decode)
+
+    private func fillFullCausalMask(position: Int) {
+        let ctx = modelConfig!.contextLength
+        let dst = maskFull.dataPointer.bindMemory(to: UInt16.self, capacity: ctx)
+        let neg = Float16(-65504).bitPattern
+        let p = min(max(position, 0), ctx - 1)
+        for i in 0..<ctx { dst[i] = i <= p ? 0 : neg }
+    }
+
+    private func fillSlidingCausalMask(position: Int) {
+        let W = modelConfig!.slidingWindow
+        let dst = maskSliding.dataPointer.bindMemory(to: UInt16.self, capacity: W)
+        let neg = Float16(-65504).bitPattern
+        let valid = min(position + 1, W)
+        for i in 0..<W { dst[i] = i < valid ? 0 : neg }
+    }
+
+    private func fillFullCausalMaskVisionAware(position p: Int, groupIds: [Int]) {
+        let ctx = modelConfig!.contextLength
+        let neg = Float16(-65504).bitPattern
+        let mp = maskFull.dataPointer.bindMemory(to: UInt16.self, capacity: ctx)
+        let pGroup = (p < groupIds.count) ? groupIds[p] : -1
+        let pClamped = min(max(p, 0), ctx - 1)
+        for i in 0..<ctx {
+            let causal = i <= pClamped
+            let sameGroup = pGroup >= 0 && i < groupIds.count && groupIds[i] == pGroup
+            mp[i] = (causal || sameGroup) ? 0 : neg
+        }
+    }
+
+    private func fillSlidingCausalMaskVisionAware(position p: Int, groupIds: [Int]) {
+        let W = modelConfig!.slidingWindow
+        let neg = Float16(-65504).bitPattern
+        let mp = maskSliding.dataPointer.bindMemory(to: UInt16.self, capacity: W)
+        if p >= W {
+            for i in 0..<W { mp[i] = 0 }
+            return
+        }
+        let pGroup = (p < groupIds.count) ? groupIds[p] : -1
+        let valid = min(p + 1, W)
+        for i in 0..<W {
+            let causal = i < valid
+            let sameGroup = pGroup >= 0 && i < groupIds.count && groupIds[i] == pGroup
+            mp[i] = (causal || sameGroup) ? 0 : neg
+        }
+    }
+
+    private func setPos(_ pos: Int) {
+        posScratch.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0] = Int32(pos)
+    }
+
+    private func setRing(_ pos: Int) {
+        let W = modelConfig!.slidingWindow
+        ringScratch.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0] = Int32(pos % W)
+    }
+
+    private func lookupRoPE(table: Data?, position: Int, dim: Int) throws -> MLMultiArray {
+        let result = try MLMultiArray(
+            shape: [1, 1, 1, NSNumber(value: dim)], dataType: .float16)
+        let dst = result.dataPointer.bindMemory(to: UInt16.self, capacity: dim)
+        guard let table else {
+            memset(dst, 0, dim * MemoryLayout<UInt16>.stride); return result
+        }
+        var headerSize = 128
+        table.withUnsafeBytes { raw in
+            let b = raw.baseAddress!.assumingMemoryBound(to: UInt8.self)
+            headerSize = 10 + (Int(b[8]) | (Int(b[9]) << 8))
+        }
+        let rowBytes = dim * MemoryLayout<UInt16>.stride
+        let offset = headerSize + position * rowBytes
+        guard offset + rowBytes <= table.count else {
+            memset(dst, 0, rowBytes); return result
+        }
+        _ = table.withUnsafeBytes { raw in
+            memcpy(dst, raw.baseAddress!.advanced(by: offset), rowBytes)
+        }
+        return result
+    }
+
+    // MARK: - Batched (T=288) prefill scratch
+
+    private func ensureBatchScratch(T: Int) throws {
+        guard let mc = modelConfig else { return }
+        let H = mc.hiddenSize
+        let PL = mc.numLayers * mc.perLayerDim
+        let ctx = mc.contextLength
+        let W = mc.slidingWindow
+        let hdS = 256
+        let hdF = 512
+        if batchHidden == nil || batchHidden!.shape[1].intValue != T {
+            batchHidden = try MLMultiArray(
+                shape: [1, NSNumber(value: T), NSNumber(value: H)],
+                dataType: .float16)
+            batchPerLayerRaw = try MLMultiArray(
+                shape: [1, NSNumber(value: T), NSNumber(value: PL)],
+                dataType: .float16)
+            batchMaskFull = try MLMultiArray(
+                shape: [1, 1, NSNumber(value: T), NSNumber(value: ctx)],
+                dataType: .float16)
+            batchMaskSliding = try MLMultiArray(
+                shape: [1, 1, NSNumber(value: T), NSNumber(value: W)],
+                dataType: .float16)
+            batchCosS = try MLMultiArray(
+                shape: [1, 1, NSNumber(value: T), NSNumber(value: hdS)],
+                dataType: .float16)
+            batchSinS = try MLMultiArray(
+                shape: [1, 1, NSNumber(value: T), NSNumber(value: hdS)],
+                dataType: .float16)
+            batchCosF = try MLMultiArray(
+                shape: [1, 1, NSNumber(value: T), NSNumber(value: hdF)],
+                dataType: .float16)
+            batchSinF = try MLMultiArray(
+                shape: [1, 1, NSNumber(value: T), NSNumber(value: hdF)],
+                dataType: .float16)
+        }
+    }
+
+    /// Fill T-row causal masks for a contiguous batch starting at
+    /// `startPos`. Padded rows (t >= validCount) are filled as
+    /// duplicates of row validCount-1 so the auto-emit at row T-1 is
+    /// the model's prediction for the LAST valid prompt token (=
+    /// first post-prompt token).
+    private func fillBatchMasks(startPos: Int, T: Int, validCount: Int,
+                                 groupIds: [Int]?) {
+        let ctx = modelConfig!.contextLength
+        let W = modelConfig!.slidingWindow
+        let neg = Float16(-65504).bitPattern
+        let mf = batchMaskFull!.dataPointer.bindMemory(
+            to: UInt16.self, capacity: T * ctx)
+        let ms = batchMaskSliding!.dataPointer.bindMemory(
+            to: UInt16.self, capacity: T * W)
+        let effectiveT = max(validCount, 1)
+        for t in 0..<T {
+            // Padded rows duplicate row validCount-1.
+            let row = min(t, effectiveT - 1)
+            let p = startPos + row
+            let pGroup = (groupIds != nil && p < groupIds!.count) ? groupIds![p] : -1
+            let pClamped = min(max(p, 0), ctx - 1)
+            for i in 0..<ctx {
+                let causal = i <= pClamped
+                let sameGroup = pGroup >= 0 && i < (groupIds?.count ?? 0)
+                    && groupIds![i] == pGroup
+                mf[t * ctx + i] = (causal || sameGroup) ? 0 : neg
+            }
+            if p < W {
+                let valid = min(p + 1, W)
+                for i in 0..<W {
+                    let causal = i < valid
+                    let sameGroup = pGroup >= 0 && i < (groupIds?.count ?? 0)
+                        && groupIds![i] == pGroup
+                    ms[t * W + i] = (causal || sameGroup) ? 0 : neg
+                }
+            } else {
+                for i in 0..<W { ms[t * W + i] = 0 }
+            }
+        }
+    }
+
+    /// Padded rows (t >= validCount) duplicate row validCount-1 so the
+    /// chunk graph sees a coherent batch where the auto-emit at row
+    /// T-1 corresponds to the LAST valid prompt position.
+    private func fillBatchRoPE(table: Data?, dst: MLMultiArray,
+                                startPos: Int, T: Int, validCount: Int,
+                                dim: Int) {
+        let p = dst.dataPointer.bindMemory(to: UInt16.self, capacity: T * dim)
+        guard let table else { memset(p, 0, T * dim * 2); return }
+        var headerSize = 128
+        table.withUnsafeBytes { raw in
+            let b = raw.baseAddress!.assumingMemoryBound(to: UInt8.self)
+            headerSize = 10 + (Int(b[8]) | (Int(b[9]) << 8))
+        }
+        let rowBytes = dim * MemoryLayout<UInt16>.stride
+        let effectiveT = max(validCount, 1)
+        for t in 0..<T {
+            let row = min(t, effectiveT - 1)
+            let pos = startPos + row
+            let offset = headerSize + pos * rowBytes
+            if offset + rowBytes <= table.count {
+                _ = table.withUnsafeBytes { raw in
+                    memcpy(p.advanced(by: t * dim),
+                           raw.baseAddress!.advanced(by: offset), rowBytes)
+                }
+            } else {
+                memset(p.advanced(by: t * dim), 0, rowBytes)
+            }
+        }
+    }
+
+    // MARK: - State bridge (prefill MLState → decode MLState)
+
+    /// Copies kv_cache_sliding (and kv_cache_full when present) from
+    /// `src` into `dst`. The withMultiArray closure scope is the only
+    /// legal window to access the buffer pointer; nested closures
+    /// keep both pointers live for the memcpy.
+    private func bridgeKVState(from src: MLState, to dst: MLState) {
+        for name in ["kv_cache_sliding", "kv_cache_full"] {
+            src.withMultiArray(for: name) { srcArr in
+                dst.withMultiArray(for: name) { dstArr in
+                    guard srcArr.count == dstArr.count else { return }
+                    memcpy(dstArr.dataPointer, srcArr.dataPointer,
+                           srcArr.count * MemoryLayout<UInt16>.stride)
+                }
+            }
+        }
+    }
+
+    // MARK: - Reusable feature provider
+
+    private final class FeatureProvider: NSObject, MLFeatureProvider {
+        let map: [String: MLFeatureValue]
+        let featureNames: Set<String>
+        init(_ map: [String: MLFeatureValue]) {
+            self.map = map
+            self.featureNames = Set(map.keys)
+        }
+        func featureValue(for name: String) -> MLFeatureValue? { map[name] }
+    }
+
+    // MARK: - T=1 decode step (3 chunks)
+
+    private func decodeStep(token: Int32, position: Int,
+                              opts: MLPredictionOptions) async throws -> Int32 {
+        guard let mc = modelConfig,
+              let c1 = decodeChunk1, let c2 = decodeChunk2, let c3 = decodeChunk3,
+              let s1 = decodeState1, let s2 = decodeState2 else {
+            throw CoreMLLLMError.modelNotFound("decode chunks/states not loaded")
+        }
+
+        let hidden: MLMultiArray
+        let perLayerRaw: MLMultiArray
+        if let mmRow = multimodalSpliceT1(token: token) {
+            hidden = mmRow
+            perLayerRaw = try prlZerosT1Buffer()
+        } else {
+            hidden = try embedTokens!.lookup(
+                Int(token), shape: [1, 1, NSNumber(value: mc.hiddenSize)])
+            perLayerRaw = try embedTokensPerLayer!.lookup(
+                Int(token),
+                shape: [1, 1, NSNumber(value: mc.numLayers * mc.perLayerDim)])
+        }
+
+        if let groupIds = mmVisionGroupIds {
+            fillFullCausalMaskVisionAware(position: position, groupIds: groupIds)
+            fillSlidingCausalMaskVisionAware(position: position, groupIds: groupIds)
+        } else {
+            fillFullCausalMask(position: position)
+            fillSlidingCausalMask(position: position)
+        }
+        setPos(position)
+        setRing(position)
+
+        let cosS = try lookupRoPE(table: cosSlidingTable, position: position, dim: 256)
+        let sinS = try lookupRoPE(table: sinSlidingTable, position: position, dim: 256)
+        let cosF = try lookupRoPE(table: cosFullTable,    position: position, dim: 512)
+        let sinF = try lookupRoPE(table: sinFullTable,    position: position, dim: 512)
+
+        let p1 = FeatureProvider([
+            "hidden_states":      MLFeatureValue(multiArray: hidden),
+            "causal_mask_full":   fvMaskFull,
+            "causal_mask_sliding": fvMaskSliding,
+            "per_layer_raw":      MLFeatureValue(multiArray: perLayerRaw),
+            "cos_s": MLFeatureValue(multiArray: cosS),
+            "sin_s": MLFeatureValue(multiArray: sinS),
+            "cos_f": MLFeatureValue(multiArray: cosF),
+            "sin_f": MLFeatureValue(multiArray: sinF),
+            "current_pos": fvPos,
+            "ring_pos":    fvRing,
+        ])
+        let out1 = try await c1.prediction(from: p1, using: s1, options: opts)
+        guard let h1 = out1.featureValue(for: "hidden_states_out"),
+              let plc = out1.featureValue(for: "per_layer_combined_out")
+        else { throw CoreMLLLMError.modelNotFound("decode chunk_1 missing outputs") }
+
+        let p2 = FeatureProvider([
+            "hidden_states":      h1,
+            "causal_mask_full":   fvMaskFull,
+            "causal_mask_sliding": fvMaskSliding,
+            "per_layer_combined": plc,
+            "cos_s": MLFeatureValue(multiArray: cosS),
+            "sin_s": MLFeatureValue(multiArray: sinS),
+            "cos_f": MLFeatureValue(multiArray: cosF),
+            "sin_f": MLFeatureValue(multiArray: sinF),
+            "current_pos": fvPos,
+            "ring_pos":    fvRing,
+        ])
+        let out2 = try await c2.prediction(from: p2, using: s2, options: opts)
+        guard let h2 = out2.featureValue(for: "hidden_states_out"),
+              let kv13k = out2.featureValue(for: "kv13_k"),
+              let kv13v = out2.featureValue(for: "kv13_v"),
+              let kv14k = out2.featureValue(for: "kv14_k"),
+              let kv14v = out2.featureValue(for: "kv14_v")
+        else { throw CoreMLLLMError.modelNotFound("decode chunk_2 missing outputs") }
+
+        var sharedInputs: [String: MLFeatureValue] = [
+            "causal_mask_full":   fvMaskFull,
+            "causal_mask_sliding": fvMaskSliding,
+            "per_layer_combined": plc,
+            "cos_s": MLFeatureValue(multiArray: cosS),
+            "sin_s": MLFeatureValue(multiArray: sinS),
+            "cos_f": MLFeatureValue(multiArray: cosF),
+            "sin_f": MLFeatureValue(multiArray: sinF),
+            "kv13_k": kv13k, "kv13_v": kv13v,
+            "kv14_k": kv14k, "kv14_v": kv14v,
+        ]
+        var p3map = sharedInputs
+        p3map["hidden_states"] = h2
+        let out3 = try await c3.prediction(from: FeatureProvider(p3map), options: opts)
+        if !is4Chunk {
+            guard let tokFV = out3.featureValue(for: "token_id"),
+                  let tokArr = tokFV.multiArrayValue
+            else { throw CoreMLLLMError.modelNotFound("decode chunk_3 (3-chunk final) no token_id") }
+            return tokArr.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0]
+        }
+        // 4-chunk: chunk_3 = KV-shared no lm_head; chunk_4 = KV-shared + lm_head.
+        guard let h3 = out3.featureValue(for: "hidden_states_out") else {
+            throw CoreMLLLMError.modelNotFound("decode chunk_3 missing hidden_states_out")
+        }
+        guard let c4 = decodeChunk4 else {
+            throw CoreMLLLMError.modelNotFound("decode chunk_4 not loaded")
+        }
+        var p4map = sharedInputs
+        p4map["hidden_states"] = h3
+        let out4 = try await c4.prediction(from: FeatureProvider(p4map), options: opts)
+        guard let tokFV = out4.featureValue(for: "token_id"),
+              let tokArr = tokFV.multiArrayValue
+        else { throw CoreMLLLMError.modelNotFound("decode chunk_4 no token_id") }
+        return tokArr.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0]
+    }
+
+    // MARK: - T=288 single-function prefill pass
+
+    /// One prefill pass over inputIds[startBatch ..< startBatch+validCount]
+    /// at sequence positions [position, position+validCount). Padded to
+    /// T=288 with -inf source masks. Returns the next token (chunk_3
+    /// argmax for batch row validCount-1).
+    private func prefillStepT288(inputIds: [Int32], startBatch: Int,
+                                  position: Int, validCount: Int,
+                                  opts: MLPredictionOptions) async throws -> Int32 {
+        guard let mc = modelConfig,
+              let c1 = prefillChunk1, let c2 = prefillChunk2, let c3 = prefillChunk3,
+              let s1 = prefillState1, let s2 = prefillState2,
+              let embed = embedTokens, let perLayer = embedTokensPerLayer
+        else { throw CoreMLLLMError.modelNotFound("prefill T=288 not loaded") }
+
+        let T = Self.kPrefillT
+        precondition(validCount > 0 && validCount <= T,
+                     "validCount=\(validCount) out of (0, \(T)]")
+        try ensureBatchScratch(T: T)
+        let H = mc.hiddenSize
+        let PL = mc.numLayers * mc.perLayerDim
+
+        let hPtr = batchHidden!.dataPointer.bindMemory(
+            to: UInt16.self, capacity: T * H)
+        let plPtr = batchPerLayerRaw!.dataPointer.bindMemory(
+            to: UInt16.self, capacity: T * PL)
+        let imgRowPtr = mmImageFeatures?.dataPointer.bindMemory(
+            to: UInt16.self, capacity: mmImageFeatures?.count ?? 0)
+        let audRowPtr = mmAudioFeatures?.dataPointer.bindMemory(
+            to: UInt16.self, capacity: mmAudioFeatures?.count ?? 0)
+
+        // Pack valid rows (real tokens) + zero-pad the tail.
+        for t in 0..<validCount {
+            let tokInt32 = inputIds[startBatch + t]
+            let tok = Int(tokInt32)
+            if let imgPtr = imgRowPtr,
+               (tokInt32 == Self.IMAGE_TOKEN_ID || tokInt32 == Self.VIDEO_TOKEN_ID),
+               mmImageIdx < mmImageNumTokens {
+                memcpy(hPtr.advanced(by: t * H),
+                       imgPtr.advanced(by: mmImageIdx * H),
+                       H * MemoryLayout<UInt16>.stride)
+                memset(plPtr.advanced(by: t * PL), 0,
+                       PL * MemoryLayout<UInt16>.stride)
+                mmImageIdx += 1
+            } else if let audPtr = audRowPtr,
+                      tokInt32 == Self.AUDIO_TOKEN_ID,
+                      mmAudioIdx < mmAudioNumTokens {
+                memcpy(hPtr.advanced(by: t * H),
+                       audPtr.advanced(by: mmAudioIdx * H),
+                       H * MemoryLayout<UInt16>.stride)
+                memset(plPtr.advanced(by: t * PL), 0,
+                       PL * MemoryLayout<UInt16>.stride)
+                mmAudioIdx += 1
+            } else {
+                let row = try embed.lookup(tok, shape: [1, 1, NSNumber(value: H)])
+                memcpy(hPtr.advanced(by: t * H),
+                       row.dataPointer, H * MemoryLayout<UInt16>.stride)
+                let plRow = try perLayer.lookup(
+                    tok, shape: [1, 1, NSNumber(value: PL)])
+                memcpy(plPtr.advanced(by: t * PL),
+                       plRow.dataPointer, PL * MemoryLayout<UInt16>.stride)
+            }
+        }
+        // Pad rows [validCount..T-1] by duplicating row validCount-1.
+        // Same hidden + per_layer_raw — combined with mask/RoPE that
+        // pin padded rows to position validCount-1, the chunk graph
+        // computes row T-1's output identical to row validCount-1's,
+        // making the chunk_3 argmax at row T-1 a valid prediction
+        // of the first post-prompt token. Multimodal counters do NOT
+        // advance for padded rows (they already advanced for the
+        // validCount real-token rows above).
+        if validCount < T && validCount > 0 {
+            let srcRowH = hPtr.advanced(by: (validCount - 1) * H)
+            let srcRowPLR = plPtr.advanced(by: (validCount - 1) * PL)
+            for t in validCount..<T {
+                memcpy(hPtr.advanced(by: t * H), srcRowH,
+                       H * MemoryLayout<UInt16>.stride)
+                memcpy(plPtr.advanced(by: t * PL), srcRowPLR,
+                       PL * MemoryLayout<UInt16>.stride)
+            }
+        }
+
+        fillBatchMasks(startPos: position, T: T,
+                       validCount: validCount, groupIds: mmVisionGroupIds)
+        fillBatchRoPE(table: cosSlidingTable, dst: batchCosS!,
+                      startPos: position, T: T, validCount: validCount, dim: 256)
+        fillBatchRoPE(table: sinSlidingTable, dst: batchSinS!,
+                      startPos: position, T: T, validCount: validCount, dim: 256)
+        fillBatchRoPE(table: cosFullTable, dst: batchCosF!,
+                      startPos: position, T: T, validCount: validCount, dim: 512)
+        fillBatchRoPE(table: sinFullTable, dst: batchSinF!,
+                      startPos: position, T: T, validCount: validCount, dim: 512)
+        setPos(position)
+        setRing(position)
+
+        let fvHidden = MLFeatureValue(multiArray: batchHidden!)
+        let fvPLR = MLFeatureValue(multiArray: batchPerLayerRaw!)
+        let fvMF = MLFeatureValue(multiArray: batchMaskFull!)
+        let fvMS = MLFeatureValue(multiArray: batchMaskSliding!)
+        let fvCS = MLFeatureValue(multiArray: batchCosS!)
+        let fvSS = MLFeatureValue(multiArray: batchSinS!)
+        let fvCF = MLFeatureValue(multiArray: batchCosF!)
+        let fvSF = MLFeatureValue(multiArray: batchSinF!)
+
+        let p1 = FeatureProvider([
+            "hidden_states":      fvHidden,
+            "causal_mask_full":   fvMF,
+            "causal_mask_sliding": fvMS,
+            "per_layer_raw":      fvPLR,
+            "cos_s": fvCS, "sin_s": fvSS,
+            "cos_f": fvCF, "sin_f": fvSF,
+            "current_pos": fvPos, "ring_pos": fvRing,
+        ])
+        let out1 = try await c1.prediction(from: p1, using: s1, options: opts)
+        guard let h1 = out1.featureValue(for: "hidden_states_out"),
+              let plc = out1.featureValue(for: "per_layer_combined_out")
+        else { throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_1 missing outputs") }
+
+        let p2 = FeatureProvider([
+            "hidden_states":      h1,
+            "causal_mask_full":   fvMF,
+            "causal_mask_sliding": fvMS,
+            "per_layer_combined": plc,
+            "cos_s": fvCS, "sin_s": fvSS,
+            "cos_f": fvCF, "sin_f": fvSF,
+            "current_pos": fvPos, "ring_pos": fvRing,
+        ])
+        let out2 = try await c2.prediction(from: p2, using: s2, options: opts)
+        guard let h2 = out2.featureValue(for: "hidden_states_out"),
+              let kv13k = out2.featureValue(for: "kv13_k"),
+              let kv13v = out2.featureValue(for: "kv13_v"),
+              let kv14k = out2.featureValue(for: "kv14_k"),
+              let kv14v = out2.featureValue(for: "kv14_v")
+        else { throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_2 missing outputs") }
+
+        var sharedInputs: [String: MLFeatureValue] = [
+            "causal_mask_full":   fvMF,
+            "causal_mask_sliding": fvMS,
+            "per_layer_combined": plc,
+            "cos_s": fvCS, "sin_s": fvSS,
+            "cos_f": fvCF, "sin_f": fvSF,
+            "kv13_k": kv13k, "kv13_v": kv13v,
+            "kv14_k": kv14k, "kv14_v": kv14v,
+        ]
+        var p3map = sharedInputs
+        p3map["hidden_states"] = h2
+        let out3 = try await c3.prediction(from: FeatureProvider(p3map), options: opts)
+        if !is4Chunk {
+            guard let tokFV = out3.featureValue(for: "token_id"),
+                  let tokArr = tokFV.multiArrayValue
+            else { throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_3 (3-chunk final) no token_id") }
+            return tokArr.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0]
+        }
+        // 4-chunk: chunk_3 emits hidden_states_out only; chunk_4 emits token_id.
+        guard let h3 = out3.featureValue(for: "hidden_states_out") else {
+            throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_3 missing hidden_states_out")
+        }
+        guard let c4 = prefillChunk4 else {
+            throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_4 not loaded")
+        }
+        var p4map = sharedInputs
+        p4map["hidden_states"] = h3
+        let out4 = try await c4.prediction(from: FeatureProvider(p4map), options: opts)
+        guard let tokFV = out4.featureValue(for: "token_id"),
+              let tokArr = tokFV.multiArrayValue
+        else { throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_4 no token_id") }
+        // chunk_4 emits argmax at batch row T-1. When validCount < T
+        // we replicate row validCount-1 across padded rows, so row T-1
+        // is functionally identical to row validCount-1 and the
+        // argmax is the valid first-post-prompt-token prediction.
+        return tokArr.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0]
+    }
+
+    // MARK: - Generate (T=288 prefill + bridge + T=1 decode)
+
+    /// Run the prompt through T=288 prefill passes, bridge KV state
+    /// into the decode chunks, then T=1 decode up to maxNewTokens.
+    /// imageFeatures / audioFeatures are pre-encoded by the caller
+    /// (typically LLMRunner) via processImage / processAudio.
+    public func generate(inputIds: [Int32],
+                          imageFeatures: MLMultiArray? = nil,
+                          imageNumTokens: Int = 0,
+                          audioFeatures: MLMultiArray? = nil,
+                          audioNumTokens: Int = 0,
+                          maxNewTokens: Int = 512,
+                          eosTokenIds: Set<Int32> = [],
+                          onToken: ((Int32) -> Void)? = nil
+    ) async throws -> [Int32] {
+        guard let mc = modelConfig else {
+            throw CoreMLLLMError.modelNotFound("Gemma4MM: no config")
+        }
+        guard let c1 = decodeChunk1, let c2 = decodeChunk2,
+              decodeChunk3 != nil,
+              let pc1 = prefillChunk1, let pc2 = prefillChunk2,
+              prefillChunk3 != nil
+        else { throw CoreMLLLMError.modelNotFound("Gemma4MM: not loaded") }
+        if inputIds.isEmpty { return [] }
+        if inputIds.count >= mc.contextLength {
+            throw CoreMLLLMError.modelNotFound(
+                "prompt (\(inputIds.count) tokens) >= ctx (\(mc.contextLength))")
+        }
+
+        // Bind multimodal state for the duration.
+        mmImageFeatures = imageFeatures
+        mmImageNumTokens = imageNumTokens
+        mmAudioFeatures = audioFeatures
+        mmAudioNumTokens = audioNumTokens
+        mmImageIdx = 0
+        mmAudioIdx = 0
+        let hasMultimodal = imageFeatures != nil || audioFeatures != nil
+        mmVisionGroupIds = hasMultimodal ? computeVisionGroupIds(inputIds: inputIds) : nil
+        defer {
+            mmImageFeatures = nil
+            mmAudioFeatures = nil
+            mmImageNumTokens = 0
+            mmAudioNumTokens = 0
+            mmImageIdx = 0
+            mmAudioIdx = 0
+            mmVisionGroupIds = nil
+        }
+
+        // Cross-turn resume: persisted decode state is reusable; if
+        // persistedInputIds is a strict prefix of inputIds, skip the
+        // prefix and only T=288-prefill the suffix. Prefill states
+        // are scratch — always rebuilt for the suffix.
+        var resumeAt = 0
+        let canResume = decodeState1 != nil && decodeState2 != nil
+            && !persistedInputIds.isEmpty
+        if canResume {
+            let cap = min(persistedInputIds.count, inputIds.count)
+            var l = 0
+            while l < cap && persistedInputIds[l] == inputIds[l] { l += 1 }
+            if l == persistedInputIds.count && l < inputIds.count && l > 0 {
+                resumeAt = l
+            }
+        }
+
+        // Advance multimodal counters past resumed prefix.
+        if resumeAt > 0 && hasMultimodal {
+            for j in 0..<resumeAt {
+                let t = inputIds[j]
+                if t == Self.IMAGE_TOKEN_ID || t == Self.VIDEO_TOKEN_ID {
+                    mmImageIdx += 1
+                } else if t == Self.AUDIO_TOKEN_ID {
+                    mmAudioIdx += 1
+                }
+            }
+        }
+
+        if resumeAt == 0 {
+            // Fresh decode states.
+            decodeState1 = c1.makeState()
+            decodeState2 = c2.makeState()
+            persistedInputIds = []
+            persistedPosition = 0
+        } else {
+            print("[Gemma4MM] RESUME L=\(resumeAt) " +
+                  "(persisted=\(persistedInputIds.count), new=\(inputIds.count))")
+            persistedInputIds = []
+            persistedPosition = 0
+        }
+        let opts = MLPredictionOptions()
+
+        let suffixCount = inputIds.count - resumeAt
+        var position = resumeAt
+        var lastToken: Int32 = inputIds[max(resumeAt - 1, 0)]
+        var prefillPredicted: Int32 = 0
+        var passes = 0
+
+        let t0 = CFAbsoluteTimeGetCurrent()
+
+        if suffixCount > 0 {
+            // Always-fresh prefill states for this generate call.
+            prefillState1 = pc1.makeState()
+            prefillState2 = pc2.makeState()
+
+            let T = Self.kPrefillT
+            var i = resumeAt
+            while i < inputIds.count {
+                let remaining = inputIds.count - i
+                let validCount = min(remaining, T)
+                prefillPredicted = try await prefillStepT288(
+                    inputIds: inputIds, startBatch: i, position: position,
+                    validCount: validCount, opts: opts)
+                position += validCount
+                lastToken = inputIds[i + validCount - 1]
+                i += validCount
+                passes += 1
+            }
+
+            // Bridge prefill KV → decode KV (full buffer memcpy each).
+            if let ps1 = prefillState1, let ds1 = decodeState1 {
+                bridgeKVState(from: ps1, to: ds1)
+            }
+            if let ps2 = prefillState2, let ds2 = decodeState2 {
+                bridgeKVState(from: ps2, to: ds2)
+            }
+            // Drop prefill states — they're rebuilt next generate().
+            prefillState1 = nil
+            prefillState2 = nil
+        }
+        let prefillEnd = CFAbsoluteTimeGetCurrent()
+
+        // The last prefill pass always auto-emits a valid next token
+        // (full batch — padded rows duplicate row validCount-1, so
+        // row T-1's argmax is the first-post-prompt-token prediction).
+        var decoded: [Int32] = []
+        if maxNewTokens > 0 && suffixCount > 0 {
+            decoded.append(prefillPredicted)
+            onToken?(prefillPredicted)
+            lastToken = prefillPredicted
+        }
+        while decoded.count < maxNewTokens {
+            if eosTokenIds.contains(lastToken) { break }
+            if position >= mc.contextLength { break }
+            let next = try await decodeStep(
+                token: lastToken, position: position, opts: opts)
+            decoded.append(next)
+            onToken?(next)
+            lastToken = next
+            position += 1
+        }
+        let t1 = CFAbsoluteTimeGetCurrent()
+
+        // Persist consumed tokens for next-turn LCP match.
+        let consumed = decoded.dropLast()
+        var newPersisted = inputIds
+        newPersisted.append(contentsOf: consumed)
+        persistedInputIds = newPersisted
+        persistedPosition = newPersisted.count
+
+        let prefillMs = (prefillEnd - t0) * 1000
+        let decodeMs = (t1 - prefillEnd) * 1000
+        if decodeMs > 0 && decoded.count > 1 {
+            lastDecodeTokensPerSecond = Double(decoded.count - 1) / (decodeMs / 1000)
+        }
+        let resumeTag = resumeAt > 0 ? " [resumed L=\(resumeAt)]" : ""
+        print("[Gemma4MM] prefill \(suffixCount) tok in " +
+              String(format: "%.0fms (%.1f tok/s)%@ [T=288 passes=%d] | decode %d tok in %.0fms (%.1f tok/s)",
+                      prefillMs,
+                      Double(max(suffixCount, 1)) / max(prefillMs / 1000, 1e-3),
+                      resumeTag, passes,
+                      decoded.count, decodeMs, lastDecodeTokensPerSecond))
+        return decoded
+    }
+}
diff --git a/Sources/CoreMLLLM/ModelDownloader.swift b/Sources/CoreMLLLM/ModelDownloader.swift
index 7ac9200..f215012 100644
--- a/Sources/CoreMLLLM/ModelDownloader.swift
+++ b/Sources/CoreMLLLM/ModelDownloader.swift
@@ -288,6 +288,34 @@ public final class ModelDownloader: NSObject {
             downloadURL: "",
             folderName: "gemma4-e4b-stateful-linear")
 
+        /// Gemma 4 E2B (stateful Linear decode + T=288 single-function
+        /// prefill + vision/video/audio multimodal). Stage 8 candidate.
+        /// Decode reuses the 3-chunk merged Linear bundle from
+        /// `gemma4e2bStatefulLinear`; prefill is a separate set of three
+        /// T=288 single-function mlpackages under `prefill_T288/`. After
+        /// each prefill pass the engine memcpys kv_cache_sliding +
+        /// kv_cache_full from the prefill MLState into the decode
+        /// MLState (multifunction T>1 + dual MLState is rejected by
+        /// iPhone ANE 18 — single-function works). Bundle ships under
+        /// `gemma4_e2b_stateful_chunks/` so the engine layout matches
+        /// the existing stateful entries.
+        /// Sideload-only until iPhone 17 Pro Phase B validation closes.
+        public static let gemma4e2bStatefulMultimodal = ModelInfo(
+            id: "gemma4-e2b-stateful-multimodal",
+            name: "Gemma 4 E2B (stateful, multimodal)", size: "4.0 GB",
+            downloadURL: "",
+            folderName: "gemma4-e2b-stateful-multimodal")
+
+        /// Gemma 4 E4B (stateful Linear decode + T=288 prefill +
+        /// multimodal). Same engine class as the E2B variant; layer
+        /// counts come from `model_config.json` so the runtime is
+        /// dimension-agnostic. Sideload-only until iPhone validation.
+        public static let gemma4e4bStatefulMultimodal = ModelInfo(
+            id: "gemma4-e4b-stateful-multimodal",
+            name: "Gemma 4 E4B (stateful, multimodal)", size: "5.0 GB",
+            downloadURL: "",
+            folderName: "gemma4-e4b-stateful-multimodal")
+
         /// Visible in the UI picker. EAGLE-3 / LookAhead probe variants are
         /// hidden unless `LLM_SHOW_EXPERIMENTAL=1` is set (or the
         /// UserDefaults key `showExperimentalModels` is true). Keeps the
@@ -324,6 +352,8 @@ public final class ModelDownloader: NSObject {
                 list.insert(gemma4e2bStateful, at: 5)        // Conv2d variant
                 list.insert(gemma4e4bStateful, at: 6)        // E4B Stage 2 Conv2d
                 list.insert(gemma4e4bStatefulLinear, at: 7)  // E4B Stage 2 Linear
+                list.insert(gemma4e2bStatefulMultimodal, at: 8)  // Stage 8 E2B
+                list.insert(gemma4e4bStatefulMultimodal, at: 9)  // Stage 8 E4B
             }
             return list
         }
diff --git a/Sources/gemma4mm-smoke/main.swift b/Sources/gemma4mm-smoke/main.swift
new file mode 100644
index 0000000..260a558
--- /dev/null
+++ b/Sources/gemma4mm-smoke/main.swift
@@ -0,0 +1,92 @@
+// Mac smoke test for Gemma4StatefulMultimodalEngine.
+//
+// Usage:
+//   swift run -c release gemma4mm-smoke <bundle-dir> [prompt] [maxTokens]
+//
+// `bundle-dir` should be the directory containing chunk_{1,2,3}.mlmodelc
+// and a `prefill_T288/` subdir — i.e. the inner
+// `gemma4_e2b_stateful_chunks/` folder, NOT the outer parent.
+//
+// Produces text-only output (no image/audio attachment). Used to catch
+// engine bugs without needing an iPhone roundtrip — Mac ANE compiler is
+// more permissive than iPhone's, so chunk_2 is expected to load here
+// even when iPhone fails MIL→EIR translation.
+
+import CoreML
+import CoreMLLLM
+import Foundation
+import Tokenizers
+
+@main
+struct Gemma4MMSmoke {
+    static func main() async {
+        let args = CommandLine.arguments
+        guard args.count >= 2 else {
+            fputs("usage: \(args[0]) <bundle-dir> [prompt] [maxTokens]\n", stderr)
+            exit(2)
+        }
+        let bundleDir = URL(fileURLWithPath: args[1])
+        let prompt = args.count >= 3
+            ? args[2]
+            : "Write three short sentences about the ocean."
+        let maxTokens = args.count >= 4 ? (Int(args[3]) ?? 64) : 64
+
+        do {
+            print("[smoke] bundle: \(bundleDir.path)")
+            // Tokenizer.
+            let hfDir = bundleDir.appendingPathComponent("hf_model")
+            let tok = try await AutoTokenizer.from(modelFolder: hfDir)
+            print("[smoke] tokenizer loaded")
+
+            // Engine.
+            let engine = Gemma4StatefulMultimodalEngine()
+            let t0 = CFAbsoluteTimeGetCurrent()
+            try await engine.load(modelDirectory: bundleDir)
+            let loadDt = CFAbsoluteTimeGetCurrent() - t0
+            print(String(format: "[smoke] engine loaded in %.1fs", loadDt))
+            print("[smoke] hasVision=\(engine.hasVision) hasAudio=\(engine.hasAudio)")
+
+            // Build a Gemma 4 chat prompt (single user turn).
+            let promptStr = "<bos><|turn>user\n\(prompt)<turn|>\n<|turn>model\n"
+            let inputIds = tok.encode(text: promptStr).map { Int32($0) }
+            print("[smoke] input_ids = \(inputIds.count) tokens")
+            print("[smoke] prompt: \(prompt)")
+            print("[smoke] max_tokens=\(maxTokens)")
+
+            var eosSet: Set<Int32> = [1, 106]
+            if let eid = tok.eosTokenId { eosSet.insert(Int32(eid)) }
+            let skipSet: Set<Int32> = [1, 105, 106]
+
+            var accum: [Int] = []
+            var emittedString = ""
+            var totalEmitted = 0
+            let genStart = CFAbsoluteTimeGetCurrent()
+            _ = try await engine.generate(
+                inputIds: inputIds,
+                maxNewTokens: maxTokens,
+                eosTokenIds: eosSet,
+                onToken: { tokenId in
+                    if skipSet.contains(tokenId) { return }
+                    accum.append(Int(tokenId))
+                    let current = tok.decode(tokens: accum)
+                    if current.count > emittedString.count {
+                        let delta = String(
+                            current.suffix(current.count - emittedString.count))
+                        FileHandle.standardOutput.write(Data(delta.utf8))
+                        emittedString = current
+                    }
+                    totalEmitted += 1
+                })
+            let dt = CFAbsoluteTimeGetCurrent() - genStart
+            print("\n---")
+            print(String(format: "[smoke] decode wall = %.2fs", dt))
+            print(String(format: "[smoke] last decode tok/s = %.2f",
+                         engine.lastDecodeTokensPerSecond))
+            print("[smoke] tokens emitted (non-skipped): \(totalEmitted)")
+            exit(0)
+        } catch {
+            fputs("[smoke] error: \(error)\n", stderr)
+            exit(1)
+        }
+    }
+}
diff --git a/conversion/build_gemma4_stateful_singlefunc_prefill.py b/conversion/build_gemma4_stateful_singlefunc_prefill.py
index 7da6681..c26580f 100644
--- a/conversion/build_gemma4_stateful_singlefunc_prefill.py
+++ b/conversion/build_gemma4_stateful_singlefunc_prefill.py
@@ -82,12 +82,16 @@
 from build_gemma4_e2b_stateful_chunks import (
     _resolve_hf_dir,
     convert_chunk1_prefill,
+    convert_chunk2_prefill,
     convert_chunk_shared_prefill,
 )
 from build_gemma4_e2b_stateful_3chunks import convert_chunk2_merged_prefill
 from models.gemma4 import Gemma4Model
 from models.gemma4_swa_chunks import compute_chunk_boundaries
-from models.gemma4_swa_stateful_chunks import SWAStatefulChunk4Prefill
+from models.gemma4_swa_stateful_chunks import (
+    SWAStatefulChunk3Prefill,
+    SWAStatefulChunk4Prefill,
+)
 
 
 def main():
@@ -108,9 +112,17 @@ def main():
     ap.add_argument("--linear-projections", action="store_true",
                     help="Plan 3 Linear projections (cml9 PR #2577) — "
                          "default on for Stage 3 / Stage 8 ship parity.")
-    ap.add_argument("--only", choices=("chunk1", "chunk2_3way", "chunk3"),
+    ap.add_argument("--only", choices=("chunk1", "chunk2_3way", "chunk3",
+                                        "chunk2_own", "chunk3_shared",
+                                        "chunk4_final"),
                     default=None,
-                    help="Build only one chunk (debug; default builds all 3).")
+                    help="Build only one chunk (debug).")
+    ap.add_argument("--four-chunk", action="store_true",
+                    help="Build 4-chunk variant: chunk_1, chunk_2 (own only), "
+                         "chunk_3 (KV-shared no lm_head), chunk_4 (KV-shared "
+                         "+ lm_head). Use when E4B chunk_2 merged graph is "
+                         "rejected by iPhone ANE 18 (std::bad_cast at "
+                         "MIL→EIR translation). Default off (3-chunk merged).")
     args = ap.parse_args()
 
     if args.output is None:
@@ -140,11 +152,19 @@ def main():
     shared_range = boundaries[2]
     c4_start, c4_end = boundaries[3]
 
-    paths = {
-        "chunk1": os.path.join(args.output, f"chunk_1_prefill_T{args.t}.mlpackage"),
-        "chunk2_3way": os.path.join(args.output, f"chunk_2_3way_prefill_T{args.t}.mlpackage"),
-        "chunk3": os.path.join(args.output, f"chunk_3_prefill_T{args.t}.mlpackage"),
-    }
+    if args.four_chunk:
+        paths = {
+            "chunk1": os.path.join(args.output, f"chunk_1_prefill_T{args.t}.mlpackage"),
+            "chunk2_own": os.path.join(args.output, f"chunk_2_prefill_T{args.t}.mlpackage"),
+            "chunk3_shared": os.path.join(args.output, f"chunk_3_prefill_T{args.t}.mlpackage"),
+            "chunk4_final": os.path.join(args.output, f"chunk_4_prefill_T{args.t}.mlpackage"),
+        }
+    else:
+        paths = {
+            "chunk1": os.path.join(args.output, f"chunk_1_prefill_T{args.t}.mlpackage"),
+            "chunk2_3way": os.path.join(args.output, f"chunk_2_3way_prefill_T{args.t}.mlpackage"),
+            "chunk3": os.path.join(args.output, f"chunk_3_prefill_T{args.t}.mlpackage"),
+        }
 
     t0 = time.time()
     if args.only in (None, "chunk1"):
@@ -156,28 +176,71 @@ def main():
             nbits=args.nbits,
             use_linear=args.linear_projections,
         )
-    if args.only in (None, "chunk2_3way"):
-        convert_chunk2_merged_prefill(
-            base=base, ctx=args.ctx, T=args.t,
-            out_path=paths["chunk2_3way"],
-            nbits=args.nbits,
-            use_linear=args.linear_projections,
-            own_range=own_range,
-            shared_range=shared_range,
-        )
-    if args.only in (None, "chunk3"):
-        convert_chunk_shared_prefill(
-            chunk_cls=SWAStatefulChunk4Prefill,
-            base=base,
-            c_start=c4_start, c_end=c4_end,
-            ctx=args.ctx, T=args.t,
-            out_path=paths["chunk3"],
-            nbits=args.nbits,
-            use_linear=args.linear_projections,
-        )
+    if args.four_chunk:
+        # 4-chunk path: chunk_2 = own only, chunk_3 = KV-shared (no lm_head),
+        # chunk_4 = KV-shared + lm_head. Splits the 3-chunk merged middle so
+        # each subgraph stays under iPhone ANE 18 compile budget.
+        own_start, own_end = own_range
+        shared_start, shared_end = shared_range
+        if args.only in (None, "chunk2_own"):
+            convert_chunk2_prefill(
+                base=base,
+                c_start=own_start, c_end=own_end,
+                ctx=args.ctx, T=args.t,
+                out_path=paths["chunk2_own"],
+                nbits=args.nbits,
+                use_linear=args.linear_projections,
+            )
+        if args.only in (None, "chunk3_shared"):
+            convert_chunk_shared_prefill(
+                chunk_cls=SWAStatefulChunk3Prefill,
+                base=base,
+                c_start=shared_start, c_end=shared_end,
+                ctx=args.ctx, T=args.t,
+                out_path=paths["chunk3_shared"],
+                nbits=args.nbits,
+                name="CHUNK 3 (KV-shared, no lm_head)",
+                with_lm_head=False,
+                use_linear=args.linear_projections,
+            )
+        if args.only in (None, "chunk4_final"):
+            convert_chunk_shared_prefill(
+                chunk_cls=SWAStatefulChunk4Prefill,
+                base=base,
+                c_start=c4_start, c_end=c4_end,
+                ctx=args.ctx, T=args.t,
+                out_path=paths["chunk4_final"],
+                nbits=args.nbits,
+                name="CHUNK 4 (KV-shared + lm_head)",
+                with_lm_head=True,
+                use_linear=args.linear_projections,
+            )
+    else:
+        if args.only in (None, "chunk2_3way"):
+            convert_chunk2_merged_prefill(
+                base=base, ctx=args.ctx, T=args.t,
+                out_path=paths["chunk2_3way"],
+                nbits=args.nbits,
+                use_linear=args.linear_projections,
+                own_range=own_range,
+                shared_range=shared_range,
+            )
+        if args.only in (None, "chunk3"):
+            convert_chunk_shared_prefill(
+                chunk_cls=SWAStatefulChunk4Prefill,
+                base=base,
+                c_start=c4_start, c_end=c4_end,
+                ctx=args.ctx, T=args.t,
+                out_path=paths["chunk3"],
+                nbits=args.nbits,
+                name="CHUNK 3 (final)",
+                with_lm_head=True,
+                use_linear=args.linear_projections,
+            )
     print(f"\n[build] DONE in {time.time()-t0:.0f}s")
     print("=" * 60)
-    print(f"3-chunk merged stateful single-function prefill (T={args.t}):")
+    layout = "4-chunk" if args.four_chunk else "3-chunk merged"
+    print(f"{layout} stateful single-function prefill (T={args.t}):")
     for label, path in paths.items():
         if not os.path.exists(path):
             continue
diff --git a/conversion/models/gemma4_swa_stateful_chunks.py b/conversion/models/gemma4_swa_stateful_chunks.py
index 3f42bd2..4e3dcf3 100644
--- a/conversion/models/gemma4_swa_stateful_chunks.py
+++ b/conversion/models/gemma4_swa_stateful_chunks.py
@@ -193,14 +193,17 @@ def _run_layer_swa_stateful(
             V_for_attn = V_sliding_slice
 
         # Producer alias outputs — same kv13/kv14 naming as the recurrent
-        # build so chunks 3/4 see no input-name change. These are slice
-        # views over the producer's just-updated state buffer.
+        # build so chunks 3/4 see no input-name change. .clone() forces
+        # a fresh tensor (rather than a slice-view over the just-updated
+        # state buffer): iPhone ANE 18 fails MIL→EIR translation with
+        # `std::bad_cast` when a state-slice is used as both an input
+        # to subsequent shared layers AND a public chunk output.
         if layer_idx == config.kv_sliding_producer:
-            kv_store_13_k = K_for_attn[..., :config.head_dim]
-            kv_store_13_v = V_for_attn[..., :config.head_dim]
+            kv_store_13_k = K_for_attn[..., :config.head_dim].clone()
+            kv_store_13_v = V_for_attn[..., :config.head_dim].clone()
         elif layer_idx == config.kv_full_producer:
-            kv_store_14_k = K_for_attn[..., :config.global_head_dim]
-            kv_store_14_v = V_for_attn[..., :config.global_head_dim]
+            kv_store_14_k = K_for_attn[..., :config.global_head_dim].clone()
+            kv_store_14_v = V_for_attn[..., :config.global_head_dim].clone()
     else:
         # Shared layer: read producer KV from the alias inputs.
         if is_full:
@@ -587,11 +590,11 @@ def _run_layer_swa_stateful_prefill(
             V_for_attn = V_sliding_slice
 
         if layer_idx == config.kv_sliding_producer:
-            kv_store_13_k = K_for_attn[..., :config.head_dim]
-            kv_store_13_v = V_for_attn[..., :config.head_dim]
+            kv_store_13_k = K_for_attn[..., :config.head_dim].clone()
+            kv_store_13_v = V_for_attn[..., :config.head_dim].clone()
         elif layer_idx == config.kv_full_producer:
-            kv_store_14_k = K_for_attn[..., :config.global_head_dim]
-            kv_store_14_v = V_for_attn[..., :config.global_head_dim]
+            kv_store_14_k = K_for_attn[..., :config.global_head_dim].clone()
+            kv_store_14_v = V_for_attn[..., :config.global_head_dim].clone()
     else:
         if is_full:
             K_for_attn = kv_store_14_k
@@ -1067,11 +1070,11 @@ def _run_layer_swa_stateful_single(
             V_for_attn = kv_cache_unified[2*oi+1:2*oi+2, :, :W, :hd]
 
         if layer_idx == config.kv_sliding_producer:
-            kv_store_13_k = K_for_attn[..., :config.head_dim]
-            kv_store_13_v = V_for_attn[..., :config.head_dim]
+            kv_store_13_k = K_for_attn[..., :config.head_dim].clone()
+            kv_store_13_v = V_for_attn[..., :config.head_dim].clone()
         elif layer_idx == config.kv_full_producer:
-            kv_store_14_k = K_for_attn[..., :config.global_head_dim]
-            kv_store_14_v = V_for_attn[..., :config.global_head_dim]
+            kv_store_14_k = K_for_attn[..., :config.global_head_dim].clone()
+            kv_store_14_v = V_for_attn[..., :config.global_head_dim].clone()
     else:
         if is_full:
             K_for_attn = kv_store_14_k
@@ -1182,11 +1185,11 @@ def _run_layer_swa_stateful_prefill_single(
             V_for_attn = kv_cache_unified[2*oi+1:2*oi+2, :, :W, :hd]
 
         if layer_idx == config.kv_sliding_producer:
-            kv_store_13_k = K_for_attn[..., :config.head_dim]
-            kv_store_13_v = V_for_attn[..., :config.head_dim]
+            kv_store_13_k = K_for_attn[..., :config.head_dim].clone()
+            kv_store_13_v = V_for_attn[..., :config.head_dim].clone()
         elif layer_idx == config.kv_full_producer:
-            kv_store_14_k = K_for_attn[..., :config.global_head_dim]
-            kv_store_14_v = V_for_attn[..., :config.global_head_dim]
+            kv_store_14_k = K_for_attn[..., :config.global_head_dim].clone()
+            kv_store_14_v = V_for_attn[..., :config.global_head_dim].clone()
     else:
         if is_full:
             K_for_attn = kv_store_14_k
diff --git a/scripts/assemble_gemma4_stateful_multimodal.sh b/scripts/assemble_gemma4_stateful_multimodal.sh
new file mode 100755
index 0000000..53e74ff
--- /dev/null
+++ b/scripts/assemble_gemma4_stateful_multimodal.sh
@@ -0,0 +1,211 @@
+#!/bin/bash
+# Assemble the Gemma 4 stateful + multimodal bundle for iPhone sideload.
+# Stage 8: 3-chunk merged Linear decode + T=288 single-function prefill +
+# vision / video / audio encoders. Drives Gemma4StatefulMultimodalEngine.
+#
+# Layout produced (matches LLMRunner detection — chunks + prefill_T288/
+# subdir under gemma4_e2b_stateful_chunks/):
+#
+#   build/gemma4_stateful_multimodal_e{2,4}b/
+#     gemma4_e2b_stateful_chunks/        # subdir name shared with E2B/E4B
+#       chunk_{1..3}.mlmodelc            (3-chunk merged decode)
+#       prefill_T288/
+#         chunk_1_prefill_T288.mlmodelc
+#         chunk_2_3way_prefill_T288.mlmodelc
+#         chunk_3_prefill_T288.mlmodelc
+#       embed_tokens_q8.bin              (sidecars from legacy bundle)
+#       embed_tokens_scales.bin
+#       embed_tokens_per_layer_q8.bin
+#       embed_tokens_per_layer_scales.bin
+#       per_layer_projection.bin
+#       per_layer_norm_weight.bin
+#       cos_sliding.npy / sin_sliding.npy / cos_full.npy / sin_full.npy
+#       hf_model/                        (tokenizer files)
+#       model_config.json
+#       vision.mlmodelc                  (multimodal encoders, shared)
+#       vision_video.mlmodelc
+#       audio.mlmodelc
+#       mel_filterbank.bin               (audio sidecars)
+#       audio_config.json
+#       output_proj_weight.npy
+#       output_proj_bias.npy
+#       embed_proj_weight.npy
+#
+# Usage:
+#   MODEL=gemma4-e2b bash scripts/assemble_gemma4_stateful_multimodal.sh
+#   MODEL=gemma4-e4b bash scripts/assemble_gemma4_stateful_multimodal.sh
+#
+# Inputs (overridable via env):
+#   SRC_CHUNKS         /tmp/$MODEL-stateful-3chunk
+#   SRC_PREFILL_T288   /tmp/$MODEL-singlefunc-prefill-T288
+#   SIDECARS           legacy text-only bundle (embed/RoPE/tokenizer)
+#   ENCODERS           legacy multimodal bundle (vision/audio mlmodelc)
+#                       — vision/audio shared between E2B and E4B
+#
+# Push:
+#   DEVICE=<id-from-devicectl-list>
+#   xcrun devicectl device copy to --device $DEVICE \
+#     --domain-type appDataContainer \
+#     --domain-identifier com.example.CoreMLLLMChat \
+#     --source build/gemma4_stateful_multimodal_e4b \
+#     --destination Documents/Models/gemma4-e4b-stateful-multimodal
+#
+# Scheme: LLM_SHOW_EXPERIMENTAL=1 to reveal the picker entry.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+MODEL="${MODEL:-gemma4-e4b}"
+case "$MODEL" in
+    gemma4-e2b) SHORT=e2b ;;
+    gemma4-e4b) SHORT=e4b ;;
+    *) echo "[error] MODEL must be gemma4-e2b or gemma4-e4b" >&2; exit 1 ;;
+esac
+
+SRC_CHUNKS="${SRC_CHUNKS:-/tmp/$MODEL-stateful-3chunk}"
+SRC_PREFILL_T288="${SRC_PREFILL_T288:-/tmp/$MODEL-singlefunc-prefill-T288}"
+# Text-side sidecars (embed_tokens, RoPE, tokenizer, model_config). E2B
+# defaults to the iphone_8k staging dir; E4B defaults to its own legacy
+# bundle (text-only — vision/audio come from ENCODERS below).
+if [[ "$MODEL" == "gemma4-e2b" ]]; then
+    SIDECARS="${SIDECARS:-/Users/majimadaisuke/Downloads/CoreML-LLM/conversion/output/iphone_8k}"
+else
+    SIDECARS="${SIDECARS:-/Users/majimadaisuke/Downloads/CoreML-LLM/output/$MODEL/bundle}"
+fi
+# Multimodal encoders + audio sidecars. Shared between E2B and E4B (same
+# SigLIP + Conformer regardless of LM size).
+ENCODERS="${ENCODERS:-/Users/majimadaisuke/Downloads/CoreML-LLM/conversion/output/iphone_8k}"
+# mel_filterbank.bin lives in a separate dir in some build trees; the
+# script falls back to this path if it's not under ENCODERS.
+MEL_FALLBACK="${MEL_FALLBACK:-/Users/majimadaisuke/Downloads/CoreML-LLM/conversion/output/audio}"
+
+OUT_PARENT="${OUT_PARENT:-$ROOT/build/gemma4_stateful_multimodal_$SHORT}"
+OUT="$OUT_PARENT/gemma4_e2b_stateful_chunks"
+PREFILL_OUT="$OUT/prefill_T288"
+
+# Set FOUR_CHUNK=1 to assemble the 4-chunk decode + 4-chunk prefill_T288
+# variant (E4B fallback when 3-chunk merged trips iPhone ANE 18). Requires
+# `--four-chunk` builds: SRC_CHUNKS holds chunk_{1..4}.mlpackage and
+# SRC_PREFILL_T288 holds chunk_{1..4}_prefill_T288.mlpackage.
+FOUR_CHUNK="${FOUR_CHUNK:-0}"
+if [[ "$FOUR_CHUNK" == "1" ]]; then
+    DECODE_CHUNKS=(chunk_1 chunk_2 chunk_3 chunk_4)
+    PREFILL_CHUNKS=(chunk_1_prefill_T288 chunk_2_prefill_T288
+                    chunk_3_prefill_T288 chunk_4_prefill_T288)
+else
+    DECODE_CHUNKS=(chunk_1 chunk_2 chunk_3)
+    PREFILL_CHUNKS=(chunk_1_prefill_T288 chunk_2_3way_prefill_T288
+                    chunk_3_prefill_T288)
+fi
+
+# ---- Sanity: required inputs ----
+for d in "$SRC_CHUNKS" "$SRC_PREFILL_T288" "$SIDECARS" "$ENCODERS"; do
+    if [[ ! -d "$d" ]]; then
+        echo "[error] missing input dir: $d" >&2
+        exit 1
+    fi
+done
+for c in "${DECODE_CHUNKS[@]}"; do
+    if [[ ! -d "$SRC_CHUNKS/${c}.mlpackage" && ! -d "$SRC_CHUNKS/${c}.mlmodelc" ]]; then
+        echo "[error] $SRC_CHUNKS/${c}.{mlpackage,mlmodelc} missing — run build_gemma4_e2b_stateful_{,3}chunks.py first" >&2
+        exit 1
+    fi
+done
+for c in "${PREFILL_CHUNKS[@]}"; do
+    if [[ ! -d "$SRC_PREFILL_T288/${c}.mlpackage" && ! -d "$SRC_PREFILL_T288/${c}.mlmodelc" ]]; then
+        echo "[error] $SRC_PREFILL_T288/${c}.{mlpackage,mlmodelc} missing — run build_gemma4_stateful_singlefunc_prefill.py first" >&2
+        exit 1
+    fi
+done
+
+rm -rf "$OUT_PARENT"
+mkdir -p "$OUT" "$PREFILL_OUT"
+
+# ---- 1. Compile + place decode mlpackages ----
+for c in "${DECODE_CHUNKS[@]}"; do
+    echo "[compile decode] $c"
+    if [[ -d "$SRC_CHUNKS/${c}.mlmodelc" ]]; then
+        cp -R "$SRC_CHUNKS/${c}.mlmodelc" "$OUT/${c}.mlmodelc"
+    else
+        xcrun coremlcompiler compile \
+            "$SRC_CHUNKS/${c}.mlpackage" "$OUT/" 2>&1 | tail -2
+    fi
+done
+
+# ---- 2. Compile + place T=288 single-function prefill mlpackages ----
+for c in "${PREFILL_CHUNKS[@]}"; do
+    echo "[compile prefill_T288] $c"
+    if [[ -d "$SRC_PREFILL_T288/${c}.mlmodelc" ]]; then
+        cp -R "$SRC_PREFILL_T288/${c}.mlmodelc" "$PREFILL_OUT/${c}.mlmodelc"
+    else
+        xcrun coremlcompiler compile \
+            "$SRC_PREFILL_T288/${c}.mlpackage" "$PREFILL_OUT/" 2>&1 | tail -2
+    fi
+done
+
+# ---- 3. Copy text-side sidecars ----
+SIDE_ITEMS=(
+    "embed_tokens_q8.bin"
+    "embed_tokens_scales.bin"
+    "embed_tokens_per_layer_q8.bin"
+    "embed_tokens_per_layer_scales.bin"
+    "per_layer_projection.bin"
+    "per_layer_norm_weight.bin"
+    "cos_sliding.npy"
+    "sin_sliding.npy"
+    "cos_full.npy"
+    "sin_full.npy"
+    "hf_model"
+    "model_config.json"
+)
+for item in "${SIDE_ITEMS[@]}"; do
+    if [[ -e "$SIDECARS/$item" ]]; then
+        echo "[copy text sidecar] $item"
+        cp -R "$SIDECARS/$item" "$OUT/"
+    else
+        echo "  [warn] missing text sidecar $item"
+    fi
+done
+
+# ---- 4. Copy multimodal encoders + audio sidecars ----
+ENC_ITEMS=(
+    "vision.mlmodelc"
+    "vision_video.mlmodelc"
+    "audio.mlmodelc"
+    "mel_filterbank.bin"
+    "audio_config.json"
+    "output_proj_weight.npy"
+    "output_proj_bias.npy"
+    "embed_proj_weight.npy"
+)
+for item in "${ENC_ITEMS[@]}"; do
+    if [[ -e "$ENCODERS/$item" ]]; then
+        echo "[copy encoder] $item"
+        cp -R "$ENCODERS/$item" "$OUT/"
+    elif [[ "$item" == "mel_filterbank.bin" && -e "$MEL_FALLBACK/$item" ]]; then
+        echo "[copy encoder fallback] $item (from $MEL_FALLBACK)"
+        cp -R "$MEL_FALLBACK/$item" "$OUT/"
+    else
+        echo "  [warn] missing encoder $item (engine treats as optional)"
+    fi
+done
+
+echo ""
+echo "=== assembled ==="
+du -sh "$OUT_PARENT"
+echo ""
+echo "Top-level:"
+ls -la "$OUT/" | head -30
+echo ""
+echo "prefill_T288/:"
+ls -la "$PREFILL_OUT/"
+
+echo ""
+echo "Push to iPhone:"
+echo "  DEVICE=\$(xcrun devicectl list devices --quiet | awk 'NR==2{print \$3}')"
+echo "  xcrun devicectl device copy to --device \$DEVICE \\"
+echo "    --domain-type appDataContainer \\"
+echo "    --domain-identifier com.example.CoreMLLLMChat \\"
+echo "    --source $OUT_PARENT \\"
+echo "    --destination Documents/Models/$MODEL-stateful-multimodal"
+echo ""
+echo "Scheme: LLM_SHOW_EXPERIMENTAL=1 to reveal the picker entry."

From 5f5d71a59f8f694d5eeefb92db17509d407eb059 Mon Sep 17 00:00:00 2001
From: john-rocky <samuraibrothersmail@gmail.com>
Date: Sun, 3 May 2026 10:54:24 +0900
Subject: [PATCH 7/7] feat(picker): gemma4e4bMultimodal + LLM_VISION_FORCE_ANE
 default in shared scheme
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires the HF-uploaded multimodal bundle into the in-app picker flow so
users can download `mlboydaisuke/gemma-4-E4B-multimodal-coreml` with one
tap (no sideload required).

ModelDownloader.swift:
- New `gemma4e4bMultimodal` ModelInfo entry (id `gemma4-e4b-multimodal`,
  size 7.6 GB, downloadURL points at the new HF repo). Shared
  `folderName: "gemma4-e4b"` with the legacy text-only entry mirrors
  the gemma4e2b3way / gemma4e2b pattern: chunks 1-4 are byte-identical
  in both repos, so users who switch between entries reuse the
  on-disk legacy chunks and only fetch the new files.
- `gemma4e4b` (text-only) renamed to "Gemma 4 E4B (text-only)" to
  disambiguate from the new multimodal entry in the picker.
- New `buildE4BMultimodalFileList()` enumerates 58 files matching the
  HF repo tree (decode chunks 1-4 + chunk2_3way + chunk3_3way +
  vision.ane.mlmodelc + audio.mlmodelc + audio sidecars + text
  sidecars). Splits files into legacyChunk(no metadata.json) vs
  newerMlc(with metadata.json) helpers — the legacy chunks were built
  before the metadata.json convention.
- Defaults list inserts `gemma4e4bMultimodal` ahead of `gemma4e4b` so
  the picker presents multimodal as the primary E4B option.

CoreMLLLMChat.xcscheme:
- Add `LLM_VISION_FORCE_ANE=1` to the shared scheme. Safe to default —
  only affects models whose bundle ships a `vision.ane.mlmodelc` (the
  new E4B multimodal entry); other models silently fall through to
  their existing GPU `vision.mlmodelc`.
- Add `LLM_SHOW_EXPERIMENTAL=1`. Required to expose the experimental
  picker entries (already documented in `ModelDownloader.swift`'s
  `defaults`).
- Drop `LLM_PROFILE_EVERY_STEP=1` from the shared scheme; debug-only,
  belongs in a developer's local copy.
---
 .../xcschemes/CoreMLLLMChat.xcscheme          |  12 ++
 Sources/CoreMLLLM/ModelDownloader.swift       | 138 +++++++++++++++++-
 2 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj/xcshareddata/xcschemes/CoreMLLLMChat.xcscheme b/Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj/xcshareddata/xcschemes/CoreMLLLMChat.xcscheme
index ab73af6..7158d8a 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj/xcshareddata/xcschemes/CoreMLLLMChat.xcscheme
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj/xcshareddata/xcschemes/CoreMLLLMChat.xcscheme
@@ -50,6 +50,18 @@
             ReferencedContainer = "container:CoreMLLLMChat.xcodeproj">
          </BuildableReference>
       </BuildableProductRunnable>
+      <EnvironmentVariables>
+         <EnvironmentVariable
+            key = "LLM_SHOW_EXPERIMENTAL"
+            value = "1"
+            isEnabled = "YES">
+         </EnvironmentVariable>
+         <EnvironmentVariable
+            key = "LLM_VISION_FORCE_ANE"
+            value = "1"
+            isEnabled = "YES">
+         </EnvironmentVariable>
+      </EnvironmentVariables>
    </LaunchAction>
    <ProfileAction
       buildConfiguration = "Release"
diff --git a/Sources/CoreMLLLM/ModelDownloader.swift b/Sources/CoreMLLLM/ModelDownloader.swift
index f215012..5b76f78 100644
--- a/Sources/CoreMLLLM/ModelDownloader.swift
+++ b/Sources/CoreMLLLM/ModelDownloader.swift
@@ -165,10 +165,28 @@ public final class ModelDownloader: NSObject {
         /// + USB sideload to `Documents/Models/gemma4-e4b/` is also supported —
         /// the app treats the folder as "downloaded" once present.
         public static let gemma4e4b = ModelInfo(
-            id: "gemma4-e4b", name: "Gemma 4 E4B", size: "5.5 GB",
+            id: "gemma4-e4b", name: "Gemma 4 E4B (text-only)", size: "5.5 GB",
             downloadURL: "https://huggingface.co/mlboydaisuke/gemma-4-E4B-coreml/resolve/main",
             folderName: "gemma4-e4b")
 
+        /// Gemma 4 E4B — multimodal (text + image + video + audio).
+        /// Same legacy 4-chunk text decoder as `gemma4e4b` plus Topology II
+        /// `chunk2_3way` / `chunk3_3way` for 3-chunk decode (saves one ANE
+        /// dispatch per token), E4B-specific `vision.ane.mlmodelc`
+        /// (output `[1, 256, 2560]`), `audio.mlmodelc` + Swift two-stage
+        /// projection sidecars (`output_proj_*` 1024→1536, `embed_proj`
+        /// 1536→2560 — non-square, matches LM hidden). Validated 2026-05-03
+        /// on iPhone 17 Pro at 15.7 tok/s decode + correct outputs across
+        /// all four input modalities.
+        ///
+        /// Set `LLM_VISION_FORCE_ANE=1` (Xcode scheme env var) to route the
+        /// vision encoder through ANE.
+        public static let gemma4e4bMultimodal = ModelInfo(
+            id: "gemma4-e4b-multimodal",
+            name: "Gemma 4 E4B (multimodal)", size: "7.6 GB",
+            downloadURL: "https://huggingface.co/mlboydaisuke/gemma-4-E4B-multimodal-coreml/resolve/main",
+            folderName: "gemma4-e4b")
+
         /// Gemma 4 E2B Fashion — MB dress/casual theory vision advisor.
         /// Local PEFT LoRA (rank=16, alpha=32) fine-tune on 598 Unsplash/Pexels
         /// outfit photos labelled by Claude Vision, merged into the E2B base
@@ -341,7 +359,7 @@ public final class ModelDownloader: NSObject {
             //                         docs/SESSION_2026_04_27_STAGE6_MULTIMODAL.md).
             var list: [ModelInfo] = [
                 gemma4e2b3way, gemma4e2b, gemma4e2bStatefulLinear,
-                gemma4e4b, gemma4e2bFashion,
+                gemma4e4bMultimodal, gemma4e4b, gemma4e2bFashion,
                 qwen25_05b, qwen35_08b, qwen35_2b,
                 qwen3vl_2b, qwen3vl_2b_stateful,
                 lfm2_5_350m,
@@ -1087,6 +1105,10 @@ public final class ModelDownloader: NSObject {
             buildE4BFileList()
             return
         }
+        if model.id == "gemma4-e4b-multimodal" {
+            buildE4BMultimodalFileList()
+            return
+        }
         if model.id == "gemma4-e2b-stateful-linear" {
             buildGemma4StatefulLinearFileList()
             return
@@ -1639,6 +1661,118 @@ public final class ModelDownloader: NSObject {
         nextFileIndex = 0
     }
 
+    /// File list for `gemma4-e4b-multimodal` — superset of the E4B
+    /// legacy text-only bundle plus Topology II 3-chunk decode chunks
+    /// (`chunk2_3way`, `chunk3_3way`), E4B-built vision encoder
+    /// (`vision.ane.mlmodelc` output `[1, 256, 2560]`), audio encoder +
+    /// Swift projection sidecars (`output_proj_*.npy` 1024→1536,
+    /// `embed_proj_weight.npy` 1536→2560), and `mel_filterbank.bin`.
+    /// Sizes are from `mlboydaisuke/gemma-4-E4B-multimodal-coreml`.
+    private func buildE4BMultimodalFileList() {
+        // Legacy chunks 1-4 (same byte content as gemma-4-E4B-coreml).
+        // The legacy chunks don't ship metadata.json (only the newer
+        // 3-way and encoder chunks do).
+        func legacyChunk(_ name: String, weightSize: Int64,
+                          milSize: Int64) -> [DownloadFile] {
+            [.init(remotePath: "\(name).mlmodelc/weights/weight.bin",
+                   localPath: "\(name).mlmodelc/weights/weight.bin", estimatedSize: weightSize),
+             .init(remotePath: "\(name).mlmodelc/coremldata.bin",
+                   localPath: "\(name).mlmodelc/coremldata.bin", estimatedSize: 1_500),
+             .init(remotePath: "\(name).mlmodelc/model.mil",
+                   localPath: "\(name).mlmodelc/model.mil", estimatedSize: milSize),
+             .init(remotePath: "\(name).mlmodelc/analytics/coremldata.bin",
+                   localPath: "\(name).mlmodelc/analytics/coremldata.bin", estimatedSize: 250)]
+        }
+        // Newer chunks (3-way decode + encoders) include metadata.json.
+        func newerMlc(_ name: String, weightSize: Int64,
+                       milSize: Int64, metaSize: Int64) -> [DownloadFile] {
+            [.init(remotePath: "\(name).mlmodelc/weights/weight.bin",
+                   localPath: "\(name).mlmodelc/weights/weight.bin", estimatedSize: weightSize),
+             .init(remotePath: "\(name).mlmodelc/coremldata.bin",
+                   localPath: "\(name).mlmodelc/coremldata.bin", estimatedSize: 1_000),
+             .init(remotePath: "\(name).mlmodelc/model.mil",
+                   localPath: "\(name).mlmodelc/model.mil", estimatedSize: milSize),
+             .init(remotePath: "\(name).mlmodelc/metadata.json",
+                   localPath: "\(name).mlmodelc/metadata.json", estimatedSize: metaSize),
+             .init(remotePath: "\(name).mlmodelc/analytics/coremldata.bin",
+                   localPath: "\(name).mlmodelc/analytics/coremldata.bin", estimatedSize: 250)]
+        }
+
+        let chunkFiles =
+              legacyChunk("chunk1", weightSize: 585_970_432, milSize: 1_288_448)
+            + legacyChunk("chunk2", weightSize: 572_196_992, milSize: 1_277_032)
+            + legacyChunk("chunk3", weightSize: 412_740_736, milSize: 597_340)
+            + legacyChunk("chunk4", weightSize: 753_797_440, milSize: 608_413)
+            + newerMlc("chunk2_3way", weightSize: 984_936_000,
+                        milSize: 917_977, metaSize: 8_741)
+            + newerMlc("chunk3_3way", weightSize: 753_797_440,
+                        milSize: 303_969, metaSize: 6_697)
+            + newerMlc("vision.ane", weightSize: 342_227_200,
+                        milSize: 709_941, metaSize: 2_694)
+            + newerMlc("audio", weightSize: 146_087_488,
+                        milSize: 858_362, metaSize: 2_342)
+
+        let extraFiles: [DownloadFile] = [
+            .init(remotePath: "model_config.json",
+                  localPath: "model_config.json", estimatedSize: 800),
+            .init(remotePath: "hf_model/tokenizer.json",
+                  localPath: "hf_model/tokenizer.json", estimatedSize: 32_169_626),
+            .init(remotePath: "hf_model/tokenizer_config.json",
+                  localPath: "hf_model/tokenizer_config.json", estimatedSize: 2_200),
+            .init(remotePath: "hf_model/config.json",
+                  localPath: "hf_model/config.json", estimatedSize: 5_200),
+            .init(remotePath: "hf_model/generation_config.json",
+                  localPath: "hf_model/generation_config.json", estimatedSize: 300),
+            .init(remotePath: "embed_tokens_q8.bin",
+                  localPath: "embed_tokens_q8.bin", estimatedSize: 671_088_640),
+            .init(remotePath: "embed_tokens_scales.bin",
+                  localPath: "embed_tokens_scales.bin", estimatedSize: 524_288),
+            .init(remotePath: "embed_tokens_per_layer_q8.bin",
+                  localPath: "embed_tokens_per_layer_q8.bin", estimatedSize: 2_818_572_288),
+            .init(remotePath: "embed_tokens_per_layer_scales.bin",
+                  localPath: "embed_tokens_per_layer_scales.bin", estimatedSize: 524_288),
+            .init(remotePath: "per_layer_projection.bin",
+                  localPath: "per_layer_projection.bin", estimatedSize: 55_050_240),
+            .init(remotePath: "per_layer_norm_weight.bin",
+                  localPath: "per_layer_norm_weight.bin", estimatedSize: 512),
+            .init(remotePath: "cos_sliding.npy",
+                  localPath: "cos_sliding.npy", estimatedSize: 2_097_280),
+            .init(remotePath: "sin_sliding.npy",
+                  localPath: "sin_sliding.npy", estimatedSize: 2_097_280),
+            .init(remotePath: "cos_full.npy",
+                  localPath: "cos_full.npy", estimatedSize: 4_194_432),
+            .init(remotePath: "sin_full.npy",
+                  localPath: "sin_full.npy", estimatedSize: 4_194_432),
+            // Audio sidecars (Swift two-stage projection).
+            .init(remotePath: "audio_config.json",
+                  localPath: "audio_config.json", estimatedSize: 400),
+            .init(remotePath: "mel_filterbank.bin",
+                  localPath: "mel_filterbank.bin", estimatedSize: 131_584),
+            .init(remotePath: "output_proj_weight.npy",
+                  localPath: "output_proj_weight.npy", estimatedSize: 3_145_856),
+            .init(remotePath: "output_proj_bias.npy",
+                  localPath: "output_proj_bias.npy", estimatedSize: 3_200),
+            .init(remotePath: "embed_proj_weight.npy",
+                  localPath: "embed_proj_weight.npy", estimatedSize: 7_864_448),
+        ]
+
+        var largeFiles: [DownloadFile] = []
+        var smallFiles: [DownloadFile] = []
+        let threshold: Int64 = 10_000_000
+        for file in chunkFiles + extraFiles {
+            if file.estimatedSize >= threshold {
+                largeFiles.append(file)
+            } else {
+                smallFiles.append(file)
+            }
+        }
+        largeFiles.sort { $0.estimatedSize > $1.estimatedSize }
+        pendingFiles = largeFiles + smallFiles
+        totalBytesForAllFiles = pendingFiles.reduce(0) { $0 + $1.estimatedSize }
+        completedBytes = 0
+        nextFileIndex = 0
+    }
+
     // MARK: - ZIP
 
     private func unzipFile(_ zipURL: URL, to destDir: URL) throws {