From 7468d213c9e4fac2cfe1eec82036104337493482 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Tue, 28 Apr 2026 18:57:55 +0900 Subject: [PATCH 1/7] feat(gemma4): generalize stateful 3-chunk converter to E2B + E4B MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase A1-A3 of the E4B optimization stack. Brings the stage2-e4b 4-chunk foundation (Phase 1 stateful + Phase 2a cross-turn KV) onto current main and adds 3-chunk merged + multifunction prefill_bN support for E4B — the lever that gave E2B its 33.4 tok/s iPhone 17 Pro decode. Converter side - SWAStatefulMergedChunk23{,Prefill,Single,PrefillSingle} accept own_range / shared_range; defaults remain E2B (own=L8-14, shared= L15-24) for back-compat. E4B passes (12,24)/(24,33) derived from compute_chunk_boundaries(config) — kv13/kv14 names are kept as legacy aliases for the (sliding,full) producer slots. - build_gemma4_e2b_stateful_3chunks.py: drops the "E2B only" hardcoded help; --model gemma4-e4b now produces a 3-chunk merged bundle (chunk_1 L0-11 / chunk_2 L12-32 merged / chunk_3 L33-41 + lm_head). Chunk-2 layout printed dynamically. - sanity_stateful_chunks.py: from stage2-e4b — adds --model preset so /tmp/gemma4-{e2b,e4b}-stateful chunks share one verifier. Bundle side - scripts/assemble_gemma4_stateful_e4b.sh: from stage2-e4b — pulls chunk_*.mlmodelc + legacy E4B sidecars into the bundle layout Gemma4StatefulEngine expects (subdir gemma4_e2b_stateful_chunks/ is intentionally shared across E2B/E4B; engine reads hidden / layers / HKV from model_config.json). Runtime side (Swift) - ModelDownloader.swift: gemma4e4bStateful + gemma4e4bStatefulLinear ModelInfo entries (slots 6/7 under LLM_SHOW_EXPERIMENTAL=1). downloadURL is intentionally blank — A6 will fill in the new mlboydaisuke/gemma-4-E4B-stateful-coreml repo URL once iPhone 17 Pro A/B clears. Existing mlboydaisuke/gemma-4-E4B-coreml legacy repo is untouched, preserving the dual-repo pattern E2B uses. - LLMRunner.swift: stateful detection comment now lists all four folders that share the gemma4_e2b_stateful_chunks/ layout. Build artefacts (A4) and iPhone validation (A5) follow. --- .../CoreMLLLMChat/LLMRunner.swift | 16 +-- Sources/CoreMLLLM/ModelDownloader.swift | 27 +++++ .../build_gemma4_e2b_stateful_3chunks.py | 104 +++++++++++++----- .../models/gemma4_swa_stateful_chunks.py | 62 ++++++++--- conversion/sanity_stateful_chunks.py | 66 ++++++++--- scripts/assemble_gemma4_stateful_e4b.sh | 102 +++++++++++++++++ 6 files changed, 308 insertions(+), 69 deletions(-) create mode 100755 scripts/assemble_gemma4_stateful_e4b.sh diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift index ce75469..30f43bd 100644 --- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift +++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift @@ -187,13 +187,15 @@ final class LLMRunner { return } - // Gemma 4 E2B STATEFUL detection: chunk_{1..4}.mlpackage/.mlmodelc - // + embed_tokens_q8.bin under gemma4_e2b_stateful_chunks/. Both - // the Conv2d wrapper variant (folder=gemma4-e2b-stateful) and the - // Linear variant (folder=gemma4-e2b-stateful-linear, Plan 3 A/B) - // share the same internal layout — Gemma4StatefulEngine handles - // both transparently because the only difference is the MIL graph - // inside each chunk_*.mlpackage. + // Gemma 4 STATEFUL detection: chunk_{1..4}.mlpackage/.mlmodelc + // + embed_tokens_q8.bin under gemma4_e2b_stateful_chunks/. The + // subdir name is shared across all six published variants — + // E2B: gemma4-e2b-stateful{,-linear} (Conv2d / Plan 3 Linear) + // E4B: gemma4-e4b-stateful{,-linear} (Stage 2 port) + // — because Gemma4StatefulEngine reads hidden_size / num_layers / + // num_kv_heads from model_config.json, so per-model differences + // (E2B 35 layers / HKV=1 vs E4B 42 layers / HKV=2) need no + // engine code change. // Require either: // - chunks 1-3 (3-chunk or 4-chunk bundle — chunk_4 optional) // - model.{mlpackage,mlmodelc} (1-chunk all-in-one) diff --git a/Sources/CoreMLLLM/ModelDownloader.swift b/Sources/CoreMLLLM/ModelDownloader.swift index 54c6fab..7ac9200 100644 --- a/Sources/CoreMLLLM/ModelDownloader.swift +++ b/Sources/CoreMLLLM/ModelDownloader.swift @@ -263,6 +263,31 @@ public final class ModelDownloader: NSObject { downloadURL: "https://huggingface.co/mlboydaisuke/gemma-4-E2B-stateful-coreml/resolve/main", folderName: "gemma4-e2b-stateful-linear") + /// Gemma 4 E4B stateful — Stage 2 port of the E2B Phase 1 + 2a + /// stateful path to the larger 4 B sibling. Built by + /// `conversion/build_gemma4_e2b_stateful_chunks.py --model gemma4-e4b` + /// (same script; chunk boundaries / hidden / HKV come from the HF + /// config). Shares the inner subdir name `gemma4_e2b_stateful_chunks` + /// with the E2B variants — Gemma4StatefulEngine reads + /// hidden_size / num_layers / per_layer_dim from `model_config.json`, + /// so E4B runs without engine code changes. Sideload-only to + /// `Documents/Models/gemma4-e4b-stateful/gemma4_e2b_stateful_chunks/`. + public static let gemma4e4bStateful = ModelInfo( + id: "gemma4-e4b-stateful", + name: "Gemma 4 E4B (stateful, MLState)", size: "5.6 GB", + downloadURL: "", + folderName: "gemma4-e4b-stateful") + + /// Gemma 4 E4B stateful — Linear projections variant (cml9 PR #2577 + /// `nn.Linear` form, ANE-equivalent placement). Same layout as + /// `gemma4e4bStateful`. Production HF download URL is filled in + /// once the iPhone 17 Pro A/B clears (Stage 2 closure step A6). + public static let gemma4e4bStatefulLinear = ModelInfo( + id: "gemma4-e4b-stateful-linear", + name: "Gemma 4 E4B (stateful, Linear projections)", size: "5.6 GB", + downloadURL: "", + folderName: "gemma4-e4b-stateful-linear") + /// Visible in the UI picker. EAGLE-3 / LookAhead probe variants are /// hidden unless `LLM_SHOW_EXPERIMENTAL=1` is set (or the /// UserDefaults key `showExperimentalModels` is true). Keeps the @@ -297,6 +322,8 @@ public final class ModelDownloader: NSObject { list.insert(gemma4e2bEagle3, at: 3) list.insert(gemma4e2bLookaheadProbe, at: 4) list.insert(gemma4e2bStateful, at: 5) // Conv2d variant + list.insert(gemma4e4bStateful, at: 6) // E4B Stage 2 Conv2d + list.insert(gemma4e4bStatefulLinear, at: 7) // E4B Stage 2 Linear } return list } diff --git a/conversion/build_gemma4_e2b_stateful_3chunks.py b/conversion/build_gemma4_e2b_stateful_3chunks.py index d2f31f0..14c55ef 100644 --- a/conversion/build_gemma4_e2b_stateful_3chunks.py +++ b/conversion/build_gemma4_e2b_stateful_3chunks.py @@ -1,25 +1,35 @@ #!/usr/bin/env python3 -"""Build Gemma 4 E2B stateful 3-chunk variant (merged middle). +"""Build Gemma 4 stateful 3-chunk variant (merged middle). Same as `build_gemma4_e2b_stateful_chunks.py` but emits 3 mlpackages -instead of 4 — the middle chunk merges the 4-chunk's chunk_2 (own KV -L8-14) and chunk_3 (KV-shared L15-24), keeping kv13/kv14 producer -aliases internal. Final chunk_3 = old chunk_4 (KV-shared L25-34 + -lm_head + argmax). - -Layout: - chunk_1.mlpackage (L0-7, own KV, computes PLE) — same as 4-chunk - chunk_2.mlpackage (L8-24, merged: own + shared inside) — NEW - chunk_3.mlpackage (L25-34 + lm_head + argmax) — = old chunk_4 +instead of 4 — the middle chunk merges the 4-chunk's chunk_2 (own KV) +and chunk_3 (KV-shared), keeping kv13/kv14 producer aliases internal. +Final chunk_3 = old chunk_4 (KV-shared tail + lm_head + argmax). + +Layout (E2B / E4B, derived from `compute_chunk_boundaries(config)`): + E2B (35 layers): + chunk_1 L0-7 own KV, computes PLE — same as 4-chunk + chunk_2 L8-24 merged: own L8-14 + shared L15-24 + chunk_3 L25-34 + lm_head + argmax — = old chunk_4 + E4B (42 layers): + chunk_1 L0-11 own KV, computes PLE + chunk_2 L12-32 merged: own L12-23 + shared L24-32 + chunk_3 L33-41 + lm_head + argmax Multifunction `--prefill-batches "8"` adds a `prefill_b8` function to each chunk (sharing weights via coremltools save_multifunction). Usage: python conversion/build_gemma4_e2b_stateful_3chunks.py \ + --model gemma4-e2b \ --output /tmp/g4_3chunk/multi \ --hf-dir /path/to/gemma4-e2b/hf_model \ --ctx 2048 --linear-projections --prefill-batches "8" + + python conversion/build_gemma4_e2b_stateful_3chunks.py \ + --model gemma4-e4b \ + --output /tmp/g4_3chunk_e4b \ + --ctx 2048 --linear-projections --prefill-batches "8" """ from __future__ import annotations @@ -63,9 +73,13 @@ fp16 = np.float16 -def convert_chunk2_merged(base, ctx, out_path, nbits, *, use_linear=False): +def convert_chunk2_merged(base, ctx, out_path, nbits, *, use_linear=False, + own_range=None, shared_range=None): + own = own_range or (8, 15) + shared = shared_range or (15, 25) print("\n" + "=" * 60) - print(f"CHUNK 2 MERGED (L8-24) — own KV L8-14 + KV-shared L15-24") + print(f"CHUNK 2 MERGED (L{own[0]}-{shared[1]-1}) — " + f"own KV L{own[0]}-{own[1]-1} + KV-shared L{shared[0]}-{shared[1]-1}") print("=" * 60) cfg = base.config hidden = cfg.hidden_size @@ -77,7 +91,10 @@ def convert_chunk2_merged(base, ctx, out_path, nbits, *, use_linear=False): HKV = cfg.num_key_value_heads chunk = SWAStatefulMergedChunk23(base, ctx, - use_linear=use_linear).eval().to(MODEL_DTYPE) + use_linear=use_linear, + own_range=own_range, + shared_range=shared_range + ).eval().to(MODEL_DTYPE) ns, nf = max(chunk.num_sliding, 1), max(chunk.num_full, 1) sample = ( @@ -128,9 +145,12 @@ def convert_chunk2_merged(base, ctx, out_path, nbits, *, use_linear=False): def convert_chunk2_merged_prefill(base, ctx, T, out_path, nbits, *, - use_linear=False): + use_linear=False, + own_range=None, shared_range=None): + own = own_range or (8, 15) + shared = shared_range or (15, 25) print("\n" + "-" * 60) - print(f"CHUNK 2 MERGED PREFILL T={T} (L8-24)") + print(f"CHUNK 2 MERGED PREFILL T={T} (L{own[0]}-{shared[1]-1})") print("-" * 60) cfg = base.config hidden = cfg.hidden_size @@ -142,7 +162,8 @@ def convert_chunk2_merged_prefill(base, ctx, T, out_path, nbits, *, HKV = cfg.num_key_value_heads chunk = SWAStatefulMergedChunk23Prefill( - base, ctx, use_linear=use_linear, T=T).eval().to(MODEL_DTYPE) + base, ctx, use_linear=use_linear, T=T, + own_range=own_range, shared_range=shared_range).eval().to(MODEL_DTYPE) ns, nf = max(chunk.num_sliding, 1), max(chunk.num_full, 1) sample = ( @@ -305,7 +326,8 @@ def convert_chunk1_prefill_single(base, c_start, c_end, ctx, T, out_path, nbits, chunk, sample, inputs, outputs, states, out_path, nbits) -def convert_chunk2_merged_single(base, ctx, out_path, nbits, *, use_linear=False): +def convert_chunk2_merged_single(base, ctx, out_path, nbits, *, use_linear=False, + own_range=None, shared_range=None): print("\n" + "=" * 60) print(f"CHUNK 2 MERGED SINGLE-BUFFER (L8-24)") print("=" * 60) @@ -319,7 +341,10 @@ def convert_chunk2_merged_single(base, ctx, out_path, nbits, *, use_linear=False HKV = cfg.num_key_value_heads chunk = SWAStatefulMergedChunk23Single(base, ctx, - use_linear=use_linear).eval().to(MODEL_DTYPE) + use_linear=use_linear, + own_range=own_range, + shared_range=shared_range + ).eval().to(MODEL_DTYPE) no = max(chunk.num_own, 1) sample = ( torch.zeros(1, 1, hidden, dtype=torch.float16), @@ -364,7 +389,8 @@ def convert_chunk2_merged_single(base, ctx, out_path, nbits, *, use_linear=False def convert_chunk2_merged_prefill_single(base, ctx, T, out_path, nbits, *, - use_linear=False): + use_linear=False, + own_range=None, shared_range=None): print("\n" + "-" * 60) print(f"CHUNK 2 MERGED SINGLE-BUFFER PREFILL T={T} (L8-24)") print("-" * 60) @@ -378,7 +404,8 @@ def convert_chunk2_merged_prefill_single(base, ctx, T, out_path, nbits, *, HKV = cfg.num_key_value_heads chunk = SWAStatefulMergedChunk23PrefillSingle( - base, ctx, use_linear=use_linear, T=T).eval().to(MODEL_DTYPE) + base, ctx, use_linear=use_linear, T=T, + own_range=own_range, shared_range=shared_range).eval().to(MODEL_DTYPE) no = max(chunk.num_own, 1) sample = ( torch.zeros(1, T, hidden, dtype=torch.float16), @@ -425,7 +452,11 @@ def convert_chunk2_merged_prefill_single(base, ctx, T, out_path, nbits, *, def main(): ap = argparse.ArgumentParser() ap.add_argument("--model", default="gemma4-e2b", - help="Model name (gemma4-e2b only for now)") + help="Model name (gemma4-e2b or gemma4-e4b). Chunk " + "boundaries are derived from the HF config via " + "compute_chunk_boundaries(config); kv13/kv14 names " + "are legacy aliases (sliding/full producer slots) " + "shared across both models.") ap.add_argument("--output", required=True) ap.add_argument("--hf-dir", default=None) ap.add_argument("--ctx", type=int, default=None) @@ -465,15 +496,22 @@ def main(): cfg = base.config boundaries = compute_chunk_boundaries(cfg) - # 3-chunk: re-use boundaries[0] (chunk_1 L0-7) and boundaries[3] - # (= old chunk_4 L25-34, which becomes new chunk_3). The merged - # middle (= boundaries[1] start, boundaries[2] end = L8-25) is - # baked into SWAStatefulMergedChunk23. + # 3-chunk: re-use boundaries[0] (chunk_1) and boundaries[3] (= old + # chunk_4, which becomes new chunk_3). The merged middle spans + # boundaries[1] (own KV) → boundaries[2] (KV-shared), passed into + # SWAStatefulMergedChunk23 via own_range/shared_range so the same + # builder works for E2B (own=L8-14, shared=L15-24) and E4B + # (own=L12-23, shared=L24-32). chunk1_range = boundaries[0] + own_range = boundaries[1] + shared_range = boundaries[2] chunk3_range = boundaries[3] # final chunk = old chunk_4 print(f"\nctx={args.ctx} W={cfg.sliding_window} hidden={cfg.hidden_size}") print(f"3-chunk layout: c1=L{chunk1_range[0]}-{chunk1_range[1]-1}, " - f"c2_merged=L8-24, c3=L{chunk3_range[0]}-{chunk3_range[1]-1}") + f"c2_merged=L{own_range[0]}-{shared_range[1]-1} " + f"(own L{own_range[0]}-{own_range[1]-1} + " + f"shared L{shared_range[0]}-{shared_range[1]-1}), " + f"c3=L{chunk3_range[0]}-{chunk3_range[1]-1}") print(f"Quantize: int{args.nbits}" if args.nbits else "Quantize: fp16") if args.linear_projections: print(f"Projections: nn.Linear") @@ -542,20 +580,26 @@ def _build_one(decode_fn, prefill_fn, final_name): _build_one( lambda p: convert_chunk2_merged_single(base, args.ctx, p, args.nbits, - use_linear=use_linear), + use_linear=use_linear, + own_range=own_range, + shared_range=shared_range), lambda T, p: convert_chunk2_merged_prefill_single( base, args.ctx, T, p, args.nbits, - use_linear=use_linear), + use_linear=use_linear, + own_range=own_range, shared_range=shared_range), "chunk_2", ) else: _build_one( lambda p: convert_chunk2_merged(base, args.ctx, p, args.nbits, - use_linear=use_linear), + use_linear=use_linear, + own_range=own_range, + shared_range=shared_range), lambda T, p: convert_chunk2_merged_prefill( base, args.ctx, T, p, args.nbits, - use_linear=use_linear), + use_linear=use_linear, + own_range=own_range, shared_range=shared_range), "chunk_2", ) if do(3): diff --git a/conversion/models/gemma4_swa_stateful_chunks.py b/conversion/models/gemma4_swa_stateful_chunks.py index 0747feb..3f42bd2 100644 --- a/conversion/models/gemma4_swa_stateful_chunks.py +++ b/conversion/models/gemma4_swa_stateful_chunks.py @@ -838,17 +838,27 @@ def forward(self, hidden_states, causal_mask_full, causal_mask_sliding, class SWAStatefulMergedChunk23(_StatefulChunkBase): - """Merged stateful chunk for L8-24. Owns KV state for L8-14, runs - L15-24 KV-shared internally. Eliminates the 4-chunk's chunk_2 → - chunk_3 hidden-state round-trip (~+5-10% Mac decode). + """Merged stateful chunk that owns the lower-half KV span and runs + the upper-half KV-shared internally. Eliminates the 4-chunk's + chunk_2 → chunk_3 hidden-state round-trip (~+5-10% Mac decode). + + Boundaries default to E2B (own=L8-14, shared=L15-24). For E4B pass + own_range / shared_range derived from compute_chunk_boundaries(cfg) + (E4B: own=L12-23, shared=L24-32). """ - START_OWN, END_OWN = 8, 15 # own-KV layers (= old chunk_2) - START_SHARED, END_SHARED = 15, 25 # KV-shared layers (= old chunk_3) + DEFAULT_OWN = (8, 15) # E2B own-KV layers (= old chunk_2) + DEFAULT_SHARED = (15, 25) # E2B KV-shared layers (= old chunk_3) def __init__(self, model: Gemma4Model, ctx: int = 2048, - use_linear: bool = False): + use_linear: bool = False, + own_range: tuple[int, int] | None = None, + shared_range: tuple[int, int] | None = None): + own = own_range if own_range is not None else self.DEFAULT_OWN + shared = shared_range if shared_range is not None else self.DEFAULT_SHARED + self.START_OWN, self.END_OWN = own + self.START_SHARED, self.END_SHARED = shared # Init base with the OWN-KV span so the kv_cache_* buffers size - # to L8-14 only. KV-shared layers don't need state slots. + # to chunk_2 only. KV-shared layers don't need state slots. super().__init__(model, self.START_OWN, self.END_OWN, ctx) self.layers_shared = nn.ModuleList([ model.layers[i] for i in range(self.START_SHARED, self.END_SHARED) @@ -905,8 +915,11 @@ def forward(self, hidden_states, causal_mask_full, causal_mask_sliding, class SWAStatefulMergedChunk23Prefill(SWAStatefulMergedChunk23): """T=N prefill variant of the merged middle chunk.""" - def __init__(self, model, ctx=2048, use_linear=False, T: int = 8): - super().__init__(model, ctx, use_linear=use_linear) + def __init__(self, model, ctx=2048, use_linear=False, T: int = 8, + own_range: tuple[int, int] | None = None, + shared_range: tuple[int, int] | None = None): + super().__init__(model, ctx, use_linear=use_linear, + own_range=own_range, shared_range=shared_range) self.T = T def forward(self, hidden_states, causal_mask_full, causal_mask_sliding, @@ -1351,13 +1364,23 @@ def forward(self, hidden_states, causal_mask_full, causal_mask_sliding, class SWAStatefulMergedChunk23Single(_StatefulSingleChunkBase): - """3-chunk merged middle (L8-24) with unified state buffer. - Owns L8-14 KV; runs L15-24 KV-shared internally. Emits kv13/kv14 - aliases for the final chunk_3.""" - START_OWN, END_OWN = 8, 15 - START_SHARED, END_SHARED = 15, 25 - - def __init__(self, model, ctx=2048, use_linear=False): + """3-chunk merged middle with unified state buffer. + Owns chunk_2 KV; runs chunk_3 KV-shared internally. Emits kv13/kv14 + aliases for the final chunk_3. + + Boundaries default to E2B (own=L8-14, shared=L15-24). For E4B pass + own_range / shared_range from compute_chunk_boundaries(cfg) + (E4B: own=L12-23, shared=L24-32).""" + DEFAULT_OWN = (8, 15) + DEFAULT_SHARED = (15, 25) + + def __init__(self, model, ctx=2048, use_linear=False, + own_range: tuple[int, int] | None = None, + shared_range: tuple[int, int] | None = None): + own = own_range if own_range is not None else self.DEFAULT_OWN + shared = shared_range if shared_range is not None else self.DEFAULT_SHARED + self.START_OWN, self.END_OWN = own + self.START_SHARED, self.END_SHARED = shared super().__init__(model, self.START_OWN, self.END_OWN, ctx) self.layers_shared = nn.ModuleList([ model.layers[i] for i in range(self.START_SHARED, self.END_SHARED) @@ -1406,8 +1429,11 @@ def forward(self, hidden_states, causal_mask_full, causal_mask_sliding, class SWAStatefulMergedChunk23PrefillSingle(SWAStatefulMergedChunk23Single): """T=N prefill variant of merged middle with unified state.""" - def __init__(self, model, ctx=2048, use_linear=False, T: int = 8): - super().__init__(model, ctx, use_linear=use_linear) + def __init__(self, model, ctx=2048, use_linear=False, T: int = 8, + own_range: tuple[int, int] | None = None, + shared_range: tuple[int, int] | None = None): + super().__init__(model, ctx, use_linear=use_linear, + own_range=own_range, shared_range=shared_range) self.T = T def forward(self, hidden_states, causal_mask_full, causal_mask_sliding, diff --git a/conversion/sanity_stateful_chunks.py b/conversion/sanity_stateful_chunks.py index b9d1f91..c80b65f 100644 --- a/conversion/sanity_stateful_chunks.py +++ b/conversion/sanity_stateful_chunks.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Mac sanity check for /tmp/gemma4-e2b-stateful/ chunk_{1..4}.mlpackage. +"""Mac sanity check for Gemma 4 stateful chunk_{1..4}.mlpackage bundles. Verifies: 1. Each chunk loads on Mac CPU_AND_NE without error. @@ -15,9 +15,15 @@ This is NOT a numerical correctness test — inputs are zeros / synthetic RoPE — only a wiring sanity check. Real perf/correctness will be on iPhone after the Swift Generator is wired up. + +Usage: + python conversion/sanity_stateful_chunks.py # E2B default + python conversion/sanity_stateful_chunks.py --model gemma4-e4b + python conversion/sanity_stateful_chunks.py --artifacts /tmp/foo """ from __future__ import annotations +import argparse import os import sys import time @@ -26,19 +32,26 @@ import numpy as np import coremltools as ct -CTX = 512 -W = 512 -HIDDEN = 1536 -PLD = 256 -NLAYERS = 35 -HKV = 1 -HD_S = 256 -HD_F = 512 -VOCAB = 262_144 -ARTIFACTS = Path("/tmp/gemma4-e2b-stateful") - -# --- E2B chunk topology (must match build_gemma4_e2b_stateful_chunks.py) --- -CHUNK_BOUNDARIES = [(0, 8), (8, 15), (15, 25), (25, 35)] +# --- Per-model presets (must match build_gemma4_e2b_stateful_chunks.py) --- +PRESETS = { + "gemma4-e2b": dict( + ctx=512, w=512, hidden=1536, pld=256, nlayers=35, hkv=1, + hd_s=256, hd_f=512, vocab=262_144, + artifacts="/tmp/gemma4-e2b-stateful", + boundaries=[(0, 8), (8, 15), (15, 25), (25, 35)], + ), + "gemma4-e4b": dict( + ctx=2048, w=512, hidden=2560, pld=256, nlayers=42, hkv=2, + hd_s=256, hd_f=512, vocab=262_144, + artifacts="/tmp/gemma4-e4b-stateful", + boundaries=[(0, 12), (12, 24), (24, 33), (33, 42)], + ), +} + +# Defaults overwritten in main() once we read --model / --artifacts. +CTX = W = HIDDEN = PLD = NLAYERS = HKV = HD_S = HD_F = VOCAB = 0 +ARTIFACTS: Path = Path("/tmp/gemma4-e2b-stateful") +CHUNK_BOUNDARIES: list = [] def _make_mask_full(pos: int) -> np.ndarray: @@ -134,7 +147,32 @@ def shared_chunk_inputs(seed: int, hidden_in: np.ndarray, per_layer_combined: np } +def _apply_preset(name: str, artifacts_override: str | None) -> None: + """Populate the module-level constants other functions read.""" + if name not in PRESETS: + sys.exit(f"unknown preset {name!r}; choose from {list(PRESETS)}") + p = PRESETS[name] + g = globals() + g["CTX"], g["W"] = p["ctx"], p["w"] + g["HIDDEN"], g["PLD"] = p["hidden"], p["pld"] + g["NLAYERS"], g["HKV"] = p["nlayers"], p["hkv"] + g["HD_S"], g["HD_F"], g["VOCAB"] = p["hd_s"], p["hd_f"], p["vocab"] + g["ARTIFACTS"] = Path(artifacts_override or p["artifacts"]) + g["CHUNK_BOUNDARIES"] = p["boundaries"] + print(f"[preset] {name} artifacts={ARTIFACTS}") + print(f" ctx={CTX} W={W} hidden={HIDDEN} layers={NLAYERS} HKV={HKV}") + + def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--model", default="gemma4-e2b", + choices=list(PRESETS), + help="Which Gemma 4 stateful preset to sanity-test") + ap.add_argument("--artifacts", default=None, + help="Override artifacts dir (defaults to preset's path)") + args = ap.parse_args() + _apply_preset(args.model, args.artifacts) + if not ARTIFACTS.is_dir(): sys.exit(f"missing: {ARTIFACTS}") diff --git a/scripts/assemble_gemma4_stateful_e4b.sh b/scripts/assemble_gemma4_stateful_e4b.sh new file mode 100755 index 0000000..698f893 --- /dev/null +++ b/scripts/assemble_gemma4_stateful_e4b.sh @@ -0,0 +1,102 @@ +#!/bin/bash +# Assemble the Gemma 4 E4B stateful bundle for iPhone sideload. +# Stage 2 sibling of assemble_gemma4_stateful_bundle.sh (which builds +# the E2B variant). Layout matches what Gemma4StatefulEngine expects: +# +# build/gemma4_stateful_e4b/ +# gemma4_e2b_stateful_chunks/ # subdir name shared with E2B +# chunk_{1..4}.mlmodelc (from /tmp/gemma4-e4b-stateful) +# embed_tokens_q8.bin (E4B sidecars from output/) +# embed_tokens_scales.bin +# embed_tokens_per_layer_q8.bin +# embed_tokens_per_layer_scales.bin +# per_layer_projection.bin (parity, not used by Engine) +# per_layer_norm_weight.bin +# cos_sliding.npy / sin_sliding.npy +# cos_full.npy / sin_full.npy +# hf_model/ (tokenizer files) +# model_config.json (E4B: hidden=2560, layers=42, HKV=2) +# +# Push: +# xcrun devicectl device copy to --device \ +# --domain-type appDataContainer \ +# --domain-identifier com.example.CoreMLLLMChat \ +# --source build/gemma4_stateful_e4b \ +# --destination Documents/Models/gemma4-e4b-stateful +# +# Scheme: LLM_SHOW_EXPERIMENTAL=1 to reveal the picker entry. +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +SRC_CHUNKS="${SRC_CHUNKS:-/tmp/gemma4-e4b-stateful}" +# E4B sidecars: the existing legacy 4-chunk E4B bundle in the sibling +# CoreML-LLM workspace already ships every sidecar we need (same names +# the E2B staging-2k-fast-prefill dir uses). Override via env if you +# moved the bundle. +SIDECARS="${SIDECARS:-/Users/majimadaisuke/Downloads/workspace/CoreML-LLM/output/gemma4-e4b/bundle}" +OUT_PARENT="${OUT_PARENT:-$ROOT/build/gemma4_stateful_e4b}" +OUT="$OUT_PARENT/gemma4_e2b_stateful_chunks" + +for d in "$SRC_CHUNKS" "$SIDECARS"; do + if [[ ! -d "$d" ]]; then + echo "[error] missing $d" >&2 + exit 1 + fi +done +for c in chunk_1 chunk_2 chunk_3 chunk_4; do + if [[ ! -d "$SRC_CHUNKS/${c}.mlpackage" && ! -d "$SRC_CHUNKS/${c}.mlmodelc" ]]; then + echo "[error] $SRC_CHUNKS/${c}.{mlpackage,mlmodelc} missing — run build_gemma4_e2b_stateful_chunks.py --model gemma4-e4b first" >&2 + exit 1 + fi +done + +rm -rf "$OUT_PARENT" +mkdir -p "$OUT" + +# 1. Compile chunks .mlpackage → .mlmodelc into the bundle dir +for c in chunk_1 chunk_2 chunk_3 chunk_4; do + echo "[compile] $c" + if [[ -d "$SRC_CHUNKS/${c}.mlmodelc" ]]; then + cp -R "$SRC_CHUNKS/${c}.mlmodelc" "$OUT/${c}.mlmodelc" + else + xcrun coremlcompiler compile \ + "$SRC_CHUNKS/${c}.mlpackage" "$OUT/" 2>&1 | tail -2 + fi +done + +# 2. Copy sidecars from the E4B legacy bundle +SIDE_ITEMS=( + "embed_tokens_q8.bin" + "embed_tokens_scales.bin" + "embed_tokens_per_layer_q8.bin" + "embed_tokens_per_layer_scales.bin" + "per_layer_projection.bin" + "per_layer_norm_weight.bin" + "cos_sliding.npy" + "sin_sliding.npy" + "cos_full.npy" + "sin_full.npy" + "hf_model" + "model_config.json" +) +for item in "${SIDE_ITEMS[@]}"; do + if [[ -e "$SIDECARS/$item" ]]; then + echo "[copy] $item" + cp -R "$SIDECARS/$item" "$OUT/" + else + echo " [warn] missing $item" + fi +done + +echo "" +echo "=== assembled ===" +du -sh "$OUT_PARENT" +ls -la "$OUT/" | head -25 + +echo "" +echo "Push to iPhone:" +echo " DEVICE=A6F3E849-1947-5202-9AD1-9C881CA58EEF" +echo " xcrun devicectl device copy to --device \$DEVICE \\" +echo " --domain-type appDataContainer \\" +echo " --domain-identifier com.example.CoreMLLLMChat \\" +echo " --source $OUT_PARENT --destination Documents/Models/gemma4-e4b-stateful" From e110752e5fff7f09aa795e8c44d237a633594df3 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Tue, 28 Apr 2026 18:59:55 +0900 Subject: [PATCH 2/7] feat(gemma4): make singlefunc prefill builder E4B-aware Stage 8 builder (PR #149) already used `compute_chunk_boundaries` for chunk_1 / chunk_3 windows but called `convert_chunk2_merged_prefill` without `own_range` / `shared_range`, so on E4B the merged middle chunk silently used E2B's L8-14 / L15-24 layer ranges instead of L12-23 / L24-32. After A3 made the converter parametric, plumb the ranges through and refresh the docstring + the stale "we don't ship E4B stateful yet" comment. --- ...uild_gemma4_stateful_singlefunc_prefill.py | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/conversion/build_gemma4_stateful_singlefunc_prefill.py b/conversion/build_gemma4_stateful_singlefunc_prefill.py index 4fdac3b..7da6681 100644 --- a/conversion/build_gemma4_stateful_singlefunc_prefill.py +++ b/conversion/build_gemma4_stateful_singlefunc_prefill.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 -"""Build Gemma 4 E2B stateful prefill chunks as **single-function** -mlpackages (no multifunction merge). +"""Build Gemma 4 stateful prefill chunks as **single-function** +mlpackages (no multifunction merge). Supports E2B and E4B; chunk +boundaries come from `compute_chunk_boundaries(config)`. Stage 8 / Stage 6.5 opt-in builder. Companion to `docs/HANDOFF_STAGE8_MLSTATE_MULTIMODAL.md` and the probe results in @@ -26,11 +27,14 @@ Layout produced (3-chunk merged variant, T=288 default): - chunk_1_prefill_T288.mlpackage (L0-7, own KV) - chunk_2_3way_prefill_T288.mlpackage (L8-24 merged, own + shared) - chunk_3_prefill_T288.mlpackage (L25-34 + lm_head + argmax, - structurally same as 4-chunk - chunk_4_prefill) + E2B (35 layers): + chunk_1_prefill_T288.mlpackage (L0-7, own KV) + chunk_2_3way_prefill_T288.mlpackage (L8-24 merged, own + shared) + chunk_3_prefill_T288.mlpackage (L25-34 + lm_head + argmax) + E4B (42 layers): + chunk_1_prefill_T288.mlpackage (L0-11, own KV) + chunk_2_3way_prefill_T288.mlpackage (L12-32 merged, own + shared) + chunk_3_prefill_T288.mlpackage (L33-41 + lm_head + argmax) Usage: python conversion/build_gemma4_stateful_singlefunc_prefill.py \\ @@ -125,12 +129,16 @@ def main(): base = Gemma4Model.from_pretrained(hf_dir, context_length=args.ctx) base.eval() - # E2B layer split: chunk_1 = L0-7 (own KV), chunk_2_3way = L8-24 - # (merged), chunk_3 = L25-34 (+head). E4B has different boundaries - # but we don't ship E4B stateful yet. + # Chunk layout (config-derived via compute_chunk_boundaries): + # E2B: c1=L0-7, own=L8-14, shared=L15-24, c4=L25-34 + # E4B: c1=L0-11, own=L12-23, shared=L24-32, c4=L33-41 + # The merged prefill needs own_range + shared_range so it picks + # the right layer-index window for the kv13/kv14 producer aliases. boundaries = compute_chunk_boundaries(base.config) - c1_start, c1_end = boundaries[0] # E2B (0, 8) - c4_start, c4_end = boundaries[3] # E2B (25, 35) + c1_start, c1_end = boundaries[0] + own_range = boundaries[1] + shared_range = boundaries[2] + c4_start, c4_end = boundaries[3] paths = { "chunk1": os.path.join(args.output, f"chunk_1_prefill_T{args.t}.mlpackage"), @@ -154,6 +162,8 @@ def main(): out_path=paths["chunk2_3way"], nbits=args.nbits, use_linear=args.linear_projections, + own_range=own_range, + shared_range=shared_range, ) if args.only in (None, "chunk3"): convert_chunk_shared_prefill( From 000a292b79493c9680b8f0f8554561c353f1a977 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Tue, 28 Apr 2026 19:50:41 +0900 Subject: [PATCH 3/7] docs(stage8): Phase B implementation plan for E4B + E2B multimodal stateful MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the design intel + chosen architecture (Option A: separate Gemma4StatefulMultimodalEngine class) so the next session can pick up without re-deriving. Records: - Phase A scope (already shipped on this branch as 4665ab2 + 2655c17) - Phase B engine class layout (storage, public API, helper port list) - State bridge code path (probe-2-verified nested withMultiArray closures + memcpy) - Generate flow for image+text prompts (T=288 prefill → bridge → decode) - Bundle layout for new HF repos gemma-4-{E2B,E4B}-stateful-multimodal-coreml - Open questions (picker naming, default-swap timing, cross-turn KV with re-encoded image features) - Build commands for the Mac compile run --- docs/STAGE8_DESIGN.md | 324 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 324 insertions(+) create mode 100644 docs/STAGE8_DESIGN.md diff --git a/docs/STAGE8_DESIGN.md b/docs/STAGE8_DESIGN.md new file mode 100644 index 0000000..de52b3e --- /dev/null +++ b/docs/STAGE8_DESIGN.md @@ -0,0 +1,324 @@ +# Stage 8: E4B + E2B Multimodal Stateful Engine — Implementation Plan + +**Status:** Phase A code complete (text-only E4B parity with E2B). Phase B (multimodal wiring) deferred to a follow-up session. + +**Branch:** `feat/e4b-optimize-multimodal` +**Date:** 2026-04-28 +**Predecessor docs:** `docs/MLSTATE_MULTIMODAL_PROBE.md`, `docs/HANDOFF_STAGE8_MLSTATE_MULTIMODAL.md`, `docs/SESSION_2026_04_27_STAGE6_MULTIMODAL.md` + +--- + +## Goal + +Ship Gemma 4 E4B and E2B with the stateful Linear decode path **and** vision + video + audio multimodal input on iPhone 17 Pro. Reach the fastest decode tok/s the architecture allows while keeping multimodal correctness intact. + +**Scope split chosen on 2026-04-28:** +- **Option A** — separate Swift class `Gemma4StatefulMultimodalEngine`, leaving the legacy `Gemma4StatefulEngine` (text-only multifunction prefill_b8 path) untouched. +- **E2B + E4B both get multimodal stateful** — same engine class drives both via `model_config.json`-derived dimensions. +- **Existing HF repos preserved** — new `mlboydaisuke/gemma-4-{E2B,E4B}-stateful-multimodal-coreml` repos rather than mutating the existing stateful repos. Mirrors the dual-repo pattern. + +--- + +## What's already shipped (Phase A — code only, builds + iPhone gates pending) + +Two commits on `feat/e4b-optimize-multimodal`: + +1. **`4665ab2`** — generalize 3-chunk + 4-chunk converters to E2B + E4B + - `SWAStatefulMergedChunk23{,Prefill,Single,PrefillSingle}` accept `own_range` / `shared_range`. Defaults E2B (own=L8-14, shared=L15-24); E4B passes (12,24)/(24,33). + - `build_gemma4_e2b_stateful_3chunks.py` --model gemma4-e4b now produces a 3-chunk merged bundle (chunk_1 L0-11 / chunk_2 L12-32 / chunk_3 L33-41). + - `sanity_stateful_chunks.py` model presets (--model gemma4-e2b / gemma4-e4b). + - `scripts/assemble_gemma4_stateful_e4b.sh` bundle assembler for iPhone sideload. + - `Sources/CoreMLLLM/ModelDownloader.swift` — `gemma4e4bStateful` + `gemma4e4bStatefulLinear` ModelInfo entries (slots 6/7 under `LLM_SHOW_EXPERIMENTAL=1`, sideload-only — `downloadURL: ""`). + - `Examples/.../LLMRunner.swift` — stateful detection comment now lists all four E2B+E4B folders. + +2. **`2655c17`** — single-function T=288 prefill builder accepts E4B + - `build_gemma4_stateful_singlefunc_prefill.py` plumbs `own_range` / `shared_range` through `convert_chunk2_merged_prefill`. Without this, on E4B the merged middle prefill chunk silently used E2B layer ranges. + +**Pending Phase A work (hardware-blocked):** +- A4: Mac build (3-chunk decode + multifunction prefill_b8) — kicked off in background after this doc lands. +- A4': Mac build (T=288 single-function prefill) for E2B + E4B — same session. +- A5: iPhone 17 Pro A/B for E4B 3-chunk merged stateful Linear — needs device. +- A6: HF upload `mlboydaisuke/gemma-4-E4B-stateful-coreml` once iPhone clears. + +--- + +## Phase B — Stage 8 multimodal stateful engine + +### B1. T=288 single-function prefill mlpackages (DONE script-side, build pending) + +E2B and E4B variants of: +- `chunk_1_prefill_T288.mlpackage` (own KV) +- `chunk_2_3way_prefill_T288.mlpackage` (merged: own + shared internal) +- `chunk_3_prefill_T288.mlpackage` (KV-shared + lm_head + argmax) + +T=288 = 256-token image span + ~32 text margin (BOS / turn markers). Drop to T=224 if 8 GB iPhone non-Pro rejects T=288 compile peak (probe required, see C1). + +**Single-function** (separate mlpackage per T) instead of multifunction merge — iPhone ANE 18 rejects multifunction T>1 + dual MLState with `ANECCompile FAILED 11`. Probe 2 verified single-function T=288 compiles in 7.3 s on iPhone 17 Pro A19 Pro. + +### B2. New Swift class `Gemma4StatefulMultimodalEngine` + +**Location:** `Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift` (new file). + +**Why a separate class (not inheritance / extension):** +- `Gemma4StatefulEngine` is `public final class` — not extensible. +- The two engines have different prefill path topology (multifunction merged vs separate single-func mlpackages), different state lifecycle (decode-only state vs decode+prefill state with bridge), and different public API shape (`generate(prompt:)` vs `generate(prompt:images:audio:)`). +- Keeps the Stage 3 stateful Linear 33.4 tok/s text-only path bit-identical for users who don't want multimodal. + +**Storage:** + +```swift +@available(iOS 18.0, macOS 15.0, *) +public final class Gemma4StatefulMultimodalEngine { + // Decode chunks (3-chunk merged stateful Linear) + private var decodeChunk1: MLModel? // L0-7 / L0-11 (E4B) — own KV + private var decodeChunk2: MLModel? // L8-24 / L12-32 — merged own+shared + private var decodeChunk3: MLModel? // L25-34 / L33-41 + lm_head + argmax + + // Prefill T=288 chunks (separate mlpackages, single-function) + private let prefillT: Int = 288 + private var prefillChunk1: MLModel? + private var prefillChunk2: MLModel? + private var prefillChunk3: MLModel? // identical structure to decodeChunk3 + + // Per-chunk MLState (chunk_3 is stateless — KV-shared from chunk_2) + private var decodeState1: MLState? + private var decodeState2: MLState? + private var prefillState1: MLState? + private var prefillState2: MLState? + + // Multimodal encoders (lazy) + private var visionModel: MLModel? // SigLIP still-image, 256 tokens + private var videoVisionModel: MLModel? // pooled SigLIP, 64 tokens/frame + private var audioModel: MLModel? // Conformer, ~50 tokens/2sec + private var audioProjection: ProjectionWeights? + private var melFilterbank: Data? + + // Sidecars (same as legacy engine) + private var embedTokens: EmbeddingLookup? + private var embedTokensPerLayer: EmbeddingLookup? + private var cosSlidingTable: Data? // mmap + private var sinSlidingTable: Data? + private var cosFullTable: Data? + private var sinFullTable: Data? + + // Cross-turn state (Phase 2a — LCP match) + private var persistedInputIds: [Int32] = [] + private var persistedPosition: Int = 0 + + // Reusable scratch (T=1 decode + T=288 prefill) + private var maskFullDecode, maskSlidingDecode: MLMultiArray! + private var maskFullPrefill288, maskSlidingPrefill288: MLMultiArray! + // ... batch hidden, per-layer raw, RoPE batched, etc. +} +``` + +**Public API:** + +```swift +public init(config: Config = Config()) +public func resetPersistedState() +public func load(modelDirectory: URL) async throws + +public func generate( + prompt: String, + images: [CGImage] = [], + audioPCM16k: [Float]? = nil, + maxNewTokens: Int = 512, + eosTokenIds: Set = [], + onToken: ((Int32) -> Void)? = nil +) async throws -> [Int32] +``` + +Video is a series of CGImages produced by `VideoProcessor.extractFrames` — exposed at the LLMRunner layer, not the engine. + +### B3. Multimodal helpers — port from Stage 6 (`origin/stage6-multimodal-stateful` commit `02ac583`) + +The Stage 6 patch added 528 lines to the legacy engine. We port these into the new class with one structural change: feature splice happens **during prefill at T=288** instead of during multifunction prefill_b8. + +**Helpers to port (file:line references for `02ac583`):** + +| Helper | Purpose | Adaption needed | +|---|---|---| +| `loadMultimodalEncoders` | Probe + load vision/video/audio mlmodelc + sidecars | Same layout, new file paths | +| `processImage(_: CGImage)` | UIImage → 256-token feature MLMultiArray | None — same encoder | +| `processVideoFrame` | Per-frame still vision encoding (64 tokens) | None | +| `processAudio(_: [Float])` | PCM 16k → mel → Conformer → projection | None | +| `computeVisionGroupIds` | Per-token group label (which image each token belongs to) | T=8 → T=288 generalization | +| `fillBatchMasksVisionAware` | Bidirectional within-image, causal across | T=8 → T=288 generalization | +| `multimodalSpliceT1` | Per-token feature splice at IMAGE/AUDIO_TOKEN_ID position | Reused for tail of prompt that doesn't fit T=288 | + +**Special token IDs (preserved from Stage 6):** +- `IMAGE_TOKEN_ID = 258880` +- `AUDIO_TOKEN_ID = 258881` +- `VIDEO_TOKEN_ID = 258884` + +### B4. State bridge (probe 2 verified) + +After prefill completes, copy `kv_cache_sliding` and `kv_cache_full` from prefill MLState to decode MLState. Critical requirement: **nested closures** — the buffer pointer is only valid within `withMultiArray(for:)` scope. + +```swift +private func bridgeKVState(from src: MLState, to dst: MLState) { + let names = ["kv_cache_sliding", "kv_cache_full"] + for name in names { + src.withMultiArray(for: name) { srcArr in + dst.withMultiArray(for: name) { dstArr in + let bytes = srcArr.count * MemoryLayout.stride // fp16 + memcpy(dstArr.dataPointer, srcArr.dataPointer, bytes) + } + } + } +} +``` + +Called twice per generate(): once for chunk_1 state (sliding-only on E2B / sliding+full on E4B), once for chunk_2 state (sliding+full both). + +Pitfall: chunk_3 is **stateless** in the 3-chunk variant (KV-shared from chunk_2 outputs kv13/kv14). No state to bridge for chunk_3. + +### B5. Generate flow (single-image text+image example) + +``` +Input: prompt = "What's in this picture?", images = [oneImage] + +1. Build inputIds: + [BOS] "What's in this picture?" [EOT] + ≈ 1 + 256 + 8 + 1 = 266 tokens — fits in T=288. + +2. Preprocess image: + features = visionModel(processImage(oneImage)) // (1, 256, hiddenSize) + +3. Build prefill input: + - embed_lookup(inputIds) → hidden (1, 266, hidden) + - splice features[0..<256] into hidden[1..<257] + - zero per_layer_raw at image positions + - vision-aware mask: bidirectional within hidden[1..<257], + causal elsewhere + +4. Run prefill T=288 (pad inputIds to 288 with mask = -inf): + - prefillChunk1(hidden, masks, rope, pos=0..287, ringPos=0) + → updates prefillState1 (kv_cache_sliding[0..287]) + - prefillChunk2(prefill1.hidden, ..., pos=0..287, ringPos=0) + → updates prefillState2; outputs kv13_k/v + kv14_k/v at last layer + - prefillChunk3(prefill2.hidden, kv13_*, kv14_*, ...) + → outputs token_id (last decode token) + +5. Bridge state: + bridgeKVState(prefillState1 → decodeState1) + bridgeKVState(prefillState2 → decodeState2) + +6. Decode loop (T=1, position=266, 267, ...): + - decodeChunk1(emb(token), masks, rope, pos, ringPos) + state: decodeState1 + - decodeChunk2(...) state: decodeState2 + - decodeChunk3(..., kv13, kv14) → next token + - emit, append to output, repeat until EOS or maxTokens +``` + +For prompts longer than T=288: **split into multiple T=288 prefill passes** (no overlap; each pass writes consecutive ring positions). Image span must NOT split across passes — push image to first pass and chunk text after. + +### B6. ModelDownloader bundle layout + +Mirror E2B's existing `gemma-4-E2B-stateful-coreml` layout but add a `prefill_T288/` subdir: + +``` +mlboydaisuke/gemma-4-{E2B,E4B}-stateful-multimodal-coreml/ + gemma4_e2b_stateful_chunks/ # subdir kept for engine compat + chunk_1.mlmodelc # decode multifunction merged + chunk_2.mlmodelc + chunk_3.mlmodelc + prefill_T288/ + chunk_1_prefill_T288.mlmodelc + chunk_2_3way_prefill_T288.mlmodelc + chunk_3_prefill_T288.mlmodelc + embed_tokens_q8.bin # sidecars + embed_tokens_scales.bin + embed_tokens_per_layer_q8.bin + embed_tokens_per_layer_scales.bin + per_layer_projection.bin + per_layer_norm_weight.bin + cos_sliding.npy / sin_sliding.npy + cos_full.npy / sin_full.npy + hf_model/ # tokenizer + model_config.json + vision.mlmodelc # multimodal encoders + vision_video.mlmodelc + audio.mlmodelc + output_proj_weight.npy # audio projection sidecars + output_proj_bias.npy + embed_proj_weight.npy +``` + +Total bundle size: +- Decode chunks: ~1.15 GB (E2B) / ~1.6 GB (E4B) +- T=288 prefill chunks: ~1.50 GB (E2B) / ~2.0 GB (E4B) +- Encoders: ~0.99 GB (shared between models) +- Sidecars + tokenizer: ~0.4 GB +- **Total: ~4.0 GB (E2B) / ~5.0 GB (E4B)** download. + +`ModelDownloader.buildGemma4StatefulMultimodalE{2,4}BFileList()` enumerates all files. Mirror the existing E2B helpers' pattern. + +### B7-B8. iPhone tests + parity + +- B7: Real-device test — image+text, video+text, audio+text → correct output. +- B8: Parity test — fixed image prompt through legacy 4-chunk prefill+decode vs new T=288 stateful prefill+bridge+decode. First 32 decode tokens must agree (top-1). + +### C1. 8 GB iPhone non-Pro probe + +Probe 1 only validated 12 GB iPhone 17 Pro at T=288. iPhone 15 / 16 / 17 non-Pro have 8 GB RAM — chunk_2 prefill at T=288 may fail compile peak (chunk_2 is the largest at 21 layers for E4B). If 8 GB fails, fall back to T=224 (image still fits 256 tokens; text margin shrinks to ~−32 — acceptable since prompt-tail fallback to T=1 already exists). + +--- + +## Open questions for next session + +1. **Picker entry naming.** "Gemma 4 E4B (multimodal stateful)" or "Gemma 4 E4B (stateful, vision+audio)"? UI clarity vs concision. +2. **Default model swap.** When B is shipped, should `gemma4e2b3way` (current production multimodal) be deprecated in favor of `gemma4e2bStatefulMultimodal` (faster decode + multimodal)? Memory note says current E2B 3-chunk is the multimodal default; swapping requires a soft-deprecation cycle for users mid-download. +3. **Cross-turn KV with vision.** Phase 2a LCP match assumes prefix invariance. If turn 1 has image and turn 2 reuses the same image, the image features may have been re-encoded — does the LCP match still hold? Stage 6 had this concern unresolved. + +--- + +## Build commands (Phase A — kick off after this doc lands) + +```bash +# Build 1: E4B 3-chunk merged decode + multifunction prefill_b8 +HF_DIR=/Users/majimadaisuke/Downloads/CoreML-LLM/output/gemma4-e4b/hf_model +python conversion/build_gemma4_e2b_stateful_3chunks.py \ + --model gemma4-e4b \ + --hf-dir "$HF_DIR" \ + --output /tmp/gemma4-e4b-stateful-3chunk \ + --linear-projections \ + --prefill-batches "8" \ + --ctx 2048 \ + --nbits 4 + +# Build 2: E4B T=288 single-function prefill (Stage 8) +python conversion/build_gemma4_stateful_singlefunc_prefill.py \ + --model gemma4-e4b \ + --hf-dir "$HF_DIR" \ + --output /tmp/gemma4-e4b-singlefunc-prefill-T288 \ + --t 288 \ + --linear-projections \ + --ctx 2048 \ + --nbits 4 + +# Sanity (chunk shape + chained 1→2→3 forward, CPU_AND_NE): +python conversion/sanity_stateful_chunks.py \ + --model gemma4-e4b \ + --artifacts /tmp/gemma4-e4b-stateful-3chunk + +# Bundle assemble for sideload (assumes legacy E4B sidecars in +# CoreML-LLM/output/gemma4-e4b/bundle): +SIDECARS=/Users/majimadaisuke/Downloads/CoreML-LLM/output/gemma4-e4b/bundle \ +SRC_CHUNKS=/tmp/gemma4-e4b-stateful-3chunk \ +bash scripts/assemble_gemma4_stateful_e4b.sh +``` + +Both builds load the 15 GB E4B safetensors; estimated 60-120 min total. Run sequentially to avoid memory contention. + +--- + +## Reference + +- `docs/MLSTATE_MULTIMODAL_PROBE.md` — probe 1 (T=288 chunk_1 compiles) + probe 2 (state bridge memcpy works). +- `docs/HANDOFF_STAGE8_MLSTATE_MULTIMODAL.md` — original Stage 8 handoff with 5-step plan. +- `docs/SESSION_2026_04_27_STAGE6_MULTIMODAL.md` — Stage 6 multimodal in legacy engine (in-place patch; ours is fresh class). +- Stage 6 commits: `origin/stage6-multimodal-stateful` (`02ac583`, `2432995`, `987ad86`) — port these helpers verbatim into new class. +- `Sources/CoreMLLLM/Gemma4StatefulEngine.swift` — legacy engine, reference for the patterns we duplicate (mask filling, RoPE lookup, EmbeddingLookup wiring, position scratch, etc.). From 526a73d3edbb2a0faf1c80a6c6a81772d6d033b4 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Tue, 28 Apr 2026 21:22:17 +0900 Subject: [PATCH 4/7] docs: macOS 26 coremltools 9.0 wheel workaround PyPI wheel ships .so files referencing @rpath/lib*.dylib that aren't included; on macOS 26 (Darwin 25 / Tahoe) this silently produces an empty pybind11 module so every conversion script crashes at "BlobWriter not loaded". Captures the fresh-venv + source-build steps that get a working /tmp/ct_build_venv to unblock builds until upstream ships fixed wheels. --- docs/MACOS_26_BUILD_ENV.md | 109 +++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 docs/MACOS_26_BUILD_ENV.md diff --git a/docs/MACOS_26_BUILD_ENV.md b/docs/MACOS_26_BUILD_ENV.md new file mode 100644 index 0000000..ca53583 --- /dev/null +++ b/docs/MACOS_26_BUILD_ENV.md @@ -0,0 +1,109 @@ +# macOS 26 (Tahoe / Darwin 25) — coremltools 9.0 build workaround + +The PyPI wheel `coremltools-9.0-*-macosx_11_0_arm64.whl` packages the +C++ extensions as `.so` files but the install_name baked at link time +references `@rpath/libmilstoragepython.dylib` etc. — the matching +`.dylib` files are **NOT** included. On macOS 26, this triggers an +import that loads the module silently without any C++ classes +registered, so `coremltools.libmilstoragepython._BlobStorageWriter` +ends up undefined and Apple Conversion stalls at: + +``` +RuntimeError: BlobWriter not loaded +``` + +(Symptom: every conversion script in `conversion/` fails after model +load and trace, before saving the mlpackage.) + +This was working on Apr 26, 2026; the symptom appeared after upgrading +to macOS 26. Reproduces in every venv (Python 3.10 / 3.11 / 3.12 / +3.14, coremltools 8.3.0 / 9.0). + +## Fix: build coremltools from source into a fresh venv + +```bash +# 1. Toolchain +brew install protobuf # protoc 34.x +xcode-select -p # confirm /Applications/Xcode.app/... +which cmake # confirm /opt/homebrew/bin/cmake + +# 2. Fresh venv (Python 3.10 — the most stable target for coremltools 9.0) +~/.pyenv/versions/3.10.13/bin/python3 -m venv /tmp/ct_build_venv +/tmp/ct_build_venv/bin/pip install --upgrade pip wheel setuptools +/tmp/ct_build_venv/bin/pip install pybind11 numpy + +# 3. Source build +cd /tmp +git clone --depth 1 https://github.com/apple/coremltools.git coremltools-src +cd coremltools-src +mkdir -p build && cd build + +xcrun --sdk macosx cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_OSX_DEPLOYMENT_TARGET=12.3 \ + -DPYTHON_EXECUTABLE:FILEPATH=/tmp/ct_build_venv/bin/python \ + -DPYTHON_INCLUDE_DIR=/Users/$USER/.pyenv/versions/3.10.13/include/python3.10 \ + -DPYTHON_LIBRARY=/Users/$USER/.pyenv/versions/3.10.13/lib/libpython3.10.dylib \ + -DOVERWRITE_PB_SOURCE=0 \ + /tmp/coremltools-src + +make -j$(sysctl -n hw.ncpu) +cmake --build . --target dist # produces build/dist/coremltools-*.whl + +# 4. Install the freshly built wheel + copy the dylibs alongside the .so files +/tmp/ct_build_venv/bin/pip install build/dist/coremltools-*.whl +cp build/lib*.dylib \ + /tmp/ct_build_venv/lib/python3.10/site-packages/coremltools/ +install_name_tool -add_rpath @loader_path \ + /tmp/ct_build_venv/lib/python3.10/site-packages/coremltools/libmilstoragepython.so +install_name_tool -add_rpath @loader_path \ + /tmp/ct_build_venv/lib/python3.10/site-packages/coremltools/libcoremlpython.so +install_name_tool -add_rpath @loader_path \ + /tmp/ct_build_venv/lib/python3.10/site-packages/coremltools/libmodelpackage.so + +# 5. Install conversion deps +/tmp/ct_build_venv/bin/pip install --no-cache-dir \ + torch transformers safetensors huggingface-hub scikit-learn + +# 6. Verify +/tmp/ct_build_venv/bin/python -c " +from coremltools.converters.mil.backend.mil.load import BlobWriter +import coremltools as ct +print('ct', ct.__version__, '— BlobWriter:', BlobWriter) +" +# Expected: ct 9.0 — BlobWriter: +``` + +## Use the venv for conversion runs + +```bash +PY=/tmp/ct_build_venv/bin/python +$PY conversion/build_gemma4_e2b_stateful_3chunks.py \ + --model gemma4-e4b \ + --hf-dir /path/to/gemma4-e4b/hf_model \ + --output /tmp/gemma4-e4b-stateful-3chunk \ + --linear-projections \ + --prefill-batches "8" \ + --ctx 2048 --nbits 4 +``` + +`/tmp/ct_build_venv` is the pinned env for all `conversion/` scripts on +this machine until coremltools 9.1 (or newer) ships a wheel that bundles +the dylibs alongside the .so files for macOS 26. + +## Why the symptom is silent + +The Python extension `.so` exports `_PyInit_libmilstoragepython` and +loads cleanly under `dlopen`. PyInit registers the pybind11 module and +attaches `_BlobStorageWriter` / `_BlobStorageReader` only if the +matching `libmilstoragepython.dylib` is found and its C++ symbols +resolve. When the dylib is missing, pybind11 silently skips class +registration; the module loads with `dir(m) == ['__doc__', '__file__', +'__loader__', '__name__', '__package__', '__spec__']` — no error, no +warning, just an empty module. + +Confirm with: +```bash +$PY -c "import coremltools.libmilstoragepython as m; print(dir(m))" +``` +A working install also lists `_BlobStorageReader` and `_BlobStorageWriter`. From a540e395796666205ebf8656e9259b8820b3bb46 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Sun, 3 May 2026 10:39:32 +0900 Subject: [PATCH 5/7] feat(gemma4-e4b): multimodal CoreML bundle (text+image+video+audio on iPhone) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Working configuration for iPhone 17 Pro at 15.7 tok/s decode + correct output across all four input modalities. Validated 2026-05-03 on a clean sandbox push of the assembled bundle. Topology: decode = Topology II (chunk1 legacy + chunk2_3way + chunk3_3way merged 21-layer middle + final lm_head). Auto-detected by ChunkedEngine via chunk2_3way/chunk3_3way presence. prefill = legacy chunks 1/2/3/4 prefill_b8 multifunction. Vision-aware bidirectional mask within image span via the engine's existing fillBatchMasksVisionAware (works at T=8 batches). vision = vision.ane.mlmodelc (E4B, output [1, 256, 2560]). audio = audio.mlmodelc (E4B, output [1, 50, 1024]) + Swift two-stage projection 1024 -> 1536 -> 2560. Changes: - Sources/CoreMLLLM/AudioProcessor.swift: ProjectionWeights now derives inDim/outDim/finalDim from weight tensor sizes (was hard-coded for E2B's square 1536x1536 embed_proj). E4B's embed_proj is non-square (2560, 1536); the embed_proj sgemm now uses finalDim for the output dimension. Direct cause of the audio gibberish on E4B. - conversion/models/gemma4_swa_merged.py: MergedChunk23 (the non-stateful merged chunk2+chunk3 used by Topology II) now accepts own_range / shared_range; defaults stay at E2B (L8-14 / L15-24). Mirrors the stateful generalisation from 4665ab2. - conversion/build_gemma4_3way.py: thread compute_chunk_boundaries(cfg) through to MergedChunk23 so `--model gemma4-e4b` produces a 21-layer chunk2_3way (L12-23 own + L24-32 shared) instead of the E2B-hardcoded 17-layer span. - scripts/assemble_gemma4_e4b_multimodal.sh: reproducible bundle assembly script (compiles mlpackage->mlmodelc, copies sidecars + legacy chunks + E4B encoders). - docs/E4B_MULTIMODAL_BUILD.md: build + sideload guide, including the rejected paths (prefill_chunk* multifunction, stateful) and the iPhone clean-sandbox requirement (devicectl never deletes orphans). Out of scope (in this commit): - Stateful Stage 8 engine — separate commit, Mac-only / iPhone-blocked. - prefill_chunk{1..4}.mlmodelc multifunction path — built and tested but produces broken output on iPhone with E4B (Mac OK); not shipped. - vision_video.mlmodelc — engine falls back to 2x2 pool of vision encoder; quality validated. --- Sources/CoreMLLLM/AudioProcessor.swift | 43 ++++-- conversion/build_gemma4_3way.py | 8 +- conversion/models/gemma4_swa_merged.py | 25 +-- docs/E4B_MULTIMODAL_BUILD.md | 138 +++++++++++++++++ scripts/assemble_gemma4_e4b_multimodal.sh | 176 ++++++++++++++++++++++ 5 files changed, 363 insertions(+), 27 deletions(-) create mode 100644 docs/E4B_MULTIMODAL_BUILD.md create mode 100755 scripts/assemble_gemma4_e4b_multimodal.sh diff --git a/Sources/CoreMLLLM/AudioProcessor.swift b/Sources/CoreMLLLM/AudioProcessor.swift index 08b8d51..d88bc98 100644 --- a/Sources/CoreMLLLM/AudioProcessor.swift +++ b/Sources/CoreMLLLM/AudioProcessor.swift @@ -23,21 +23,32 @@ public enum AudioProcessor { // MARK: - Projection weights (loaded from .npy files) /// Loaded projection weights for Swift-side computation. + /// Two-stage projection: 1024 → outDim (output_proj) → finalDim (embed_proj). + /// E2B: outDim=1536, finalDim=1536 (square embed_proj). + /// E4B: outDim=1536, finalDim=2560 (non-square embed_proj — projects up to LM hidden). public struct ProjectionWeights { - let outputProjWeight: [Float] // (1536, 1024) row-major - let outputProjBias: [Float] // (1536,) - let embedProjWeight: [Float] // (1536, 1536) row-major + let outputProjWeight: [Float] // (outDim, 1024) row-major + let outputProjBias: [Float] // (outDim,) + let embedProjWeight: [Float] // (finalDim, outDim) row-major let inDim: Int // 1024 - let outDim: Int // 1536 + let outDim: Int // 1536 (audio_soft_token_size) + let finalDim: Int // LM hidden size (1536 E2B / 2560 E4B) /// Load projection weights from .npy files in the model directory. public static func load(from directory: URL) throws -> ProjectionWeights { let opW = try loadNpyFloat16(directory.appendingPathComponent("output_proj_weight.npy")) let opB = try loadNpyFloat16(directory.appendingPathComponent("output_proj_bias.npy")) let epW = try loadNpyFloat16(directory.appendingPathComponent("embed_proj_weight.npy")) + // outDim = output_proj_bias length = audio_soft_token_size (1536). + // inDim = output_proj_weight.count / outDim = 1024. + // finalDim = embed_proj_weight.count / outDim = LM hidden (E2B 1536, E4B 2560). + let outDim = opB.count + let inDim = opW.count / outDim + let finalDim = epW.count / outDim return ProjectionWeights( outputProjWeight: opW, outputProjBias: opB, - embedProjWeight: epW, inDim: 1024, outDim: 1536) + embedProjWeight: epW, + inDim: inDim, outDim: outDim, finalDim: finalDim) } /// Load a float16 numpy file as [Float]. @@ -182,26 +193,28 @@ public enum AudioProcessor { } } - // embed_proj: (S, 1536) @ W^T(1536, 1536) → (S, 1536) - var features = [Float](repeating: 0, count: S * outDim) + // embed_proj: (S, outDim) @ W^T(finalDim, outDim) → (S, finalDim). + // E2B: finalDim==outDim==1536 (square). E4B: finalDim=2560 != outDim=1536. + let finalDim = proj.finalDim + var features = [Float](repeating: 0, count: S * finalDim) cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - Int32(S), Int32(outDim), Int32(outDim), + Int32(S), Int32(finalDim), Int32(outDim), 1.0, projected, Int32(outDim), proj.embedProjWeight, Int32(outDim), - 0.0, &features, Int32(outDim)) + 0.0, &features, Int32(finalDim)) // fp32 → fp16 batch conversion via Accelerate let result = try! MLMultiArray( - shape: [1, NSNumber(value: S), NSNumber(value: outDim)], + shape: [1, NSNumber(value: S), NSNumber(value: finalDim)], dataType: .float16) - let rp = result.dataPointer.bindMemory(to: UInt16.self, capacity: S * outDim) + let rp = result.dataPointer.bindMemory(to: UInt16.self, capacity: S * finalDim) features.withUnsafeBufferPointer { src in var srcBuf = vImage_Buffer(data: UnsafeMutableRawPointer(mutating: src.baseAddress!), - height: 1, width: vImagePixelCount(S * outDim), - rowBytes: S * outDim * 4) + height: 1, width: vImagePixelCount(S * finalDim), + rowBytes: S * finalDim * 4) var dstBuf = vImage_Buffer(data: rp, height: 1, - width: vImagePixelCount(S * outDim), - rowBytes: S * outDim * 2) + width: vImagePixelCount(S * finalDim), + rowBytes: S * finalDim * 2) vImageConvert_PlanarFtoPlanar16F(&srcBuf, &dstBuf, 0) } diff --git a/conversion/build_gemma4_3way.py b/conversion/build_gemma4_3way.py index 27e4d4c..b39a6af 100644 --- a/conversion/build_gemma4_3way.py +++ b/conversion/build_gemma4_3way.py @@ -180,9 +180,13 @@ def build_chunk2_merged(base, ctx: int, out_pkg: str, *, quantize: bool) -> None max_hd = hd_f nkv = cfg.num_key_value_heads - mc = MergedChunk23(base).eval() + boundaries = compute_chunk_boundaries(cfg) + own_range = boundaries[1] + shared_range = boundaries[2] + mc = MergedChunk23(base, own_range=own_range, shared_range=shared_range).eval() ns, nf = mc.num_sliding, mc.num_full - print(f"\n=== chunk2_3way (L{mc.START_C2}-{mc.END_C3-1}, 17 layers) ===") + n_layers = (mc.END_C2 - mc.START_C2) + (mc.END_C3 - mc.START_C3) + print(f"\n=== chunk2_3way (L{mc.START_C2}-{mc.END_C3-1}, {n_layers} layers) ===") print(f" own-KV: {ns} sliding + {nf} full") sample = ( diff --git a/conversion/models/gemma4_swa_merged.py b/conversion/models/gemma4_swa_merged.py index a493ab3..bd54dad 100644 --- a/conversion/models/gemma4_swa_merged.py +++ b/conversion/models/gemma4_swa_merged.py @@ -20,21 +20,26 @@ class MergedChunk23(nn.Module): - """Layers 8-24: chunk2 (L8-14) + chunk3 (L15-24) merged. + """Merged chunk2 + chunk3 (own KV + KV-shared). Boundaries default + to E2B (own=L8-14, shared=L15-24). For E4B pass own_range / + shared_range from `compute_chunk_boundaries(cfg)` + (E4B: own=L12-23, shared=L24-32). - Own KV: L8-14 (5 sliding + 2 full). Shared KV: L15-24 (all shared from L13/L14). - kv13/kv14 stay internal — never leave the ANE. - - Outputs: hidden_states, K/V for L8-14, BUT NOT kv13/kv14 (internal). - chunk4 still needs kv14 → output it for chunk4. + kv13/kv14 stay internal — never leave the ANE. Outputs: hidden_states, + K/V for own layers, kv13/kv14 (chunk4 still needs them). """ - START_C2, END_C2 = 8, 15 # chunk2 layers - START_C3, END_C3 = 15, 25 # chunk3 layers + DEFAULT_OWN = (8, 15) # E2B own-KV layers + DEFAULT_SHARED = (15, 25) # E2B KV-shared layers - def __init__(self, model: Gemma4Model): + def __init__(self, model: Gemma4Model, + own_range: tuple[int, int] | None = None, + shared_range: tuple[int, int] | None = None): super().__init__() self.config = model.config - # All layers 8-24 + own = own_range if own_range is not None else self.DEFAULT_OWN + shared = shared_range if shared_range is not None else self.DEFAULT_SHARED + self.START_C2, self.END_C2 = own + self.START_C3, self.END_C3 = shared self.layers_c2 = nn.ModuleList([model.layers[i] for i in range(self.START_C2, self.END_C2)]) self.layers_c3 = nn.ModuleList([model.layers[i] for i in range(self.START_C3, self.END_C3)]) self.sliding_map, self.full_map = _layer_kv_map(self.START_C2, self.END_C2, model.config) diff --git a/docs/E4B_MULTIMODAL_BUILD.md b/docs/E4B_MULTIMODAL_BUILD.md new file mode 100644 index 0000000..a9d0cd9 --- /dev/null +++ b/docs/E4B_MULTIMODAL_BUILD.md @@ -0,0 +1,138 @@ +# Gemma 4 E4B multimodal CoreML — build & sideload guide + +**Status:** Validated 2026-05-03 on iPhone 17 Pro. Text 15.7 tok/s + image / video / audio all functional. + +**Working bundle:** `gemma4-e4b-multimodal` (~7.6 GB). + +--- + +## Bundle topology (what works on iPhone) + +| Component | File(s) | Source | +|---|---|---| +| Decode (Topology II 3-chunk) | `chunk1` (legacy) + `chunk2_3way` + `chunk3_3way` | legacy E4B HF + `build_gemma4_3way.py --model gemma4-e4b` | +| Prefill (multifunction `prefill_b8`) | `chunk1` / `chunk2` / `chunk3` / `chunk4` (legacy) | legacy E4B HF (`mlboydaisuke/gemma-4-E4B-coreml`) | +| Vision encoder | `vision.ane.mlmodelc` (output `[1, 256, 2560]`) | `convert_gemma4_multimodal.py --vision-ane --model-path ` | +| Audio encoder | `audio.mlmodelc` (output `[1, 50, 1024]`) | `convert_audio.py --model-path ` | +| Audio projection | `output_proj_*.npy` (1024→1536) + `embed_proj_weight.npy` (1536→2560) | from `convert_audio.py` | +| Text sidecars | `embed_tokens_*`, RoPE tables, `model_config.json`, `hf_model/` | legacy E4B HF | + +`AudioProcessor.swift` `projectHiddenStates` runs the two-stage projection in Swift/Accelerate. `embed_proj` is now non-square aware (E4B `(2560, 1536)` vs E2B `(1536, 1536)`). + +--- + +## What was tried and rejected + +### `prefill_chunk{1..4}.mlmodelc` separate-file multifunction (T=64/128/256/512) + +Built via `build_prefill_multifunction.py` (the production E2B `gemma4e2b3way` path). + +- **Mac**: works fine, 16.5 tok/s text + correct multimodal. +- **iPhone**: text and image/audio prompts both produce degenerate output (e.g. `こんにちは` → `こんにちは。\n(同じトーンで)\nこんにちは。`). +- Likely cause: int4 quantization noise on iPhone ANE 18 + E4B-specific graph (HKV=2, 21 merged layers in `chunk2_3way`) tips greedy argmax into a degenerate loop. E2B ships the same multifunction layout and works on iPhone. +- Engine code is unchanged; the bundle ships **without `prefill_chunk*`** so the umbrella engine falls back to legacy `prefill_b8` multifunction. Vision-aware bidirectional mask within the image span still functions through `fillBatchMasksVisionAware` in `ChunkedEngine.swift`. + +### Stateful (MLState) E4B multimodal + +Engine class `Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift` builds and runs on Mac. iPhone ANE 18 fails to compile `chunk_2` with `std::bad_cast` in MIL→EIR translation when the producer layer's `kv13_k`/`kv14_k` alias slice is exposed as a chunk output. `.clone()` in PyTorch and 4-chunk decode split (each chunk smaller) both produce the same compile failure. Stateful path remains Mac-only / dev-only. + +### iPhone bundle pushes + +`xcrun devicectl device copy to` does **not** delete files that aren't in the source. Switching bundle layouts (e.g. multimodal → baseline) requires deleting and re-installing the app to clear the data container — otherwise orphan files (a leftover `prefill_chunk1.mlmodelc` is enough) silently override the new bundle's behaviour. + +--- + +## Build steps + +Run on Mac with a working `coremltools 9.0` venv. macOS 26 needs the source-built wheel (see `docs/MACOS_26_BUILD_ENV.md`). + +```bash +PY=/tmp/ct_build_venv/bin/python +HF_DIR=/path/to/gemma4-e4b/hf_model # local clone of google/gemma-4-E4B-it +ROOT=$(pwd) + +# 1. 3-chunk decode (Topology II merged middle chunk). +mkdir -p /tmp/gemma4-e4b-3way +$PY conversion/build_gemma4_3way.py \ + --model gemma4-e4b --hf-dir "$HF_DIR" \ + --output /tmp/gemma4-e4b-3way --ctx 2048 + +# 2. Vision encoder (ANE-targeted, square 48×48 grid → 256 soft tokens at LM hidden 2560). +mkdir -p /tmp/gemma4-e4b-vision-ane +$PY ../CoreML-LLM/conversion/convert_gemma4_multimodal.py \ + --model-path "$HF_DIR" \ + --output /tmp/gemma4-e4b-vision-ane \ + --quantize int4 \ + --vision-ane + +# 3. Audio encoder (Conformer + Swift projection sidecars). +mkdir -p /tmp/gemma4-e4b-audio +$PY ../CoreML-LLM/conversion/convert_audio.py \ + --model-path "$HF_DIR" \ + --output /tmp/gemma4-e4b-audio \ + --quantize int4 + +# 4. Assemble bundle (compiles mlpackage→mlmodelc, copies sidecars + legacy chunks). +LEGACY=/path/to/gemma4-e4b-coreml-bundle bash scripts/assemble_gemma4_e4b_multimodal.sh +# → build/gemma4-e4b-multimodal/ (~7.6 GB) +``` + +The assembler script accepts env vars `THREEWAY` / `VISION_ANE` / `AUDIO` / `LEGACY` / `MEL_FALLBACK` / `OUT` to override defaults. See the script header for the full layout description. + +--- + +## Sideload to iPhone + +```bash +DEVICE=$(xcrun devicectl list devices | awk '/iPhone 17 Pro/{print $3}') + +# 1. Delete CoreMLLLMChat app on iPhone (long-press home icon → "Remove App" +# → "Delete App"). devicectl doesn't remove orphan files; switching from a +# previous bundle without a clean sandbox WILL produce broken output. +# 2. In Xcode, Cmd+R to reinstall a fresh app. Launch once to create the +# Documents container. +# 3. Force-quit the app (swipe up in app switcher) so devicectl can write. + +xcrun devicectl device copy to --device "$DEVICE" \ + --domain-type appDataContainer \ + --domain-identifier com.example.CoreMLLLMChat \ + --source build/gemma4-e4b-multimodal \ + --destination Documents/Models/gemma4-e4b + +# 4. Xcode scheme env vars: +# LLM_SHOW_EXPERIMENTAL=1 (already required for some pickers) +# LLM_VISION_FORCE_ANE=1 (route vision.ane.mlmodelc through ANE) +# 5. Cmd+R, pick "Gemma 4 E4B" in the picker, test. +``` + +--- + +## Verified iPhone 17 Pro results (2026-05-03) + +| Modality | Result | +|---|---| +| Text-only | 15.7 tok/s, baseline-quality response (matches Mac) | +| Image + text | Coherent description, no gibberish | +| Video + text | Coherent description | +| Audio + text | Correct response (after `AudioProcessor` `embed_proj` non-square fix) | + +--- + +## Files of interest + +| File | Role | +|---|---| +| `Sources/CoreMLLLM/AudioProcessor.swift` | Two-stage Swift projection. `ProjectionWeights` now derives `inDim` / `outDim` / `finalDim` from weight tensor sizes; embed_proj sgemm uses `finalDim` (E4B 2560) instead of hard-coded `outDim`. | +| `conversion/models/gemma4_swa_merged.py` | `MergedChunk23` accepts `own_range` / `shared_range`; defaults E2B (L8-14 / L15-24); E4B passes (12,24)/(24,33). | +| `conversion/build_gemma4_3way.py` | Threads `compute_chunk_boundaries(cfg)` into the merged chunk so `--model gemma4-e4b` produces correct 3-way decode. | +| `scripts/assemble_gemma4_e4b_multimodal.sh` | Reproducible bundle assembly. | +| `Sources/CoreMLLLM/ChunkedEngine.swift` | Auto-detects Topology II via `chunk2_3way` + `chunk3_3way` presence. Routes prefill via `prefill_b8` multifunction in legacy chunks 1-4 when `prefill_chunk1` is absent (our case). | + +--- + +## What's NOT in this bundle (intentional) + +- **`prefill_chunk{1..4}.mlmodelc` (multifunction T=64/128/256/512)**: see "What was tried and rejected" above. +- **`vision.mlmodelc` (GPU variant, output `[1, 280, hidden]`)**: not built for E4B. We ship `vision.ane.mlmodelc` only and rely on `LLM_VISION_FORCE_ANE=1`. +- **`vision_video.mlmodelc`**: video runs through still-image vision with 2×2 pooling fallback in the engine. Adequate quality on validation. +- **Stateful chunks**: `Gemma4StatefulMultimodalEngine` is Mac-only / dev-only. diff --git a/scripts/assemble_gemma4_e4b_multimodal.sh b/scripts/assemble_gemma4_e4b_multimodal.sh new file mode 100755 index 0000000..93abe54 --- /dev/null +++ b/scripts/assemble_gemma4_e4b_multimodal.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# Assemble the Gemma 4 E4B multimodal CoreML bundle for iPhone sideload +# (or HF upload). Working configuration validated 2026-05-03 on iPhone 17 +# Pro: text 15.7 tok/s + image / video / audio all functional. +# +# Layout produced: +# +# build/gemma4-e4b-multimodal/ +# chunk1.mlmodelc # legacy 4-chunk (also has prefill_b8 multifunction) +# chunk2.mlmodelc # legacy chunk2 — used as prefill_b8 only +# chunk3.mlmodelc # legacy chunk3 — used as prefill_b8 only +# chunk4.mlmodelc # legacy chunk4 — used as prefill_b8 only +# chunk2_3way.mlmodelc # Topology II decode (own L12-23 + shared L24-32 merged) +# chunk3_3way.mlmodelc # Topology II decode (shared L33-41 + lm_head) +# vision.ane.mlmodelc # E4B SigLIP encoder (output [1, 256, 2560]) +# audio.mlmodelc # E4B Conformer encoder (output [1, 50, 1024]) +# audio_config.json +# mel_filterbank.bin +# output_proj_weight.npy # Audio projection 1024 → 1536 +# output_proj_bias.npy +# embed_proj_weight.npy # Audio projection 1536 → 2560 (E4B-specific shape) +# embed_tokens_q8.bin +# embed_tokens_scales.bin +# embed_tokens_per_layer_q8.bin +# embed_tokens_per_layer_scales.bin +# per_layer_projection.bin +# per_layer_norm_weight.bin +# cos_sliding.npy / sin_sliding.npy / cos_full.npy / sin_full.npy +# hf_model/ (tokenizer) +# model_config.json +# +# Total bundle size: ~7.6 GB. +# +# Engine routing (CoreMLLLM umbrella in Sources/CoreMLLLM/): +# - decode = Topology II (chunk1 + chunk2_3way + chunk3_3way) — auto-detected +# when chunk2_3way/chunk3_3way are present. +# - prefill = legacy chunks 1/2/3/4 prefill_b8 multifunction. The newer +# prefill_chunk{1..4}.mlmodelc separate-file path is INTENTIONALLY +# omitted: it produces broken outputs on iPhone ANE 18 with E4B +# (likely int4 quantization noise). Mac decodes E4B prefill_chunk* +# fine — iPhone-specific issue. +# - vision = vision.ane.mlmodelc when LLM_VISION_FORCE_ANE=1, else GPU +# fallback. Built E4B-specific (output dim 2560 matches LM hidden). +# - audio = audio.mlmodelc + Swift-side projection (AudioProcessor.swift, +# embed_proj is non-square 1536 → 2560 for E4B). +# +# Usage: +# bash scripts/assemble_gemma4_e4b_multimodal.sh +# +# Required input directories (override via env if non-default): +# THREEWAY=/tmp/gemma4-e4b-3way (build_gemma4_3way.py --model gemma4-e4b output) +# VISION_ANE=/tmp/gemma4-e4b-vision-ane (convert_gemma4_multimodal.py --vision-ane on E4B HF) +# AUDIO=/tmp/gemma4-e4b-audio (convert_audio.py on E4B HF) +# LEGACY=.../output/gemma4-e4b/bundle (legacy 4-chunk text-only bundle, e.g. from HF +# mlboydaisuke/gemma-4-E4B-coreml) +# MEL_FALLBACK=.../conversion/output/audio (mel_filterbank.bin source if missing from AUDIO) +# +# Push to iPhone (clean sandbox required — see docs/E4B_MULTIMODAL_BUILD.md): +# xcrun devicectl device copy to --device \ +# --domain-type appDataContainer \ +# --domain-identifier com.example.CoreMLLLMChat \ +# --source build/gemma4-e4b-multimodal \ +# --destination Documents/Models/gemma4-e4b +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +LEGACY="${LEGACY:-/Users/$USER/Downloads/CoreML-LLM/output/gemma4-e4b/bundle}" +THREEWAY="${THREEWAY:-/tmp/gemma4-e4b-3way}" +VISION_ANE="${VISION_ANE:-/tmp/gemma4-e4b-vision-ane}" +AUDIO="${AUDIO:-/tmp/gemma4-e4b-audio}" +MEL_FALLBACK="${MEL_FALLBACK:-/Users/$USER/Downloads/CoreML-LLM/conversion/output/audio}" +OUT="${OUT:-$ROOT/build/gemma4-e4b-multimodal}" + +# Sanity +for d in "$LEGACY" "$THREEWAY" "$VISION_ANE" "$AUDIO"; do + if [[ ! -d "$d" ]]; then + echo "[error] missing input dir: $d" >&2 + exit 1 + fi +done + +mkdir -p "$OUT" +rm -rf "$OUT"/* + +echo "[$(date)] === decode (chunk1 legacy + chunk{2,3}_3way Topology II) ===" +cp -R "$LEGACY/chunk1.mlmodelc" "$OUT/" +for c in chunk2_3way chunk3_3way; do + if [[ -d "$THREEWAY/$c.mlmodelc" ]]; then + cp -R "$THREEWAY/$c.mlmodelc" "$OUT/" + elif [[ -d "$THREEWAY/$c.mlpackage" ]]; then + echo " compile $c" + xcrun coremlcompiler compile "$THREEWAY/$c.mlpackage" "$OUT/" 2>&1 | tail -1 + else + echo "[error] $c{,_3way}.{mlpackage,mlmodelc} missing in $THREEWAY" >&2 + exit 1 + fi +done + +echo "" +echo "[$(date)] === legacy chunks 2/3/4 (prefill_b8 multifunction) ===" +for c in chunk2 chunk3 chunk4; do + cp -R "$LEGACY/$c.mlmodelc" "$OUT/" +done + +echo "" +echo "[$(date)] === text sidecars ===" +SIDE_TEXT=( + embed_tokens_q8.bin embed_tokens_scales.bin + embed_tokens_per_layer_q8.bin embed_tokens_per_layer_scales.bin + per_layer_projection.bin per_layer_norm_weight.bin + cos_sliding.npy sin_sliding.npy cos_full.npy sin_full.npy + hf_model model_config.json +) +for f in "${SIDE_TEXT[@]}"; do + if [[ -e "$LEGACY/$f" ]]; then + cp -R "$LEGACY/$f" "$OUT/" + else + echo " [warn] missing $f" + fi +done + +echo "" +echo "[$(date)] === E4B encoders + audio sidecars ===" +# Vision (E4B-specific, output dim 2560 matches LM hidden) +if [[ -d "$VISION_ANE/vision.ane.mlmodelc" ]]; then + cp -R "$VISION_ANE/vision.ane.mlmodelc" "$OUT/" +elif [[ -d "$VISION_ANE/vision.ane.mlpackage" ]]; then + xcrun coremlcompiler compile "$VISION_ANE/vision.ane.mlpackage" "$OUT/" 2>&1 | tail -1 +else + echo "[error] vision.ane.{mlpackage,mlmodelc} missing in $VISION_ANE" >&2 + exit 1 +fi +# Audio (E4B-specific, output [1, 50, 1024]) +if [[ -d "$AUDIO/audio.mlmodelc" ]]; then + cp -R "$AUDIO/audio.mlmodelc" "$OUT/" +elif [[ -d "$AUDIO/audio.mlpackage" ]]; then + xcrun coremlcompiler compile "$AUDIO/audio.mlpackage" "$OUT/" 2>&1 | tail -1 +else + echo "[error] audio.{mlpackage,mlmodelc} missing in $AUDIO" >&2 + exit 1 +fi +# Audio sidecars +SIDE_AUDIO=( + audio_config.json + output_proj_weight.npy output_proj_bias.npy embed_proj_weight.npy +) +for f in "${SIDE_AUDIO[@]}"; do + if [[ -e "$AUDIO/$f" ]]; then + cp "$AUDIO/$f" "$OUT/" + else + echo " [warn] missing $f" + fi +done +# mel_filterbank.bin (often shipped from a sibling audio build dir) +if [[ -e "$AUDIO/mel_filterbank.bin" ]]; then + cp "$AUDIO/mel_filterbank.bin" "$OUT/" +elif [[ -e "$MEL_FALLBACK/mel_filterbank.bin" ]]; then + cp "$MEL_FALLBACK/mel_filterbank.bin" "$OUT/" +else + echo " [warn] missing mel_filterbank.bin (audio path will fail at runtime)" +fi + +echo "" +echo "=== assembled ===" +du -sh "$OUT" +ls "$OUT" +echo "" +echo "Push to iPhone (CLEAN sandbox — delete + reinstall app first; devicectl" +echo "doesn't remove orphan files from previous bundles):" +echo " xcrun devicectl device copy to --device \\" +echo " --domain-type appDataContainer \\" +echo " --domain-identifier com.example.CoreMLLLMChat \\" +echo " --source $OUT \\" +echo " --destination Documents/Models/gemma4-e4b" +echo "" +echo "Scheme env vars: LLM_VISION_FORCE_ANE=1 (route vision.ane via ANE)." From fc52b8b41bb46052f6f86c4b3ae0ef7aeb6919c8 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Sun, 3 May 2026 10:40:23 +0900 Subject: [PATCH 6/7] =?UTF-8?q?research(gemma4-stateful-mm):=20Stage=208?= =?UTF-8?q?=20multimodal=20stateful=20engine=20=E2=80=94=20Mac=20dev=20/?= =?UTF-8?q?=20iPhone=20blocked?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stage 8 follow-up to the stateful Linear shipment. Adds a parallel engine that drives Gemma 4 stateful (3-chunk merged + Linear) with T=288 single-function prefill chunks + the Stage 6 vision/audio splice. The engine class works end-to-end on Mac (text decode 16.5 tok/s; assembled bundle drives image + audio splice through the T=288 batched prefill with bidirectional within-image mask). iPhone status: BLOCKED. Multiple converter paths attempted, all hit the same iPhone ANE 18 MIL->EIR translation failure on chunk_2 (the merged 21-layer middle chunk): - 3-chunk merged stateful with kv13/kv14 alias output: std::bad_cast - .clone() patch on the alias output assignment: same error - 4-chunk decode split (chunk_2_own + chunk_2_shared): same error, confirming the alias-slice-over-MLState pattern is the root cause rather than graph size. The non-stateful 3-way merged chunk2_3way (same 21 layers, but K/V flow as plain tensor inputs/outputs — no MLState alias) compiles and runs on iPhone ANE 18 at 15.7 tok/s, confirming the diagnosis. Code keeps the stateful path for Mac development and future revisits (stateful + multifunction T=288 might unlock once iPhone ANE picks up multifunction T>1 + dual MLState; not on iOS 18). Files: - Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift (NEW) ~880-line dimension-agnostic stateful engine. 3-chunk merged decode + 4-state MLState (decode/prefill x s1/s2) + bridgeKVState via withMultiArray nested closures + ported Stage 6 multimodal helpers (vision/video/audio splice + vision-aware bidir mask + cross-turn LCP-resume). Padding-replicate scheme keeps auto-emitted token at row T-1 valid even when validCount < T. - Sources/CoreMLLLM/ModelDownloader.swift gemma4e2bStatefulMultimodal + gemma4e4bStatefulMultimodal ModelInfo entries (sideload-only, exposed under LLM_SHOW_EXPERIMENTAL=1). - Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift Detection (prefill_T288/ subdir presence) routes to the new engine; load + generate + image/audio caching mirror the existing gemma4Stateful pattern. - Sources/gemma4mm-smoke/main.swift (NEW) Mac CLI smoke test for the stateful multimodal engine. - Package.swift: gemma4mm-smoke executable target. - scripts/assemble_gemma4_stateful_multimodal.sh (NEW) Reproducible bundle assembly (decode 3-chunk + prefill_T288/ subdir + multimodal encoders). - conversion/build_gemma4_stateful_singlefunc_prefill.py Adds --four-chunk variant (used during the chunk_2 split probe). - conversion/models/gemma4_swa_stateful_chunks.py .clone() on the kv13/kv14 producer alias output (decode + prefill T=N variants). Materialises the slice over MLState into a fresh tensor; ineffective vs the iPhone ANE bug but not regressive. --- .../CoreMLLLMChat/LLMRunner.swift | 218 +++ Package.swift | 12 + .../Gemma4StatefulMultimodalEngine.swift | 1213 +++++++++++++++++ Sources/CoreMLLLM/ModelDownloader.swift | 30 + Sources/gemma4mm-smoke/main.swift | 92 ++ ...uild_gemma4_stateful_singlefunc_prefill.py | 119 +- .../models/gemma4_swa_stateful_chunks.py | 39 +- .../assemble_gemma4_stateful_multimodal.sh | 211 +++ 8 files changed, 1888 insertions(+), 46 deletions(-) create mode 100644 Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift create mode 100644 Sources/gemma4mm-smoke/main.swift create mode 100755 scripts/assemble_gemma4_stateful_multimodal.sh diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift index 30f43bd..aa03fc0 100644 --- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift +++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift @@ -59,6 +59,21 @@ final class LLMRunner { private var gemma4StatefulEngine: Gemma4StatefulEngine? private var gemma4StatefulTokenizer: (any Tokenizer)? + // Gemma 4 stateful + multimodal path (Stage 8): same 3-chunk merged + // Linear decode as the text-only stateful entry, plus a separate + // T=288 single-function prefill set under `prefill_T288/` and the + // vision/video/audio encoders. Selected when the bundle has both + // `chunk_{1..3}` and a `prefill_T288/` subdir alongside. + private var gemma4StatefulMultimodalEngine: Gemma4StatefulMultimodalEngine? + private var gemma4StatefulMultimodalTokenizer: (any Tokenizer)? + /// Cache the last image/audio features so a same-attachment follow-up + /// turn skips encoder cost (mirrors the legacy gemma4 multimodal path). + private var cachedGemma4MMImage: CGImage? + private var cachedGemma4MMImageFeatures: MLMultiArray? + private var cachedGemma4MMAudioSig: [Float]? + private var cachedGemma4MMAudioFeatures: MLMultiArray? + private var cachedGemma4MMAudioTokens: Int = 0 + // Qwen3-VL 2B path: separate generator + tokenizer, selected when // the downloaded folder contains `qwen3_vl_2b_decode_chunks/`. // Plain GQA architecture (not the Qwen3.5 hybrid SSM), so it gets @@ -91,6 +106,7 @@ final class LLMRunner { || qwen3vl2bGenerator != nil || qwen3vl2bStatefulGenerator != nil || gemma4StatefulEngine != nil + || gemma4StatefulMultimodalEngine != nil { llm = nil qwen35Generator = nil @@ -102,6 +118,13 @@ final class LLMRunner { qwen3vl2bVisionEncoder = nil gemma4StatefulEngine = nil gemma4StatefulTokenizer = nil + gemma4StatefulMultimodalEngine = nil + gemma4StatefulMultimodalTokenizer = nil + cachedGemma4MMImage = nil + cachedGemma4MMImageFeatures = nil + cachedGemma4MMAudioSig = nil + cachedGemma4MMAudioFeatures = nil + cachedGemma4MMAudioTokens = 0 cachedVisionImage = nil cachedVisionFeatures = nil isLoaded = false @@ -214,7 +237,26 @@ final class LLMRunner { let gemma4StatefulPresent = fm.fileExists(atPath: gemma4StatefulDir.appendingPathComponent("embed_tokens_q8.bin").path) && (hasChunks || has1Chunk) + // Stage 8 multimodal-stateful detection: same 3-chunk decode + // bundle plus a `prefill_T288/` subdir with the three single- + // function prefill mlpackages, plus at least one of + // vision/audio mlmodelc. Route to Gemma4StatefulMultimodalEngine + // when present — falls through to the text-only stateful path + // when only the decode chunks are installed. if gemma4StatefulPresent { + let prefillT288Dir = gemma4StatefulDir.appendingPathComponent("prefill_T288") + let hasPrefillT288 = ["chunk_1_prefill_T288", + "chunk_2_3way_prefill_T288", + "chunk_3_prefill_T288"].allSatisfy { name in + fm.fileExists(atPath: + prefillT288Dir.appendingPathComponent("\(name).mlpackage").path) + || fm.fileExists(atPath: + prefillT288Dir.appendingPathComponent("\(name).mlmodelc").path) + } + if hasPrefillT288 { + try await loadGemma4StatefulMultimodal(folder: gemma4StatefulDir) + return + } try await loadGemma4Stateful(folder: gemma4StatefulDir) return } @@ -299,6 +341,10 @@ final class LLMRunner { return try await generateQwen3VL2BStateful( messages: messages, image: image) } + if gemma4StatefulMultimodalEngine != nil { + return try await generateGemma4StatefulMultimodal( + messages: messages, image: image, audio: audio) + } if gemma4StatefulEngine != nil { return try await generateGemma4Stateful(messages: messages) } @@ -1151,6 +1197,178 @@ final class LLMRunner { } } + // MARK: - Gemma 4 stateful + multimodal (Stage 8) + + private func loadGemma4StatefulMultimodal(folder: URL) async throws { + loadingStatus = "Loading Gemma 4 multimodal tokenizer..." + let hfDir = folder.appendingPathComponent("hf_model") + let tok = try await AutoTokenizer.from(modelFolder: hfDir) + loadingStatus = "Compiling Gemma 4 stateful multimodal chunks (first run only)..." + let engine = Gemma4StatefulMultimodalEngine() + try await engine.load(modelDirectory: folder) + gemma4StatefulMultimodalEngine = engine + gemma4StatefulMultimodalTokenizer = tok + + let parent = folder.deletingLastPathComponent().lastPathComponent + let isE4B = parent.lowercased().contains("e4b") + modelName = isE4B + ? "Gemma 4 E4B (stateful, multimodal)" + : "Gemma 4 E2B (stateful, multimodal)" + hasVision = engine.hasVision + hasAudio = engine.hasAudio + isLoaded = true + loadingStatus = "Ready" + print("[LLMRunner] Gemma 4 stateful multimodal loaded — \(modelName) " + + "vision=\(hasVision) video=\(engine.hasVideoVision) audio=\(hasAudio)") + } + + private func generateGemma4StatefulMultimodal(messages: [ChatMessage], + image: CGImage?, + audio: [Float]? + ) async throws -> AsyncStream { + guard let engine = gemma4StatefulMultimodalEngine, + let tok = gemma4StatefulMultimodalTokenizer + else { + throw NSError(domain: "LLMRunner", code: 42, + userInfo: [NSLocalizedDescriptionKey: + "Gemma 4 stateful multimodal not loaded"]) + } + isGenerating = true + tokensPerSecond = 0 + + // Encode image once per distinct attachment. Cache hit (same + // CGImage instance) skips the ~30 s vision graph + lets the + // engine's cross-turn KV reuse hit the LCP fast path. + var imageFeatures: MLMultiArray? = nil + var imageNumTokens = 0 + var imageChanged = false + if let img = image { + if cachedGemma4MMImage === img, let f = cachedGemma4MMImageFeatures { + imageFeatures = f + imageNumTokens = 256 + } else { + imageFeatures = try engine.processImage(img) + imageNumTokens = 256 + cachedGemma4MMImage = img + cachedGemma4MMImageFeatures = imageFeatures + imageChanged = true + } + } else if cachedGemma4MMImage != nil { + cachedGemma4MMImage = nil + cachedGemma4MMImageFeatures = nil + imageChanged = true + } + + var audioFeatures: MLMultiArray? = nil + var audioNumTokens = 0 + var audioChanged = false + if let pcm = audio { + // Cheap fingerprint: [count, first, last]. Re-encode on + // any mismatch. + let sig: [Float] = pcm.isEmpty + ? [0, 0, 0] + : [Float(pcm.count), pcm.first ?? 0, pcm.last ?? 0] + let sigMatches = (cachedGemma4MMAudioSig == sig) + if sigMatches, let f = cachedGemma4MMAudioFeatures { + audioFeatures = f + audioNumTokens = cachedGemma4MMAudioTokens + } else { + let (feat, n) = try engine.processAudio(pcm) + audioFeatures = feat + audioNumTokens = n + cachedGemma4MMAudioSig = sig + cachedGemma4MMAudioFeatures = feat + cachedGemma4MMAudioTokens = n + audioChanged = true + } + } else if cachedGemma4MMAudioFeatures != nil { + cachedGemma4MMAudioSig = nil + cachedGemma4MMAudioFeatures = nil + cachedGemma4MMAudioTokens = 0 + audioChanged = true + } + + // Attachment changed → drop persisted KV so the LCP match + // doesn't reuse stale image/audio rows from a prior turn. + if imageChanged || audioChanged { engine.resetPersistedState() } + + // Build the Gemma 4 prompt. Image / audio blocks are pinned to + // the LAST user turn so cross-turn resume keeps the pad span at + // a fixed offset (same trick as the legacy gemma4 path). + let imageBlock = "<|image>" + + String(repeating: "<|image|>", count: 256) + + "" + let audioBlock = "<|audio>" + + String(repeating: "<|audio|>", count: audioNumTokens) + + "" + let lastUserIdx = messages.lastIndex { $0.role == .user } + var prompt = "" + for (i, m) in messages.enumerated() { + switch m.role { + case .user: + let isLast = i == lastUserIdx + var mediaPrefix = "" + if imageFeatures != nil && isLast { mediaPrefix += imageBlock + "\n" } + if audioFeatures != nil && isLast && audioNumTokens > 0 { + mediaPrefix += audioBlock + "\n" + } + prompt += "<|turn>user\n\(mediaPrefix)\(m.content)\n" + case .assistant: + prompt += "<|turn>model\n\(m.content)\n" + case .system: + continue + } + } + prompt += "<|turn>model\n" + let inputIds = tok.encode(text: prompt).map { Int32($0) } + + var eosSet: Set = [1, 106] + if let eid = tok.eosTokenId { eosSet.insert(Int32(eid)) } + let skipSet: Set = [1, 105, 106] + + let genStart = Date() + return AsyncStream { continuation in + Task { [weak self] in + defer { Task { @MainActor in self?.isGenerating = false } } + var accum: [Int] = [] + var emittedString = "" + var totalEmitted = 0 + do { + _ = try await engine.generate( + inputIds: inputIds, + imageFeatures: imageFeatures, + imageNumTokens: imageNumTokens, + audioFeatures: audioFeatures, + audioNumTokens: audioNumTokens, + maxNewTokens: 256, + eosTokenIds: eosSet, + onToken: { tokenId in + if skipSet.contains(tokenId) { return } + accum.append(Int(tokenId)) + let current = tok.decode(tokens: accum) + if current.count > emittedString.count { + let delta = String( + current.suffix(current.count - emittedString.count)) + continuation.yield(delta) + emittedString = current + } + totalEmitted += 1 + }) + let dt = Date().timeIntervalSince(genStart) + if dt > 0 { + let tps = Double(totalEmitted) / dt + Task { @MainActor in + self?.tokensPerSecond = tps + } + } + } catch { + continuation.yield("[Error: \(error.localizedDescription)]") + } + continuation.finish() + } + } + } + /// Build the token ID sequence for a vision-augmented Qwen3-VL 2B /// prompt. Emits the same prefix the HF processor would produce for /// `[{role:"user", content:[{type:"image"},{type:"text", text:...}]}]` diff --git a/Package.swift b/Package.swift index 50df101..9ec6f0c 100644 --- a/Package.swift +++ b/Package.swift @@ -16,6 +16,7 @@ let package = Package( .executable(name: "determinism-oracle", targets: ["DeterminismOracle"]), .executable(name: "verify-k8-probe", targets: ["VerifyK8Probe"]), .executable(name: "ane-residency-gate", targets: ["AneResidencyGate"]), + .executable(name: "gemma4mm-smoke", targets: ["Gemma4MMSmoke"]), // Standalone samples for the two Gemma-3-based models. These live in // the same package on purpose — a LocalAIKit-style wrapper can depend // on the `CoreMLLLM` library and use `FunctionGemma` / `EmbeddingGemma` @@ -91,6 +92,17 @@ let package = Package( path: "Sources/verify-k8-probe", swiftSettings: [.swiftLanguageMode(.v5)] ), + // Mac smoke test for Gemma4StatefulMultimodalEngine — text-only + // generate to catch engine bugs without an iPhone trip. + .executableTarget( + name: "Gemma4MMSmoke", + dependencies: [ + "CoreMLLLM", + .product(name: "Tokenizers", package: "swift-transformers"), + ], + path: "Sources/gemma4mm-smoke", + swiftSettings: [.swiftLanguageMode(.v5)] + ), // FunctionGemma-270M standalone CLI. Does NOT combine with Gemma 4 — // multi-model orchestration belongs in the LocalAIKit wrapper. .executableTarget( diff --git a/Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift b/Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift new file mode 100644 index 0000000..aac4c51 --- /dev/null +++ b/Sources/CoreMLLLM/Gemma4StatefulMultimodalEngine.swift @@ -0,0 +1,1213 @@ +// Gemma4StatefulMultimodalEngine — Stage 8 runtime that pairs the +// 3-chunk merged stateful Linear decode path with single-function +// T=288 prefill chunks and the Stage 6 multimodal feature splice. +// +// Sibling of (not subclass of) Gemma4StatefulEngine. The legacy class +// keeps shipping the multifunction prefill_b8 path bit-identical for +// users who don't want multimodal; this class layers on: +// +// - 3 separate single-function prefill mlpackages (T=288): +// prefill_T288/chunk_1_prefill_T288.mlmodelc +// prefill_T288/chunk_2_3way_prefill_T288.mlmodelc +// prefill_T288/chunk_3_prefill_T288.mlmodelc +// iPhone ANE 18 rejects multifunction T>1 + dual MLState. Probe +// 2 verified single-function T=288 compiles in 7.3 s on A19 Pro. +// +// - 4 MLStates (decode_s1/s2 + prefill_s1/s2). After each prefill +// pass we memcpy kv_cache_sliding + kv_cache_full from prefill +// state into decode state via the NS_REFINED_FOR_SWIFT +// `withMultiArray(for:handler:)` bridge. +// +// - vision (256/image), video (64/frame), audio (~188/2 sec) +// encoder splice at IMAGE/VIDEO/AUDIO_TOKEN_ID positions. +// Vision-aware mask preserves bidirectional within-image +// attention during prefill. +// +// Both Gemma 4 E2B (35 layers) and E4B (42 layers) drive through this +// engine — chunk topology comes from the loaded mlpackages, layer +// counts come from model_config.json. The engine itself is +// dimension-agnostic. + +import Accelerate +import CoreGraphics +import CoreML +import Foundation + +@available(iOS 18.0, macOS 15.0, *) +public final class Gemma4StatefulMultimodalEngine { + // MARK: - Public surface + + public struct Config { + public let computeUnits: MLComputeUnits + public init(computeUnits: MLComputeUnits = .cpuAndNeuralEngine) { + self.computeUnits = computeUnits + } + } + + public private(set) var modelConfig: ModelConfig? + public var lastDecodeTokensPerSecond: Double = 0 + + // MARK: - Storage + + private let cfg: Config + private var modelDir: URL? + + // Decode chunks. Two layouts auto-detected at load(): + // - 3-chunk merged: chunk_1 (own) + chunk_2 (own + KV-shared + // internal) + chunk_3 (KV-shared + lm_head). chunk_3 emits + // token_id. Used for E2B (chunk_2 fits ANE budget). + // - 4-chunk split: chunk_1 (own) + chunk_2 (own only) + chunk_3 + // (KV-shared, no lm_head) + chunk_4 (KV-shared + lm_head). + // chunk_4 emits token_id. Used for E4B because the merged + // 21-layer chunk_2 trips iPhone ANE 18 MIL→EIR translation + // (`std::bad_cast`); splitting keeps each subgraph small enough. + private var decodeChunk1: MLModel? + private var decodeChunk2: MLModel? + private var decodeChunk3: MLModel? + private var decodeChunk4: MLModel? + private var is4Chunk: Bool = false + + // T=288 single-function prefill chunks (separate mlpackages). + private static let kPrefillT: Int = 288 + private var prefillChunk1: MLModel? + private var prefillChunk2: MLModel? + private var prefillChunk3: MLModel? + private var prefillChunk4: MLModel? + + /// True when both decode and prefill chunk sets loaded successfully. + public var hasT288Prefill: Bool { + let core = prefillChunk1 != nil && prefillChunk2 != nil + && prefillChunk3 != nil + return is4Chunk ? (core && prefillChunk4 != nil) : core + } + + // Per-chunk MLStates. Decode + prefill paths each have their own + // since they bind to different MLModel instances. chunk_3 is + // stateless (KV-shared from chunk_2's kv13/kv14 outputs) for both + // decode and prefill. + private var decodeState1: MLState? + private var decodeState2: MLState? + private var prefillState1: MLState? + private var prefillState2: MLState? + + // Sidecars (same as Gemma4StatefulEngine). + private var embedTokens: EmbeddingLookup? + private var embedTokensPerLayer: EmbeddingLookup? + private var cosSlidingTable: Data? + private var sinSlidingTable: Data? + private var cosFullTable: Data? + private var sinFullTable: Data? + + // T=1 decode scratch. + private var maskFull: MLMultiArray! + private var maskSliding: MLMultiArray! + private var fvMaskFull: MLFeatureValue! + private var fvMaskSliding: MLFeatureValue! + private var posScratch: MLMultiArray! + private var ringScratch: MLMultiArray! + private var fvPos: MLFeatureValue! + private var fvRing: MLFeatureValue! + + // T=288 prefill scratch (allocated once at load). + private var batchHidden: MLMultiArray? + private var batchPerLayerRaw: MLMultiArray? + private var batchMaskFull: MLMultiArray? + private var batchMaskSliding: MLMultiArray? + private var batchCosS: MLMultiArray? + private var batchSinS: MLMultiArray? + private var batchCosF: MLMultiArray? + private var batchSinF: MLMultiArray? + + // Cross-turn KV reuse — only on decode states (prefill states are + // reusable scratch and get overwritten each generate()). The LCP + // match invariant: persistedInputIds is a strict prefix of the + // next prompt's inputIds. LLMRunner is responsible for calling + // resetPersistedState() on chat clear or attachment change. + private var persistedInputIds: [Int32] = [] + private var persistedPosition: Int = 0 + + // MARK: - Multimodal (Stage 6 helpers, ported) + + private static let IMAGE_TOKEN_ID: Int32 = 258880 + private static let AUDIO_TOKEN_ID: Int32 = 258881 + private static let VIDEO_TOKEN_ID: Int32 = 258884 + + private var visionModelURL: URL? + private var visionConfig: MLModelConfiguration? + private var visionModel: MLModel? + private var visionUsesANEBuild: Bool = false + + private var videoVisionModelURL: URL? + private var videoVisionConfig: MLModelConfiguration? + private var videoVisionModel: MLModel? + + private var audioModelURL: URL? + private var audioConfig: MLModelConfiguration? + private var audioModel: MLModel? + private var melFilterbank: [Float]? + private var audioProjection: AudioProcessor.ProjectionWeights? + private var audioMelFrames: Int = 200 + private var audioNumTokensConfig: Int = 188 + private var audioMelFloor: Float = 0.001 + + public var hasVision: Bool { visionModelURL != nil } + public var hasVideoVision: Bool { videoVisionModelURL != nil } + public var hasAudio: Bool { audioModelURL != nil } + public var defaultAudioNumTokens: Int { audioNumTokensConfig } + + // Per-call multimodal binding. + private var mmImageFeatures: MLMultiArray? + private var mmImageNumTokens: Int = 0 + private var mmAudioFeatures: MLMultiArray? + private var mmAudioNumTokens: Int = 0 + private var mmImageIdx: Int = 0 + private var mmAudioIdx: Int = 0 + private var mmVisionGroupIds: [Int]? + + // Reusable PLR=0 scratch for T=1 multimodal positions. + private var prlZerosT1: MLMultiArray? + + // MARK: - Init / Load + + public init(config: Config = Config()) { + self.cfg = config + } + + /// Drop the cross-turn KV cache. Call when chat history clears, + /// the vision/audio prefix changes, or any other prompt-prefix + /// invariant breaks. + public func resetPersistedState() { + decodeState1 = nil + decodeState2 = nil + prefillState1 = nil + prefillState2 = nil + persistedInputIds = [] + persistedPosition = 0 + } + + public func load(modelDirectory: URL) async throws { + resetPersistedState() + modelDir = modelDirectory + let mc = try ModelConfig.load(from: modelDirectory) + modelConfig = mc + + let mcfg = MLModelConfiguration() + mcfg.computeUnits = cfg.computeUnits + + decodeChunk1 = try openChunk("chunk_1", in: modelDirectory, cfg: mcfg) + decodeChunk2 = try openChunk("chunk_2", in: modelDirectory, cfg: mcfg) + decodeChunk3 = try openChunk("chunk_3", in: modelDirectory, cfg: mcfg) + + // 4-chunk vs 3-chunk detection: chunk_4 present → 4-chunk; + // chunk_3.token_id output → 3-chunk merged final. + let chunk4Mlc = modelDirectory.appendingPathComponent("chunk_4.mlmodelc") + let chunk4Pkg = modelDirectory.appendingPathComponent("chunk_4.mlpackage") + let has4 = FileManager.default.fileExists(atPath: chunk4Mlc.path) + || FileManager.default.fileExists(atPath: chunk4Pkg.path) + if has4 { + decodeChunk4 = try openChunk("chunk_4", in: modelDirectory, cfg: mcfg) + is4Chunk = true + print("[Gemma4MM] 4-chunk decode layout (chunk_2 own / chunk_3 shared / chunk_4 final)") + } else { + print("[Gemma4MM] 3-chunk merged decode layout (chunk_3 = final)") + } + + // T=288 prefill chunks live under prefill_T288/ in the bundle + // layout. Failing to find them is fatal — this engine has no + // T=1 prefill fallback (the legacy engine handles that path). + let pfDir = modelDirectory.appendingPathComponent("prefill_T288") + prefillChunk1 = try openChunk( + "chunk_1_prefill_T288", in: pfDir, cfg: mcfg) + if is4Chunk { + prefillChunk2 = try openChunk( + "chunk_2_prefill_T288", in: pfDir, cfg: mcfg) + prefillChunk3 = try openChunk( + "chunk_3_prefill_T288", in: pfDir, cfg: mcfg) + prefillChunk4 = try openChunk( + "chunk_4_prefill_T288", in: pfDir, cfg: mcfg) + } else { + prefillChunk2 = try openChunk( + "chunk_2_3way_prefill_T288", in: pfDir, cfg: mcfg) + prefillChunk3 = try openChunk( + "chunk_3_prefill_T288", in: pfDir, cfg: mcfg) + } + print("[Gemma4MM] T=\(Self.kPrefillT) prefill chunks loaded " + + "(\(is4Chunk ? "4-chunk" : "3-chunk merged"))") + + embedTokens = try EmbeddingLookup( + dataURL: modelDirectory.appendingPathComponent("embed_tokens_q8.bin"), + scalesURL: modelDirectory.appendingPathComponent("embed_tokens_scales.bin"), + vocabSize: mc.vocabSize, dim: mc.hiddenSize, scale: mc.embedScale) + embedTokensPerLayer = try EmbeddingLookup( + dataURL: modelDirectory.appendingPathComponent("embed_tokens_per_layer_q8.bin"), + scalesURL: modelDirectory.appendingPathComponent("embed_tokens_per_layer_scales.bin"), + vocabSize: mc.vocabSize, + dim: mc.numLayers * mc.perLayerDim, + scale: mc.perLayerEmbedScale) + + cosSlidingTable = try? Data( + contentsOf: modelDirectory.appendingPathComponent("cos_sliding.npy"), + options: .mappedIfSafe) + sinSlidingTable = try? Data( + contentsOf: modelDirectory.appendingPathComponent("sin_sliding.npy"), + options: .mappedIfSafe) + cosFullTable = try? Data( + contentsOf: modelDirectory.appendingPathComponent("cos_full.npy"), + options: .mappedIfSafe) + sinFullTable = try? Data( + contentsOf: modelDirectory.appendingPathComponent("sin_full.npy"), + options: .mappedIfSafe) + + let ctx = mc.contextLength + let W = mc.slidingWindow + maskFull = try MLMultiArray( + shape: [1, 1, 1, NSNumber(value: ctx)], dataType: .float16) + maskSliding = try MLMultiArray( + shape: [1, 1, 1, NSNumber(value: W)], dataType: .float16) + posScratch = try MLMultiArray(shape: [1], dataType: .int32) + ringScratch = try MLMultiArray(shape: [1], dataType: .int32) + fvMaskFull = MLFeatureValue(multiArray: maskFull) + fvMaskSliding = MLFeatureValue(multiArray: maskSliding) + fvPos = MLFeatureValue(multiArray: posScratch) + fvRing = MLFeatureValue(multiArray: ringScratch) + + try ensureBatchScratch(T: Self.kPrefillT) + + loadMultimodalEncoders(modelDirectory: modelDirectory) + } + + private func openChunk(_ name: String, in dir: URL, + cfg: MLModelConfiguration) throws -> MLModel { + let mlc = dir.appendingPathComponent("\(name).mlmodelc") + let pkg = dir.appendingPathComponent("\(name).mlpackage") + + let url: URL + if FileManager.default.fileExists(atPath: mlc.path) { + url = mlc + } else if FileManager.default.fileExists(atPath: pkg.path) { + url = try MLModel.compileModel(at: pkg) + } else { + throw CoreMLLLMError.modelNotFound( + "\(name).mlmodelc/.mlpackage not found in \(dir.path)") + } + + // Try the requested compute units first. iPhone ANE 18 has been + // observed to fail MIL→EIR translation on some merged chunks + // (`std::bad_cast` in `_ANECompiler::ANECCompile()`); fall back to + // CPU+GPU so the engine still loads. Gates per-chunk via env + // var `LLM_GEMMA4MM_FORCE_GPU=[,...]`. + let envForceGPU = ProcessInfo.processInfo + .environment["LLM_GEMMA4MM_FORCE_GPU"] ?? "" + let forced = envForceGPU.split(separator: ",") + .map { String($0).trimmingCharacters(in: .whitespaces) } + if forced.contains(name) { + print("[Gemma4MM] \(name) — LLM_GEMMA4MM_FORCE_GPU forces cpuAndGPU") + let gpu = MLModelConfiguration() + gpu.computeUnits = .cpuAndGPU + return try MLModel(contentsOf: url, configuration: gpu) + } + do { + return try MLModel(contentsOf: url, configuration: cfg) + } catch { + print("[Gemma4MM] \(name) load failed on \(cfg.computeUnits.rawValue): \(error). Retrying on cpuAndGPU.") + let gpu = MLModelConfiguration() + gpu.computeUnits = .cpuAndGPU + return try MLModel(contentsOf: url, configuration: gpu) + } + } + + // MARK: - Multimodal encoder loading (ported from Stage 6 02ac583) + + private func loadMultimodalEncoders(modelDirectory: URL) { + let forceANE = ProcessInfo.processInfo.environment["LLM_VISION_FORCE_ANE"] == "1" + let visionANEv2Compiled = modelDirectory.appendingPathComponent("vision.ane.v2.mlmodelc") + let visionANECompiled = modelDirectory.appendingPathComponent("vision.ane.mlmodelc") + let visionANEPkg = modelDirectory.appendingPathComponent("vision.ane.mlpackage") + let visionCompiled = modelDirectory.appendingPathComponent("vision.mlmodelc") + let visionPkg = modelDirectory.appendingPathComponent("vision.mlpackage") + if forceANE, FileManager.default.fileExists(atPath: visionANEv2Compiled.path) { + visionModelURL = visionANEv2Compiled; visionUsesANEBuild = true + } else if forceANE, FileManager.default.fileExists(atPath: visionANECompiled.path) { + visionModelURL = visionANECompiled; visionUsesANEBuild = true + } else if forceANE, FileManager.default.fileExists(atPath: visionANEPkg.path) { + visionModelURL = visionANEPkg; visionUsesANEBuild = true + } else if FileManager.default.fileExists(atPath: visionCompiled.path) { + visionModelURL = visionCompiled + } else if FileManager.default.fileExists(atPath: visionPkg.path) { + visionModelURL = visionPkg + } else if FileManager.default.fileExists(atPath: visionANEv2Compiled.path) { + visionModelURL = visionANEv2Compiled; visionUsesANEBuild = true + } else if FileManager.default.fileExists(atPath: visionANECompiled.path) { + visionModelURL = visionANECompiled; visionUsesANEBuild = true + } else if FileManager.default.fileExists(atPath: visionANEPkg.path) { + visionModelURL = visionANEPkg; visionUsesANEBuild = true + } + if let url = visionModelURL { + let cfg = MLModelConfiguration() + cfg.computeUnits = visionUsesANEBuild ? .cpuAndNeuralEngine : .cpuAndGPU + visionConfig = cfg + print("[Gemma4MM/Vision] selected \(url.lastPathComponent) → \(visionUsesANEBuild ? "ANE" : "GPU")") + prewarmVisionInBackground() + } + + let videoVisionCompiled = modelDirectory.appendingPathComponent("vision_video.mlmodelc") + let videoVisionPkg = modelDirectory.appendingPathComponent("vision_video.mlpackage") + if FileManager.default.fileExists(atPath: videoVisionCompiled.path) { + videoVisionModelURL = videoVisionCompiled + } else if FileManager.default.fileExists(atPath: videoVisionPkg.path) { + videoVisionModelURL = videoVisionPkg + } + if videoVisionModelURL != nil { + let cfg = MLModelConfiguration() + cfg.computeUnits = .cpuAndGPU + videoVisionConfig = cfg + } + + let audioCompiled = modelDirectory.appendingPathComponent("audio.mlmodelc") + let audioPkg = modelDirectory.appendingPathComponent("audio.mlpackage") + if FileManager.default.fileExists(atPath: audioCompiled.path) { + audioModelURL = audioCompiled + } else if FileManager.default.fileExists(atPath: audioPkg.path) { + audioModelURL = audioPkg + } + if audioModelURL != nil { + let cfg = MLModelConfiguration() + cfg.computeUnits = .cpuAndGPU + audioConfig = cfg + + let melURL = modelDirectory.appendingPathComponent("mel_filterbank.bin") + if FileManager.default.fileExists(atPath: melURL.path) { + melFilterbank = try? AudioProcessor.loadMelFilterbank(from: melURL) + } + let projURL = modelDirectory.appendingPathComponent("output_proj_weight.npy") + if FileManager.default.fileExists(atPath: projURL.path) { + audioProjection = try? AudioProcessor.ProjectionWeights.load(from: modelDirectory) + } + let audioConfURL = modelDirectory.appendingPathComponent("audio_config.json") + if let data = try? Data(contentsOf: audioConfURL), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] { + audioMelFrames = json["mel_frames"] as? Int ?? 200 + audioNumTokensConfig = json["num_tokens"] as? Int ?? 188 + if let mf = json["log_offset"] as? Double { + audioMelFloor = Float(mf) + } else if let mf = json["mel_floor"] as? Double { + audioMelFloor = Float(mf) + } + } + } + + if hasVision || hasAudio || hasVideoVision { + print("[Gemma4MM] multimodal encoders: vision=\(hasVision) " + + "video=\(hasVideoVision) audio=\(hasAudio)") + } + } + + private func prewarmVisionInBackground() { + guard let url = visionModelURL, let cfg = visionConfig, + !visionUsesANEBuild else { return } + DispatchQueue.global(qos: .utility).async { [weak self] in + do { + let t0 = CFAbsoluteTimeGetCurrent() + let m = try MLModel(contentsOf: url, configuration: cfg) + self?.visionModel = m + let pd = 16 * 16 * 3 + let total = 2520 + let pv = try MLMultiArray( + shape: [1, NSNumber(value: total), NSNumber(value: pd)], + dataType: .float32) + let pid = try MLMultiArray( + shape: [1, NSNumber(value: total), 2], dataType: .int32) + let pidp = pid.dataPointer.bindMemory( + to: Int32.self, capacity: total * 2) + var k = 0 + for py in 0..<48 { + for px in 0..<48 { + pidp[k * 2] = Int32(px) + pidp[k * 2 + 1] = Int32(py) + k += 1 + } + } + for i in (48 * 48).. MLMultiArray { + if visionModel == nil, let url = visionModelURL, let cfg = visionConfig { + visionModel = try MLModel(contentsOf: url, configuration: cfg) + } + guard let vm = visionModel else { throw CoreMLLLMError.visionNotAvailable } + return visionUsesANEBuild + ? try ImageProcessor.processANE(image, with: vm) + : try ImageProcessor.process(image, with: vm) + } + + public func processVideoFrame(_ image: CGImage) throws -> MLMultiArray { + if videoVisionModel == nil, let url = videoVisionModelURL, let cfg = videoVisionConfig { + videoVisionModel = try MLModel(contentsOf: url, configuration: cfg) + } + guard let vm = videoVisionModel else { throw CoreMLLLMError.visionNotAvailable } + return try ImageProcessor.processVideoFrame(image, with: vm) + } + + public func processAudio(_ samples: [Float]) throws -> (MLMultiArray, Int) { + if audioModel == nil, let url = audioModelURL, let cfg = audioConfig { + audioModel = try MLModel(contentsOf: url, configuration: cfg) + } + guard let am = audioModel else { throw CoreMLLLMError.audioNotAvailable } + guard let mel = melFilterbank else { throw CoreMLLLMError.audioNotAvailable } + + let padLeft = 160 + let paddedLen = padLeft + samples.count + let unfoldSize = 321 + let actualMelFrames = max(0, (paddedLen - unfoldSize) / 160 + 1) + let afterConv1 = (actualMelFrames + 1) / 2 + let actualTokens = min((afterConv1 + 1) / 2, audioNumTokensConfig) + + let features = try AudioProcessor.process( + samples, with: am, melFilterbank: mel, + targetFrames: audioMelFrames, projection: audioProjection, + melFloor: audioMelFloor) + return (features, actualTokens) + } + + // MARK: - Multimodal mask + splice helpers + + private func computeVisionGroupIds(inputIds: [Int32]) -> [Int] { + var ids = [Int](repeating: -1, count: inputIds.count) + var current = -1 + var prev = false + for i in 0.. MLMultiArray { + if let buf = prlZerosT1 { return buf } + guard let mc = modelConfig else { + throw CoreMLLLMError.modelNotFound("no config") + } + let dim = mc.numLayers * mc.perLayerDim + let arr = try MLMultiArray( + shape: [1, 1, NSNumber(value: dim)], dataType: .float16) + memset(arr.dataPointer, 0, dim * MemoryLayout.stride) + prlZerosT1 = arr + return arr + } + + private func multimodalSpliceT1(token: Int32) -> MLMultiArray? { + guard let mc = modelConfig else { return nil } + if (token == Self.IMAGE_TOKEN_ID || token == Self.VIDEO_TOKEN_ID), + let img = mmImageFeatures, mmImageIdx < mmImageNumTokens { + let row = ImageProcessor.sliceFeature( + img, at: mmImageIdx, hiddenSize: mc.hiddenSize) + mmImageIdx += 1 + return row + } + if token == Self.AUDIO_TOKEN_ID, + let aud = mmAudioFeatures, mmAudioIdx < mmAudioNumTokens { + let row = AudioProcessor.sliceFeature( + aud, at: mmAudioIdx, hiddenSize: mc.hiddenSize) + mmAudioIdx += 1 + return row + } + return nil + } + + // MARK: - Mask + position helpers (T=1 decode) + + private func fillFullCausalMask(position: Int) { + let ctx = modelConfig!.contextLength + let dst = maskFull.dataPointer.bindMemory(to: UInt16.self, capacity: ctx) + let neg = Float16(-65504).bitPattern + let p = min(max(position, 0), ctx - 1) + for i in 0..= 0 && i < groupIds.count && groupIds[i] == pGroup + mp[i] = (causal || sameGroup) ? 0 : neg + } + } + + private func fillSlidingCausalMaskVisionAware(position p: Int, groupIds: [Int]) { + let W = modelConfig!.slidingWindow + let neg = Float16(-65504).bitPattern + let mp = maskSliding.dataPointer.bindMemory(to: UInt16.self, capacity: W) + if p >= W { + for i in 0..= 0 && i < groupIds.count && groupIds[i] == pGroup + mp[i] = (causal || sameGroup) ? 0 : neg + } + } + + private func setPos(_ pos: Int) { + posScratch.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0] = Int32(pos) + } + + private func setRing(_ pos: Int) { + let W = modelConfig!.slidingWindow + ringScratch.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0] = Int32(pos % W) + } + + private func lookupRoPE(table: Data?, position: Int, dim: Int) throws -> MLMultiArray { + let result = try MLMultiArray( + shape: [1, 1, 1, NSNumber(value: dim)], dataType: .float16) + let dst = result.dataPointer.bindMemory(to: UInt16.self, capacity: dim) + guard let table else { + memset(dst, 0, dim * MemoryLayout.stride); return result + } + var headerSize = 128 + table.withUnsafeBytes { raw in + let b = raw.baseAddress!.assumingMemoryBound(to: UInt8.self) + headerSize = 10 + (Int(b[8]) | (Int(b[9]) << 8)) + } + let rowBytes = dim * MemoryLayout.stride + let offset = headerSize + position * rowBytes + guard offset + rowBytes <= table.count else { + memset(dst, 0, rowBytes); return result + } + _ = table.withUnsafeBytes { raw in + memcpy(dst, raw.baseAddress!.advanced(by: offset), rowBytes) + } + return result + } + + // MARK: - Batched (T=288) prefill scratch + + private func ensureBatchScratch(T: Int) throws { + guard let mc = modelConfig else { return } + let H = mc.hiddenSize + let PL = mc.numLayers * mc.perLayerDim + let ctx = mc.contextLength + let W = mc.slidingWindow + let hdS = 256 + let hdF = 512 + if batchHidden == nil || batchHidden!.shape[1].intValue != T { + batchHidden = try MLMultiArray( + shape: [1, NSNumber(value: T), NSNumber(value: H)], + dataType: .float16) + batchPerLayerRaw = try MLMultiArray( + shape: [1, NSNumber(value: T), NSNumber(value: PL)], + dataType: .float16) + batchMaskFull = try MLMultiArray( + shape: [1, 1, NSNumber(value: T), NSNumber(value: ctx)], + dataType: .float16) + batchMaskSliding = try MLMultiArray( + shape: [1, 1, NSNumber(value: T), NSNumber(value: W)], + dataType: .float16) + batchCosS = try MLMultiArray( + shape: [1, 1, NSNumber(value: T), NSNumber(value: hdS)], + dataType: .float16) + batchSinS = try MLMultiArray( + shape: [1, 1, NSNumber(value: T), NSNumber(value: hdS)], + dataType: .float16) + batchCosF = try MLMultiArray( + shape: [1, 1, NSNumber(value: T), NSNumber(value: hdF)], + dataType: .float16) + batchSinF = try MLMultiArray( + shape: [1, 1, NSNumber(value: T), NSNumber(value: hdF)], + dataType: .float16) + } + } + + /// Fill T-row causal masks for a contiguous batch starting at + /// `startPos`. Padded rows (t >= validCount) are filled as + /// duplicates of row validCount-1 so the auto-emit at row T-1 is + /// the model's prediction for the LAST valid prompt token (= + /// first post-prompt token). + private func fillBatchMasks(startPos: Int, T: Int, validCount: Int, + groupIds: [Int]?) { + let ctx = modelConfig!.contextLength + let W = modelConfig!.slidingWindow + let neg = Float16(-65504).bitPattern + let mf = batchMaskFull!.dataPointer.bindMemory( + to: UInt16.self, capacity: T * ctx) + let ms = batchMaskSliding!.dataPointer.bindMemory( + to: UInt16.self, capacity: T * W) + let effectiveT = max(validCount, 1) + for t in 0..= 0 && i < (groupIds?.count ?? 0) + && groupIds![i] == pGroup + mf[t * ctx + i] = (causal || sameGroup) ? 0 : neg + } + if p < W { + let valid = min(p + 1, W) + for i in 0..= 0 && i < (groupIds?.count ?? 0) + && groupIds![i] == pGroup + ms[t * W + i] = (causal || sameGroup) ? 0 : neg + } + } else { + for i in 0..= validCount) duplicate row validCount-1 so the + /// chunk graph sees a coherent batch where the auto-emit at row + /// T-1 corresponds to the LAST valid prompt position. + private func fillBatchRoPE(table: Data?, dst: MLMultiArray, + startPos: Int, T: Int, validCount: Int, + dim: Int) { + let p = dst.dataPointer.bindMemory(to: UInt16.self, capacity: T * dim) + guard let table else { memset(p, 0, T * dim * 2); return } + var headerSize = 128 + table.withUnsafeBytes { raw in + let b = raw.baseAddress!.assumingMemoryBound(to: UInt8.self) + headerSize = 10 + (Int(b[8]) | (Int(b[9]) << 8)) + } + let rowBytes = dim * MemoryLayout.stride + let effectiveT = max(validCount, 1) + for t in 0...stride) + } + } + } + } + + // MARK: - Reusable feature provider + + private final class FeatureProvider: NSObject, MLFeatureProvider { + let map: [String: MLFeatureValue] + let featureNames: Set + init(_ map: [String: MLFeatureValue]) { + self.map = map + self.featureNames = Set(map.keys) + } + func featureValue(for name: String) -> MLFeatureValue? { map[name] } + } + + // MARK: - T=1 decode step (3 chunks) + + private func decodeStep(token: Int32, position: Int, + opts: MLPredictionOptions) async throws -> Int32 { + guard let mc = modelConfig, + let c1 = decodeChunk1, let c2 = decodeChunk2, let c3 = decodeChunk3, + let s1 = decodeState1, let s2 = decodeState2 else { + throw CoreMLLLMError.modelNotFound("decode chunks/states not loaded") + } + + let hidden: MLMultiArray + let perLayerRaw: MLMultiArray + if let mmRow = multimodalSpliceT1(token: token) { + hidden = mmRow + perLayerRaw = try prlZerosT1Buffer() + } else { + hidden = try embedTokens!.lookup( + Int(token), shape: [1, 1, NSNumber(value: mc.hiddenSize)]) + perLayerRaw = try embedTokensPerLayer!.lookup( + Int(token), + shape: [1, 1, NSNumber(value: mc.numLayers * mc.perLayerDim)]) + } + + if let groupIds = mmVisionGroupIds { + fillFullCausalMaskVisionAware(position: position, groupIds: groupIds) + fillSlidingCausalMaskVisionAware(position: position, groupIds: groupIds) + } else { + fillFullCausalMask(position: position) + fillSlidingCausalMask(position: position) + } + setPos(position) + setRing(position) + + let cosS = try lookupRoPE(table: cosSlidingTable, position: position, dim: 256) + let sinS = try lookupRoPE(table: sinSlidingTable, position: position, dim: 256) + let cosF = try lookupRoPE(table: cosFullTable, position: position, dim: 512) + let sinF = try lookupRoPE(table: sinFullTable, position: position, dim: 512) + + let p1 = FeatureProvider([ + "hidden_states": MLFeatureValue(multiArray: hidden), + "causal_mask_full": fvMaskFull, + "causal_mask_sliding": fvMaskSliding, + "per_layer_raw": MLFeatureValue(multiArray: perLayerRaw), + "cos_s": MLFeatureValue(multiArray: cosS), + "sin_s": MLFeatureValue(multiArray: sinS), + "cos_f": MLFeatureValue(multiArray: cosF), + "sin_f": MLFeatureValue(multiArray: sinF), + "current_pos": fvPos, + "ring_pos": fvRing, + ]) + let out1 = try await c1.prediction(from: p1, using: s1, options: opts) + guard let h1 = out1.featureValue(for: "hidden_states_out"), + let plc = out1.featureValue(for: "per_layer_combined_out") + else { throw CoreMLLLMError.modelNotFound("decode chunk_1 missing outputs") } + + let p2 = FeatureProvider([ + "hidden_states": h1, + "causal_mask_full": fvMaskFull, + "causal_mask_sliding": fvMaskSliding, + "per_layer_combined": plc, + "cos_s": MLFeatureValue(multiArray: cosS), + "sin_s": MLFeatureValue(multiArray: sinS), + "cos_f": MLFeatureValue(multiArray: cosF), + "sin_f": MLFeatureValue(multiArray: sinF), + "current_pos": fvPos, + "ring_pos": fvRing, + ]) + let out2 = try await c2.prediction(from: p2, using: s2, options: opts) + guard let h2 = out2.featureValue(for: "hidden_states_out"), + let kv13k = out2.featureValue(for: "kv13_k"), + let kv13v = out2.featureValue(for: "kv13_v"), + let kv14k = out2.featureValue(for: "kv14_k"), + let kv14v = out2.featureValue(for: "kv14_v") + else { throw CoreMLLLMError.modelNotFound("decode chunk_2 missing outputs") } + + var sharedInputs: [String: MLFeatureValue] = [ + "causal_mask_full": fvMaskFull, + "causal_mask_sliding": fvMaskSliding, + "per_layer_combined": plc, + "cos_s": MLFeatureValue(multiArray: cosS), + "sin_s": MLFeatureValue(multiArray: sinS), + "cos_f": MLFeatureValue(multiArray: cosF), + "sin_f": MLFeatureValue(multiArray: sinF), + "kv13_k": kv13k, "kv13_v": kv13v, + "kv14_k": kv14k, "kv14_v": kv14v, + ] + var p3map = sharedInputs + p3map["hidden_states"] = h2 + let out3 = try await c3.prediction(from: FeatureProvider(p3map), options: opts) + if !is4Chunk { + guard let tokFV = out3.featureValue(for: "token_id"), + let tokArr = tokFV.multiArrayValue + else { throw CoreMLLLMError.modelNotFound("decode chunk_3 (3-chunk final) no token_id") } + return tokArr.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0] + } + // 4-chunk: chunk_3 = KV-shared no lm_head; chunk_4 = KV-shared + lm_head. + guard let h3 = out3.featureValue(for: "hidden_states_out") else { + throw CoreMLLLMError.modelNotFound("decode chunk_3 missing hidden_states_out") + } + guard let c4 = decodeChunk4 else { + throw CoreMLLLMError.modelNotFound("decode chunk_4 not loaded") + } + var p4map = sharedInputs + p4map["hidden_states"] = h3 + let out4 = try await c4.prediction(from: FeatureProvider(p4map), options: opts) + guard let tokFV = out4.featureValue(for: "token_id"), + let tokArr = tokFV.multiArrayValue + else { throw CoreMLLLMError.modelNotFound("decode chunk_4 no token_id") } + return tokArr.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0] + } + + // MARK: - T=288 single-function prefill pass + + /// One prefill pass over inputIds[startBatch ..< startBatch+validCount] + /// at sequence positions [position, position+validCount). Padded to + /// T=288 with -inf source masks. Returns the next token (chunk_3 + /// argmax for batch row validCount-1). + private func prefillStepT288(inputIds: [Int32], startBatch: Int, + position: Int, validCount: Int, + opts: MLPredictionOptions) async throws -> Int32 { + guard let mc = modelConfig, + let c1 = prefillChunk1, let c2 = prefillChunk2, let c3 = prefillChunk3, + let s1 = prefillState1, let s2 = prefillState2, + let embed = embedTokens, let perLayer = embedTokensPerLayer + else { throw CoreMLLLMError.modelNotFound("prefill T=288 not loaded") } + + let T = Self.kPrefillT + precondition(validCount > 0 && validCount <= T, + "validCount=\(validCount) out of (0, \(T)]") + try ensureBatchScratch(T: T) + let H = mc.hiddenSize + let PL = mc.numLayers * mc.perLayerDim + + let hPtr = batchHidden!.dataPointer.bindMemory( + to: UInt16.self, capacity: T * H) + let plPtr = batchPerLayerRaw!.dataPointer.bindMemory( + to: UInt16.self, capacity: T * PL) + let imgRowPtr = mmImageFeatures?.dataPointer.bindMemory( + to: UInt16.self, capacity: mmImageFeatures?.count ?? 0) + let audRowPtr = mmAudioFeatures?.dataPointer.bindMemory( + to: UInt16.self, capacity: mmAudioFeatures?.count ?? 0) + + // Pack valid rows (real tokens) + zero-pad the tail. + for t in 0...stride) + memset(plPtr.advanced(by: t * PL), 0, + PL * MemoryLayout.stride) + mmImageIdx += 1 + } else if let audPtr = audRowPtr, + tokInt32 == Self.AUDIO_TOKEN_ID, + mmAudioIdx < mmAudioNumTokens { + memcpy(hPtr.advanced(by: t * H), + audPtr.advanced(by: mmAudioIdx * H), + H * MemoryLayout.stride) + memset(plPtr.advanced(by: t * PL), 0, + PL * MemoryLayout.stride) + mmAudioIdx += 1 + } else { + let row = try embed.lookup(tok, shape: [1, 1, NSNumber(value: H)]) + memcpy(hPtr.advanced(by: t * H), + row.dataPointer, H * MemoryLayout.stride) + let plRow = try perLayer.lookup( + tok, shape: [1, 1, NSNumber(value: PL)]) + memcpy(plPtr.advanced(by: t * PL), + plRow.dataPointer, PL * MemoryLayout.stride) + } + } + // Pad rows [validCount..T-1] by duplicating row validCount-1. + // Same hidden + per_layer_raw — combined with mask/RoPE that + // pin padded rows to position validCount-1, the chunk graph + // computes row T-1's output identical to row validCount-1's, + // making the chunk_3 argmax at row T-1 a valid prediction + // of the first post-prompt token. Multimodal counters do NOT + // advance for padded rows (they already advanced for the + // validCount real-token rows above). + if validCount < T && validCount > 0 { + let srcRowH = hPtr.advanced(by: (validCount - 1) * H) + let srcRowPLR = plPtr.advanced(by: (validCount - 1) * PL) + for t in validCount...stride) + memcpy(plPtr.advanced(by: t * PL), srcRowPLR, + PL * MemoryLayout.stride) + } + } + + fillBatchMasks(startPos: position, T: T, + validCount: validCount, groupIds: mmVisionGroupIds) + fillBatchRoPE(table: cosSlidingTable, dst: batchCosS!, + startPos: position, T: T, validCount: validCount, dim: 256) + fillBatchRoPE(table: sinSlidingTable, dst: batchSinS!, + startPos: position, T: T, validCount: validCount, dim: 256) + fillBatchRoPE(table: cosFullTable, dst: batchCosF!, + startPos: position, T: T, validCount: validCount, dim: 512) + fillBatchRoPE(table: sinFullTable, dst: batchSinF!, + startPos: position, T: T, validCount: validCount, dim: 512) + setPos(position) + setRing(position) + + let fvHidden = MLFeatureValue(multiArray: batchHidden!) + let fvPLR = MLFeatureValue(multiArray: batchPerLayerRaw!) + let fvMF = MLFeatureValue(multiArray: batchMaskFull!) + let fvMS = MLFeatureValue(multiArray: batchMaskSliding!) + let fvCS = MLFeatureValue(multiArray: batchCosS!) + let fvSS = MLFeatureValue(multiArray: batchSinS!) + let fvCF = MLFeatureValue(multiArray: batchCosF!) + let fvSF = MLFeatureValue(multiArray: batchSinF!) + + let p1 = FeatureProvider([ + "hidden_states": fvHidden, + "causal_mask_full": fvMF, + "causal_mask_sliding": fvMS, + "per_layer_raw": fvPLR, + "cos_s": fvCS, "sin_s": fvSS, + "cos_f": fvCF, "sin_f": fvSF, + "current_pos": fvPos, "ring_pos": fvRing, + ]) + let out1 = try await c1.prediction(from: p1, using: s1, options: opts) + guard let h1 = out1.featureValue(for: "hidden_states_out"), + let plc = out1.featureValue(for: "per_layer_combined_out") + else { throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_1 missing outputs") } + + let p2 = FeatureProvider([ + "hidden_states": h1, + "causal_mask_full": fvMF, + "causal_mask_sliding": fvMS, + "per_layer_combined": plc, + "cos_s": fvCS, "sin_s": fvSS, + "cos_f": fvCF, "sin_f": fvSF, + "current_pos": fvPos, "ring_pos": fvRing, + ]) + let out2 = try await c2.prediction(from: p2, using: s2, options: opts) + guard let h2 = out2.featureValue(for: "hidden_states_out"), + let kv13k = out2.featureValue(for: "kv13_k"), + let kv13v = out2.featureValue(for: "kv13_v"), + let kv14k = out2.featureValue(for: "kv14_k"), + let kv14v = out2.featureValue(for: "kv14_v") + else { throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_2 missing outputs") } + + var sharedInputs: [String: MLFeatureValue] = [ + "causal_mask_full": fvMF, + "causal_mask_sliding": fvMS, + "per_layer_combined": plc, + "cos_s": fvCS, "sin_s": fvSS, + "cos_f": fvCF, "sin_f": fvSF, + "kv13_k": kv13k, "kv13_v": kv13v, + "kv14_k": kv14k, "kv14_v": kv14v, + ] + var p3map = sharedInputs + p3map["hidden_states"] = h2 + let out3 = try await c3.prediction(from: FeatureProvider(p3map), options: opts) + if !is4Chunk { + guard let tokFV = out3.featureValue(for: "token_id"), + let tokArr = tokFV.multiArrayValue + else { throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_3 (3-chunk final) no token_id") } + return tokArr.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0] + } + // 4-chunk: chunk_3 emits hidden_states_out only; chunk_4 emits token_id. + guard let h3 = out3.featureValue(for: "hidden_states_out") else { + throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_3 missing hidden_states_out") + } + guard let c4 = prefillChunk4 else { + throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_4 not loaded") + } + var p4map = sharedInputs + p4map["hidden_states"] = h3 + let out4 = try await c4.prediction(from: FeatureProvider(p4map), options: opts) + guard let tokFV = out4.featureValue(for: "token_id"), + let tokArr = tokFV.multiArrayValue + else { throw CoreMLLLMError.modelNotFound("prefill T=288 chunk_4 no token_id") } + // chunk_4 emits argmax at batch row T-1. When validCount < T + // we replicate row validCount-1 across padded rows, so row T-1 + // is functionally identical to row validCount-1 and the + // argmax is the valid first-post-prompt-token prediction. + return tokArr.dataPointer.bindMemory(to: Int32.self, capacity: 1)[0] + } + + // MARK: - Generate (T=288 prefill + bridge + T=1 decode) + + /// Run the prompt through T=288 prefill passes, bridge KV state + /// into the decode chunks, then T=1 decode up to maxNewTokens. + /// imageFeatures / audioFeatures are pre-encoded by the caller + /// (typically LLMRunner) via processImage / processAudio. + public func generate(inputIds: [Int32], + imageFeatures: MLMultiArray? = nil, + imageNumTokens: Int = 0, + audioFeatures: MLMultiArray? = nil, + audioNumTokens: Int = 0, + maxNewTokens: Int = 512, + eosTokenIds: Set = [], + onToken: ((Int32) -> Void)? = nil + ) async throws -> [Int32] { + guard let mc = modelConfig else { + throw CoreMLLLMError.modelNotFound("Gemma4MM: no config") + } + guard let c1 = decodeChunk1, let c2 = decodeChunk2, + decodeChunk3 != nil, + let pc1 = prefillChunk1, let pc2 = prefillChunk2, + prefillChunk3 != nil + else { throw CoreMLLLMError.modelNotFound("Gemma4MM: not loaded") } + if inputIds.isEmpty { return [] } + if inputIds.count >= mc.contextLength { + throw CoreMLLLMError.modelNotFound( + "prompt (\(inputIds.count) tokens) >= ctx (\(mc.contextLength))") + } + + // Bind multimodal state for the duration. + mmImageFeatures = imageFeatures + mmImageNumTokens = imageNumTokens + mmAudioFeatures = audioFeatures + mmAudioNumTokens = audioNumTokens + mmImageIdx = 0 + mmAudioIdx = 0 + let hasMultimodal = imageFeatures != nil || audioFeatures != nil + mmVisionGroupIds = hasMultimodal ? computeVisionGroupIds(inputIds: inputIds) : nil + defer { + mmImageFeatures = nil + mmAudioFeatures = nil + mmImageNumTokens = 0 + mmAudioNumTokens = 0 + mmImageIdx = 0 + mmAudioIdx = 0 + mmVisionGroupIds = nil + } + + // Cross-turn resume: persisted decode state is reusable; if + // persistedInputIds is a strict prefix of inputIds, skip the + // prefix and only T=288-prefill the suffix. Prefill states + // are scratch — always rebuilt for the suffix. + var resumeAt = 0 + let canResume = decodeState1 != nil && decodeState2 != nil + && !persistedInputIds.isEmpty + if canResume { + let cap = min(persistedInputIds.count, inputIds.count) + var l = 0 + while l < cap && persistedInputIds[l] == inputIds[l] { l += 1 } + if l == persistedInputIds.count && l < inputIds.count && l > 0 { + resumeAt = l + } + } + + // Advance multimodal counters past resumed prefix. + if resumeAt > 0 && hasMultimodal { + for j in 0.. 0 { + // Always-fresh prefill states for this generate call. + prefillState1 = pc1.makeState() + prefillState2 = pc2.makeState() + + let T = Self.kPrefillT + var i = resumeAt + while i < inputIds.count { + let remaining = inputIds.count - i + let validCount = min(remaining, T) + prefillPredicted = try await prefillStepT288( + inputIds: inputIds, startBatch: i, position: position, + validCount: validCount, opts: opts) + position += validCount + lastToken = inputIds[i + validCount - 1] + i += validCount + passes += 1 + } + + // Bridge prefill KV → decode KV (full buffer memcpy each). + if let ps1 = prefillState1, let ds1 = decodeState1 { + bridgeKVState(from: ps1, to: ds1) + } + if let ps2 = prefillState2, let ds2 = decodeState2 { + bridgeKVState(from: ps2, to: ds2) + } + // Drop prefill states — they're rebuilt next generate(). + prefillState1 = nil + prefillState2 = nil + } + let prefillEnd = CFAbsoluteTimeGetCurrent() + + // The last prefill pass always auto-emits a valid next token + // (full batch — padded rows duplicate row validCount-1, so + // row T-1's argmax is the first-post-prompt-token prediction). + var decoded: [Int32] = [] + if maxNewTokens > 0 && suffixCount > 0 { + decoded.append(prefillPredicted) + onToken?(prefillPredicted) + lastToken = prefillPredicted + } + while decoded.count < maxNewTokens { + if eosTokenIds.contains(lastToken) { break } + if position >= mc.contextLength { break } + let next = try await decodeStep( + token: lastToken, position: position, opts: opts) + decoded.append(next) + onToken?(next) + lastToken = next + position += 1 + } + let t1 = CFAbsoluteTimeGetCurrent() + + // Persist consumed tokens for next-turn LCP match. + let consumed = decoded.dropLast() + var newPersisted = inputIds + newPersisted.append(contentsOf: consumed) + persistedInputIds = newPersisted + persistedPosition = newPersisted.count + + let prefillMs = (prefillEnd - t0) * 1000 + let decodeMs = (t1 - prefillEnd) * 1000 + if decodeMs > 0 && decoded.count > 1 { + lastDecodeTokensPerSecond = Double(decoded.count - 1) / (decodeMs / 1000) + } + let resumeTag = resumeAt > 0 ? " [resumed L=\(resumeAt)]" : "" + print("[Gemma4MM] prefill \(suffixCount) tok in " + + String(format: "%.0fms (%.1f tok/s)%@ [T=288 passes=%d] | decode %d tok in %.0fms (%.1f tok/s)", + prefillMs, + Double(max(suffixCount, 1)) / max(prefillMs / 1000, 1e-3), + resumeTag, passes, + decoded.count, decodeMs, lastDecodeTokensPerSecond)) + return decoded + } +} diff --git a/Sources/CoreMLLLM/ModelDownloader.swift b/Sources/CoreMLLLM/ModelDownloader.swift index 7ac9200..f215012 100644 --- a/Sources/CoreMLLLM/ModelDownloader.swift +++ b/Sources/CoreMLLLM/ModelDownloader.swift @@ -288,6 +288,34 @@ public final class ModelDownloader: NSObject { downloadURL: "", folderName: "gemma4-e4b-stateful-linear") + /// Gemma 4 E2B (stateful Linear decode + T=288 single-function + /// prefill + vision/video/audio multimodal). Stage 8 candidate. + /// Decode reuses the 3-chunk merged Linear bundle from + /// `gemma4e2bStatefulLinear`; prefill is a separate set of three + /// T=288 single-function mlpackages under `prefill_T288/`. After + /// each prefill pass the engine memcpys kv_cache_sliding + + /// kv_cache_full from the prefill MLState into the decode + /// MLState (multifunction T>1 + dual MLState is rejected by + /// iPhone ANE 18 — single-function works). Bundle ships under + /// `gemma4_e2b_stateful_chunks/` so the engine layout matches + /// the existing stateful entries. + /// Sideload-only until iPhone 17 Pro Phase B validation closes. + public static let gemma4e2bStatefulMultimodal = ModelInfo( + id: "gemma4-e2b-stateful-multimodal", + name: "Gemma 4 E2B (stateful, multimodal)", size: "4.0 GB", + downloadURL: "", + folderName: "gemma4-e2b-stateful-multimodal") + + /// Gemma 4 E4B (stateful Linear decode + T=288 prefill + + /// multimodal). Same engine class as the E2B variant; layer + /// counts come from `model_config.json` so the runtime is + /// dimension-agnostic. Sideload-only until iPhone validation. + public static let gemma4e4bStatefulMultimodal = ModelInfo( + id: "gemma4-e4b-stateful-multimodal", + name: "Gemma 4 E4B (stateful, multimodal)", size: "5.0 GB", + downloadURL: "", + folderName: "gemma4-e4b-stateful-multimodal") + /// Visible in the UI picker. EAGLE-3 / LookAhead probe variants are /// hidden unless `LLM_SHOW_EXPERIMENTAL=1` is set (or the /// UserDefaults key `showExperimentalModels` is true). Keeps the @@ -324,6 +352,8 @@ public final class ModelDownloader: NSObject { list.insert(gemma4e2bStateful, at: 5) // Conv2d variant list.insert(gemma4e4bStateful, at: 6) // E4B Stage 2 Conv2d list.insert(gemma4e4bStatefulLinear, at: 7) // E4B Stage 2 Linear + list.insert(gemma4e2bStatefulMultimodal, at: 8) // Stage 8 E2B + list.insert(gemma4e4bStatefulMultimodal, at: 9) // Stage 8 E4B } return list } diff --git a/Sources/gemma4mm-smoke/main.swift b/Sources/gemma4mm-smoke/main.swift new file mode 100644 index 0000000..260a558 --- /dev/null +++ b/Sources/gemma4mm-smoke/main.swift @@ -0,0 +1,92 @@ +// Mac smoke test for Gemma4StatefulMultimodalEngine. +// +// Usage: +// swift run -c release gemma4mm-smoke [prompt] [maxTokens] +// +// `bundle-dir` should be the directory containing chunk_{1,2,3}.mlmodelc +// and a `prefill_T288/` subdir — i.e. the inner +// `gemma4_e2b_stateful_chunks/` folder, NOT the outer parent. +// +// Produces text-only output (no image/audio attachment). Used to catch +// engine bugs without needing an iPhone roundtrip — Mac ANE compiler is +// more permissive than iPhone's, so chunk_2 is expected to load here +// even when iPhone fails MIL→EIR translation. + +import CoreML +import CoreMLLLM +import Foundation +import Tokenizers + +@main +struct Gemma4MMSmoke { + static func main() async { + let args = CommandLine.arguments + guard args.count >= 2 else { + fputs("usage: \(args[0]) [prompt] [maxTokens]\n", stderr) + exit(2) + } + let bundleDir = URL(fileURLWithPath: args[1]) + let prompt = args.count >= 3 + ? args[2] + : "Write three short sentences about the ocean." + let maxTokens = args.count >= 4 ? (Int(args[3]) ?? 64) : 64 + + do { + print("[smoke] bundle: \(bundleDir.path)") + // Tokenizer. + let hfDir = bundleDir.appendingPathComponent("hf_model") + let tok = try await AutoTokenizer.from(modelFolder: hfDir) + print("[smoke] tokenizer loaded") + + // Engine. + let engine = Gemma4StatefulMultimodalEngine() + let t0 = CFAbsoluteTimeGetCurrent() + try await engine.load(modelDirectory: bundleDir) + let loadDt = CFAbsoluteTimeGetCurrent() - t0 + print(String(format: "[smoke] engine loaded in %.1fs", loadDt)) + print("[smoke] hasVision=\(engine.hasVision) hasAudio=\(engine.hasAudio)") + + // Build a Gemma 4 chat prompt (single user turn). + let promptStr = "<|turn>user\n\(prompt)\n<|turn>model\n" + let inputIds = tok.encode(text: promptStr).map { Int32($0) } + print("[smoke] input_ids = \(inputIds.count) tokens") + print("[smoke] prompt: \(prompt)") + print("[smoke] max_tokens=\(maxTokens)") + + var eosSet: Set = [1, 106] + if let eid = tok.eosTokenId { eosSet.insert(Int32(eid)) } + let skipSet: Set = [1, 105, 106] + + var accum: [Int] = [] + var emittedString = "" + var totalEmitted = 0 + let genStart = CFAbsoluteTimeGetCurrent() + _ = try await engine.generate( + inputIds: inputIds, + maxNewTokens: maxTokens, + eosTokenIds: eosSet, + onToken: { tokenId in + if skipSet.contains(tokenId) { return } + accum.append(Int(tokenId)) + let current = tok.decode(tokens: accum) + if current.count > emittedString.count { + let delta = String( + current.suffix(current.count - emittedString.count)) + FileHandle.standardOutput.write(Data(delta.utf8)) + emittedString = current + } + totalEmitted += 1 + }) + let dt = CFAbsoluteTimeGetCurrent() - genStart + print("\n---") + print(String(format: "[smoke] decode wall = %.2fs", dt)) + print(String(format: "[smoke] last decode tok/s = %.2f", + engine.lastDecodeTokensPerSecond)) + print("[smoke] tokens emitted (non-skipped): \(totalEmitted)") + exit(0) + } catch { + fputs("[smoke] error: \(error)\n", stderr) + exit(1) + } + } +} diff --git a/conversion/build_gemma4_stateful_singlefunc_prefill.py b/conversion/build_gemma4_stateful_singlefunc_prefill.py index 7da6681..c26580f 100644 --- a/conversion/build_gemma4_stateful_singlefunc_prefill.py +++ b/conversion/build_gemma4_stateful_singlefunc_prefill.py @@ -82,12 +82,16 @@ from build_gemma4_e2b_stateful_chunks import ( _resolve_hf_dir, convert_chunk1_prefill, + convert_chunk2_prefill, convert_chunk_shared_prefill, ) from build_gemma4_e2b_stateful_3chunks import convert_chunk2_merged_prefill from models.gemma4 import Gemma4Model from models.gemma4_swa_chunks import compute_chunk_boundaries -from models.gemma4_swa_stateful_chunks import SWAStatefulChunk4Prefill +from models.gemma4_swa_stateful_chunks import ( + SWAStatefulChunk3Prefill, + SWAStatefulChunk4Prefill, +) def main(): @@ -108,9 +112,17 @@ def main(): ap.add_argument("--linear-projections", action="store_true", help="Plan 3 Linear projections (cml9 PR #2577) — " "default on for Stage 3 / Stage 8 ship parity.") - ap.add_argument("--only", choices=("chunk1", "chunk2_3way", "chunk3"), + ap.add_argument("--only", choices=("chunk1", "chunk2_3way", "chunk3", + "chunk2_own", "chunk3_shared", + "chunk4_final"), default=None, - help="Build only one chunk (debug; default builds all 3).") + help="Build only one chunk (debug).") + ap.add_argument("--four-chunk", action="store_true", + help="Build 4-chunk variant: chunk_1, chunk_2 (own only), " + "chunk_3 (KV-shared no lm_head), chunk_4 (KV-shared " + "+ lm_head). Use when E4B chunk_2 merged graph is " + "rejected by iPhone ANE 18 (std::bad_cast at " + "MIL→EIR translation). Default off (3-chunk merged).") args = ap.parse_args() if args.output is None: @@ -140,11 +152,19 @@ def main(): shared_range = boundaries[2] c4_start, c4_end = boundaries[3] - paths = { - "chunk1": os.path.join(args.output, f"chunk_1_prefill_T{args.t}.mlpackage"), - "chunk2_3way": os.path.join(args.output, f"chunk_2_3way_prefill_T{args.t}.mlpackage"), - "chunk3": os.path.join(args.output, f"chunk_3_prefill_T{args.t}.mlpackage"), - } + if args.four_chunk: + paths = { + "chunk1": os.path.join(args.output, f"chunk_1_prefill_T{args.t}.mlpackage"), + "chunk2_own": os.path.join(args.output, f"chunk_2_prefill_T{args.t}.mlpackage"), + "chunk3_shared": os.path.join(args.output, f"chunk_3_prefill_T{args.t}.mlpackage"), + "chunk4_final": os.path.join(args.output, f"chunk_4_prefill_T{args.t}.mlpackage"), + } + else: + paths = { + "chunk1": os.path.join(args.output, f"chunk_1_prefill_T{args.t}.mlpackage"), + "chunk2_3way": os.path.join(args.output, f"chunk_2_3way_prefill_T{args.t}.mlpackage"), + "chunk3": os.path.join(args.output, f"chunk_3_prefill_T{args.t}.mlpackage"), + } t0 = time.time() if args.only in (None, "chunk1"): @@ -156,28 +176,71 @@ def main(): nbits=args.nbits, use_linear=args.linear_projections, ) - if args.only in (None, "chunk2_3way"): - convert_chunk2_merged_prefill( - base=base, ctx=args.ctx, T=args.t, - out_path=paths["chunk2_3way"], - nbits=args.nbits, - use_linear=args.linear_projections, - own_range=own_range, - shared_range=shared_range, - ) - if args.only in (None, "chunk3"): - convert_chunk_shared_prefill( - chunk_cls=SWAStatefulChunk4Prefill, - base=base, - c_start=c4_start, c_end=c4_end, - ctx=args.ctx, T=args.t, - out_path=paths["chunk3"], - nbits=args.nbits, - use_linear=args.linear_projections, - ) + if args.four_chunk: + # 4-chunk path: chunk_2 = own only, chunk_3 = KV-shared (no lm_head), + # chunk_4 = KV-shared + lm_head. Splits the 3-chunk merged middle so + # each subgraph stays under iPhone ANE 18 compile budget. + own_start, own_end = own_range + shared_start, shared_end = shared_range + if args.only in (None, "chunk2_own"): + convert_chunk2_prefill( + base=base, + c_start=own_start, c_end=own_end, + ctx=args.ctx, T=args.t, + out_path=paths["chunk2_own"], + nbits=args.nbits, + use_linear=args.linear_projections, + ) + if args.only in (None, "chunk3_shared"): + convert_chunk_shared_prefill( + chunk_cls=SWAStatefulChunk3Prefill, + base=base, + c_start=shared_start, c_end=shared_end, + ctx=args.ctx, T=args.t, + out_path=paths["chunk3_shared"], + nbits=args.nbits, + name="CHUNK 3 (KV-shared, no lm_head)", + with_lm_head=False, + use_linear=args.linear_projections, + ) + if args.only in (None, "chunk4_final"): + convert_chunk_shared_prefill( + chunk_cls=SWAStatefulChunk4Prefill, + base=base, + c_start=c4_start, c_end=c4_end, + ctx=args.ctx, T=args.t, + out_path=paths["chunk4_final"], + nbits=args.nbits, + name="CHUNK 4 (KV-shared + lm_head)", + with_lm_head=True, + use_linear=args.linear_projections, + ) + else: + if args.only in (None, "chunk2_3way"): + convert_chunk2_merged_prefill( + base=base, ctx=args.ctx, T=args.t, + out_path=paths["chunk2_3way"], + nbits=args.nbits, + use_linear=args.linear_projections, + own_range=own_range, + shared_range=shared_range, + ) + if args.only in (None, "chunk3"): + convert_chunk_shared_prefill( + chunk_cls=SWAStatefulChunk4Prefill, + base=base, + c_start=c4_start, c_end=c4_end, + ctx=args.ctx, T=args.t, + out_path=paths["chunk3"], + nbits=args.nbits, + name="CHUNK 3 (final)", + with_lm_head=True, + use_linear=args.linear_projections, + ) print(f"\n[build] DONE in {time.time()-t0:.0f}s") print("=" * 60) - print(f"3-chunk merged stateful single-function prefill (T={args.t}):") + layout = "4-chunk" if args.four_chunk else "3-chunk merged" + print(f"{layout} stateful single-function prefill (T={args.t}):") for label, path in paths.items(): if not os.path.exists(path): continue diff --git a/conversion/models/gemma4_swa_stateful_chunks.py b/conversion/models/gemma4_swa_stateful_chunks.py index 3f42bd2..4e3dcf3 100644 --- a/conversion/models/gemma4_swa_stateful_chunks.py +++ b/conversion/models/gemma4_swa_stateful_chunks.py @@ -193,14 +193,17 @@ def _run_layer_swa_stateful( V_for_attn = V_sliding_slice # Producer alias outputs — same kv13/kv14 naming as the recurrent - # build so chunks 3/4 see no input-name change. These are slice - # views over the producer's just-updated state buffer. + # build so chunks 3/4 see no input-name change. .clone() forces + # a fresh tensor (rather than a slice-view over the just-updated + # state buffer): iPhone ANE 18 fails MIL→EIR translation with + # `std::bad_cast` when a state-slice is used as both an input + # to subsequent shared layers AND a public chunk output. if layer_idx == config.kv_sliding_producer: - kv_store_13_k = K_for_attn[..., :config.head_dim] - kv_store_13_v = V_for_attn[..., :config.head_dim] + kv_store_13_k = K_for_attn[..., :config.head_dim].clone() + kv_store_13_v = V_for_attn[..., :config.head_dim].clone() elif layer_idx == config.kv_full_producer: - kv_store_14_k = K_for_attn[..., :config.global_head_dim] - kv_store_14_v = V_for_attn[..., :config.global_head_dim] + kv_store_14_k = K_for_attn[..., :config.global_head_dim].clone() + kv_store_14_v = V_for_attn[..., :config.global_head_dim].clone() else: # Shared layer: read producer KV from the alias inputs. if is_full: @@ -587,11 +590,11 @@ def _run_layer_swa_stateful_prefill( V_for_attn = V_sliding_slice if layer_idx == config.kv_sliding_producer: - kv_store_13_k = K_for_attn[..., :config.head_dim] - kv_store_13_v = V_for_attn[..., :config.head_dim] + kv_store_13_k = K_for_attn[..., :config.head_dim].clone() + kv_store_13_v = V_for_attn[..., :config.head_dim].clone() elif layer_idx == config.kv_full_producer: - kv_store_14_k = K_for_attn[..., :config.global_head_dim] - kv_store_14_v = V_for_attn[..., :config.global_head_dim] + kv_store_14_k = K_for_attn[..., :config.global_head_dim].clone() + kv_store_14_v = V_for_attn[..., :config.global_head_dim].clone() else: if is_full: K_for_attn = kv_store_14_k @@ -1067,11 +1070,11 @@ def _run_layer_swa_stateful_single( V_for_attn = kv_cache_unified[2*oi+1:2*oi+2, :, :W, :hd] if layer_idx == config.kv_sliding_producer: - kv_store_13_k = K_for_attn[..., :config.head_dim] - kv_store_13_v = V_for_attn[..., :config.head_dim] + kv_store_13_k = K_for_attn[..., :config.head_dim].clone() + kv_store_13_v = V_for_attn[..., :config.head_dim].clone() elif layer_idx == config.kv_full_producer: - kv_store_14_k = K_for_attn[..., :config.global_head_dim] - kv_store_14_v = V_for_attn[..., :config.global_head_dim] + kv_store_14_k = K_for_attn[..., :config.global_head_dim].clone() + kv_store_14_v = V_for_attn[..., :config.global_head_dim].clone() else: if is_full: K_for_attn = kv_store_14_k @@ -1182,11 +1185,11 @@ def _run_layer_swa_stateful_prefill_single( V_for_attn = kv_cache_unified[2*oi+1:2*oi+2, :, :W, :hd] if layer_idx == config.kv_sliding_producer: - kv_store_13_k = K_for_attn[..., :config.head_dim] - kv_store_13_v = V_for_attn[..., :config.head_dim] + kv_store_13_k = K_for_attn[..., :config.head_dim].clone() + kv_store_13_v = V_for_attn[..., :config.head_dim].clone() elif layer_idx == config.kv_full_producer: - kv_store_14_k = K_for_attn[..., :config.global_head_dim] - kv_store_14_v = V_for_attn[..., :config.global_head_dim] + kv_store_14_k = K_for_attn[..., :config.global_head_dim].clone() + kv_store_14_v = V_for_attn[..., :config.global_head_dim].clone() else: if is_full: K_for_attn = kv_store_14_k diff --git a/scripts/assemble_gemma4_stateful_multimodal.sh b/scripts/assemble_gemma4_stateful_multimodal.sh new file mode 100755 index 0000000..53e74ff --- /dev/null +++ b/scripts/assemble_gemma4_stateful_multimodal.sh @@ -0,0 +1,211 @@ +#!/bin/bash +# Assemble the Gemma 4 stateful + multimodal bundle for iPhone sideload. +# Stage 8: 3-chunk merged Linear decode + T=288 single-function prefill + +# vision / video / audio encoders. Drives Gemma4StatefulMultimodalEngine. +# +# Layout produced (matches LLMRunner detection — chunks + prefill_T288/ +# subdir under gemma4_e2b_stateful_chunks/): +# +# build/gemma4_stateful_multimodal_e{2,4}b/ +# gemma4_e2b_stateful_chunks/ # subdir name shared with E2B/E4B +# chunk_{1..3}.mlmodelc (3-chunk merged decode) +# prefill_T288/ +# chunk_1_prefill_T288.mlmodelc +# chunk_2_3way_prefill_T288.mlmodelc +# chunk_3_prefill_T288.mlmodelc +# embed_tokens_q8.bin (sidecars from legacy bundle) +# embed_tokens_scales.bin +# embed_tokens_per_layer_q8.bin +# embed_tokens_per_layer_scales.bin +# per_layer_projection.bin +# per_layer_norm_weight.bin +# cos_sliding.npy / sin_sliding.npy / cos_full.npy / sin_full.npy +# hf_model/ (tokenizer files) +# model_config.json +# vision.mlmodelc (multimodal encoders, shared) +# vision_video.mlmodelc +# audio.mlmodelc +# mel_filterbank.bin (audio sidecars) +# audio_config.json +# output_proj_weight.npy +# output_proj_bias.npy +# embed_proj_weight.npy +# +# Usage: +# MODEL=gemma4-e2b bash scripts/assemble_gemma4_stateful_multimodal.sh +# MODEL=gemma4-e4b bash scripts/assemble_gemma4_stateful_multimodal.sh +# +# Inputs (overridable via env): +# SRC_CHUNKS /tmp/$MODEL-stateful-3chunk +# SRC_PREFILL_T288 /tmp/$MODEL-singlefunc-prefill-T288 +# SIDECARS legacy text-only bundle (embed/RoPE/tokenizer) +# ENCODERS legacy multimodal bundle (vision/audio mlmodelc) +# — vision/audio shared between E2B and E4B +# +# Push: +# DEVICE= +# xcrun devicectl device copy to --device $DEVICE \ +# --domain-type appDataContainer \ +# --domain-identifier com.example.CoreMLLLMChat \ +# --source build/gemma4_stateful_multimodal_e4b \ +# --destination Documents/Models/gemma4-e4b-stateful-multimodal +# +# Scheme: LLM_SHOW_EXPERIMENTAL=1 to reveal the picker entry. +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +MODEL="${MODEL:-gemma4-e4b}" +case "$MODEL" in + gemma4-e2b) SHORT=e2b ;; + gemma4-e4b) SHORT=e4b ;; + *) echo "[error] MODEL must be gemma4-e2b or gemma4-e4b" >&2; exit 1 ;; +esac + +SRC_CHUNKS="${SRC_CHUNKS:-/tmp/$MODEL-stateful-3chunk}" +SRC_PREFILL_T288="${SRC_PREFILL_T288:-/tmp/$MODEL-singlefunc-prefill-T288}" +# Text-side sidecars (embed_tokens, RoPE, tokenizer, model_config). E2B +# defaults to the iphone_8k staging dir; E4B defaults to its own legacy +# bundle (text-only — vision/audio come from ENCODERS below). +if [[ "$MODEL" == "gemma4-e2b" ]]; then + SIDECARS="${SIDECARS:-/Users/majimadaisuke/Downloads/CoreML-LLM/conversion/output/iphone_8k}" +else + SIDECARS="${SIDECARS:-/Users/majimadaisuke/Downloads/CoreML-LLM/output/$MODEL/bundle}" +fi +# Multimodal encoders + audio sidecars. Shared between E2B and E4B (same +# SigLIP + Conformer regardless of LM size). +ENCODERS="${ENCODERS:-/Users/majimadaisuke/Downloads/CoreML-LLM/conversion/output/iphone_8k}" +# mel_filterbank.bin lives in a separate dir in some build trees; the +# script falls back to this path if it's not under ENCODERS. +MEL_FALLBACK="${MEL_FALLBACK:-/Users/majimadaisuke/Downloads/CoreML-LLM/conversion/output/audio}" + +OUT_PARENT="${OUT_PARENT:-$ROOT/build/gemma4_stateful_multimodal_$SHORT}" +OUT="$OUT_PARENT/gemma4_e2b_stateful_chunks" +PREFILL_OUT="$OUT/prefill_T288" + +# Set FOUR_CHUNK=1 to assemble the 4-chunk decode + 4-chunk prefill_T288 +# variant (E4B fallback when 3-chunk merged trips iPhone ANE 18). Requires +# `--four-chunk` builds: SRC_CHUNKS holds chunk_{1..4}.mlpackage and +# SRC_PREFILL_T288 holds chunk_{1..4}_prefill_T288.mlpackage. +FOUR_CHUNK="${FOUR_CHUNK:-0}" +if [[ "$FOUR_CHUNK" == "1" ]]; then + DECODE_CHUNKS=(chunk_1 chunk_2 chunk_3 chunk_4) + PREFILL_CHUNKS=(chunk_1_prefill_T288 chunk_2_prefill_T288 + chunk_3_prefill_T288 chunk_4_prefill_T288) +else + DECODE_CHUNKS=(chunk_1 chunk_2 chunk_3) + PREFILL_CHUNKS=(chunk_1_prefill_T288 chunk_2_3way_prefill_T288 + chunk_3_prefill_T288) +fi + +# ---- Sanity: required inputs ---- +for d in "$SRC_CHUNKS" "$SRC_PREFILL_T288" "$SIDECARS" "$ENCODERS"; do + if [[ ! -d "$d" ]]; then + echo "[error] missing input dir: $d" >&2 + exit 1 + fi +done +for c in "${DECODE_CHUNKS[@]}"; do + if [[ ! -d "$SRC_CHUNKS/${c}.mlpackage" && ! -d "$SRC_CHUNKS/${c}.mlmodelc" ]]; then + echo "[error] $SRC_CHUNKS/${c}.{mlpackage,mlmodelc} missing — run build_gemma4_e2b_stateful_{,3}chunks.py first" >&2 + exit 1 + fi +done +for c in "${PREFILL_CHUNKS[@]}"; do + if [[ ! -d "$SRC_PREFILL_T288/${c}.mlpackage" && ! -d "$SRC_PREFILL_T288/${c}.mlmodelc" ]]; then + echo "[error] $SRC_PREFILL_T288/${c}.{mlpackage,mlmodelc} missing — run build_gemma4_stateful_singlefunc_prefill.py first" >&2 + exit 1 + fi +done + +rm -rf "$OUT_PARENT" +mkdir -p "$OUT" "$PREFILL_OUT" + +# ---- 1. Compile + place decode mlpackages ---- +for c in "${DECODE_CHUNKS[@]}"; do + echo "[compile decode] $c" + if [[ -d "$SRC_CHUNKS/${c}.mlmodelc" ]]; then + cp -R "$SRC_CHUNKS/${c}.mlmodelc" "$OUT/${c}.mlmodelc" + else + xcrun coremlcompiler compile \ + "$SRC_CHUNKS/${c}.mlpackage" "$OUT/" 2>&1 | tail -2 + fi +done + +# ---- 2. Compile + place T=288 single-function prefill mlpackages ---- +for c in "${PREFILL_CHUNKS[@]}"; do + echo "[compile prefill_T288] $c" + if [[ -d "$SRC_PREFILL_T288/${c}.mlmodelc" ]]; then + cp -R "$SRC_PREFILL_T288/${c}.mlmodelc" "$PREFILL_OUT/${c}.mlmodelc" + else + xcrun coremlcompiler compile \ + "$SRC_PREFILL_T288/${c}.mlpackage" "$PREFILL_OUT/" 2>&1 | tail -2 + fi +done + +# ---- 3. Copy text-side sidecars ---- +SIDE_ITEMS=( + "embed_tokens_q8.bin" + "embed_tokens_scales.bin" + "embed_tokens_per_layer_q8.bin" + "embed_tokens_per_layer_scales.bin" + "per_layer_projection.bin" + "per_layer_norm_weight.bin" + "cos_sliding.npy" + "sin_sliding.npy" + "cos_full.npy" + "sin_full.npy" + "hf_model" + "model_config.json" +) +for item in "${SIDE_ITEMS[@]}"; do + if [[ -e "$SIDECARS/$item" ]]; then + echo "[copy text sidecar] $item" + cp -R "$SIDECARS/$item" "$OUT/" + else + echo " [warn] missing text sidecar $item" + fi +done + +# ---- 4. Copy multimodal encoders + audio sidecars ---- +ENC_ITEMS=( + "vision.mlmodelc" + "vision_video.mlmodelc" + "audio.mlmodelc" + "mel_filterbank.bin" + "audio_config.json" + "output_proj_weight.npy" + "output_proj_bias.npy" + "embed_proj_weight.npy" +) +for item in "${ENC_ITEMS[@]}"; do + if [[ -e "$ENCODERS/$item" ]]; then + echo "[copy encoder] $item" + cp -R "$ENCODERS/$item" "$OUT/" + elif [[ "$item" == "mel_filterbank.bin" && -e "$MEL_FALLBACK/$item" ]]; then + echo "[copy encoder fallback] $item (from $MEL_FALLBACK)" + cp -R "$MEL_FALLBACK/$item" "$OUT/" + else + echo " [warn] missing encoder $item (engine treats as optional)" + fi +done + +echo "" +echo "=== assembled ===" +du -sh "$OUT_PARENT" +echo "" +echo "Top-level:" +ls -la "$OUT/" | head -30 +echo "" +echo "prefill_T288/:" +ls -la "$PREFILL_OUT/" + +echo "" +echo "Push to iPhone:" +echo " DEVICE=\$(xcrun devicectl list devices --quiet | awk 'NR==2{print \$3}')" +echo " xcrun devicectl device copy to --device \$DEVICE \\" +echo " --domain-type appDataContainer \\" +echo " --domain-identifier com.example.CoreMLLLMChat \\" +echo " --source $OUT_PARENT \\" +echo " --destination Documents/Models/$MODEL-stateful-multimodal" +echo "" +echo "Scheme: LLM_SHOW_EXPERIMENTAL=1 to reveal the picker entry." From 5f5d71a59f8f694d5eeefb92db17509d407eb059 Mon Sep 17 00:00:00 2001 From: john-rocky Date: Sun, 3 May 2026 10:54:24 +0900 Subject: [PATCH 7/7] feat(picker): gemma4e4bMultimodal + LLM_VISION_FORCE_ANE default in shared scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the HF-uploaded multimodal bundle into the in-app picker flow so users can download `mlboydaisuke/gemma-4-E4B-multimodal-coreml` with one tap (no sideload required). ModelDownloader.swift: - New `gemma4e4bMultimodal` ModelInfo entry (id `gemma4-e4b-multimodal`, size 7.6 GB, downloadURL points at the new HF repo). Shared `folderName: "gemma4-e4b"` with the legacy text-only entry mirrors the gemma4e2b3way / gemma4e2b pattern: chunks 1-4 are byte-identical in both repos, so users who switch between entries reuse the on-disk legacy chunks and only fetch the new files. - `gemma4e4b` (text-only) renamed to "Gemma 4 E4B (text-only)" to disambiguate from the new multimodal entry in the picker. - New `buildE4BMultimodalFileList()` enumerates 58 files matching the HF repo tree (decode chunks 1-4 + chunk2_3way + chunk3_3way + vision.ane.mlmodelc + audio.mlmodelc + audio sidecars + text sidecars). Splits files into legacyChunk(no metadata.json) vs newerMlc(with metadata.json) helpers — the legacy chunks were built before the metadata.json convention. - Defaults list inserts `gemma4e4bMultimodal` ahead of `gemma4e4b` so the picker presents multimodal as the primary E4B option. CoreMLLLMChat.xcscheme: - Add `LLM_VISION_FORCE_ANE=1` to the shared scheme. Safe to default — only affects models whose bundle ships a `vision.ane.mlmodelc` (the new E4B multimodal entry); other models silently fall through to their existing GPU `vision.mlmodelc`. - Add `LLM_SHOW_EXPERIMENTAL=1`. Required to expose the experimental picker entries (already documented in `ModelDownloader.swift`'s `defaults`). - Drop `LLM_PROFILE_EVERY_STEP=1` from the shared scheme; debug-only, belongs in a developer's local copy. --- .../xcschemes/CoreMLLLMChat.xcscheme | 12 ++ Sources/CoreMLLLM/ModelDownloader.swift | 138 +++++++++++++++++- 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj/xcshareddata/xcschemes/CoreMLLLMChat.xcscheme b/Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj/xcshareddata/xcschemes/CoreMLLLMChat.xcscheme index ab73af6..7158d8a 100644 --- a/Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj/xcshareddata/xcschemes/CoreMLLLMChat.xcscheme +++ b/Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj/xcshareddata/xcschemes/CoreMLLLMChat.xcscheme @@ -50,6 +50,18 @@ ReferencedContainer = "container:CoreMLLLMChat.xcodeproj"> + + + + + + [DownloadFile] { + [.init(remotePath: "\(name).mlmodelc/weights/weight.bin", + localPath: "\(name).mlmodelc/weights/weight.bin", estimatedSize: weightSize), + .init(remotePath: "\(name).mlmodelc/coremldata.bin", + localPath: "\(name).mlmodelc/coremldata.bin", estimatedSize: 1_500), + .init(remotePath: "\(name).mlmodelc/model.mil", + localPath: "\(name).mlmodelc/model.mil", estimatedSize: milSize), + .init(remotePath: "\(name).mlmodelc/analytics/coremldata.bin", + localPath: "\(name).mlmodelc/analytics/coremldata.bin", estimatedSize: 250)] + } + // Newer chunks (3-way decode + encoders) include metadata.json. + func newerMlc(_ name: String, weightSize: Int64, + milSize: Int64, metaSize: Int64) -> [DownloadFile] { + [.init(remotePath: "\(name).mlmodelc/weights/weight.bin", + localPath: "\(name).mlmodelc/weights/weight.bin", estimatedSize: weightSize), + .init(remotePath: "\(name).mlmodelc/coremldata.bin", + localPath: "\(name).mlmodelc/coremldata.bin", estimatedSize: 1_000), + .init(remotePath: "\(name).mlmodelc/model.mil", + localPath: "\(name).mlmodelc/model.mil", estimatedSize: milSize), + .init(remotePath: "\(name).mlmodelc/metadata.json", + localPath: "\(name).mlmodelc/metadata.json", estimatedSize: metaSize), + .init(remotePath: "\(name).mlmodelc/analytics/coremldata.bin", + localPath: "\(name).mlmodelc/analytics/coremldata.bin", estimatedSize: 250)] + } + + let chunkFiles = + legacyChunk("chunk1", weightSize: 585_970_432, milSize: 1_288_448) + + legacyChunk("chunk2", weightSize: 572_196_992, milSize: 1_277_032) + + legacyChunk("chunk3", weightSize: 412_740_736, milSize: 597_340) + + legacyChunk("chunk4", weightSize: 753_797_440, milSize: 608_413) + + newerMlc("chunk2_3way", weightSize: 984_936_000, + milSize: 917_977, metaSize: 8_741) + + newerMlc("chunk3_3way", weightSize: 753_797_440, + milSize: 303_969, metaSize: 6_697) + + newerMlc("vision.ane", weightSize: 342_227_200, + milSize: 709_941, metaSize: 2_694) + + newerMlc("audio", weightSize: 146_087_488, + milSize: 858_362, metaSize: 2_342) + + let extraFiles: [DownloadFile] = [ + .init(remotePath: "model_config.json", + localPath: "model_config.json", estimatedSize: 800), + .init(remotePath: "hf_model/tokenizer.json", + localPath: "hf_model/tokenizer.json", estimatedSize: 32_169_626), + .init(remotePath: "hf_model/tokenizer_config.json", + localPath: "hf_model/tokenizer_config.json", estimatedSize: 2_200), + .init(remotePath: "hf_model/config.json", + localPath: "hf_model/config.json", estimatedSize: 5_200), + .init(remotePath: "hf_model/generation_config.json", + localPath: "hf_model/generation_config.json", estimatedSize: 300), + .init(remotePath: "embed_tokens_q8.bin", + localPath: "embed_tokens_q8.bin", estimatedSize: 671_088_640), + .init(remotePath: "embed_tokens_scales.bin", + localPath: "embed_tokens_scales.bin", estimatedSize: 524_288), + .init(remotePath: "embed_tokens_per_layer_q8.bin", + localPath: "embed_tokens_per_layer_q8.bin", estimatedSize: 2_818_572_288), + .init(remotePath: "embed_tokens_per_layer_scales.bin", + localPath: "embed_tokens_per_layer_scales.bin", estimatedSize: 524_288), + .init(remotePath: "per_layer_projection.bin", + localPath: "per_layer_projection.bin", estimatedSize: 55_050_240), + .init(remotePath: "per_layer_norm_weight.bin", + localPath: "per_layer_norm_weight.bin", estimatedSize: 512), + .init(remotePath: "cos_sliding.npy", + localPath: "cos_sliding.npy", estimatedSize: 2_097_280), + .init(remotePath: "sin_sliding.npy", + localPath: "sin_sliding.npy", estimatedSize: 2_097_280), + .init(remotePath: "cos_full.npy", + localPath: "cos_full.npy", estimatedSize: 4_194_432), + .init(remotePath: "sin_full.npy", + localPath: "sin_full.npy", estimatedSize: 4_194_432), + // Audio sidecars (Swift two-stage projection). + .init(remotePath: "audio_config.json", + localPath: "audio_config.json", estimatedSize: 400), + .init(remotePath: "mel_filterbank.bin", + localPath: "mel_filterbank.bin", estimatedSize: 131_584), + .init(remotePath: "output_proj_weight.npy", + localPath: "output_proj_weight.npy", estimatedSize: 3_145_856), + .init(remotePath: "output_proj_bias.npy", + localPath: "output_proj_bias.npy", estimatedSize: 3_200), + .init(remotePath: "embed_proj_weight.npy", + localPath: "embed_proj_weight.npy", estimatedSize: 7_864_448), + ] + + var largeFiles: [DownloadFile] = [] + var smallFiles: [DownloadFile] = [] + let threshold: Int64 = 10_000_000 + for file in chunkFiles + extraFiles { + if file.estimatedSize >= threshold { + largeFiles.append(file) + } else { + smallFiles.append(file) + } + } + largeFiles.sort { $0.estimatedSize > $1.estimatedSize } + pendingFiles = largeFiles + smallFiles + totalBytesForAllFiles = pendingFiles.reduce(0) { $0 + $1.estimatedSize } + completedBytes = 0 + nextFileIndex = 0 + } + // MARK: - ZIP private func unzipFile(_ zipURL: URL, to destDir: URL) throws {