diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift index 6529514..08db204 100644 --- a/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift +++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift @@ -235,9 +235,11 @@ struct ChatView: View { if runner.isLoaded { ToolbarItem(placement: .topBarTrailing) { Menu("Bench") { - Button("5 min") { startBenchmark(minutes: 5) } - Button("10 min") { startBenchmark(minutes: 10) } - Button("30 min") { startBenchmark(minutes: 30) } + Button("2 min (speed)") { startBenchmark(minutes: 2) } + Button("5 min") { startBenchmark(minutes: 5) } + Button("15 min (power)") { startBenchmark(minutes: 15) } + Button("30 min") { startBenchmark(minutes: 30) } + Button("60 min") { startBenchmark(minutes: 60) } } .disabled(runner.isGenerating || benchmarkRunning) } @@ -565,6 +567,15 @@ struct ChatView: View { let logLines = result.batteryLog.map { entry in " \(String(format: "%5.0f", entry.0))s → \(Int(entry.1 * 100))%" }.joined(separator: "\n") + let thermalLines = result.thermalTrajectory.map { s in + " \(String(format: "%5.0f", s.t))s → \(LLMRunner.thermalString(s.state)) bat=\(s.batteryLevel >= 0 ? "\(Int(s.batteryLevel * 100))%" : "?")" + }.joined(separator: "\n") + let ttf = result.timeToFair.map { "\(Int($0))s" } ?? "never" + let tts = result.timeToSerious.map { "\(Int($0))s" } ?? "never" + let mJ = result.mJPerToken + let mJStr = mJ > 0 ? String(format: "%.1f mJ/tok", mJ) : "n/a (gauge noise, need >=10 min run)" + let csvPath = saveBenchmarkCSV(result) + let csvLine = csvPath.map { "CSV : \($0)" } ?? "CSV : (save failed)" let summary = """ [Benchmark RESULT] Duration : \(Int(result.duration))s (\(String(format: "%.1f", result.duration / 60.0)) min) @@ -572,9 +583,15 @@ struct ChatView: View { Total tokens : \(result.totalTokens) Avg tok/s : \(String(format: "%.2f", result.avgTokPerSec)) Battery : \(bs)% → \(be)% (Δ \(String(format: "%.2f", result.drainedPercent))%) - Drain rate : \(String(format: "%.3f", result.drainedPerMinute))%/min + Drain rate : \(String(format: "%.3f", result.drainedPerMinute))%/min (~\(String(format: "%.1f", result.drainedPerHour))%/hr) Tokens/%SoC : \(String(format: "%.0f", result.tokensPerPercent)) + Energy/token : \(mJStr) Thermal : \(LLMRunner.thermalString(result.thermalStart)) → \(LLMRunner.thermalString(result.thermalEnd))\(abortNote) + Time→fair : \(ttf) + Time→serious : \(tts) + \(csvLine) + Thermal trajectory: + \(thermalLines) Battery log: \(logLines) """ @@ -589,6 +606,21 @@ struct ChatView: View { } } + private func saveBenchmarkCSV(_ result: LLMRunner.BenchmarkResult) -> String? { + let fm = FileManager.default + guard let docs = fm.urls(for: .documentDirectory, in: .userDomainMask).first else { return nil } + let ts = Int(Date().timeIntervalSince1970) + let url = docs.appendingPathComponent("bench-\(ts).csv") + do { + try result.csv().write(to: url, atomically: true, encoding: .utf8) + print("[Benchmark] CSV saved: \(url.path)") + return url.lastPathComponent + } catch { + print("[Benchmark] CSV save failed: \(error)") + return nil + } + } + private func verifyANE() { messages.append(ChatMessage(role: .system, content: "Checking MLComputePlan device placement...")) Task.detached(priority: .userInitiated) { diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift index 96826a5..ce75469 100644 --- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift +++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift @@ -1221,6 +1221,12 @@ final class LLMRunner { var thermal: ProcessInfo.ThermalState } + struct ThermalSample { + var t: TimeInterval + var state: ProcessInfo.ThermalState + var batteryLevel: Float + } + struct BenchmarkResult { var duration: TimeInterval var totalTokens: Int @@ -1232,11 +1238,59 @@ final class LLMRunner { var thermalEnd: ProcessInfo.ThermalState var abortedThermal: Bool = false var batteryLog: [(TimeInterval, Float)] = [] + var thermalTrajectory: [LLMRunner.ThermalSample] = [] + + // iPhone 17 Pro nominal battery capacity. Override for other devices. + // Source: Apple spec sheet (14.03 Wh = 50508 J). + var batteryCapacityWh: Double = 14.03 var batteryDelta: Float { batteryStart - batteryEnd } var drainedPercent: Double { Double(batteryDelta) * 100.0 } var drainedPerMinute: Double { duration > 0 ? drainedPercent / (duration / 60.0) : 0 } + var drainedPerHour: Double { drainedPerMinute * 60.0 } var tokensPerPercent: Double { drainedPercent > 0 ? Double(totalTokens) / drainedPercent : 0 } + + /// Energy per decoded token in millijoules, derived from battery-gauge delta. + /// Coarse (1% gauge resolution); trust only for runs >= 10 min. + var mJPerToken: Double { + guard totalTokens > 0, drainedPercent > 0 else { return 0 } + let joules = drainedPercent / 100.0 * batteryCapacityWh * 3600.0 + return joules * 1000.0 / Double(totalTokens) + } + + var timeToFair: TimeInterval? { + thermalTrajectory.first { $0.state == .fair || $0.state == .serious || $0.state == .critical }?.t + } + var timeToSerious: TimeInterval? { + thermalTrajectory.first { $0.state == .serious || $0.state == .critical }?.t + } + + func csv() -> String { + var lines = ["t_seconds,battery_pct,thermal_state,source"] + for s in thermalTrajectory { + let pct = s.batteryLevel >= 0 ? Int(s.batteryLevel * 100) : -1 + lines.append("\(Int(s.t)),\(pct),\(LLMRunner.thermalString(s.state)),thermal") + } + for (t, lvl) in batteryLog { + let pct = lvl >= 0 ? Int(lvl * 100) : -1 + lines.append("\(Int(t)),\(pct),,battery") + } + lines.append("") + lines.append("# summary") + lines.append("# duration_s=\(Int(duration))") + lines.append("# total_tokens=\(totalTokens)") + lines.append("# avg_tok_per_sec=\(String(format: "%.2f", avgTokPerSec))") + lines.append("# drained_percent=\(String(format: "%.2f", drainedPercent))") + lines.append("# drained_per_hour=\(String(format: "%.2f", drainedPerHour))") + lines.append("# mJ_per_token=\(String(format: "%.2f", mJPerToken))") + lines.append("# time_to_fair_s=\(timeToFair.map { String(Int($0)) } ?? "never")") + lines.append("# time_to_serious_s=\(timeToSerious.map { String(Int($0)) } ?? "never")") + lines.append("# thermal_start=\(LLMRunner.thermalString(thermalStart))") + lines.append("# thermal_end=\(LLMRunner.thermalString(thermalEnd))") + lines.append("# aborted_thermal=\(abortedThermal)") + lines.append("# battery_capacity_wh=\(batteryCapacityWh)") + return lines.joined(separator: "\n") + } } private static let benchmarkPrompt = @@ -1258,6 +1312,10 @@ final class LLMRunner { var abortedThermal = false var batteryLog: [(TimeInterval, Float)] = [(0, startBat)] var lastLoggedLevel = startBat + var thermalTrajectory: [ThermalSample] = [ + ThermalSample(t: 0, state: startThermal, batteryLevel: startBat) + ] + var nextThermalSampleAt: TimeInterval = 30 let prompt = ChatMessage(role: .user, content: Self.benchmarkPrompt) func isThermalUnsafe() -> Bool { @@ -1277,6 +1335,13 @@ final class LLMRunner { batteryLog.append((elapsed, currentLevel)) lastLoggedLevel = currentLevel } + if elapsed >= nextThermalSampleAt { + thermalTrajectory.append(ThermalSample( + t: elapsed, + state: ProcessInfo.processInfo.thermalState, + batteryLevel: currentLevel)) + nextThermalSampleAt += 30 + } if totalTokens % 20 == 0 { onProgress(BenchmarkProgress( elapsed: elapsed, totalTokens: totalTokens, round: round, @@ -1293,14 +1358,17 @@ final class LLMRunner { let endTime = Date() let endBat = UIDevice.current.batteryLevel + let endThermal = ProcessInfo.processInfo.thermalState let dur = endTime.timeIntervalSince(startTime) batteryLog.append((dur, endBat)) + thermalTrajectory.append(ThermalSample(t: dur, state: endThermal, batteryLevel: endBat)) return BenchmarkResult( duration: dur, totalTokens: totalTokens, rounds: round, avgTokPerSec: dur > 0 ? Double(totalTokens) / dur : 0, batteryStart: startBat, batteryEnd: endBat, - thermalStart: startThermal, thermalEnd: ProcessInfo.processInfo.thermalState, - abortedThermal: abortedThermal, batteryLog: batteryLog) + thermalStart: startThermal, thermalEnd: endThermal, + abortedThermal: abortedThermal, batteryLog: batteryLog, + thermalTrajectory: thermalTrajectory) } #endif diff --git a/docs/BENCHMARKING.md b/docs/BENCHMARKING.md index cc49ab4..487acc9 100644 --- a/docs/BENCHMARKING.md +++ b/docs/BENCHMARKING.md @@ -72,7 +72,38 @@ We report **peak** in the README performance table because that matches how comp - Mid-decode: ~981 MB - Headroom (`os_proc_available`): ~5 GB -## Energy (`J/tok`) +## Energy (`mJ/tok`, `%/hour`, thermal trajectory) + +The sample app's **Bench** menu now exposes three presets aimed at +power reporting: + +- **2 min (speed)** — quick peak tok/s check +- **15 min (power)** — minimum duration for a defensible `mJ/tok` + number given the iOS battery gauge's 1 % resolution +- **60 min** — long-haul thermal profile, useful for "will this + throttle in a real session" questions + +After each run the app writes a CSV to `Documents/bench-.csv` +with the per-30s thermal trajectory, battery log, and a `# summary` +block. The CSV filename is printed in the in-app result and to the +console. Retrieve via Files app (the target already has +document-sharing entitlements). + +`BenchmarkResult` exposes: + +- `mJPerToken` — `drainedPercent × batteryCapacityWh × 36000 / totalTokens`. + iPhone 17 Pro nominal capacity is 14.03 Wh; override + `batteryCapacityWh` for other devices. +- `drainedPerHour` — extrapolated from the run duration. +- `timeToFair`, `timeToSerious` — first elapsed second at which + `ProcessInfo.thermalState` transitioned. +- `thermalTrajectory` — array of `ThermalSample(t, state, batteryLevel)` + at 30-second intervals. + +For the methodology, metric tiers, and head-to-head protocol against +other engines, see [POWER_BENCHMARK_PLAN.md](POWER_BENCHMARK_PLAN.md). + +## Energy (`J/tok`) — legacy derivation The ~0.07 J/tok figure in `docs/RESEARCH.md` is **derived**, not directly measured: diff --git a/docs/POWER_BENCHMARK_PLAN.md b/docs/POWER_BENCHMARK_PLAN.md new file mode 100644 index 0000000..ae8ddbf --- /dev/null +++ b/docs/POWER_BENCHMARK_PLAN.md @@ -0,0 +1,233 @@ +# Power & Energy Benchmark Plan + +**Goal:** publish a reproducible, defensible set of power/energy numbers +for Gemma 4 E2B on iPhone 17 Pro that competitors (LiteRT-LM iOS, +llama.cpp Metal, MLX) cannot easily match, and that makes the ANE +placement advantage visible as a *user-facing* metric — not just a +compute-unit placement percentage. + +Speed parity is the second-order goal (see +`docs/MOBILE_2K_COMPETITIVE_PLAN.md`). This doc covers the first-order +goal: prove that on a phone in a pocket, we draw less current and +stay cooler for the same work. + +--- + +## Why this matters competitively + +| Engine | Primary compute | Expected sustained behaviour | +|---|---|---| +| llama.cpp (Metal) | GPU | Thermal throttles within 3–5 min of continuous decode; iPhone chassis gets hot; drains battery fast | +| LiteRT-LM iOS | GPU + CPU hybrid | Better than pure-Metal but still GPU-heavy; 56 tok/s peak implies high wattage | +| MLX | GPU | Same class as llama.cpp Metal | +| **CoreMLLLM (ours)** | ANE (99.78 %) | ANE power envelope is ~1/3 to 1/5 of GPU for the same ops; no thermal impact on the rest of iOS | + +Nobody publishes *mWh/token* or *sustained tok/s at thermal=nominal for +15 minutes* for on-device LLMs. Whoever publishes first owns that +narrative. We are the only engine that can publish these numbers +honestly. + +--- + +## Metrics to publish + +### Tier 1 — must ship (v0.6 README) + +1. **Energy per token** — `mJ/tok` at ctx=2K and ctx=8K, decode only. +2. **Sustained tok/s (15 min)** — average over a 15-minute continuous + decode run, device starting at `thermal=nominal`, not charging. +3. **Thermal trajectory** — `ProcessInfo.thermalState` at t=0, 1, 3, 5, + 10, 15 min. Reported as "time to first `fair`" and "time to first + `serious`". +4. **Battery drain** — `%/hour` extrapolated from a 15-minute run, + corrected for battery capacity (`UIDevice.batteryLevel` delta × + declared Wh). + +### Tier 2 — if infra allows (v0.7) + +5. **Per-subsystem power** — ANE vs GPU vs CPU wattage breakdown, + sampled from `powermetrics` (macOS host, device tethered). +6. **mW during idle-after-decode** — cost of holding KV cache resident + vs releasing. +7. **Chassis surface temperature** — IR thermometer, 3 points (back + center, top, camera bump). Manual, one-shot. +8. **Energy per user turn** — prefill + decode, typical 512-token + prompt → 128-token response. + +### Tier 3 — research-grade (not blocking) + +9. **Joules per correct answer** — hook MMLU subset into Bench, measure + energy to produce each answer. +10. **Energy parity vs llama.cpp / LiteRT-LM** — head-to-head on the + same physical device, same prompt, same output length. + +--- + +## Measurement methodology + +### On-device (no host required) + +The sample app already has the scaffolding: + +- `LLMRunner.BenchmarkResult.drainedPercent` — from `UIDevice.batteryLevel` + delta across the run. Resolution is 1 % on iOS, so runs must be + ≥ 10 minutes to get < 10 % error. **Raise default Bench duration from + 120 s to 900 s (15 min)** and expose it in `ChatView.swift`. +- `thermalStart` / `thermalEnd` — already captured. **Add + thermal sampling every 30 s** into a `[ThermalSample]` array on + `BenchmarkResult`. This gives the "time to fair/serious" number. +- `drainedPerMinute` × iPhone 17 Pro nominal capacity (14.03 Wh) → + watts → mJ/tok. + +The existing `Energy (J/tok)` section in `docs/BENCHMARKING.md` +acknowledges this is derived, not measured. That's fine — lab-grade +per-rail power is not available without tethering. Be explicit in the +README. + +### Tethered (macOS host, `powermetrics`) + +For Tier 2, use `sudo powermetrics --samplers ane_power,gpu_power,cpu_power -i 1000` +on a **Mac connected to the iPhone via USB-C** running the +CoreMLLLM Bench. Note: `powermetrics` on macOS reports the **Mac's** +subsystems, not the iPhone's. For the iPhone, use **Instruments → +Energy Log** instead — it gives CPU/GPU/networking energy estimates +per process but **does not break out ANE power**. + +**Honest conclusion:** iOS does not expose per-rail ANE power to +third parties. Tier 2 #5 is a "best effort with disclosed limits" +metric, not a lab number. Publish the raw Instruments screenshots. + +### Lab-grade (optional, nice-to-have) + +- External USB-C power meter (e.g. ChargerLAB POWER-Z KM003C) between + charger and phone, measure Wh during a 15-min decode with phone at + exactly 50 % battery. Subtract idle baseline measured for 15 min + immediately before with the app backgrounded. +- This is the only way to get an end-to-end "wall-plug" number. It + includes screen, radios, and everything else, but with a clean + baseline-subtraction it is defensible. + +--- + +## Test matrix + +Run each configuration **three times** on a cold device (5 min rest +between runs). Report median. + +| Ctx | Duration | Sampling | KV reset | Purpose | +|---|---|---|---|---| +| 2048 | 15 min | argmax | no | Tier-1 headline: sustained 2K mJ/tok | +| 8192 | 15 min | argmax | no | Long-ctx sustained — differentiator vs llama.cpp (they OOM or crawl) | +| 2048 | 15 min | argmax | every 256 tok | Shows steady-state with realistic turns, not a single long generation | +| 2048 | 5 min | argmax | no | Peak number for README (matches competitors' reporting) | +| 2048, bench-prefill | N/A | prefill only | N/A | mJ/tok for prefill (usually 3–10× cheaper per token than decode) | + +All runs: airplane mode ON, screen brightness at 50 % fixed (manual — +auto-brightness adds noise), Low Power Mode OFF, not charging, same +benchmark prompt as `LLMRunner.swift :: benchmarkPrompt`. + +--- + +## Head-to-head protocol (Tier 3 #10) + +To make an apples-to-apples claim against LiteRT-LM or llama.cpp: + +1. Same iPhone 17 Pro, same iOS version, same battery level (50 % start). +2. Same prompt, same max-tokens cap (e.g. 256 decoded tokens). +3. Same starting thermal state (`nominal`, 5 min rest between runs). +4. Airplane mode ON. Screen brightness fixed. +5. Measure: wall-clock duration, battery drain %, ending thermal state. +6. Derive: J/tok = (drain% × 14.03 Wh × 3600) / (tokens × 100). +7. Repeat each engine 3×, report median + min/max. + +**Risk:** LiteRT-LM iOS distribution may not be publicly installable. +If so, publish our number standalone and invite Google to respond — +that itself is a win narratively. + +--- + +## Implementation plan (code changes) + +Ordered by cost. Each step is standalone-shippable. + +### Step 1 — extend `BenchmarkResult` (0.5 day) + +`Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift:116` + +Add: +```swift +struct ThermalSample { let t: TimeInterval; let state: ProcessInfo.ThermalState; let batteryLevel: Float } +var thermalTrajectory: [ThermalSample] +var mJPerToken: Double { /* drained% × 14030 J / tokens / 100 */ } +var timeToFair: TimeInterval? +var timeToSerious: TimeInterval? +``` + +Sample every 30 s from a `Task` running alongside decode. + +### Step 2 — raise default duration, add presets (0.5 day) + +`ChatView.swift:394` area — add a picker: `2 min` / `15 min` / `60 min`. +Default to 15 min for the "Power" tab, 2 min for the "Speed" tab. + +### Step 3 — CSV export (0.5 day) + +Add a "Share CSV" button on the Bench result sheet. Columns: +`t_seconds, tok_per_sec_window, battery_pct, thermal_state, phys_footprint_mb`. +Let users (and us) paste into spreadsheets. + +### Step 4 — README rewrite (0.5 day) + +Add a **"Power & Thermal"** table to README above the speed table. +Include mJ/tok at 2K and 8K, sustained tok/s, time-to-`fair`. Link to +this doc for methodology. + +### Step 5 — head-to-head blog post / gist (1 day) + +Run the protocol above against whichever competitor we can actually +install. Publish the CSVs. Do not editorialise — let numbers speak. + +--- + +## What we will *not* claim + +- **"X watts on the ANE"** — iOS does not give us this. We will not + fabricate a per-rail number. +- **"Zero GPU usage"** — the vision encoder runs on GPU by design. Any + multimodal turn has GPU energy in it. Text-only is clean. +- **"Better battery life than the OS baseline"** — untrue and not the + claim. The claim is "less energy per token than competing LLM + engines", which is narrower and defensible. +- **Lab-calibrated J/tok** — clearly label the headline number as + *derived from battery-gauge delta*, with error bars. + +--- + +## Success criteria + +v0.6 README ships with: + +- mJ/tok at 2K decode, ±15 % error bar, methodology linked. +- Sustained 15-min tok/s at 2K, with thermal trajectory. +- At least one head-to-head comparison (even if only against a + hypothetical "GPU-based engine on same device" using published + llama.cpp Metal numbers — clearly marked as indirect). + +Stretch: + +- Instruments Energy Log screenshot showing our process's energy + score vs a Metal-based LLM on the same device. +- External USB-C power meter measurement with baseline subtraction. + +--- + +## Timeline + +| Week | Deliverable | +|---|---| +| 1 | Steps 1–3 (code changes), first internal 15-min runs logged | +| 2 | Step 4 (README), publish Tier 1 metrics | +| 3 | Tier 2 attempt (Instruments tethered), publish whatever we get | +| 4 | Head-to-head against one competitor, blog post | + +Total: ~4 weeks for 1 person, can run in parallel with speed work. diff --git a/docs/POWER_BENCH_RUNBOOK.md b/docs/POWER_BENCH_RUNBOOK.md new file mode 100644 index 0000000..204ac05 --- /dev/null +++ b/docs/POWER_BENCH_RUNBOOK.md @@ -0,0 +1,228 @@ +# Power Benchmark Runbook (shareable) + +This is a self-contained guide for running the 15-minute sustained +power benchmark on **any supported iPhone**. It's the practical +counterpart to `POWER_BENCHMARK_PLAN.md` (methodology). Hand this to +whoever is running the test. + +**Supported**: iPhone with A17 Pro or newer (iPhone 15 Pro, 15 Pro Max, +16, 16 Plus, 16 Pro, 16 Pro Max, 17, 17 Pro, 17 Pro Max). Older chips +lack the ANE headroom to hit reasonable tok/s and will skew numbers. + +**Time budget**: ~35 min end-to-end (5 min cool-down + 15 min bench + +~15 min setup + reporting). + +--- + +## 1. Device-specific setup (one-time) + +### 1a. Set the battery capacity for your device + +`mJ/token` is derived from battery drain × battery capacity. The code +defaults to iPhone 17 Pro (14.03 Wh). For other devices, edit +`Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift`, find: + +```swift +var batteryCapacityWh: Double = 14.03 +``` + +and replace with your device's nominal capacity: + +| Device | Battery (Wh) | +|---|---:| +| iPhone 15 Pro | 13.35 | +| iPhone 15 Pro Max | 17.11 | +| iPhone 16 | 13.63 | +| iPhone 16 Plus | 17.16 | +| iPhone 16 Pro | 13.88 | +| iPhone 16 Pro Max | 17.15 | +| iPhone 17 | 14.34 | +| iPhone 17 Pro | **14.03** (default) | +| iPhone 17 Pro Max | 17.20 | + +Source: Apple "environmental report" PDFs per device. If unsure, look +it up — a wrong value scales `mJ/token` linearly, so use the right one. + +### 1b. Build in Release + +1. Open `Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj` in Xcode. +2. Product → Scheme → **Edit Scheme** → **Run** tab → **Build + Configuration: Release**. Close the dialog. +3. Select your physical iPhone as the run destination. +4. Set a Development Team (Signing & Capabilities → pick your team). +5. ⌘R to build & install. Once launched, Xcode can be disconnected. + +Debug builds are ~10–20 % slower and will under-report both tok/s and +energy efficiency. **Release is not optional.** + +--- + +## 2. Pre-run checklist (every run) + +Do this immediately before each benchmark run. Skipping any of these +biases the result. + +| Check | Setting | Why | +|---|---|---| +| Battery ≥ 60 % | — | 15-min run drains ~3–8 %; under 60 % risks Low Power Mode auto-enabling | +| **Unplug charger** | Cable out | Charging masks drain measurement | +| Low Power Mode | **OFF** (Settings → Battery) | LPM throttles CPU/ANE and invalidates the number | +| Airplane Mode | **ON** | Radio traffic adds uncontrolled energy | +| Wi-Fi / Bluetooth | OFF (Control Center) | Same as above; verify even with airplane mode on | +| Auto-Brightness | **OFF** (Settings → Accessibility → Display & Text Size) | Brightness changes shift power draw mid-run | +| Brightness slider | **50 % exactly** | Reproducibility | +| Background apps | All killed (swipe up from each) | Other processes burn battery | +| Do Not Disturb | ON | Notifications wake the display | +| Device temperature | Cool to the touch | Start from `thermal = nominal` | +| **5-min rest** | Close app, set phone down, wait 5 min | Lets residual thermal clear | + +If any check fails and you run anyway, **note it in the report** — +the number is still useful, it's just not clean. + +--- + +## 3. Run steps + +1. Unlock the phone, open **CoreMLLLMChat**. +2. If no model is loaded: tap **Get Model** → pick Gemma 4 E2B → wait + for download + load (~2.7 GB on first run, ~15–30 s compile on + first launch after download, <1 s on subsequent). +3. Tap **ANE?** once. Confirm the result shows `TOTAL … (99 %)` or + higher. If it's lower, something is wrong — stop and investigate. +4. Put the phone **face-up on a flat surface** (table, not fabric — + fabric insulates and skews thermal). +5. **Don't touch it. Don't hold it. Don't charge it.** The body heat + from your hand changes the thermal profile. +6. Tap **Bench → 15 min (power)**. +7. A banner says `[Benchmark] Starting 15-minute sustained + generation…` — screen will stay on automatically + (`isIdleTimerDisabled`). +8. **Wait 15 minutes**, hands off. +9. When done, a `[Benchmark RESULT]` block appears in the chat. + +If it aborts early with `Aborted: YES (thermal .serious)`, **keep +that result** — "device can only sustain N minutes before throttling" +is itself the thing we want to publish. + +--- + +## 4. What to send back + +Paste the full `[Benchmark RESULT]` block into a reply. It looks like: + +``` +[Benchmark RESULT] +Duration : 900s (15.0 min) +Rounds : 4 +Total tokens : 27431 +Avg tok/s : 30.48 +Battery : 82% → 76% (Δ 6.00%) +Drain rate : 0.400%/min (~24.0%/hr) +Tokens/%SoC : 4572 +Energy/token : 92.0 mJ/tok +Thermal : nominal → fair +Time→fair : 420s +Time→serious : never +CSV : bench-1744800000.csv +Thermal trajectory: + 0s → nominal bat=82% + 30s → nominal bat=82% + ... +Battery log: + 0s → 82% + 120s → 81% + ... +``` + +Also attach the CSV if possible (see §5). + +Plus, these side-channel facts: + +- **Device model** (Settings → General → About → Model Name) +- **iOS version** (Settings → General → About → iOS Version) +- Whether any pre-run check was skipped, and which +- Ambient room temperature rough estimate (cold room vs warm room + matters — ANE has ~5–10 °C thermal headroom before throttle) + +--- + +## 5. Getting the CSV off the device + +The CSV is saved to the app's `Documents/` folder as +`bench-.csv`. Three ways to extract: + +**Easiest — Files app + AirDrop (no cable):** + +1. Open **Files** app on the iPhone → Browse → **On My iPhone** → + **CoreMLLLMChat**. +2. Long-press `bench-.csv` → **Share** → **AirDrop** → send to + Mac. + +**Files + iCloud Drive (if you have iCloud):** +Copy the file from On My iPhone → CoreMLLLMChat into iCloud Drive, +retrieve on Mac. + +**Xcode (cable required, dev machine):** +Xcode → Window → **Devices and Simulators** → select iPhone → select +`CoreMLLLMChat` under Installed Apps → gear icon → **Download +Container** → right-click the `.xcappdata` → **Show Package Contents** +→ `AppData/Documents/`. + +--- + +## 6. Running more than once + +Recommended: run the full 15-min bench **3 times** and report median. + +Between runs: +- Let the phone **cool for 10+ min** (not 5 — second run starts warmer + than first). +- Recheck §2 (Low Power Mode sometimes auto-enables when battery + drops). + +If you can only do one run, that's fine — just note "n=1" in the +report. + +--- + +## 7. Sanity ranges (so you know if something's wrong) + +On an A17 Pro or newer iPhone, expect roughly: + +| Number | Expected range | Red flag if… | +|---|---|---| +| Avg tok/s (15 min) | 25–35 tok/s | < 15: Debug build, or thermal throttle, or wrong compute unit | +| Drain rate | 0.3–0.6 %/min | > 1 %/min: another app is active, or radios are on | +| mJ/token | 50–150 | > 300: drain didn't register (too-short run), or wrong capacity Wh | +| Time → fair | 180–900 s (or `never`) | < 60 s: device started warm, or ambient too hot | +| Time → serious | `never` preferred | Happens: reportable data, not a failure | + +If numbers are **way** outside these, rerun after confirming §2, and +double-check step 1b (Release build). + +--- + +## 8. Troubleshooting + +**`Energy/token : n/a (gauge noise, need ≥10 min run)`** +→ The battery gauge didn't change. Means the run was too short, or +the phone was charging, or drain was < 1 %. Rerun at 15 min with +charger unplugged. + +**`Aborted: YES (thermal .serious)`** +→ Device threw thermal state before the 15 min mark. That IS the +result. Note `Time→serious` — that's the sustained-duration number. + +**`Avg tok/s` much lower than expected** +→ Check that `ANE?` showed ≥ 99 %. If it's lower, the model loaded on +GPU/CPU and numbers are meaningless. Force-quit and relaunch. + +**No CSV file in Files app** +→ Verify `Info.plist` has `UIFileSharingEnabled = YES` and +`LSSupportsOpeningDocumentsInPlace = YES` (already set in this repo, +but check if modified). The folder won't appear until the first +file is written, so run at least one bench first. + +**Battery delta is 0 or negative** +→ Charger was connected, or Low Power Mode kicked in mid-run. Both +ruin the measurement. Start over.