diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift
index 6529514..08db204 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/ChatView.swift
@@ -235,9 +235,11 @@ struct ChatView: View {
                 if runner.isLoaded {
                     ToolbarItem(placement: .topBarTrailing) {
                         Menu("Bench") {
-                            Button("5 min")  { startBenchmark(minutes: 5) }
-                            Button("10 min") { startBenchmark(minutes: 10) }
-                            Button("30 min") { startBenchmark(minutes: 30) }
+                            Button("2 min (speed)")  { startBenchmark(minutes: 2) }
+                            Button("5 min")          { startBenchmark(minutes: 5) }
+                            Button("15 min (power)") { startBenchmark(minutes: 15) }
+                            Button("30 min")         { startBenchmark(minutes: 30) }
+                            Button("60 min")         { startBenchmark(minutes: 60) }
                         }
                         .disabled(runner.isGenerating || benchmarkRunning)
                     }
@@ -565,6 +567,15 @@ struct ChatView: View {
                 let logLines = result.batteryLog.map { entry in
                     "  \(String(format: "%5.0f", entry.0))s → \(Int(entry.1 * 100))%"
                 }.joined(separator: "\n")
+                let thermalLines = result.thermalTrajectory.map { s in
+                    "  \(String(format: "%5.0f", s.t))s → \(LLMRunner.thermalString(s.state))  bat=\(s.batteryLevel >= 0 ? "\(Int(s.batteryLevel * 100))%" : "?")"
+                }.joined(separator: "\n")
+                let ttf = result.timeToFair.map { "\(Int($0))s" } ?? "never"
+                let tts = result.timeToSerious.map { "\(Int($0))s" } ?? "never"
+                let mJ = result.mJPerToken
+                let mJStr = mJ > 0 ? String(format: "%.1f mJ/tok", mJ) : "n/a (gauge noise, need >=10 min run)"
+                let csvPath = saveBenchmarkCSV(result)
+                let csvLine = csvPath.map { "CSV           : \($0)" } ?? "CSV           : (save failed)"
                 let summary = """
                 [Benchmark RESULT]
                 Duration      : \(Int(result.duration))s (\(String(format: "%.1f", result.duration / 60.0)) min)
@@ -572,9 +583,15 @@ struct ChatView: View {
                 Total tokens  : \(result.totalTokens)
                 Avg tok/s     : \(String(format: "%.2f", result.avgTokPerSec))
                 Battery       : \(bs)% → \(be)%  (Δ \(String(format: "%.2f", result.drainedPercent))%)
-                Drain rate    : \(String(format: "%.3f", result.drainedPerMinute))%/min
+                Drain rate    : \(String(format: "%.3f", result.drainedPerMinute))%/min (~\(String(format: "%.1f", result.drainedPerHour))%/hr)
                 Tokens/%SoC   : \(String(format: "%.0f", result.tokensPerPercent))
+                Energy/token  : \(mJStr)
                 Thermal       : \(LLMRunner.thermalString(result.thermalStart)) → \(LLMRunner.thermalString(result.thermalEnd))\(abortNote)
+                Time→fair     : \(ttf)
+                Time→serious  : \(tts)
+                \(csvLine)
+                Thermal trajectory:
+                \(thermalLines)
                 Battery log:
                 \(logLines)
                 """
@@ -589,6 +606,21 @@ struct ChatView: View {
         }
     }
 
+    private func saveBenchmarkCSV(_ result: LLMRunner.BenchmarkResult) -> String? {
+        let fm = FileManager.default
+        guard let docs = fm.urls(for: .documentDirectory, in: .userDomainMask).first else { return nil }
+        let ts = Int(Date().timeIntervalSince1970)
+        let url = docs.appendingPathComponent("bench-\(ts).csv")
+        do {
+            try result.csv().write(to: url, atomically: true, encoding: .utf8)
+            print("[Benchmark] CSV saved: \(url.path)")
+            return url.lastPathComponent
+        } catch {
+            print("[Benchmark] CSV save failed: \(error)")
+            return nil
+        }
+    }
+
     private func verifyANE() {
         messages.append(ChatMessage(role: .system, content: "Checking MLComputePlan device placement..."))
         Task.detached(priority: .userInitiated) {
diff --git a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
index 96826a5..ce75469 100644
--- a/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
+++ b/Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift
@@ -1221,6 +1221,12 @@ final class LLMRunner {
         var thermal: ProcessInfo.ThermalState
     }
 
+    struct ThermalSample {
+        var t: TimeInterval
+        var state: ProcessInfo.ThermalState
+        var batteryLevel: Float
+    }
+
     struct BenchmarkResult {
         var duration: TimeInterval
         var totalTokens: Int
@@ -1232,11 +1238,59 @@ final class LLMRunner {
         var thermalEnd: ProcessInfo.ThermalState
         var abortedThermal: Bool = false
         var batteryLog: [(TimeInterval, Float)] = []
+        var thermalTrajectory: [LLMRunner.ThermalSample] = []
+
+        // iPhone 17 Pro nominal battery capacity. Override for other devices.
+        // Source: Apple spec sheet (14.03 Wh = 50508 J).
+        var batteryCapacityWh: Double = 14.03
 
         var batteryDelta: Float { batteryStart - batteryEnd }
         var drainedPercent: Double { Double(batteryDelta) * 100.0 }
         var drainedPerMinute: Double { duration > 0 ? drainedPercent / (duration / 60.0) : 0 }
+        var drainedPerHour: Double { drainedPerMinute * 60.0 }
         var tokensPerPercent: Double { drainedPercent > 0 ? Double(totalTokens) / drainedPercent : 0 }
+
+        /// Energy per decoded token in millijoules, derived from battery-gauge delta.
+        /// Coarse (1% gauge resolution); trust only for runs >= 10 min.
+        var mJPerToken: Double {
+            guard totalTokens > 0, drainedPercent > 0 else { return 0 }
+            let joules = drainedPercent / 100.0 * batteryCapacityWh * 3600.0
+            return joules * 1000.0 / Double(totalTokens)
+        }
+
+        var timeToFair: TimeInterval? {
+            thermalTrajectory.first { $0.state == .fair || $0.state == .serious || $0.state == .critical }?.t
+        }
+        var timeToSerious: TimeInterval? {
+            thermalTrajectory.first { $0.state == .serious || $0.state == .critical }?.t
+        }
+
+        func csv() -> String {
+            var lines = ["t_seconds,battery_pct,thermal_state,source"]
+            for s in thermalTrajectory {
+                let pct = s.batteryLevel >= 0 ? Int(s.batteryLevel * 100) : -1
+                lines.append("\(Int(s.t)),\(pct),\(LLMRunner.thermalString(s.state)),thermal")
+            }
+            for (t, lvl) in batteryLog {
+                let pct = lvl >= 0 ? Int(lvl * 100) : -1
+                lines.append("\(Int(t)),\(pct),,battery")
+            }
+            lines.append("")
+            lines.append("# summary")
+            lines.append("# duration_s=\(Int(duration))")
+            lines.append("# total_tokens=\(totalTokens)")
+            lines.append("# avg_tok_per_sec=\(String(format: "%.2f", avgTokPerSec))")
+            lines.append("# drained_percent=\(String(format: "%.2f", drainedPercent))")
+            lines.append("# drained_per_hour=\(String(format: "%.2f", drainedPerHour))")
+            lines.append("# mJ_per_token=\(String(format: "%.2f", mJPerToken))")
+            lines.append("# time_to_fair_s=\(timeToFair.map { String(Int($0)) } ?? "never")")
+            lines.append("# time_to_serious_s=\(timeToSerious.map { String(Int($0)) } ?? "never")")
+            lines.append("# thermal_start=\(LLMRunner.thermalString(thermalStart))")
+            lines.append("# thermal_end=\(LLMRunner.thermalString(thermalEnd))")
+            lines.append("# aborted_thermal=\(abortedThermal)")
+            lines.append("# battery_capacity_wh=\(batteryCapacityWh)")
+            return lines.joined(separator: "\n")
+        }
     }
 
     private static let benchmarkPrompt =
@@ -1258,6 +1312,10 @@ final class LLMRunner {
         var abortedThermal = false
         var batteryLog: [(TimeInterval, Float)] = [(0, startBat)]
         var lastLoggedLevel = startBat
+        var thermalTrajectory: [ThermalSample] = [
+            ThermalSample(t: 0, state: startThermal, batteryLevel: startBat)
+        ]
+        var nextThermalSampleAt: TimeInterval = 30
         let prompt = ChatMessage(role: .user, content: Self.benchmarkPrompt)
 
         func isThermalUnsafe() -> Bool {
@@ -1277,6 +1335,13 @@ final class LLMRunner {
                     batteryLog.append((elapsed, currentLevel))
                     lastLoggedLevel = currentLevel
                 }
+                if elapsed >= nextThermalSampleAt {
+                    thermalTrajectory.append(ThermalSample(
+                        t: elapsed,
+                        state: ProcessInfo.processInfo.thermalState,
+                        batteryLevel: currentLevel))
+                    nextThermalSampleAt += 30
+                }
                 if totalTokens % 20 == 0 {
                     onProgress(BenchmarkProgress(
                         elapsed: elapsed, totalTokens: totalTokens, round: round,
@@ -1293,14 +1358,17 @@ final class LLMRunner {
 
         let endTime = Date()
         let endBat = UIDevice.current.batteryLevel
+        let endThermal = ProcessInfo.processInfo.thermalState
         let dur = endTime.timeIntervalSince(startTime)
         batteryLog.append((dur, endBat))
+        thermalTrajectory.append(ThermalSample(t: dur, state: endThermal, batteryLevel: endBat))
         return BenchmarkResult(
             duration: dur, totalTokens: totalTokens, rounds: round,
             avgTokPerSec: dur > 0 ? Double(totalTokens) / dur : 0,
             batteryStart: startBat, batteryEnd: endBat,
-            thermalStart: startThermal, thermalEnd: ProcessInfo.processInfo.thermalState,
-            abortedThermal: abortedThermal, batteryLog: batteryLog)
+            thermalStart: startThermal, thermalEnd: endThermal,
+            abortedThermal: abortedThermal, batteryLog: batteryLog,
+            thermalTrajectory: thermalTrajectory)
     }
     #endif
 
diff --git a/docs/BENCHMARKING.md b/docs/BENCHMARKING.md
index cc49ab4..487acc9 100644
--- a/docs/BENCHMARKING.md
+++ b/docs/BENCHMARKING.md
@@ -72,7 +72,38 @@ We report **peak** in the README performance table because that matches how comp
   - Mid-decode: ~981 MB
   - Headroom (`os_proc_available`): ~5 GB
 
-## Energy (`J/tok`)
+## Energy (`mJ/tok`, `%/hour`, thermal trajectory)
+
+The sample app's **Bench** menu now exposes three presets aimed at
+power reporting:
+
+- **2 min (speed)** — quick peak tok/s check
+- **15 min (power)** — minimum duration for a defensible `mJ/tok`
+  number given the iOS battery gauge's 1 % resolution
+- **60 min** — long-haul thermal profile, useful for "will this
+  throttle in a real session" questions
+
+After each run the app writes a CSV to `Documents/bench-<unix_ts>.csv`
+with the per-30s thermal trajectory, battery log, and a `# summary`
+block. The CSV filename is printed in the in-app result and to the
+console. Retrieve via Files app (the target already has
+document-sharing entitlements).
+
+`BenchmarkResult` exposes:
+
+- `mJPerToken` — `drainedPercent × batteryCapacityWh × 36000 / totalTokens`.
+  iPhone 17 Pro nominal capacity is 14.03 Wh; override
+  `batteryCapacityWh` for other devices.
+- `drainedPerHour` — extrapolated from the run duration.
+- `timeToFair`, `timeToSerious` — first elapsed second at which
+  `ProcessInfo.thermalState` transitioned.
+- `thermalTrajectory` — array of `ThermalSample(t, state, batteryLevel)`
+  at 30-second intervals.
+
+For the methodology, metric tiers, and head-to-head protocol against
+other engines, see [POWER_BENCHMARK_PLAN.md](POWER_BENCHMARK_PLAN.md).
+
+## Energy (`J/tok`) — legacy derivation
 
 The ~0.07 J/tok figure in `docs/RESEARCH.md` is **derived**, not directly measured:
 
diff --git a/docs/POWER_BENCHMARK_PLAN.md b/docs/POWER_BENCHMARK_PLAN.md
new file mode 100644
index 0000000..ae8ddbf
--- /dev/null
+++ b/docs/POWER_BENCHMARK_PLAN.md
@@ -0,0 +1,233 @@
+# Power & Energy Benchmark Plan
+
+**Goal:** publish a reproducible, defensible set of power/energy numbers
+for Gemma 4 E2B on iPhone 17 Pro that competitors (LiteRT-LM iOS,
+llama.cpp Metal, MLX) cannot easily match, and that makes the ANE
+placement advantage visible as a *user-facing* metric — not just a
+compute-unit placement percentage.
+
+Speed parity is the second-order goal (see
+`docs/MOBILE_2K_COMPETITIVE_PLAN.md`). This doc covers the first-order
+goal: prove that on a phone in a pocket, we draw less current and
+stay cooler for the same work.
+
+---
+
+## Why this matters competitively
+
+| Engine | Primary compute | Expected sustained behaviour |
+|---|---|---|
+| llama.cpp (Metal) | GPU | Thermal throttles within 3–5 min of continuous decode; iPhone chassis gets hot; drains battery fast |
+| LiteRT-LM iOS | GPU + CPU hybrid | Better than pure-Metal but still GPU-heavy; 56 tok/s peak implies high wattage |
+| MLX | GPU | Same class as llama.cpp Metal |
+| **CoreMLLLM (ours)** | ANE (99.78 %) | ANE power envelope is ~1/3 to 1/5 of GPU for the same ops; no thermal impact on the rest of iOS |
+
+Nobody publishes *mWh/token* or *sustained tok/s at thermal=nominal for
+15 minutes* for on-device LLMs. Whoever publishes first owns that
+narrative. We are the only engine that can publish these numbers
+honestly.
+
+---
+
+## Metrics to publish
+
+### Tier 1 — must ship (v0.6 README)
+
+1. **Energy per token** — `mJ/tok` at ctx=2K and ctx=8K, decode only.
+2. **Sustained tok/s (15 min)** — average over a 15-minute continuous
+   decode run, device starting at `thermal=nominal`, not charging.
+3. **Thermal trajectory** — `ProcessInfo.thermalState` at t=0, 1, 3, 5,
+   10, 15 min. Reported as "time to first `fair`" and "time to first
+   `serious`".
+4. **Battery drain** — `%/hour` extrapolated from a 15-minute run,
+   corrected for battery capacity (`UIDevice.batteryLevel` delta ×
+   declared Wh).
+
+### Tier 2 — if infra allows (v0.7)
+
+5. **Per-subsystem power** — ANE vs GPU vs CPU wattage breakdown,
+   sampled from `powermetrics` (macOS host, device tethered).
+6. **mW during idle-after-decode** — cost of holding KV cache resident
+   vs releasing.
+7. **Chassis surface temperature** — IR thermometer, 3 points (back
+   center, top, camera bump). Manual, one-shot.
+8. **Energy per user turn** — prefill + decode, typical 512-token
+   prompt → 128-token response.
+
+### Tier 3 — research-grade (not blocking)
+
+9. **Joules per correct answer** — hook MMLU subset into Bench, measure
+   energy to produce each answer.
+10. **Energy parity vs llama.cpp / LiteRT-LM** — head-to-head on the
+    same physical device, same prompt, same output length.
+
+---
+
+## Measurement methodology
+
+### On-device (no host required)
+
+The sample app already has the scaffolding:
+
+- `LLMRunner.BenchmarkResult.drainedPercent` — from `UIDevice.batteryLevel`
+  delta across the run. Resolution is 1 % on iOS, so runs must be
+  ≥ 10 minutes to get < 10 % error. **Raise default Bench duration from
+  120 s to 900 s (15 min)** and expose it in `ChatView.swift`.
+- `thermalStart` / `thermalEnd` — already captured. **Add
+  thermal sampling every 30 s** into a `[ThermalSample]` array on
+  `BenchmarkResult`. This gives the "time to fair/serious" number.
+- `drainedPerMinute` × iPhone 17 Pro nominal capacity (14.03 Wh) →
+  watts → mJ/tok.
+
+The existing `Energy (J/tok)` section in `docs/BENCHMARKING.md`
+acknowledges this is derived, not measured. That's fine — lab-grade
+per-rail power is not available without tethering. Be explicit in the
+README.
+
+### Tethered (macOS host, `powermetrics`)
+
+For Tier 2, use `sudo powermetrics --samplers ane_power,gpu_power,cpu_power -i 1000`
+on a **Mac connected to the iPhone via USB-C** running the
+CoreMLLLM Bench. Note: `powermetrics` on macOS reports the **Mac's**
+subsystems, not the iPhone's. For the iPhone, use **Instruments →
+Energy Log** instead — it gives CPU/GPU/networking energy estimates
+per process but **does not break out ANE power**.
+
+**Honest conclusion:** iOS does not expose per-rail ANE power to
+third parties. Tier 2 #5 is a "best effort with disclosed limits"
+metric, not a lab number. Publish the raw Instruments screenshots.
+
+### Lab-grade (optional, nice-to-have)
+
+- External USB-C power meter (e.g. ChargerLAB POWER-Z KM003C) between
+  charger and phone, measure Wh during a 15-min decode with phone at
+  exactly 50 % battery. Subtract idle baseline measured for 15 min
+  immediately before with the app backgrounded.
+- This is the only way to get an end-to-end "wall-plug" number. It
+  includes screen, radios, and everything else, but with a clean
+  baseline-subtraction it is defensible.
+
+---
+
+## Test matrix
+
+Run each configuration **three times** on a cold device (5 min rest
+between runs). Report median.
+
+| Ctx | Duration | Sampling | KV reset | Purpose |
+|---|---|---|---|---|
+| 2048 | 15 min | argmax | no | Tier-1 headline: sustained 2K mJ/tok |
+| 8192 | 15 min | argmax | no | Long-ctx sustained — differentiator vs llama.cpp (they OOM or crawl) |
+| 2048 | 15 min | argmax | every 256 tok | Shows steady-state with realistic turns, not a single long generation |
+| 2048 | 5 min | argmax | no | Peak number for README (matches competitors' reporting) |
+| 2048, bench-prefill | N/A | prefill only | N/A | mJ/tok for prefill (usually 3–10× cheaper per token than decode) |
+
+All runs: airplane mode ON, screen brightness at 50 % fixed (manual —
+auto-brightness adds noise), Low Power Mode OFF, not charging, same
+benchmark prompt as `LLMRunner.swift :: benchmarkPrompt`.
+
+---
+
+## Head-to-head protocol (Tier 3 #10)
+
+To make an apples-to-apples claim against LiteRT-LM or llama.cpp:
+
+1. Same iPhone 17 Pro, same iOS version, same battery level (50 % start).
+2. Same prompt, same max-tokens cap (e.g. 256 decoded tokens).
+3. Same starting thermal state (`nominal`, 5 min rest between runs).
+4. Airplane mode ON. Screen brightness fixed.
+5. Measure: wall-clock duration, battery drain %, ending thermal state.
+6. Derive: J/tok = (drain% × 14.03 Wh × 3600) / (tokens × 100).
+7. Repeat each engine 3×, report median + min/max.
+
+**Risk:** LiteRT-LM iOS distribution may not be publicly installable.
+If so, publish our number standalone and invite Google to respond —
+that itself is a win narratively.
+
+---
+
+## Implementation plan (code changes)
+
+Ordered by cost. Each step is standalone-shippable.
+
+### Step 1 — extend `BenchmarkResult` (0.5 day)
+
+`Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift:116`
+
+Add:
+```swift
+struct ThermalSample { let t: TimeInterval; let state: ProcessInfo.ThermalState; let batteryLevel: Float }
+var thermalTrajectory: [ThermalSample]
+var mJPerToken: Double { /* drained% × 14030 J / tokens / 100 */ }
+var timeToFair: TimeInterval?
+var timeToSerious: TimeInterval?
+```
+
+Sample every 30 s from a `Task` running alongside decode.
+
+### Step 2 — raise default duration, add presets (0.5 day)
+
+`ChatView.swift:394` area — add a picker: `2 min` / `15 min` / `60 min`.
+Default to 15 min for the "Power" tab, 2 min for the "Speed" tab.
+
+### Step 3 — CSV export (0.5 day)
+
+Add a "Share CSV" button on the Bench result sheet. Columns:
+`t_seconds, tok_per_sec_window, battery_pct, thermal_state, phys_footprint_mb`.
+Let users (and us) paste into spreadsheets.
+
+### Step 4 — README rewrite (0.5 day)
+
+Add a **"Power & Thermal"** table to README above the speed table.
+Include mJ/tok at 2K and 8K, sustained tok/s, time-to-`fair`. Link to
+this doc for methodology.
+
+### Step 5 — head-to-head blog post / gist (1 day)
+
+Run the protocol above against whichever competitor we can actually
+install. Publish the CSVs. Do not editorialise — let numbers speak.
+
+---
+
+## What we will *not* claim
+
+- **"X watts on the ANE"** — iOS does not give us this. We will not
+  fabricate a per-rail number.
+- **"Zero GPU usage"** — the vision encoder runs on GPU by design. Any
+  multimodal turn has GPU energy in it. Text-only is clean.
+- **"Better battery life than the OS baseline"** — untrue and not the
+  claim. The claim is "less energy per token than competing LLM
+  engines", which is narrower and defensible.
+- **Lab-calibrated J/tok** — clearly label the headline number as
+  *derived from battery-gauge delta*, with error bars.
+
+---
+
+## Success criteria
+
+v0.6 README ships with:
+
+- mJ/tok at 2K decode, ±15 % error bar, methodology linked.
+- Sustained 15-min tok/s at 2K, with thermal trajectory.
+- At least one head-to-head comparison (even if only against a
+  hypothetical "GPU-based engine on same device" using published
+  llama.cpp Metal numbers — clearly marked as indirect).
+
+Stretch:
+
+- Instruments Energy Log screenshot showing our process's energy
+  score vs a Metal-based LLM on the same device.
+- External USB-C power meter measurement with baseline subtraction.
+
+---
+
+## Timeline
+
+| Week | Deliverable |
+|---|---|
+| 1 | Steps 1–3 (code changes), first internal 15-min runs logged |
+| 2 | Step 4 (README), publish Tier 1 metrics |
+| 3 | Tier 2 attempt (Instruments tethered), publish whatever we get |
+| 4 | Head-to-head against one competitor, blog post |
+
+Total: ~4 weeks for 1 person, can run in parallel with speed work.
diff --git a/docs/POWER_BENCH_RUNBOOK.md b/docs/POWER_BENCH_RUNBOOK.md
new file mode 100644
index 0000000..204ac05
--- /dev/null
+++ b/docs/POWER_BENCH_RUNBOOK.md
@@ -0,0 +1,228 @@
+# Power Benchmark Runbook (shareable)
+
+This is a self-contained guide for running the 15-minute sustained
+power benchmark on **any supported iPhone**. It's the practical
+counterpart to `POWER_BENCHMARK_PLAN.md` (methodology). Hand this to
+whoever is running the test.
+
+**Supported**: iPhone with A17 Pro or newer (iPhone 15 Pro, 15 Pro Max,
+16, 16 Plus, 16 Pro, 16 Pro Max, 17, 17 Pro, 17 Pro Max). Older chips
+lack the ANE headroom to hit reasonable tok/s and will skew numbers.
+
+**Time budget**: ~35 min end-to-end (5 min cool-down + 15 min bench +
+~15 min setup + reporting).
+
+---
+
+## 1. Device-specific setup (one-time)
+
+### 1a. Set the battery capacity for your device
+
+`mJ/token` is derived from battery drain × battery capacity. The code
+defaults to iPhone 17 Pro (14.03 Wh). For other devices, edit
+`Examples/CoreMLLLMChat/CoreMLLLMChat/LLMRunner.swift`, find:
+
+```swift
+var batteryCapacityWh: Double = 14.03
+```
+
+and replace with your device's nominal capacity:
+
+| Device | Battery (Wh) |
+|---|---:|
+| iPhone 15 Pro | 13.35 |
+| iPhone 15 Pro Max | 17.11 |
+| iPhone 16 | 13.63 |
+| iPhone 16 Plus | 17.16 |
+| iPhone 16 Pro | 13.88 |
+| iPhone 16 Pro Max | 17.15 |
+| iPhone 17 | 14.34 |
+| iPhone 17 Pro | **14.03** (default) |
+| iPhone 17 Pro Max | 17.20 |
+
+Source: Apple "environmental report" PDFs per device. If unsure, look
+it up — a wrong value scales `mJ/token` linearly, so use the right one.
+
+### 1b. Build in Release
+
+1. Open `Examples/CoreMLLLMChat/CoreMLLLMChat.xcodeproj` in Xcode.
+2. Product → Scheme → **Edit Scheme** → **Run** tab → **Build
+   Configuration: Release**. Close the dialog.
+3. Select your physical iPhone as the run destination.
+4. Set a Development Team (Signing & Capabilities → pick your team).
+5. ⌘R to build & install. Once launched, Xcode can be disconnected.
+
+Debug builds are ~10–20 % slower and will under-report both tok/s and
+energy efficiency. **Release is not optional.**
+
+---
+
+## 2. Pre-run checklist (every run)
+
+Do this immediately before each benchmark run. Skipping any of these
+biases the result.
+
+| Check | Setting | Why |
+|---|---|---|
+| Battery ≥ 60 % | — | 15-min run drains ~3–8 %; under 60 % risks Low Power Mode auto-enabling |
+| **Unplug charger** | Cable out | Charging masks drain measurement |
+| Low Power Mode | **OFF** (Settings → Battery) | LPM throttles CPU/ANE and invalidates the number |
+| Airplane Mode | **ON** | Radio traffic adds uncontrolled energy |
+| Wi-Fi / Bluetooth | OFF (Control Center) | Same as above; verify even with airplane mode on |
+| Auto-Brightness | **OFF** (Settings → Accessibility → Display & Text Size) | Brightness changes shift power draw mid-run |
+| Brightness slider | **50 % exactly** | Reproducibility |
+| Background apps | All killed (swipe up from each) | Other processes burn battery |
+| Do Not Disturb | ON | Notifications wake the display |
+| Device temperature | Cool to the touch | Start from `thermal = nominal` |
+| **5-min rest** | Close app, set phone down, wait 5 min | Lets residual thermal clear |
+
+If any check fails and you run anyway, **note it in the report** —
+the number is still useful, it's just not clean.
+
+---
+
+## 3. Run steps
+
+1. Unlock the phone, open **CoreMLLLMChat**.
+2. If no model is loaded: tap **Get Model** → pick Gemma 4 E2B → wait
+   for download + load (~2.7 GB on first run, ~15–30 s compile on
+   first launch after download, <1 s on subsequent).
+3. Tap **ANE?** once. Confirm the result shows `TOTAL … (99 %)` or
+   higher. If it's lower, something is wrong — stop and investigate.
+4. Put the phone **face-up on a flat surface** (table, not fabric —
+   fabric insulates and skews thermal).
+5. **Don't touch it. Don't hold it. Don't charge it.** The body heat
+   from your hand changes the thermal profile.
+6. Tap **Bench → 15 min (power)**.
+7. A banner says `[Benchmark] Starting 15-minute sustained
+   generation…` — screen will stay on automatically
+   (`isIdleTimerDisabled`).
+8. **Wait 15 minutes**, hands off.
+9. When done, a `[Benchmark RESULT]` block appears in the chat.
+
+If it aborts early with `Aborted: YES (thermal .serious)`, **keep
+that result** — "device can only sustain N minutes before throttling"
+is itself the thing we want to publish.
+
+---
+
+## 4. What to send back
+
+Paste the full `[Benchmark RESULT]` block into a reply. It looks like:
+
+```
+[Benchmark RESULT]
+Duration      : 900s (15.0 min)
+Rounds        : 4
+Total tokens  : 27431
+Avg tok/s     : 30.48
+Battery       : 82% → 76%  (Δ 6.00%)
+Drain rate    : 0.400%/min (~24.0%/hr)
+Tokens/%SoC   : 4572
+Energy/token  : 92.0 mJ/tok
+Thermal       : nominal → fair
+Time→fair     : 420s
+Time→serious  : never
+CSV           : bench-1744800000.csv
+Thermal trajectory:
+    0s → nominal  bat=82%
+   30s → nominal  bat=82%
+   ...
+Battery log:
+    0s → 82%
+  120s → 81%
+  ...
+```
+
+Also attach the CSV if possible (see §5).
+
+Plus, these side-channel facts:
+
+- **Device model** (Settings → General → About → Model Name)
+- **iOS version** (Settings → General → About → iOS Version)
+- Whether any pre-run check was skipped, and which
+- Ambient room temperature rough estimate (cold room vs warm room
+  matters — ANE has ~5–10 °C thermal headroom before throttle)
+
+---
+
+## 5. Getting the CSV off the device
+
+The CSV is saved to the app's `Documents/` folder as
+`bench-<unix_timestamp>.csv`. Three ways to extract:
+
+**Easiest — Files app + AirDrop (no cable):**
+
+1. Open **Files** app on the iPhone → Browse → **On My iPhone** →
+   **CoreMLLLMChat**.
+2. Long-press `bench-<ts>.csv` → **Share** → **AirDrop** → send to
+   Mac.
+
+**Files + iCloud Drive (if you have iCloud):**
+Copy the file from On My iPhone → CoreMLLLMChat into iCloud Drive,
+retrieve on Mac.
+
+**Xcode (cable required, dev machine):**
+Xcode → Window → **Devices and Simulators** → select iPhone → select
+`CoreMLLLMChat` under Installed Apps → gear icon → **Download
+Container** → right-click the `.xcappdata` → **Show Package Contents**
+→ `AppData/Documents/`.
+
+---
+
+## 6. Running more than once
+
+Recommended: run the full 15-min bench **3 times** and report median.
+
+Between runs:
+- Let the phone **cool for 10+ min** (not 5 — second run starts warmer
+  than first).
+- Recheck §2 (Low Power Mode sometimes auto-enables when battery
+  drops).
+
+If you can only do one run, that's fine — just note "n=1" in the
+report.
+
+---
+
+## 7. Sanity ranges (so you know if something's wrong)
+
+On an A17 Pro or newer iPhone, expect roughly:
+
+| Number | Expected range | Red flag if… |
+|---|---|---|
+| Avg tok/s (15 min) | 25–35 tok/s | < 15: Debug build, or thermal throttle, or wrong compute unit |
+| Drain rate | 0.3–0.6 %/min | > 1 %/min: another app is active, or radios are on |
+| mJ/token | 50–150 | > 300: drain didn't register (too-short run), or wrong capacity Wh |
+| Time → fair | 180–900 s (or `never`) | < 60 s: device started warm, or ambient too hot |
+| Time → serious | `never` preferred | Happens: reportable data, not a failure |
+
+If numbers are **way** outside these, rerun after confirming §2, and
+double-check step 1b (Release build).
+
+---
+
+## 8. Troubleshooting
+
+**`Energy/token : n/a (gauge noise, need ≥10 min run)`**
+→ The battery gauge didn't change. Means the run was too short, or
+the phone was charging, or drain was < 1 %. Rerun at 15 min with
+charger unplugged.
+
+**`Aborted: YES (thermal .serious)`**
+→ Device threw thermal state before the 15 min mark. That IS the
+result. Note `Time→serious` — that's the sustained-duration number.
+
+**`Avg tok/s` much lower than expected**
+→ Check that `ANE?` showed ≥ 99 %. If it's lower, the model loaded on
+GPU/CPU and numbers are meaningless. Force-quit and relaunch.
+
+**No CSV file in Files app**
+→ Verify `Info.plist` has `UIFileSharingEnabled = YES` and
+`LSSupportsOpeningDocumentsInPlace = YES` (already set in this repo,
+but check if modified). The folder won't appear until the first
+file is written, so run at least one bench first.
+
+**Battery delta is 0 or negative**
+→ Charger was connected, or Low Power Mode kicked in mid-run. Both
+ruin the measurement. Start over.