From 5ddb8f6f89da1c7bf5e62681efe1e2c3393c3c64 Mon Sep 17 00:00:00 2001
From: Universe <universe@grida.co>
Date: Tue, 31 Mar 2026 02:15:08 +0900
Subject: [PATCH 1/6] perf(cg): skip prefill loop on cache-warm frames via
 generation tracking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a monotonically increasing generation counter to PictureCache that
increments on any mutation (insert, invalidate). The prefill loop now
compares the current generation, variant key, and layer count against
the last successful prefill — when all match, the entire O(N) iteration
is skipped in O(1).

On 135K-node scenes, this eliminates ~800µs of HashMap lookups per
cache-warm frame. Measured improvements on 01-135k.perf.grida:
- rt_pan_fast_fit p50: 111µs → 76µs (-32%)
- rt_pan_fast_fit p95: 263µs → 153µs (-42%)
- pan_settle_slow_fit settle: 1034µs → 709µs (-31%)
- Criterion large_baseline/pan: -14.0% (p < 0.01)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/grida-canvas/src/cache/picture.rs | 30 ++++++++++++-
 crates/grida-canvas/src/runtime/scene.rs | 57 ++++++++++++++++++++++++
 docs/wg/feat-2d/optimization.md          | 41 +++++++++++++++++
 3 files changed, 126 insertions(+), 2 deletions(-)
diff --git a/crates/grida-canvas/src/cache/picture.rs b/crates/grida-canvas/src/cache/picture.rs
index 23e1941084..f6800f69db 100644
--- a/crates/grida-canvas/src/cache/picture.rs
+++ b/crates/grida-canvas/src/cache/picture.rs
@@ -25,6 +25,10 @@ pub struct PictureCache {
     default_store: NodeIdHashMap<NodeId, Picture>,
     /// Store for non-default render variants (variant key != 0).
     variant_store: NodeIdHashMap<(NodeId, u64), Picture>,
+    /// Monotonically increasing counter incremented on any cache mutation
+    /// (insert, invalidate, invalidate_node). The prefill loop uses this
+    /// to skip the 136K-iteration cache-hit check when nothing changed.
+    generation: u64,
 }
 
 impl PictureCache {
@@ -33,6 +37,7 @@ impl PictureCache {
             strategy: PictureCacheStrategy::default(),
             default_store: new_node_id_map(),
             variant_store: new_node_id_map(),
+            generation: 0,
         }
     }
 
@@ -49,8 +54,17 @@ impl PictureCache {
         self.default_store.get(id)
     }
 
+    /// Returns the current cache generation counter. This increments on
+    /// every mutation (insert, invalidate). Callers can compare generations
+    /// to detect whether the cache contents have changed.
+    #[inline]
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+
     pub fn set_node_picture(&mut self, id: NodeId, picture: Picture) {
         self.default_store.insert(id, picture);
+        self.generation = self.generation.wrapping_add(1);
     }
 
     /// Lookup a picture for a node in a specific render variant.
@@ -69,15 +83,25 @@ impl PictureCache {
     pub fn set_node_picture_variant(&mut self, id: NodeId, variant_key: u64, picture: Picture) {
         if variant_key == 0 {
             self.default_store.insert(id, picture);
-            return;
+        } else {
+            self.variant_store.insert((id, variant_key), picture);
         }
-        self.variant_store.insert((id, variant_key), picture);
+        self.generation = self.generation.wrapping_add(1);
     }
 
     pub fn len(&self) -> usize {
         self.default_store.len() + self.variant_store.len()
     }
 
+    /// Returns true when the variant store has no entries.
+    /// When this is true AND variant unification is enabled, ALL cached
+    /// pictures live under the default key (0), making the prefill skip
+    /// safe across stable/unstable transitions.
+    #[inline]
+    pub fn variant_store_is_empty(&self) -> bool {
+        self.variant_store.is_empty()
+    }
+
     pub fn depth(&self) -> Option<usize> {
         self.strategy.depth
     }
@@ -85,6 +109,7 @@ impl PictureCache {
     pub fn invalidate(&mut self) {
         self.default_store.clear();
         self.variant_store.clear();
+        self.generation = self.generation.wrapping_add(1);
     }
 
     /// Invalidate cached pictures for a single node (all variants).
@@ -96,5 +121,6 @@ impl PictureCache {
     pub fn invalidate_node(&mut self, id: NodeId) {
         self.default_store.remove(&id);
         self.variant_store.retain(|&(nid, _), _| nid != id);
+        self.generation = self.generation.wrapping_add(1);
     }
 }
diff --git a/crates/grida-canvas/src/runtime/scene.rs b/crates/grida-canvas/src/runtime/scene.rs
index d92990ca67..b15ea8319a 100644
--- a/crates/grida-canvas/src/runtime/scene.rs
+++ b/crates/grida-canvas/src/runtime/scene.rs
@@ -358,6 +358,13 @@ pub struct Renderer {
     /// [`apply_changes`] consumes the set once per frame and performs
     /// the correct invalidation for every cache layer.
     changes: ChangeSet,
+    /// Picture cache generation + variant key at the time of the last
+    /// successful prefill. When the cache generation and variant key
+    /// match, the prefill loop can be skipped entirely — all pictures
+    /// are already cached from a previous frame.
+    last_prefill_generation: u64,
+    last_prefill_variant_key: u64,
+    last_prefill_layer_count: usize,
 }
 
 impl Renderer {
@@ -385,6 +392,38 @@ impl Renderer {
         // True when the policy differs from STANDARD only in effect-related
         // fields — content, compositing, and clip policies are unchanged.
         let can_unify = variant_key != 0 && policy.is_effect_only_variant();
+
+        // Skip-prefill fast path: when the picture cache generation hasn't
+        // changed since the last prefill AND we're using the same variant
+        // key AND the layer count matches, every picture from the previous
+        // prefill is still valid. Skip the O(N) iteration entirely.
+        //
+        // For variant key tracking: when can_unify is true AND the variant
+        // store is empty (no per-variant entries — all nodes are effect-free),
+        // we track key=0 since all pictures live under the default key. This
+        // is safe across stable/unstable transitions for effect-free scenes.
+        // Scenes WITH effects track the actual variant_key.
+        //
+        // On 135K-node scenes at fit zoom, this eliminates ~800µs of HashMap
+        // lookups on every cache-warm frame (the common case during view-only
+        // pan/zoom interaction and settle frames).
+        let effective_key_for_tracking = if can_unify
+            && self.scene_cache.picture.variant_store_is_empty()
+        {
+            0
+        } else {
+            variant_key
+        };
+
+        let current_gen = self.scene_cache.picture.generation();
+        let layer_count: usize = plan.regions.iter().map(|(_, idx)| idx.len()).sum();
+        if current_gen == self.last_prefill_generation
+            && effective_key_for_tracking == self.last_prefill_variant_key
+            && layer_count == self.last_prefill_layer_count
+        {
+            return;
+        }
+
         // Prefill picture cache for visible layers so Painter can reuse pictures even with masks.
         // Fast path: skip clone + recording when the picture is already cached (common case
         // on cache-warm frames). The clone of LayerEntry is expensive because it deep-copies
@@ -433,6 +472,18 @@ impl Renderer {
                 }
             }
         }
+
+        // Update tracking state for future skip-prefill checks.
+        let effective_key_after = if can_unify
+            && self.scene_cache.picture.variant_store_is_empty()
+        {
+            0
+        } else {
+            variant_key
+        };
+        self.last_prefill_generation = self.scene_cache.picture.generation();
+        self.last_prefill_variant_key = effective_key_after;
+        self.last_prefill_layer_count = layer_count;
     }
 
     /// Pre-extract blit data for all promoted nodes.
@@ -608,6 +659,9 @@ impl Renderer {
             pan_image_cache: None,
             zoom_image_cache: None,
             changes: ChangeSet::new(),
+            last_prefill_generation: u64::MAX,
+            last_prefill_variant_key: u64::MAX,
+            last_prefill_layer_count: 0,
         }
     }
 
@@ -1488,6 +1542,9 @@ impl Renderer {
         self.scene_cache = cache::scene::SceneCache::new();
         self.pan_image_cache = None;
         self.zoom_image_cache = None;
+        self.last_prefill_generation = u64::MAX;
+        self.last_prefill_variant_key = u64::MAX;
+        self.last_prefill_layer_count = 0;
         self.images.clear_missing_tracking();
         if let Some(scene) = self.scene.as_ref() {
             #[cfg(feature = "perf")]
diff --git a/docs/wg/feat-2d/optimization.md b/docs/wg/feat-2d/optimization.md
index e70bb3d5b5..fe6f5b51ff 100644
--- a/docs/wg/feat-2d/optimization.md
+++ b/docs/wg/feat-2d/optimization.md
@@ -1187,6 +1187,47 @@ expensive full redraws.
     - `runtime/scene.rs` — `apply_changes()` for `last_had_data_changes`
     - `window/application.rs` — `frame()` vs `redraw()` dual-path issue
 
+48. **Picture Cache Prefill Skip (Generation Tracking)** ✅ IMPLEMENTED
+
+    The `prefill_picture_cache_for_plan()` loop iterates ALL visible nodes
+    each frame to check if their `SkPicture` is cached, doing a HashMap
+    lookup per node. On cache-warm frames (the common case during view-only
+    pan/zoom), every lookup succeeds and no work is done — but the iteration
+    itself costs O(N) per frame.
+
+    **The optimization:** track a monotonically increasing `generation`
+    counter on `PictureCache` that increments on any mutation (insert,
+    invalidate). The prefill stores the generation, variant key, and layer
+    count after each successful pass. On the next frame, if all three
+    match, the entire loop is skipped in O(1).
+
+    For effect-free scenes (the common case for large design docs), the
+    variant key unification optimization stores all pictures under key=0
+    regardless of stable/unstable quality. The generation-based skip is
+    safe across stable/unstable transitions because the cache contents
+    are identical.
+
+    **Measured impact (Apple M2 Pro, GPU benchmark, 01-135k 135K nodes):**
+
+    | Scenario | Metric | Before | After | Delta |
+    | -------- | ------ | ------ | ----- | ----- |
+    | rt_pan_fast_fit | p50 frame | 111 µs | 76 µs | **-32%** |
+    | rt_pan_fast_fit | p95 frame | 263 µs | 153 µs | **-42%** |
+    | rt_pan_slow_fit | settle | 2,323 µs | 1,836 µs | **-21%** |
+    | pan_settle_slow_fit | avg | 87 µs | 59 µs | **-32%** |
+    | pan_settle_slow_fit | settle | 1,034 µs | 709 µs | **-31%** |
+
+    **Criterion (CPU raster, 2000-node scene, statistically rigorous):**
+
+    | Scene | Change | p-value |
+    | ----- | ------ | ------- |
+    | large_baseline/pan | **-14.0%** | < 0.01 |
+    | large_baseline/pan_zoomed_in | -5.4% | 0.02 |
+    | large_compositing/pan | -4.2% | 0.02 |
+
+    Implementation: `PictureCache.generation` in `cache/picture.rs`,
+    `Renderer.last_prefill_*` tracking in `runtime/scene.rs`.
+
 ---
 
 This list is designed to evolve the renderer from single-threaded mode to

From 491413ea3c36016086b155de8f1e32bd95f2142f Mon Sep 17 00:00:00 2001
From: Universe <universe@grida.co>
Date: Wed, 1 Apr 2026 02:52:30 +0900
Subject: [PATCH 2/6] =?UTF-8?q?style(cg):=20cargo=20fmt=20=E2=80=93=20fix?=
 =?UTF-8?q?=20formatting=20in=20scene.rs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 crates/grida-canvas/src/runtime/scene.rs | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/crates/grida-canvas/src/runtime/scene.rs b/crates/grida-canvas/src/runtime/scene.rs
index b15ea8319a..301ef821bb 100644
--- a/crates/grida-canvas/src/runtime/scene.rs
+++ b/crates/grida-canvas/src/runtime/scene.rs
@@ -407,13 +407,12 @@ impl Renderer {
         // On 135K-node scenes at fit zoom, this eliminates ~800µs of HashMap
         // lookups on every cache-warm frame (the common case during view-only
         // pan/zoom interaction and settle frames).
-        let effective_key_for_tracking = if can_unify
-            && self.scene_cache.picture.variant_store_is_empty()
-        {
-            0
-        } else {
-            variant_key
-        };
+        let effective_key_for_tracking =
+            if can_unify && self.scene_cache.picture.variant_store_is_empty() {
+                0
+            } else {
+                variant_key
+            };
 
         let current_gen = self.scene_cache.picture.generation();
         let layer_count: usize = plan.regions.iter().map(|(_, idx)| idx.len()).sum();
@@ -474,8 +473,7 @@ impl Renderer {
         }
 
         // Update tracking state for future skip-prefill checks.
-        let effective_key_after = if can_unify
-            && self.scene_cache.picture.variant_store_is_empty()
+        let effective_key_after = if can_unify && self.scene_cache.picture.variant_store_is_empty()
         {
             0
         } else {

From ec0d857ac5a65857f3172ae0144146682937a269 Mon Sep 17 00:00:00 2001
From: Universe <universe@grida.co>
Date: Wed, 1 Apr 2026 03:34:20 +0900
Subject: [PATCH 3/6] docs(cg): add render cost prediction reference and
 validation benchmarks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add reference document (render-cost-prediction.md) with measured GPU
operation costs, Skia blur algorithm analysis, blend mode tiers, and
cache hit/miss ratios. All claims labeled as FACT/BENCHMARK/INFERENCE/
HEURISTIC. Key finding: fixed per-operation overhead (FBO allocation,
~11-110µs) dominates over pixel-proportional cost at typical node sizes.

Add three validation benchmarks:
- skia_bench_cost_model: per-effect cost at 50²–4000², linearity, blur
  radius dependence, two-component formula extraction
- skia_bench_cache_blit: cache hit vs miss ratio (~0.05×), blit constancy
- skia_bench_scene_scale: full Renderer pipeline at 1K–136K nodes

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/grida-canvas/Cargo.toml                |  15 +
 .../skia_bench/skia_bench_cache_blit.rs       | 271 +++++++++
 .../skia_bench/skia_bench_cost_model.rs       | 537 ++++++++++++++++++
 .../skia_bench/skia_bench_scene_scale.rs      | 369 ++++++++++++
 docs/wg/feat-2d/render-cost-prediction.md     | 401 +++++++------
 5 files changed, 1432 insertions(+), 161 deletions(-)
 create mode 100644 crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs
 create mode 100644 crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs
 create mode 100644 crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs

diff --git a/crates/grida-canvas/Cargo.toml b/crates/grida-canvas/Cargo.toml
index 41a837dfd3..d6d839f33f 100644
--- a/crates/grida-canvas/Cargo.toml
+++ b/crates/grida-canvas/Cargo.toml
@@ -149,6 +149,21 @@ path = "examples/skia_bench/skia_bench_cache_image.rs"
 name = "skia_bench_cache_text"
 path = "examples/skia_bench/skia_bench_cache_text.rs"
 
+[[example]]
+name = "skia_bench_cost_model"
+path = "examples/skia_bench/skia_bench_cost_model.rs"
+required-features = ["native-gl-context"]
+
+[[example]]
+name = "skia_bench_cache_blit"
+path = "examples/skia_bench/skia_bench_cache_blit.rs"
+required-features = ["native-gl-context"]
+
+[[example]]
+name = "skia_bench_scene_scale"
+path = "examples/skia_bench/skia_bench_scene_scale.rs"
+required-features = ["native-gl-context"]
+
 # ── IO tools ─────────────────────────────────────────────────────
 [[example]]
 name = "tool_io_grida"
diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs
new file mode 100644
index 0000000000..c7598eaf65
--- /dev/null
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs
@@ -0,0 +1,271 @@
+//! Cache Hit vs. Miss Cost Ratio Benchmark
+//!
+//! Measures the actual cost ratio between a cache hit (GPU texture blit) and
+//! a cache miss (full rasterization). Validates the ~0.1× estimate from
+//! `docs/wg/feat-2d/render-cost-prediction.md`.
+//!
+//! Run with:
+//! ```bash
+//! cargo run -p cg --example skia_bench_cache_blit --features native-gl-context --release
+//! ```
+
+#[cfg(not(feature = "native-gl-context"))]
+fn main() {
+    eprintln!("This example requires --features native-gl-context");
+}
+
+#[cfg(feature = "native-gl-context")]
+fn main() {
+    use cg::window::headless::HeadlessGpu;
+    use skia_safe::{
+        canvas::SaveLayerRec, image_filters, Color, Image, ImageInfo, Paint, Rect, Surface,
+    };
+    use std::time::Instant;
+
+    const W: i32 = 1000;
+    const H: i32 = 1000;
+    const WARMUP: u32 = 10;
+    const ITERS: u32 = 50;
+
+    let mut gpu = HeadlessGpu::new(W, H).expect("GPU init");
+    gpu.print_gl_info();
+    println!();
+
+    let surface = &mut gpu.surface;
+
+    // ── Helpers ──────────────────────────────────────────────────────
+
+    fn flush(s: &mut Surface) {
+        if let Some(mut ctx) = s.recording_context() {
+            if let Some(mut d) = ctx.as_direct_context() {
+                d.flush_and_submit();
+            }
+        }
+    }
+
+    /// Measure median time (µs) for a drawing operation.
+    fn bench_draw(
+        surface: &mut Surface,
+        draw_fn: &dyn Fn(&skia_safe::Canvas),
+    ) -> f64 {
+        // Warmup
+        for _ in 0..WARMUP {
+            let canvas = surface.canvas();
+            canvas.clear(Color::WHITE);
+            draw_fn(canvas);
+            flush(surface);
+        }
+        // Measure
+        let mut timings = Vec::with_capacity(ITERS as usize);
+        for _ in 0..ITERS {
+            let t0 = Instant::now();
+            let canvas = surface.canvas();
+            canvas.clear(Color::WHITE);
+            draw_fn(canvas);
+            flush(surface);
+            timings.push(t0.elapsed().as_nanos() as f64 / 1000.0);
+        }
+        timings.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        timings[timings.len() / 2]
+    }
+
+    /// Capture a rect with effects into a GPU-resident Image.
+    fn capture_to_image(
+        surface: &mut Surface,
+        size: i32,
+        draw_fn: &dyn Fn(&skia_safe::Canvas, Rect),
+    ) -> Image {
+        let info = ImageInfo::new_n32_premul((size, size), None);
+        let mut offscreen = surface.new_surface(&info).expect("offscreen surface");
+        {
+            let canvas = offscreen.canvas();
+            canvas.clear(Color::TRANSPARENT);
+            let rect = Rect::from_xywh(0.0, 0.0, size as f32, size as f32);
+            draw_fn(canvas, rect);
+        }
+        flush(surface);
+        offscreen.image_snapshot()
+    }
+
+    // ── Effect configurations ───────────────────────────────────────
+
+    struct EffectConfig {
+        name: &'static str,
+        draw: Box<dyn Fn(&skia_safe::Canvas, Rect)>,
+    }
+
+    let shadow_filter = image_filters::drop_shadow(
+        (4.0, 4.0),
+        (8.0, 8.0),
+        Color::from_argb(128, 0, 0, 0),
+        None,
+        None,
+        None,
+    );
+    let blur_filter = image_filters::blur((8.0, 8.0), None, None, None);
+
+    let sf = shadow_filter.clone();
+    let blf = blur_filter.clone();
+    let sf2 = shadow_filter.clone();
+
+    let effects: Vec<EffectConfig> = vec![
+        EffectConfig {
+            name: "solid rect",
+            draw: Box::new(|canvas, rect| {
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+            }),
+        },
+        EffectConfig {
+            name: "rect + blur (s=8)",
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(blf.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        EffectConfig {
+            name: "rect + shadow (s=8)",
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(sf.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        EffectConfig {
+            name: "complex (3 fills + stroke + shadow)",
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(sf2.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                // 3 fills
+                let mut p1 = Paint::default();
+                p1.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p1);
+                let mut p2 = Paint::default();
+                p2.set_color(Color::from_argb(128, 255, 0, 0));
+                canvas.draw_rect(rect, &p2);
+                let mut p3 = Paint::default();
+                p3.set_color(Color::from_argb(64, 0, 255, 0));
+                canvas.draw_rect(rect, &p3);
+                // 1 stroke
+                let mut s = Paint::default();
+                s.set_color(Color::BLACK);
+                s.set_style(skia_safe::PaintStyle::Stroke);
+                s.set_stroke_width(2.0);
+                canvas.draw_rect(rect, &s);
+                canvas.restore();
+            }),
+        },
+    ];
+
+    let sizes: [i32; 3] = [100, 200, 500];
+
+    // ── Run benchmarks ──────────────────────────────────────────────
+
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 1: Cache Hit vs. Miss Ratio");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<36} {:>5} {:>10} {:>10} {:>10}",
+        "Effect", "Size", "Miss(µs)", "Hit(µs)", "Ratio"
+    );
+    println!(
+        "  {:-<36} {:->5} {:->10} {:->10} {:->10}",
+        "", "", "", "", ""
+    );
+
+    // blit_times[effect_idx][size_idx] for constancy check
+    let mut blit_times: Vec<Vec<f64>> = vec![Vec::new(); effects.len()];
+
+    for (ei, effect) in effects.iter().enumerate() {
+        for (si, &size) in sizes.iter().enumerate() {
+            let sizef = size as f32;
+            let cx = (W as f32 - sizef) / 2.0;
+            let cy = (H as f32 - sizef) / 2.0;
+            let dst_rect = Rect::from_xywh(cx, cy, sizef, sizef);
+
+            // Cache miss: full rasterize
+            let miss_us = bench_draw(surface, &|canvas| {
+                (effect.draw)(canvas, dst_rect);
+            });
+
+            // Capture to GPU texture
+            let cached_image = capture_to_image(surface, size, &*effect.draw);
+
+            // Cache hit: texture blit
+            let hit_us = bench_draw(surface, &|canvas| {
+                canvas.draw_image_rect(
+                    &cached_image,
+                    None,
+                    dst_rect,
+                    &Paint::default(),
+                );
+            });
+
+            let ratio = hit_us / miss_us;
+            blit_times[ei].push(hit_us);
+
+            println!(
+                "  {:<36} {:>4}² {:>10.1} {:>10.1} {:>9.3}×",
+                effect.name, size, miss_us, hit_us, ratio
+            );
+
+            eprint!(
+                "\r  [{}/{}]",
+                ei * sizes.len() + si + 1,
+                effects.len() * sizes.len()
+            );
+        }
+    }
+    eprintln!("\r  Done.{:40}", "");
+
+    // ── Output Section 2: Blit Constancy ────────────────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 2: Blit Cost Constancy (same size, different source complexity)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  Blit cost should NOT vary with source effect complexity at the same size."
+    );
+    println!();
+
+    for (si, &size) in sizes.iter().enumerate() {
+        let blit_at_size: Vec<f64> = blit_times.iter().map(|bt| bt[si]).collect();
+        let mean = blit_at_size.iter().sum::<f64>() / blit_at_size.len() as f64;
+        let variance =
+            blit_at_size.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / blit_at_size.len() as f64;
+        let stddev = variance.sqrt();
+        let cv = if mean > 0.0 { stddev / mean * 100.0 } else { 0.0 };
+
+        println!("  Size {}²:", size);
+        for (ei, effect) in effects.iter().enumerate() {
+            println!("    {:<36} {:>8.1} µs", effect.name, blit_times[ei][si]);
+        }
+        println!(
+            "    mean={:.1} µs  stddev={:.1} µs  CV={:.1}%  {}",
+            mean,
+            stddev,
+            cv,
+            if cv < 10.0 { "OK" } else { "WARN (>10%)" }
+        );
+        println!();
+    }
+
+    println!("  Expected: CV < 10% at each size (blit cost independent of source complexity)");
+    println!("  Reference: predicted cache-hit ratio ~0.1× (from cost model doc)");
+    println!();
+}
diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs
new file mode 100644
index 0000000000..504cc5c045
--- /dev/null
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs
@@ -0,0 +1,537 @@
+//! Render Cost Model Validation Benchmark
+//!
+//! Validates the structural pixel-cost model from
+//! `docs/wg/feat-2d/render-cost-prediction.md` against real GPU measurements.
+//!
+//! Unlike `skia_bench_effects` (10K tiny rects, per-rect overhead), this draws
+//! **one rect per iteration at controlled sizes** to isolate per-pixel cost
+//! from per-draw-call overhead.
+//!
+//! Run with:
+//! ```bash
+//! cargo run -p cg --example skia_bench_cost_model --features native-gl-context --release
+//! ```
+
+#[cfg(not(feature = "native-gl-context"))]
+fn main() {
+    eprintln!("This example requires --features native-gl-context");
+}
+
+#[cfg(feature = "native-gl-context")]
+fn main() {
+    use cg::window::headless::HeadlessGpu;
+    use skia_safe::{
+        canvas::SaveLayerRec, image_filters, BlendMode, Color, Paint, Rect, Surface,
+    };
+    use std::time::Instant;
+
+    const W: i32 = 1000;
+    const H: i32 = 1000;
+    const WARMUP: u32 = 10;
+    const ITERS: u32 = 50;
+
+    let mut gpu = HeadlessGpu::new(W, H).expect("GPU init");
+    gpu.print_gl_info();
+    println!();
+
+    let surface = &mut gpu.surface;
+
+    // ── Helpers ──────────────────────────────────────────────────────
+
+    fn flush(s: &mut Surface) {
+        if let Some(mut ctx) = s.recording_context() {
+            if let Some(mut d) = ctx.as_direct_context() {
+                d.flush_and_submit();
+            }
+        }
+    }
+
+    /// Run a single-rect benchmark at the given size.
+    /// Returns the **median** duration in microseconds.
+    fn bench_single_rect(
+        surface: &mut Surface,
+        size: f32,
+        draw_fn: &dyn Fn(&skia_safe::Canvas, Rect),
+    ) -> f64 {
+        let cx = (W as f32 - size) / 2.0;
+        let cy = (H as f32 - size) / 2.0;
+        let rect = Rect::from_xywh(cx, cy, size, size);
+
+        // Warmup
+        for _ in 0..WARMUP {
+            let canvas = surface.canvas();
+            canvas.clear(Color::WHITE);
+            draw_fn(canvas, rect);
+            flush(surface);
+        }
+
+        // Measure
+        let mut timings = Vec::with_capacity(ITERS as usize);
+        for _ in 0..ITERS {
+            let t0 = Instant::now();
+            let canvas = surface.canvas();
+            canvas.clear(Color::WHITE);
+            draw_fn(canvas, rect);
+            flush(surface);
+            timings.push(t0.elapsed().as_nanos() as f64 / 1000.0); // microseconds
+        }
+        timings.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        timings[timings.len() / 2] // median
+    }
+
+    /// Compute R-squared for linear fit of (xs, ys).
+    fn r_squared(xs: &[f64], ys: &[f64]) -> f64 {
+        let n = xs.len() as f64;
+        let x_mean = xs.iter().sum::<f64>() / n;
+        let y_mean = ys.iter().sum::<f64>() / n;
+        let ss_xy: f64 = xs.iter().zip(ys).map(|(x, y)| (x - x_mean) * (y - y_mean)).sum();
+        let ss_xx: f64 = xs.iter().map(|x| (x - x_mean).powi(2)).sum();
+        let ss_yy: f64 = ys.iter().map(|y| (y - y_mean).powi(2)).sum();
+        if ss_xx == 0.0 || ss_yy == 0.0 {
+            return 0.0;
+        }
+        let r = ss_xy / (ss_xx * ss_yy).sqrt();
+        r * r
+    }
+
+    // ── Variant definitions ─────────────────────────────────────────
+
+    struct Variant {
+        name: &'static str,
+        predicted: f64,
+        draw: Box<dyn Fn(&skia_safe::Canvas, Rect)>,
+    }
+
+    let shadow_filter_s8 = image_filters::drop_shadow(
+        (4.0, 4.0),
+        (8.0, 8.0),
+        Color::from_argb(128, 0, 0, 0),
+        None,
+        None,
+        None,
+    );
+
+    let shadow_filter_s8_only = image_filters::drop_shadow_only(
+        (2.0, 2.0),
+        (6.0, 6.0),
+        Color::from_argb(128, 0, 0, 0),
+        None,
+        None,
+        None,
+    );
+
+    let blur_filter_5 = image_filters::blur((5.0, 5.0), None, None, None);
+    let blur_filter_50 = image_filters::blur((50.0, 50.0), None, None, None);
+    let backdrop_blur_8 = image_filters::blur((8.0, 8.0), None, None, None).unwrap();
+
+    // Clone filters for closures
+    let sf8 = shadow_filter_s8.clone();
+    let sf8o = shadow_filter_s8_only.clone();
+    let bf5 = blur_filter_5.clone();
+    let bf50 = blur_filter_50.clone();
+    let sf8_for_combo = shadow_filter_s8.clone();
+    let bf5_for_combo = blur_filter_5.clone();
+    let bd8 = backdrop_blur_8.clone();
+
+    let variants: Vec<Variant> = vec![
+        // 1. Baseline
+        Variant {
+            name: "baseline (solid rect)",
+            predicted: 1.0,
+            draw: Box::new(|canvas, rect| {
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+            }),
+        },
+        // 2. +1 extra fill
+        Variant {
+            name: "+1 fill (2 fills total)",
+            predicted: 2.0,
+            draw: Box::new(|canvas, rect| {
+                let mut p1 = Paint::default();
+                p1.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p1);
+                let mut p2 = Paint::default();
+                p2.set_color(Color::from_argb(128, 255, 0, 0));
+                canvas.draw_rect(rect, &p2);
+            }),
+        },
+        // 3. +2 extra fills
+        Variant {
+            name: "+2 fills (3 fills total)",
+            predicted: 3.0,
+            draw: Box::new(|canvas, rect| {
+                let mut p1 = Paint::default();
+                p1.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p1);
+                let mut p2 = Paint::default();
+                p2.set_color(Color::from_argb(128, 255, 0, 0));
+                canvas.draw_rect(rect, &p2);
+                let mut p3 = Paint::default();
+                p3.set_color(Color::from_argb(128, 0, 255, 0));
+                canvas.draw_rect(rect, &p3);
+            }),
+        },
+        // 4. +1 stroke
+        Variant {
+            name: "+1 stroke",
+            predicted: 2.0,
+            draw: Box::new(|canvas, rect| {
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                let mut s = Paint::default();
+                s.set_color(Color::BLACK);
+                s.set_style(skia_safe::PaintStyle::Stroke);
+                s.set_stroke_width(2.0);
+                canvas.draw_rect(rect, &s);
+            }),
+        },
+        // 5. Non-normal blend mode (save_layer)
+        Variant {
+            name: "blend mode (Multiply)",
+            predicted: 2.0,
+            draw: Box::new(|canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_blend_mode(BlendMode::Multiply);
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 6. Opacity (save_layer_alpha)
+        Variant {
+            name: "opacity 0.5 (save_layer_alpha)",
+            predicted: 2.0,
+            draw: Box::new(|canvas, rect| {
+                canvas.save_layer_alpha(Some(rect),128);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 7. Gaussian blur (r=5)
+        Variant {
+            name: "blur (r=5)",
+            predicted: 4.0,
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(bf5.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 8. Gaussian blur (r=50) — should be ~same cost (radius independence)
+        Variant {
+            name: "blur (r=50)",
+            predicted: 4.0,
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(bf50.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 9. Drop shadow (with content)
+        Variant {
+            name: "drop shadow (s=8)",
+            predicted: 6.0,
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(sf8.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 10. Inner shadow (clip + shadow_only)
+        Variant {
+            name: "inner shadow (s=6)",
+            predicted: 6.0,
+            draw: Box::new(move |canvas, rect| {
+                // Base rect
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 240, 240, 240));
+                canvas.draw_rect(rect, &p);
+                // Clipped inner shadow
+                canvas.save();
+                canvas.clip_rect(rect, None, None);
+                let mut lp = Paint::default();
+                lp.set_image_filter(sf8o.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut sp = Paint::default();
+                sp.set_color(Color::from_argb(255, 240, 240, 240));
+                canvas.draw_rect(rect, &sp);
+                canvas.restore();
+                canvas.restore();
+            }),
+        },
+        // 11. Drop shadow + blur combined
+        Variant {
+            name: "shadow + blur combo",
+            predicted: 9.0,
+            draw: Box::new(move |canvas, rect| {
+                // Outer: blur
+                let mut blur_p = Paint::default();
+                blur_p.set_image_filter(bf5_for_combo.clone());
+                let blur_rec = SaveLayerRec::default().bounds(&rect).paint(&blur_p);
+                canvas.save_layer(&blur_rec);
+                // Inner: shadow
+                let mut shadow_p = Paint::default();
+                shadow_p.set_image_filter(sf8_for_combo.clone());
+                let shadow_rec = SaveLayerRec::default().bounds(&rect).paint(&shadow_p);
+                canvas.save_layer(&shadow_rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+                canvas.restore();
+            }),
+        },
+        // 12. 2x nested save_layer (no effects, pure isolation cost)
+        Variant {
+            name: "2x nested save_layer",
+            predicted: 5.0,
+            draw: Box::new(|canvas, rect| {
+                canvas.save_layer_alpha(Some(rect),255);
+                canvas.save_layer_alpha(Some(rect),255);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+                canvas.restore();
+            }),
+        },
+        // 13. 3x nested save_layer
+        Variant {
+            name: "3x nested save_layer",
+            predicted: 7.0,
+            draw: Box::new(|canvas, rect| {
+                canvas.save_layer_alpha(Some(rect),255);
+                canvas.save_layer_alpha(Some(rect),255);
+                canvas.save_layer_alpha(Some(rect),255);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+                canvas.restore();
+                canvas.restore();
+            }),
+        },
+        // 14. Backdrop blur
+        Variant {
+            name: "backdrop blur (s=8)",
+            predicted: 4.0,
+            draw: Box::new(move |canvas, rect| {
+                // Background content
+                let mut bg = Paint::default();
+                bg.set_color(Color::from_argb(255, 200, 50, 100));
+                canvas.draw_rect(rect, &bg);
+                // Backdrop blur layer on top
+                let lp = Paint::default();
+                let rec = SaveLayerRec::default()
+                    .bounds(&rect)
+                    .backdrop(&bd8)
+                    .paint(&lp);
+                canvas.save_layer(&rec);
+                let mut overlay = Paint::default();
+                overlay.set_color(Color::from_argb(80, 255, 255, 255));
+                canvas.draw_rect(rect, &overlay);
+                canvas.restore();
+            }),
+        },
+    ];
+
+    // ── Run benchmarks ──────────────────────────────────────────────
+
+    let sizes: [f32; 8] = [50.0, 100.0, 200.0, 300.0, 500.0, 1000.0, 2000.0, 4000.0];
+    let pixel_areas: Vec<f64> = sizes.iter().map(|s| (*s as f64) * (*s as f64)).collect();
+
+    // results[variant_idx][size_idx] = median_us
+    let mut results: Vec<Vec<f64>> = Vec::new();
+
+    for (vi, variant) in variants.iter().enumerate() {
+        let mut row = Vec::new();
+        for &size in &sizes {
+            let us = bench_single_rect(surface, size, &*variant.draw);
+            row.push(us);
+        }
+        eprint!("\r  [{}/{}] {:<35}", vi + 1, variants.len(), variant.name);
+        results.push(row);
+    }
+    eprintln!("\r  Done.{:40}", "");
+
+    // ── Output Section 1: Cost Multiplier Table (at 200²) ───────────
+
+    let size_idx_200 = 2; // 200.0 is index 2
+    let baseline_200 = results[0][size_idx_200];
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 1: Cost Multiplier Validation (at 200×200)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<35} {:>10} {:>10} {:>10} {:>6}",
+        "Effect", "Predicted", "Measured", "Time(µs)", "Status"
+    );
+    println!("  {:-<35} {:->10} {:->10} {:->10} {:->6}", "", "", "", "", "");
+
+    for (vi, variant) in variants.iter().enumerate() {
+        let time_us = results[vi][size_idx_200];
+        let measured = time_us / baseline_200;
+        let ratio = measured / variant.predicted;
+        let status = if ratio >= 0.5 && ratio <= 2.0 { "OK" } else { "WARN" };
+        println!(
+            "  {:<35} {:>9.1}× {:>9.2}× {:>10.1} {:>6}",
+            variant.name, variant.predicted, measured, time_us, status
+        );
+    }
+
+    // ── Output Section 2: Linearity Table ───────────────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 2: Linearity (time vs. pixel area)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<35} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>6}",
+        "Effect", "50²", "100²", "200²", "300²", "500²", "1000²", "2000²", "4000²", "R²"
+    );
+    println!(
+        "  {:-<35} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->6}",
+        "", "", "", "", "", "", "", "", "", ""
+    );
+
+    for (vi, variant) in variants.iter().enumerate() {
+        let row = &results[vi];
+        let r2 = r_squared(&pixel_areas, row);
+        println!(
+            "  {:<35} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>5.3}",
+            variant.name, row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], r2
+        );
+    }
+
+    // ── Output Section 3: Blur Radius Independence ──────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 3: Blur Radius Independence (r=5 vs r=50)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<10} {:>10} {:>10} {:>10}",
+        "Size", "r=5 (µs)", "r=50 (µs)", "Ratio"
+    );
+    println!("  {:-<10} {:->10} {:->10} {:->10}", "", "", "", "");
+
+    let blur5_idx = 6; // "blur (r=5)"
+    let blur50_idx = 7; // "blur (r=50)"
+    for (si, &size) in sizes.iter().enumerate() {
+        let t5 = results[blur5_idx][si];
+        let t50 = results[blur50_idx][si];
+        let ratio = t50 / t5;
+        println!(
+            "  {:<10} {:>10.1} {:>10.1} {:>9.2}×",
+            format!("{}²", size as i32),
+            t5,
+            t50,
+            ratio
+        );
+    }
+
+    // ── Output Section 4: Device Fill Rate Calibration ──────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 4: Device Fill Rate Calibration");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+
+    // Use baseline at 500² for the most stable measurement
+    let baseline_500_us = results[0][4]; // 500² = 250_000 pixels
+    let pixels_500 = 500.0 * 500.0;
+    let pixels_per_us = pixels_500 / baseline_500_us;
+    let pixels_per_ms = pixels_per_us * 1000.0;
+    let budget_12ms = pixels_per_ms * 12.0;
+
+    println!("  Baseline (solid rect) at 500×500: {:.1} µs", baseline_500_us);
+    println!("  Fill rate: {:.1}M pixels/ms", pixels_per_ms / 1_000_000.0);
+    println!(
+        "  12ms frame budget: {:.1}B pixels ({:.0}M pixels)",
+        budget_12ms / 1_000_000_000.0,
+        budget_12ms / 1_000_000.0
+    );
+    println!();
+
+    println!("  Reference (from docs/wg/feat-2d/render-cost-prediction.md):");
+    println!("    Desktop GPU (discrete)   ~500M pixels/ms");
+    println!("    Desktop GPU (integrated) ~100M pixels/ms");
+    println!("    WebGL (WASM, desktop)    ~50-100M pixels/ms");
+    println!("    WebGL (WASM, mobile)     ~10-30M pixels/ms");
+
+    // ── Output Section 5: Two-Component Formula Extraction ──────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 5: Two-Component Formula (C_fixed + area × C_per_pixel)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  Solving from 200² and 4000² measurements:");
+    println!();
+    println!(
+        "  {:<35} {:>10} {:>10} {:>12} {:>12}",
+        "Effect", "C_fixed(µs)", "C_pixel(ns/px)", "t@200²(µs)", "t@4000²(µs)"
+    );
+    println!(
+        "  {:-<35} {:->10} {:->10} {:->12} {:->12}",
+        "", "", "", "", ""
+    );
+
+    let area_small = 200.0_f64 * 200.0; // 40,000
+    let area_large = 4000.0_f64 * 4000.0; // 16,000,000
+    let idx_200 = 2usize; // index of 200.0 in sizes
+    let idx_4000 = 7usize; // index of 4000.0 in sizes
+
+    for (vi, variant) in variants.iter().enumerate() {
+        let t_small = results[vi][idx_200];
+        let t_large = results[vi][idx_4000];
+
+        // Solve: t_small = C_fixed + area_small * C_pixel
+        //        t_large = C_fixed + area_large * C_pixel
+        // → C_pixel = (t_large - t_small) / (area_large - area_small)
+        // → C_fixed = t_small - area_small * C_pixel
+        let c_pixel = (t_large - t_small) / (area_large - area_small); // µs per pixel
+        let c_fixed = t_small - area_small * c_pixel;
+
+        let c_pixel_ns = c_pixel * 1000.0; // ns per pixel
+
+        println!(
+            "  {:<35} {:>10.1} {:>10.3} {:>12.1} {:>12.1}",
+            variant.name,
+            c_fixed.max(0.0),
+            c_pixel_ns.max(0.0),
+            t_small,
+            t_large
+        );
+    }
+
+    println!();
+    println!("  C_fixed = per-save_layer FBO/pipeline overhead (device-specific)");
+    println!("  C_pixel = per-pixel bandwidth cost (ns/pixel)");
+    println!("  Cost model: node_cost = C_fixed + screen_area × C_pixel × passes");
+    println!();
+}
diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs
new file mode 100644
index 0000000000..c27db1f0e9
--- /dev/null
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs
@@ -0,0 +1,369 @@
+//! Scene-Scale Cost Model Benchmark
+//!
+//! Measures full-engine render cost at scale (1K–136K nodes) with the complete
+//! Renderer pipeline: R-tree culling, picture cache, layer compositing, GPU flush.
+//!
+//! This complements `skia_bench_cost_model` (single-node isolation) by testing
+//! whether per-node costs are additive at scale or whether GPU batching,
+//! memory pressure, and cache behavior introduce non-linear effects.
+//!
+//! Run with:
+//! ```bash
+//! cargo run -p cg --example skia_bench_scene_scale --features native-gl-context --release
+//! ```
+
+#[cfg(not(feature = "native-gl-context"))]
+fn main() {
+    eprintln!("This example requires --features native-gl-context");
+}
+
+#[cfg(feature = "native-gl-context")]
+fn main() {
+    use cg::cg::prelude::*;
+    use cg::node::scene_graph::{Parent, SceneGraph};
+    use cg::node::schema::*;
+    use cg::runtime::scene::FrameFlushResult;
+    use cg::window::headless::HeadlessGpu;
+    use math2::transform::AffineTransform;
+    use std::time::Instant;
+
+    const W: i32 = 1000;
+    const H: i32 = 1000;
+    const WARMUP: u32 = 5;
+    const ITERS: u32 = 20;
+
+    let mut gpu = HeadlessGpu::new(W, H).expect("GPU init");
+    gpu.print_gl_info();
+    println!();
+
+    // ── Scene builders ──────────────────────────────────────────────
+
+    #[derive(Clone, Copy)]
+    enum SceneType {
+        Plain,
+        WithShadow,
+        WithBlur,
+        Mixed, // 70% plain, 20% shadow, 10% blur
+    }
+
+    impl SceneType {
+        fn label(&self) -> &'static str {
+            match self {
+                SceneType::Plain => "plain rects",
+                SceneType::WithShadow => "all with shadow",
+                SceneType::WithBlur => "all with blur",
+                SceneType::Mixed => "mixed (70/20/10)",
+            }
+        }
+    }
+
+    fn build_scene(count: usize, scene_type: SceneType) -> Scene {
+        let mut graph = SceneGraph::new();
+        let cols = (count as f64).sqrt().ceil() as usize;
+
+        let rectangles: Vec<Node> = (0..count)
+            .map(|i| {
+                let col = i % cols;
+                let row = i / cols;
+                let x = (col as f32) * 10.0;
+                let y = (row as f32) * 10.0;
+
+                let effects = match scene_type {
+                    SceneType::Plain => LayerEffects::default(),
+                    SceneType::WithShadow => LayerEffects::from_array(vec![
+                        FilterEffect::DropShadow(FeShadow {
+                            dx: 2.0,
+                            dy: 2.0,
+                            blur: 4.0,
+                            spread: 0.0,
+                            color: CGColor::from_rgba(0, 0, 0, 128),
+                            active: true,
+                        }),
+                    ]),
+                    SceneType::WithBlur => {
+                        LayerEffects::new().blur(3.0)
+                    }
+                    SceneType::Mixed => {
+                        let kind = i % 10;
+                        if kind < 7 {
+                            LayerEffects::default() // 70% plain
+                        } else if kind < 9 {
+                            LayerEffects::from_array(vec![
+                                FilterEffect::DropShadow(FeShadow {
+                                    dx: 2.0,
+                                    dy: 2.0,
+                                    blur: 4.0,
+                                    spread: 0.0,
+                                    color: CGColor::from_rgba(0, 0, 0, 128),
+                                    active: true,
+                                }),
+                            ]) // 20% shadow
+                        } else {
+                            LayerEffects::new().blur(3.0) // 10% blur
+                        }
+                    }
+                };
+
+                Node::Rectangle(RectangleNodeRec {
+                    active: true,
+                    opacity: 1.0,
+                    blend_mode: LayerBlendMode::default(),
+                    mask: None,
+                    transform: AffineTransform::new(x, y, 0.0),
+                    size: Size {
+                        width: 8.0,
+                        height: 8.0,
+                    },
+                    corner_radius: RectangularCornerRadius::zero(),
+                    corner_smoothing: CornerSmoothing::default(),
+                    fills: Paints::new([Paint::from(CGColor::from_rgba(
+                        66,
+                        (133 + i % 50) as u8,
+                        244,
+                        255,
+                    ))]),
+                    strokes: Paints::default(),
+                    stroke_style: StrokeStyle {
+                        stroke_align: StrokeAlign::Inside,
+                        stroke_cap: StrokeCap::default(),
+                        stroke_join: StrokeJoin::default(),
+                        stroke_miter_limit: StrokeMiterLimit::default(),
+                        stroke_dash_array: None,
+                    },
+                    stroke_width: StrokeWidth::default(),
+                    effects,
+                    layout_child: None,
+                })
+            })
+            .collect();
+
+        graph.append_children(rectangles, Parent::Root);
+
+        Scene {
+            name: format!("scale_{}_{}", count, scene_type.label()),
+            background_color: Some(CGColor::WHITE),
+            graph,
+        }
+    }
+
+    // ── Benchmark runner ────────────────────────────────────────────
+
+    struct ScaleResult {
+        scene_type: &'static str,
+        node_count: usize,
+        visible_count: usize,
+        frame_us: f64,
+        flush_us: f64,
+        total_us: f64,
+        per_visible_us: f64,
+        cache_hits: usize,
+        live_draws: usize,
+    }
+
+    fn run_scale_bench(
+        renderer: &mut cg::runtime::scene::Renderer,
+        count: usize,
+        scene_type: SceneType,
+    ) -> ScaleResult {
+        let scene = build_scene(count, scene_type);
+        renderer.load_scene(scene);
+
+        // Measure stable frames (full draw, no image cache).
+        // load_scene queues a stable frame automatically.
+        // Each iteration: flush (draws), then queue next stable frame.
+        // Stable frames always do a full draw — no pan/zoom image cache reuse.
+        let mut frame_times = Vec::with_capacity((WARMUP + ITERS) as usize);
+        let mut flush_times = Vec::with_capacity((WARMUP + ITERS) as usize);
+        let mut total_times = Vec::with_capacity((WARMUP + ITERS) as usize);
+        let mut last_visible = 0usize;
+        let mut last_cache_hits = 0usize;
+        let mut last_live_draws = 0usize;
+
+        for i in 0..(WARMUP + ITERS) {
+            renderer.queue_stable();
+            let t0 = Instant::now();
+            let result = renderer.flush();
+            let wall = t0.elapsed();
+
+            if let FrameFlushResult::OK(stats) = result {
+                if i >= WARMUP {
+                    frame_times.push(stats.frame_duration.as_nanos() as f64 / 1000.0);
+                    flush_times.push(stats.flush_duration.as_nanos() as f64 / 1000.0);
+                    total_times.push(wall.as_nanos() as f64 / 1000.0);
+                }
+                last_visible = stats.draw.live_draw_count + stats.draw.layer_image_cache_hits;
+                last_cache_hits = stats.draw.layer_image_cache_hits;
+                last_live_draws = stats.draw.live_draw_count;
+            }
+        }
+
+        // Use median
+        frame_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        flush_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        total_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+        let frame_us = frame_times.get(frame_times.len() / 2).copied().unwrap_or(0.0);
+        let flush_us = flush_times.get(flush_times.len() / 2).copied().unwrap_or(0.0);
+        let total_us = total_times.get(total_times.len() / 2).copied().unwrap_or(0.0);
+        let per_visible = if last_visible > 0 {
+            total_us / last_visible as f64
+        } else {
+            0.0
+        };
+
+        ScaleResult {
+            scene_type: scene_type.label(),
+            node_count: count,
+            visible_count: last_visible,
+            frame_us,
+            flush_us,
+            total_us,
+            per_visible_us: per_visible,
+            cache_hits: last_cache_hits,
+            live_draws: last_live_draws,
+        }
+    }
+
+    // ── Run all configurations ──────────────────────────────────────
+
+    let counts = [1_000, 5_000, 10_000, 50_000, 100_000, 136_000];
+    let scene_types = [
+        SceneType::Plain,
+        SceneType::WithShadow,
+        SceneType::WithBlur,
+        SceneType::Mixed,
+    ];
+
+    let mut renderer = gpu.create_renderer();
+    let mut results: Vec<ScaleResult> = Vec::new();
+
+    let total_configs = counts.len() * scene_types.len();
+    let mut done = 0;
+
+    for &scene_type in &scene_types {
+        for &count in &counts {
+            eprint!("\r  [{}/{}] {} × {}k", done + 1, total_configs, scene_type.label(), count / 1000);
+            results.push(run_scale_bench(&mut renderer, count, scene_type));
+            done += 1;
+        }
+    }
+    eprintln!("\r  Done.{:60}", "");
+
+    // ── Output Section 1: Scale Table ───────────────────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 1: Frame Time vs. Node Count (unstable frames, full Renderer pipeline)");
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<22} {:>8} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8} {:>8}",
+        "Scene Type", "Nodes", "Visible", "Frame(µs)", "Flush(µs)", "Total(µs)", "Per-vis", "Hits", "Live"
+    );
+    println!(
+        "  {:-<22} {:->8} {:->8} {:->10} {:->10} {:->10} {:->10} {:->8} {:->8}",
+        "", "", "", "", "", "", "", "", ""
+    );
+
+    for r in &results {
+        println!(
+            "  {:<22} {:>7}k {:>8} {:>10.0} {:>10.0} {:>10.0} {:>9.2} {:>8} {:>8}",
+            r.scene_type,
+            r.node_count / 1000,
+            r.visible_count,
+            r.frame_us,
+            r.flush_us,
+            r.total_us,
+            r.per_visible_us,
+            r.cache_hits,
+            r.live_draws
+        );
+    }
+
+    // ── Output Section 2: Linearity Check ───────────────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 2: Per-Node Cost Linearity (total_us / visible_count across scales)");
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<22} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}",
+        "Scene Type", "1k", "5k", "10k", "50k", "100k", "136k"
+    );
+    println!(
+        "  {:-<22} {:->10} {:->10} {:->10} {:->10} {:->10} {:->10}",
+        "", "", "", "", "", "", ""
+    );
+
+    for scene_type in &scene_types {
+        let label = scene_type.label();
+        let per_vis: Vec<String> = counts
+            .iter()
+            .map(|&count| {
+                results
+                    .iter()
+                    .find(|r| r.node_count == count && r.scene_type == label)
+                    .map(|r| format!("{:.2}", r.per_visible_us))
+                    .unwrap_or_else(|| "-".to_string())
+            })
+            .collect();
+        println!(
+            "  {:<22} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}",
+            label, per_vis[0], per_vis[1], per_vis[2], per_vis[3], per_vis[4], per_vis[5]
+        );
+    }
+    println!();
+    println!("  If per-visible cost is flat → cost model is additive (linear scaling).");
+    println!("  If per-visible cost increases with N → non-linear overhead at scale.");
+    println!();
+
+    // ── Output Section 3: Predicted vs Measured ─────────────────────
+
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 3: Predicted vs. Measured (using cost model)");
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!();
+
+    // Find baseline per-visible cost from plain 1k
+    let plain_1k = results
+        .iter()
+        .find(|r| r.node_count == 1_000 && r.scene_type == "plain rects");
+
+    if let Some(base) = plain_1k {
+        let base_per_vis = base.per_visible_us;
+        println!("  Baseline per-visible-node cost (plain, 1k): {:.2} µs", base_per_vis);
+        println!();
+        println!(
+            "  {:<22} {:>8} {:>12} {:>12} {:>10}",
+            "Scene Type", "Nodes", "Predicted(µs)", "Measured(µs)", "Ratio"
+        );
+        println!(
+            "  {:-<22} {:->8} {:->12} {:->12} {:->10}",
+            "", "", "", "", ""
+        );
+
+        for r in &results {
+            // Prediction: plain baseline per node × visible count × effect multiplier
+            let multiplier = match r.scene_type {
+                "plain rects" => 1.0,
+                "all with shadow" => 6.0, // 1 base + 5 shadow
+                "all with blur" => 4.0,   // 1 base + 3 blur(σ=3)
+                "mixed (70/20/10)" => 0.7 * 1.0 + 0.2 * 6.0 + 0.1 * 4.0, // 2.3
+                _ => 1.0,
+            };
+            let predicted = base_per_vis * r.visible_count as f64 * multiplier;
+            let measured = r.total_us;
+            let ratio = measured / predicted;
+
+            println!(
+                "  {:<22} {:>7}k {:>12.0} {:>12.0} {:>9.2}×",
+                r.scene_type,
+                r.node_count / 1000,
+                predicted,
+                measured,
+                ratio
+            );
+        }
+    }
+
+    println!();
+}
diff --git a/docs/wg/feat-2d/render-cost-prediction.md b/docs/wg/feat-2d/render-cost-prediction.md
index 0c4c1f1152..d6de03f2a5 100644
--- a/docs/wg/feat-2d/render-cost-prediction.md
+++ b/docs/wg/feat-2d/render-cost-prediction.md
@@ -12,9 +12,13 @@ tags:
 
 # Render Cost Prediction
 
-Reference sheet for computing GPU render cost of 2D scene operations
-**before drawing**. All constants and formulas are derived from GPU
-pipeline structure, not empirical tuning.
+Reference sheet for estimating GPU render cost of 2D scene operations
+**before drawing**. Each claim is labeled as one of:
+
+- **FACT** — verified from Skia/Chromium source or hardware specification
+- **BENCHMARK** — measured locally (Apple M2 Pro, Metal 4.1, Skia 0.93)
+- **INFERENCE** — derived from facts and benchmarks, not directly proven
+- **HEURISTIC** — useful approximation, known to have exceptions
 
 Related:
 
@@ -23,178 +27,258 @@ Related:
 
 ---
 
-## Core Principle: Fill Rate Dominance
+## Dominant Cost: Fixed Overhead per Operation
 
-2D GPU rendering is **memory-bandwidth bound**, not compute bound. The
-fragment shader for a rect fill is ~1 ALU op; even a Gaussian blur pass
-is ~10 ALU ops per pixel. Modern GPUs execute trillions of ALU ops/sec,
-but memory bandwidth is 50-200 GB/s. Each pixel read/write is 4-16 bytes.
+> **BENCHMARK** — Confirmed by measuring identical effects at 200² through
+> 4000² pixels (100× area range). Per-pixel cost component is near zero;
+> total time is constant regardless of area.
 
-Therefore:
+On our measured hardware (M2 Pro, Metal), the cost of most 2D operations
+is dominated by **fixed per-operation overhead** — primarily GPU render
+target switches (`save_layer` / FBO allocation) — not by pixel fill rate.
 
-```
-frame_cost ≈ total_pixels_touched / memory_bandwidth
-```
+The fixed overhead comes from (**FACT**, traced to Skia/GL source):
+
+1. **GPU texture allocation** (~15-30µs) — `glTexStorage2D()`, synchronous
+   on most drivers. Skia's `GrResourceCache` pools textures to mitigate
+   this, but cache misses still pay full cost.
+2. **FBO state change** (~20-40µs) — `glFramebufferTexture2D()`, forces
+   GPU pipeline flush. Unavoidable in GL/Metal immediate-mode API.
+3. **Resource allocator** (~5-15µs) — CPU-side scratch key lookup in
+   `GrResourceAllocator`.
 
-This relationship is **linear**. Double the pixels, double the time.
-No surprises, no non-linear scaling — as long as you stay within VRAM
-and don't hit texture cache thrashing (rare in 2D; access is spatially
-coherent).
+Source: `skia/src/gpu/ganesh/GrGLGpu.cpp` (texture alloc),
+`skia/src/gpu/ganesh/GrResourceAllocator.cpp` (scratch pool).
 
-This means render cost can be pre-computed as an **ALU/pixel budget**:
-count the pixels the GPU will touch, apply structural multipliers per
-effect, and compare against a calibrated device budget.
+> **INFERENCE** — Many common 2D workloads are bandwidth-dominated for
+> simple fills, but effects requiring `save_layer` (blur, shadow, blend
+> mode isolation, group opacity) are dominated by fixed overhead at
+> typical node sizes (< ~1M pixels). The pixel-proportional component
+> becomes significant only at very large sizes or high zoom.
 
 ---
 
-## Effect Cost Constants
-
-These are not magic numbers or tuning parameters. They are the
-**structural pass counts** of each rendering operation — how many
-full-area read-write cycles the GPU performs.
-
-| Effect                                | Pixel Multiplier     | Derivation                                                 |
-| ------------------------------------- | -------------------- | ---------------------------------------------------------- |
-| Plain shape (rect, ellipse, polygon)  | `1×`                 | Single fill pass                                           |
-| Additional fill (N fills on one node) | `+1×` per extra fill | Each fill is a separate pass                               |
-| Additional stroke                     | `+1×` per stroke     | Separate pass                                              |
-| Non-rect clip path                    | `+1×`                | Mask pass + masked content                                 |
-| Rect clip                             | `+0×`                | Hardware scissor — free                                    |
-| Blend mode (non-normal)               | `+1×`                | Requires offscreen isolation layer                         |
-| Group opacity (alpha < 1.0 on group)  | `+1×`                | `save_layer` for isolated compositing                      |
-| Gaussian blur                         | `+3×`                | Downsample pyramid (~1.33×) + blur + upsample + composite  |
-| Drop shadow                           | `+5×`                | Draw shape (1×) + blur pipeline (3×) + composite back (1×) |
-| Inner shadow                          | `+5×`                | Same as drop shadow, inverted mask                         |
-| Backdrop filter (background blur)     | `+3×`                | Snapshot dst + blur + composite                            |
-| Layer blur (on node itself)           | `+3×`                | Offscreen + blur + composite                               |
-| Image fill                            | `+0×` over base      | Texture sample replaces color fill — same bandwidth        |
-| Multiple shadows                      | `+5×` per shadow     | Each shadow is independent                                 |
-
-### Blur Radius Independence
-
-Skia (and most GPU frameworks) implement Gaussian blur via a **downsample
-pyramid**, not a brute-force kernel convolution:
+## Measured Fixed Cost per Operation
+
+> **BENCHMARK** — Single rect, median of 50 runs after 10 warmup.
+> Constant across 50²–4000² pixel area (R² ≈ 0 for most effects).
+
+| Operation                    | C_fixed (µs) | What triggers it                            |
+| ---------------------------- | ------------ | ------------------------------------------- |
+| Baseline (no `save_layer`)   | ~12          | GPU draw call + flush overhead              |
+| `save_layer_alpha` (opacity) | ~20          | 1 FBO switch                                |
+| 2× nested `save_layer`       | ~32          | 2 FBO switches                              |
+| 3× nested `save_layer`       | ~43          | 3 FBO switches (~11µs per additional layer) |
+| Blur (σ=5)                   | ~73          | FBO + blur shader dispatch                  |
+| Inner shadow (σ=6)           | ~72          | FBO + clip + shadow filter dispatch         |
+| Blend mode (Multiply)        | ~81          | FBO + blend resolve                         |
+| Drop shadow (σ=8)            | ~97          | FBO + shadow filter dispatch                |
+| Backdrop blur (σ=8)          | ~110         | FBO + dst snapshot + blur                   |
+| Blur (σ=50)                  | ~207         | FBO + multiple downsample dispatches        |
+| Shadow + blur combo          | ~307         | 2 nested FBOs + both filter dispatches      |
+
+> **INFERENCE** — For frame budget estimation, counting the number of
+> `save_layer`-inducing operations and summing their fixed costs is more
+> accurate than pixel-area-based prediction, at least up to ~16M pixels
+> per node on this hardware.
 
-```
-large sigma → downsample 2× → downsample 2× → ... → blur at reduced size → upsample
-```
+---
 
-Total pixel work = `area × (1 + 1/4 + 1/16 + ...) ≈ area × 1.33` (geometric
-series), plus the blur pass at reduced resolution. The cost is approximately
-**constant regardless of blur radius**. The pyramid absorbs the radius.
+## Blur Cost: Depends on Sigma
 
-### `save_layer` / `save_layer_alpha` — The Hidden Spike Source
+### Skia Constants
 
-`save_layer` is the single most expensive primitive in Skia. It allocates an
-offscreen surface, renders content into it, then composites back.
+> **FACT** — From `skia/src/core/SkBlurEngine.h`.
+
+- `kMaxSamples = 28` — max texture samples per GPU blur pass (hardcoded)
+- `kMaxLinearSigma = 4.0` — max sigma for direct convolution (hardcoded)
+- `SigmaToRadius(σ) = ⌈3 × σ⌉` — sigma-to-radius conversion
+- `LinearKernelWidth(r) = r + 1` — samples per 1D pass (hardware bilinear)
+- σ ≤ 0.03 is treated as identity (no-op)
+
+### Skia Blur Strategy
+
+> **FACT** — From `skia/src/gpu/ganesh/GrBlurUtils.cpp`.
 
 ```
-save_layer_cost = layer_bounds_area × zoom² × 2  (write to offscreen + read back)
+σ ≤ 4.0 and small kernel  →  single 2D convolution pass (≤28 samples)
+σ ≤ 4.0                   →  two separable 1D passes
+σ > 4.0                   →  downsample until σ ≤ 4.0, blur, upsample (recursive)
 ```
 
-Critical: **they cascade multiplicatively with nesting depth**.
+For σ ≤ 4.0, the pass count varies:
 
+- If `KernelWidth(rX) × KernelWidth(rY) ≤ 28`: single 2D pass
+- Otherwise: two separable 1D passes
+
+> **HEURISTIC** — The following formula estimates pass count for σ > 4.0.
+> The exact count depends on image dimensions and Skia's internal
+> rounding, so treat this as an approximation.
+
+```rust
+fn blur_pass_estimate(sigma: f32) -> u32 {
+    if sigma <= 0.03 {
+        return 0; // identity
+    }
+    if sigma <= 4.0 {
+        return 2; // 1–2 passes (1D separable or single 2D)
+    }
+    let levels = ((sigma / 4.0).log2()).ceil() as u32;
+    2 + levels * 2 // 2 blur passes + downsample/upsample per level
+}
 ```
-save_layer              ← offscreen A (full group bounds)
-  save_layer            ← offscreen B (child bounds)
-    save_layer          ← offscreen C (grandchild bounds)
-      draw rect
-    restore             → composite C into B
-  restore               → composite B into A
-restore                 → composite A into target
-```
 
-Three nested layers on the same area = `area × 6` bandwidth, not `area × 2`.
+### Blur Radius Dependence
+
+> **BENCHMARK** — Blur σ=50 is consistently ~2.8× more expensive than
+> σ=5 across all tested sizes. This ratio is stable, confirming that
+> cost scales with downsample level count.
 
-#### Implicit `save_layer` triggers
+| Size  | σ=5 (µs) | σ=50 (µs) | Ratio |
+| ----- | -------- | --------- | ----- |
+| 50²   | 74       | 211       | 2.87× |
+| 100²  | 65       | 193       | 2.99× |
+| 200²  | 73       | 207       | 2.84× |
+| 500²  | 76       | 208       | 2.74× |
+| 4000² | 77       | 230       | 3.00× |
 
-Skia inserts `save_layer` implicitly for these conditions. The cost estimator
-must account for them even when the application code does not call `save_layer`
-explicitly:
+### `reduce_blur()` — Interactive Quality Reduction
 
-| Trigger                                   | Reason                                                          |
-| ----------------------------------------- | --------------------------------------------------------------- |
-| Non-normal blend mode on a group          | Isolated offscreen to blend against dst                         |
-| Group opacity (alpha < 1.0 with children) | Children must composite together first, then alpha applied once |
-| Blur / backdrop filter                    | Reads from dst, needs snapshot                                  |
-| Clip + antialiasing on groups             | Soft-edge mask requires offscreen                               |
-| `ColorFilter` on a group                  | Applied after children composite                                |
+> **FACT** — From `crates/grida-canvas/src/painter/painter.rs`.
+
+The painter implements `reduce_blur()` which divides sigma by 4×
+during interactive frames (`RenderPolicy::EffectQuality::Reduced`).
+This moves most blurs into the σ ≤ 4.0 direct convolution range.
+Example: σ=20 → σ=5 (eliminates ~2 downsample levels).
 
 ---
 
-## Per-Node Cost Formula
+## `save_layer` Triggers
 
-```rust
-fn estimated_fill_pixels(node: &Node, zoom: f32, viewport: &Rect) -> f64 {
-    let screen_area = clipped_area(&node.bounds, viewport) * (zoom * zoom) as f64;
+> **FACT** — From Skia's `SkCanvas::internalSaveLayer()` and observed
+> painter behavior. The cost estimator must account for implicit
+> `save_layer` insertions even when the application code does not call
+> `save_layer` explicitly.
 
-    // Base draw
-    let mut passes: f64 = 1.0;
+| Trigger                                   | Reason                                  |
+| ----------------------------------------- | --------------------------------------- |
+| Non-normal blend mode on a group          | Isolated offscreen to blend against dst |
+| Group opacity (alpha < 1.0 with children) | Children must composite together first  |
+| Blur / backdrop filter                    | Needs offscreen for filter input        |
+| Clip + antialiasing on groups             | Soft-edge mask requires offscreen       |
+| `ColorFilter` on a group                  | Applied after children composite        |
 
-    // Extra fills/strokes beyond the first
-    passes += (node.fill_count.saturating_sub(1)) as f64;
-    passes += node.stroke_count as f64;
+> **FACT** — `save_layer` costs cascade with nesting depth.
+> Each additional layer adds ~11µs fixed overhead (measured from
+> 2× vs 3× nested `save_layer`: 32µs → 43µs).
 
-    // Effects
-    for shadow in &node.shadows {
-        if shadow.visible {
-            passes += 5.0; // shape + blur pipeline + composite
-        }
-    }
-    if node.has_blur() {
-        passes += 3.0; // downsample + blur + composite
-    }
-    if node.has_backdrop_blur() {
-        passes += 3.0;
-    }
+### Blend Mode Tiers
 
-    // Isolation layers (implicit save_layer)
-    if node.blend_mode != BlendMode::Normal {
-        passes += 1.0; // offscreen + composite
-    }
-    if node.opacity < 1.0 && node.has_children() {
-        passes += 1.0; // group opacity isolation
-    }
+> **FACT** — From `skia/src/gpu/Blend.h`, `skia/src/gpu/BlendFormula.h`,
+> `skia/src/gpu/ganesh/effects/GrCustomXfermode.cpp`.
 
-    // Clip
-    if node.has_non_rect_clip() {
-        passes += 1.0; // mask pass
-    }
+Not all blend modes have the same cost. Three tiers:
 
-    screen_area * passes
-}
-```
+| Tier                   | Modes                                                                | Implementation                                      |
+| ---------------------- | -------------------------------------------------------------------- | --------------------------------------------------- |
+| Coefficient (cheapest) | Normal, Screen, SrcOver, Plus, Modulate                              | Hardware fixed-function blend — zero shader cost    |
+| Simple advanced        | Overlay, HardLight, Darken, Lighten                                  | Shared shader, ~10-20 lines, separable              |
+| Complex advanced       | ColorDodge, ColorBurn, SoftLight, Hue, Saturation, Color, Luminosity | Individual shaders, non-separable, guarded division |
+
+> **INFERENCE** — The ~81µs measured for blend mode (Multiply) is
+> entirely `save_layer` FBO overhead, not blend math. Multiply is a
+> coefficient blend mode (cheapest tier). The blend mode tier affects
+> ALU cost per pixel, which is negligible compared to FBO overhead at
+> typical node sizes. Per-paint blend modes (no `save_layer`) are
+> effectively free.
+
+---
 
-### Cache Hit vs. Miss Cost
+## Cache Hit vs. Miss
 
-A compositor/picture cache **hit** replaces the full rasterization pipeline
-with a single texture blit:
+> **BENCHMARK** — Measured with `skia_bench_cache_blit`.
 
-| State      | Effective multiplier          | What happens                                         |
-| ---------- | ----------------------------- | ---------------------------------------------------- |
-| Cache miss | `passes ×` (from table above) | Full rasterization: path tessellation, fill, effects |
-| Cache hit  | `~0.1×`                       | Single texture-sampled quad draw                     |
+| State      | Cost                         | What happens                                                  |
+| ---------- | ---------------------------- | ------------------------------------------------------------- |
+| Cache miss | ~70-300µs (effect-dependent) | Full rasterization with FBO overhead                          |
+| Cache hit  | ~5µs (constant)              | Single texture blit, independent of source complexity or size |
 
-The cost difference is **100-1000×**. Cache state is a binary signal — the
-single largest contributor to per-node cost variance.
+Hit/miss ratio for effect nodes: **~0.05×** (measured).
+Blit cost is ~5µs regardless of source effect complexity — confirmed
+with coefficient of variation check across 4 effect types.
+
+> **BENCHMARK** — At scale (136K nodes, 2600 visible), the compositor
+> cache serves all effect nodes as texture blits. Shadow and blur nodes
+> show `cache_hits = 2704, live_draws = 0`. Effect multipliers only
+> apply to **cache-miss frames** (first render, zoom change, scene
+> mutation).
 
 ---
 
-## Device Fill Rate Reference
+## Scale Behavior
+
+> **BENCHMARK** — Full Renderer pipeline with R-tree culling, picture
+> cache, and layer compositing. Measured with `skia_bench_scene_scale`.
+
+### Per-Visible-Node Cost (stable frames)
+
+| Scene Type       | 1K   | 5K   | 10K  | 50K  | 100K | 136K         |
+| ---------------- | ---- | ---- | ---- | ---- | ---- | ------------ |
+| Plain rects      | 0.41 | 0.38 | 0.40 | 0.43 | 0.54 | 0.89 µs/node |
+| All with shadow  | 0.49 | 0.45 | 0.46 | 0.47 | 0.64 | 0.87 µs/node |
+| All with blur    | 0.46 | 0.48 | 0.45 | 0.51 | 0.74 | 0.84 µs/node |
+| Mixed (70/20/10) | 0.85 | 0.81 | 0.72 | 0.80 | 1.03 | 1.17 µs/node |
+
+> **INFERENCE** — Per-visible-node cost is approximately additive
+> (linear) from 1K to 50K total nodes. Non-linear overhead appears at
+> 100K+ due to R-tree query and scene cache management scaling with
+> total scene size, not drawing cost. Visible count caps at ~2600 nodes
+> in a 1000×1000 viewport with 8×8 rects — R-tree culling works.
+
+---
+
+## Practical Cost Model
 
-The total pixel budget depends on device fill rate — the one value that
-varies per hardware. Everything else is derived from geometry and scene
-structure.
+> **HEURISTIC** — Based on all benchmarks above. For frame budget
+> decisions (skip or draw), the following is more accurate than
+> pixel-area-based prediction at typical node sizes.
+
+```
+frame_cost ≈ Σ visible_nodes(
+    if cache_hit:     ~5 µs
+    if cache_miss:    C_fixed(effect_type)
+)
+```
+
+Where `C_fixed` values are from the measured table above. The pixel-area
+component is negligible up to ~16M pixels per node on tested hardware.
+
+For nodes with multiple effects, sum the fixed costs (each effect
+that triggers a `save_layer` adds its own FBO overhead).
 
 ### Calibration
 
-Render a known workload (e.g., full-screen solid rect) and measure:
+Two device-specific constants must be measured at startup:
 
 ```
-pixels_per_ms = (screen_width × screen_height) / render_time_ms
+save_layer_overhead_us  = measured via single save_layer + draw + restore
+pixels_per_ms           = measured via full-screen solid rect
 ```
 
-### Reference Values (order-of-magnitude)
+Everything else is derived from scene structure (effect types, cache state).
+
+---
+
+## Device Fill Rate Reference
+
+> **BENCHMARK** — Baseline solid rect at 500².
+
+| Metric      | Value (M2 Pro)  |
+| ----------- | --------------- |
+| Fill rate   | ~146M pixels/ms |
+| 12ms budget | ~1.8B pixels    |
+
+> **HEURISTIC** — Order-of-magnitude reference.
 
 | Platform                 | Expected pixels_per_ms |
 | ------------------------ | ---------------------- |
@@ -207,58 +291,53 @@ pixels_per_ms = (screen_width × screen_height) / render_time_ms
 
 ## Chromium Reference
 
-Chromium's `cc/` compositor collects similar metrics but uses them differently:
+> **FACT** — From `cc/paint/display_item_list.h`, `cc/tiles/tile_manager.cc`.
+
+Chromium's `cc/` compositor collects these metrics:
 
-| Metric                                | Chromium Location              | Chromium Usage                                                |
+| Metric                                | Location                       | Usage                                                         |
 | ------------------------------------- | ------------------------------ | ------------------------------------------------------------- |
 | `TotalOpCount()`                      | `cc/paint/display_item_list.h` | Solid-color analysis gate                                     |
 | `num_slow_paths_up_to_min_for_MSAA()` | `cc/paint/display_item_list.h` | Page-level GPU raster veto                                    |
 | `has_save_layer_ops()`                | `cc/paint/display_item_list.h` | LCD text decision                                             |
-| `has_non_aa_paint()`                  | `cc/paint/display_item_list.h` | Antialiasing decisions                                        |
 | `BytesUsed()` / `OpBytesUsed()`       | `cc/paint/display_item_list.h` | Tracing / debugging                                           |
-| `AreaOfDrawText()`                    | `cc/paint/display_item_list.h` | Text coverage statistics                                      |
 | Solid color analysis                  | `cc/tiles/tile_manager.cc`     | Skip rasterization for uniform tiles (`kMaxOpsToAnalyze = 5`) |
 
-Chromium does **not** perform per-tile raster cost prediction. Tile
-scheduling is purely spatial (viewport distance + scroll velocity) with
-a memory budget constraint. Their architecture tolerates stale tiles
-(multi-threaded raster catches up across frames). Ours cannot — we render
-single-threaded with a hard per-frame deadline, requiring predictive
-budgeting.
+> **INFERENCE** — Based on source review, Chromium does not appear to
+> perform per-tile raster cost prediction. Tile scheduling is spatial
+> (viewport distance + scroll velocity) with a memory budget constraint.
+> Their multi-threaded raster architecture can tolerate stale tiles in
+> ways our single-threaded pipeline cannot.
 
 Local source: `/Users/softmarshmallow/Documents/Github/chromium/cc/`
 
 ---
 
-## Skia `Picture` Metrics (Available for Free)
+## Skia `Picture` Metrics
 
-Skia's `Picture` object exposes complexity metrics that are already
-computed during recording and cost nothing to query:
+> **FACT** — From `skia/include/core/SkPicture.h`.
 
-| Method                     | What it returns                    | Use                                |
-| -------------------------- | ---------------------------------- | ---------------------------------- |
-| `approximate_op_count()`   | Number of draw operations recorded | Secondary complexity signal        |
-| `approximate_bytes_used()` | Serialized size of the picture     | Memory pressure / complexity proxy |
+| Method                     | Returns                            | Cost to query       |
+| -------------------------- | ---------------------------------- | ------------------- |
+| `approximate_op_count()`   | Number of recorded draw operations | Free (stored field) |
+| `approximate_bytes_used()` | Serialized size of the picture     | Free (stored field) |
 
-These are stored fields, not computations. They complement the pixel-area
-model by capturing path complexity variance (a 1000-op picture with
-complex beziers vs. a 3-op picture with simple rects at the same pixel
-area).
+These capture path complexity variance that the fixed-cost model does
+not account for (e.g., a 1000-op picture with complex beziers vs. a
+3-op picture with simple rects).
 
 ---
 
-## Linearity Bounds
+## Benchmark Source
 
-The fill-rate model is linear under these conditions:
+All benchmarks use `HeadlessGpu` (offscreen Metal/GL surface), median
+of 50 iterations after 10 warmup, single rect per iteration unless
+noted otherwise.
 
-| Condition                          | Linear?             | Notes                                                  |
-| ---------------------------------- | ------------------- | ------------------------------------------------------ |
-| Work above ~10K pixels             | Yes                 | Below this, GPU launch overhead dominates (flat floor) |
-| Spatial texture access (normal 2D) | Yes                 | Bandwidth-bound, no cache thrashing                    |
-| Random texture access              | Can be super-linear | Rare in 2D rendering                                   |
-| Tile-based GPU (mobile)            | Mostly              | Large nodes spanning many tiles add per-tile overhead  |
-| Thermal throttling                 | N/A                 | Between-frame variance, not within-frame               |
-| VRAM pressure / swapping           | Non-linear          | Catastrophic; avoid by staying within budget           |
+| Benchmark                | What it measures                                                                   |
+| ------------------------ | ---------------------------------------------------------------------------------- |
+| `skia_bench_cost_model`  | Per-effect fixed cost, linearity, blur radius, fill rate, two-component extraction |
+| `skia_bench_cache_blit`  | Cache hit/miss ratio, blit constancy across effect types                           |
+| `skia_bench_scene_scale` | Full Renderer pipeline at 1K–136K nodes with culling and caching                   |
 
-For typical 2D canvas rendering (spatial access, nodes > 10K pixels),
-the linear model holds.
+Source: `crates/grida-canvas/examples/skia_bench/`

From 20a72e2e8e250ce8a9231d282441af06598e346b Mon Sep 17 00:00:00 2001
From: Universe <universe@grida.co>
Date: Wed, 1 Apr 2026 21:04:34 +0900
Subject: [PATCH 4/6] refactor(cg): extract cost prediction as debug-only
 module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move render cost estimation into isolated `runtime::cost_prediction`
module. Compute `predicted_cost_us` in FramePlan and display in devtools
overlay as `pred: Xµs (Y×)` for correlation against actual frame times.

This is instrumentation only — no behavioral changes to pan/zoom cache
blits, stable/unstable promotion, downscale rendering, or effect quality.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/runtime/cost_prediction.rs            | 153 ++++++++++++++++++
 crates/grida-canvas/src/runtime/mod.rs        |   1 +
 crates/grida-canvas/src/runtime/scene.rs      |  36 +++++
 crates/grida-canvas/src/window/application.rs |   7 +-
 4 files changed, 196 insertions(+), 1 deletion(-)
 create mode 100644 crates/grida-canvas/src/runtime/cost_prediction.rs

diff --git a/crates/grida-canvas/src/runtime/cost_prediction.rs b/crates/grida-canvas/src/runtime/cost_prediction.rs
new file mode 100644
index 0000000000..04dd737a5b
--- /dev/null
+++ b/crates/grida-canvas/src/runtime/cost_prediction.rs
@@ -0,0 +1,153 @@
+//! Render cost prediction — read-only metric for frame budget estimation.
+//!
+//! Estimates the GPU cost of rendering a frame based on the visible node set
+//! and their effects. All constants are fixed-overhead costs measured on
+//! Apple M2 Pro (Metal 4.1).
+//!
+//! This module is **debug/instrumentation only**. It does not influence
+//! rendering decisions. The predicted cost is reported in `FramePlan` and
+//! the devtools overlay for correlation analysis against actual frame times.
+//!
+//! ## Reference
+//!
+//! - [`docs/wg/feat-2d/render-cost-prediction.md`] — cost model derivation,
+//!   benchmark results, Skia blur algorithm analysis, blend mode tiers,
+//!   and calibration methodology.
+//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs`] —
+//!   per-effect validation benchmark (fixed cost extraction, linearity).
+//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs`] —
+//!   cache hit/miss ratio measurement.
+//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs`] —
+//!   full Renderer pipeline at 1K–136K nodes.
+
+use crate::cg::fe::{FeBlur, FilterShadowEffect};
+use crate::cg::prelude::LayerBlendMode;
+use crate::painter::layer::PainterPictureLayer;
+
+// ── Measured fixed-overhead constants (µs) ──────────────────────────
+//
+// Per-operation FBO/pipeline switch costs, NOT per-pixel.
+// Source: skia_bench_cost_model (single rect, median of 50 runs).
+
+/// Baseline draw call + flush overhead (no save_layer).
+const COST_BASELINE_US: f64 = 12.0;
+
+/// Gaussian blur: FBO + shader dispatch. For σ > 4.0, each additional
+/// downsample level adds ~COST_BLUR_LEVEL_US.
+const COST_BLUR_BASE_US: f64 = 73.0;
+const COST_BLUR_LEVEL_US: f64 = 35.0;
+
+/// Drop shadow: FBO + shadow filter dispatch.
+const COST_SHADOW_US: f64 = 97.0;
+
+/// Inner shadow: FBO + clip + shadow filter dispatch.
+const COST_INNER_SHADOW_US: f64 = 72.0;
+
+/// Non-PassThrough blend mode: FBO + blend resolve.
+const COST_BLEND_MODE_US: f64 = 81.0;
+
+/// Backdrop blur: FBO + dst snapshot + blur.
+const COST_BACKDROP_BLUR_US: f64 = 110.0;
+
+/// Group opacity isolation (save_layer_alpha).
+const COST_OPACITY_ISOLATION_US: f64 = 20.0;
+
+/// Compositor cache hit: single texture blit (~5µs, size-independent).
+const COST_CACHE_HIT_US: f64 = 5.0;
+
+// ── Public API ──────────────────────────────────────────────────────
+
+/// Estimate the blur fixed cost based on sigma.
+///
+/// Skia uses direct convolution for σ ≤ 4.0 and recursive downsampling
+/// for larger values. Each downsample level adds a fixed FBO overhead.
+/// See `skia/src/core/SkBlurEngine.h` for the `kMaxLinearSigma = 4.0`
+/// constant that drives this.
+pub fn blur_cost_us(sigma: f32) -> f64 {
+    if sigma <= 0.03 {
+        return 0.0;
+    }
+    if sigma <= 4.0 {
+        return COST_BLUR_BASE_US;
+    }
+    let levels = (sigma / 4.0).log2().ceil() as u32;
+    COST_BLUR_BASE_US + levels as f64 * COST_BLUR_LEVEL_US
+}
+
+/// Estimate the fixed-overhead cost (µs) for rendering a single node.
+///
+/// `is_cache_hit`: true if the node will be drawn from the compositor
+/// layer cache (texture blit) rather than live-rasterized.
+pub fn estimate_node_cost(layer: &PainterPictureLayer, is_cache_hit: bool) -> f64 {
+    if is_cache_hit {
+        return COST_CACHE_HIT_US;
+    }
+
+    let mut cost = COST_BASELINE_US;
+
+    let (effects, base) = match layer {
+        PainterPictureLayer::Shape(s) => (&s.effects, &s.base),
+        PainterPictureLayer::Text(t) => (&t.effects, &t.base),
+        PainterPictureLayer::Vector(v) => (&v.effects, &v.base),
+    };
+
+    // Blur
+    if let Some(blur) = &effects.blur {
+        if blur.active {
+            let sigma = match &blur.blur {
+                FeBlur::Gaussian(g) => g.radius,
+                FeBlur::Progressive(p) => p.radius.max(p.radius2),
+            };
+            cost += blur_cost_us(sigma);
+        }
+    }
+
+    // Backdrop blur
+    if let Some(backdrop) = &effects.backdrop_blur {
+        if backdrop.active {
+            let sigma = match &backdrop.blur {
+                FeBlur::Gaussian(g) => g.radius,
+                FeBlur::Progressive(p) => p.radius.max(p.radius2),
+            };
+            cost += COST_BACKDROP_BLUR_US.max(blur_cost_us(sigma));
+        }
+    }
+
+    // Shadows
+    for shadow in &effects.shadows {
+        match shadow {
+            FilterShadowEffect::DropShadow(s) => {
+                if s.active {
+                    cost += COST_SHADOW_US.max(blur_cost_us(s.blur));
+                }
+            }
+            FilterShadowEffect::InnerShadow(s) => {
+                if s.active {
+                    cost += COST_INNER_SHADOW_US.max(blur_cost_us(s.blur));
+                }
+            }
+        }
+    }
+
+    // Glass (treated as backdrop blur)
+    if let Some(glass) = &effects.glass {
+        if glass.active {
+            cost += COST_BACKDROP_BLUR_US;
+        }
+    }
+
+    // Blend mode isolation (non-PassThrough requires save_layer)
+    if !matches!(base.blend_mode, LayerBlendMode::PassThrough) {
+        cost += COST_BLEND_MODE_US;
+    }
+
+    // Group opacity isolation
+    // Note: leaf nodes fold opacity into paint alpha; only groups need
+    // save_layer. We can't distinguish group vs leaf from
+    // PainterPictureLayer alone, so we conservatively add the cost.
+    if base.opacity < 1.0 {
+        cost += COST_OPACITY_ISOLATION_US;
+    }
+
+    cost
+}
diff --git a/crates/grida-canvas/src/runtime/mod.rs b/crates/grida-canvas/src/runtime/mod.rs
index 6584dd5762..f1f99f50ef 100644
--- a/crates/grida-canvas/src/runtime/mod.rs
+++ b/crates/grida-canvas/src/runtime/mod.rs
@@ -1,6 +1,7 @@
 pub mod camera;
 pub mod changes;
 pub mod config;
+pub mod cost_prediction;
 pub mod counter;
 pub mod effect_tree;
 pub mod font_repository;
diff --git a/crates/grida-canvas/src/runtime/scene.rs b/crates/grida-canvas/src/runtime/scene.rs
index 301ef821bb..19cba5da1e 100644
--- a/crates/grida-canvas/src/runtime/scene.rs
+++ b/crates/grida-canvas/src/runtime/scene.rs
@@ -155,6 +155,11 @@ pub struct FramePlan {
     pub compositor_indices: Vec<usize>,
     pub display_list_duration: Duration,
     pub display_list_size_estimated: usize,
+    /// Predicted frame cost in microseconds, based on the fixed-overhead cost
+    /// model (sum of per-effect FBO/pipeline costs for visible nodes).
+    /// See `docs/wg/feat-2d/render-cost-prediction.md` for derivation.
+    /// Zero for cache-hit frames (pan/zoom blit).
+    pub predicted_cost_us: f64,
 }
 
 /// Deferred frame plan: stores just the inputs so the expensive R-tree query
@@ -1120,6 +1125,7 @@ impl Renderer {
                         compositor_indices: Vec::new(),
                         display_list_duration: Duration::ZERO,
                         display_list_size_estimated: 0,
+                        predicted_cost_us: 0.0,
                     };
 
                     return FrameFlushStats {
@@ -1177,6 +1183,7 @@ impl Renderer {
                     compositor_indices: Vec::new(),
                     display_list_duration: Duration::ZERO,
                     display_list_size_estimated: 0,
+                    predicted_cost_us: 0.0,
                 },
             );
             if let Some((mid_flush_duration, frame_duration)) = zoom_cache_hit {
@@ -1194,6 +1201,7 @@ impl Renderer {
                     compositor_indices: Vec::new(),
                     display_list_duration: Duration::ZERO,
                     display_list_size_estimated: 0,
+                    predicted_cost_us: 0.0,
                 };
                 return FrameFlushStats {
                     frame: plan,
@@ -2087,6 +2095,10 @@ impl Renderer {
         pic
     }
 
+    // ── Render cost prediction ─────────────────────────────────────
+    // Read-only debug metric. Delegates to `runtime::cost_prediction`.
+    // See docs/wg/feat-2d/render-cost-prediction.md for derivation.
+
     /// Plan the frame for rendering.
     ///
     /// # Arguments
@@ -2190,6 +2202,29 @@ impl Renderer {
 
         let ll_len = regions.iter().map(|(_, indices)| indices.len()).sum();
 
+        // Predict frame cost: sum per-node fixed overhead costs.
+        let predicted_cost_us = {
+            let promoted_set: std::collections::HashSet<&NodeId> = promoted_ids.iter().collect();
+            let mut total = 0.0_f64;
+            // Live-drawn nodes (from regions)
+            for (_, region_indices) in &regions {
+                for &idx in region_indices {
+                    if let Some(entry) = self.scene_cache.layers.layers.get(idx) {
+                        total += crate::runtime::cost_prediction::estimate_node_cost(&entry.layer, false);
+                    }
+                }
+            }
+            // Promoted (cache-hit) nodes
+            for &idx in &compositor_indices {
+                if let Some(entry) = self.scene_cache.layers.layers.get(idx) {
+                    if promoted_set.contains(&entry.id) {
+                        total += crate::runtime::cost_prediction::estimate_node_cost(&entry.layer, true);
+                    }
+                }
+            }
+            total
+        };
+
         let __ll_duration = __start.elapsed();
 
         FramePlan {
@@ -2201,6 +2236,7 @@ impl Renderer {
             compositor_indices,
             display_list_duration: __ll_duration,
             display_list_size_estimated: ll_len,
+            predicted_cost_us,
         }
     }
 
diff --git a/crates/grida-canvas/src/window/application.rs b/crates/grida-canvas/src/window/application.rs
index 7de297a56f..1db614717e 100644
--- a/crates/grida-canvas/src/window/application.rs
+++ b/crates/grida-canvas/src/window/application.rs
@@ -1627,13 +1627,18 @@ impl UnknownTargetApplication {
         wall_time: std::time::Duration,
     ) {
         let s = format!(
-            "fps*: {:.0} | t: {:.2}ms | cam: {} | render: {:.1}ms | flush: {:.1}ms | frame: {:.1}ms | list: {:.1}ms ({:?}) | draw: {:.1}ms | $:pic: {:?} ({:?} use) | $:geo: {:?} | comp: {:?} ({:?} hit, {:.1}KB) | live: {:?} | res: {} | img: {} | fnt: {}",
+            "fps*: {:.0} | t: {:.2}ms | cam: {} | render: {:.1}ms | flush: {:.1}ms | frame: {:.1}ms | pred: {:.0}µs ({:.1}×) | list: {:.1}ms ({:?}) | draw: {:.1}ms | $:pic: {:?} ({:?} use) | $:geo: {:?} | comp: {:?} ({:?} hit, {:.1}KB) | live: {:?} | res: {} | img: {} | fnt: {}",
             1.0 / wall_time.as_secs_f64(),
             wall_time.as_secs_f64() * 1000.0,
             stats.frame.camera_change.label(),
             stats.total_duration.as_secs_f64() * 1000.0,
             stats.flush_duration.as_secs_f64() * 1000.0,
             stats.frame_duration.as_secs_f64() * 1000.0,
+            stats.frame.predicted_cost_us,
+            {
+                let actual_us = stats.frame_duration.as_secs_f64() * 1_000_000.0;
+                if actual_us > 0.0 { stats.frame.predicted_cost_us / actual_us } else { 0.0 }
+            },
             stats.frame.display_list_duration.as_secs_f64() * 1000.0,
             stats.frame.display_list_size_estimated,
             stats.draw.painter_duration.as_secs_f64() * 1000.0,

From 7fb4e28971059b3c594d78c32e85683d20997afd Mon Sep 17 00:00:00 2001
From: Universe <universe@grida.co>
Date: Wed, 1 Apr 2026 21:09:57 +0900
Subject: [PATCH 5/6] feat(tests): add box-margin.html to demonstrate CSS
 margin behaviors

This new HTML fixture illustrates various CSS margin behaviors, including margin collapse, negative margins, and auto margins. It provides visual examples and descriptions to aid understanding of how margins interact in different scenarios, enhancing the testing framework for CSS-related features.
---
 fixtures/test-html/L0/box-margin.html | 295 ++++++++++++++++++++++++++
 1 file changed, 295 insertions(+)
 create mode 100644 fixtures/test-html/L0/box-margin.html

diff --git a/fixtures/test-html/L0/box-margin.html b/fixtures/test-html/L0/box-margin.html
new file mode 100644
index 0000000000..3ae840fb83
--- /dev/null
+++ b/fixtures/test-html/L0/box-margin.html
@@ -0,0 +1,295 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>Box: Margin</title>
+    <style>
+      * {
+        box-sizing: border-box;
+      }
+      body {
+        background: #030712;
+        color: #e2e8f0;
+        font-family: system-ui, sans-serif;
+        font-size: 14px;
+        padding: 24px;
+      }
+
+      h1 {
+        font-size: 20px;
+        color: #fff;
+        margin-bottom: 32px;
+      }
+      h2 {
+        font-size: 14px;
+        margin-top: 40px;
+        margin-bottom: 6px;
+        color: #94a3b8;
+        text-transform: uppercase;
+        letter-spacing: 0.05em;
+      }
+      p.desc {
+        font-size: 12px;
+        color: #64748b;
+        margin-bottom: 14px;
+      }
+
+      .demo {
+        background: #1e293b;
+        border: 1px solid #334155;
+        border-radius: 8px;
+        padding: 24px;
+        margin-bottom: 14px;
+        position: relative;
+      }
+
+      .demo-label {
+        position: absolute;
+        top: 8px;
+        right: 12px;
+        font-size: 11px;
+        color: #475569;
+        font-family: monospace;
+      }
+
+      .box {
+        background: #6366f1;
+        color: #fff;
+        padding: 12px 16px;
+        font-size: 12px;
+        font-family: monospace;
+      }
+
+      .box-alt {
+        background: #a855f7;
+      }
+      .box-green {
+        background: #059669;
+      }
+
+      .anno {
+        font-size: 11px;
+        color: #f87171;
+        font-family: monospace;
+        text-align: center;
+        padding: 4px;
+      }
+
+      .side {
+        display: flex;
+        flex-direction: row;
+        flex-wrap: wrap;
+        gap: 14px;
+        margin-bottom: 14px;
+      }
+
+      .side .demo {
+        margin-bottom: 0;
+        flex: 1;
+        min-width: 260px;
+      }
+
+      .collapse-container {
+        background: #0f172a;
+      }
+      .collapse-container .box {
+        margin: 30px 0;
+      }
+
+      .no-collapse-container {
+        display: flex;
+        flex-direction: column;
+        background: #0f172a;
+      }
+      .no-collapse-container .box {
+        margin: 30px 0;
+      }
+
+      .inline-container {
+        background: #0f172a;
+        line-height: 2;
+        padding: 12px;
+      }
+      .inline-box {
+        display: inline;
+        background: #6366f1;
+        color: #fff;
+        padding: 2px 6px;
+        font-size: 12px;
+        font-family: monospace;
+        margin: 40px 8px;
+      }
+
+      .inline-box-alt {
+        background: #a855f7;
+      }
+    </style>
+  </head>
+  <body>
+    <h1>CSS Margin Behaviors</h1>
+
+    <!-- 1. Margin Collapse (sibling) -->
+    <h2>1. Margin Collapse (block flow)</h2>
+    <p class="desc">
+      Both boxes have <code>margin: 30px 0</code>. In normal flow, the 30px
+      margins collapse into 30px (not 60px).
+    </p>
+    <div class="side">
+      <div class="demo">
+        <div class="demo-label">normal flow (collapsed)</div>
+        <div class="collapse-container">
+          <div class="box">margin: 30px 0</div>
+          <div class="anno">↕ 30px (collapsed)</div>
+          <div class="box box-alt">margin: 30px 0</div>
+        </div>
+      </div>
+      <div class="demo">
+        <div class="demo-label">flex column (no collapse)</div>
+        <div class="no-collapse-container">
+          <div class="box">margin: 30px 0</div>
+          <div class="anno">↕ 60px (no collapse)</div>
+          <div class="box box-alt">margin: 30px 0</div>
+        </div>
+      </div>
+    </div>
+
+    <!-- 2. Negative Margin -->
+    <h2>2. Negative Margin</h2>
+    <p class="desc">
+      Second box has <code>margin-top: -20px</code>, pulling it upward and
+      overlapping the first box.
+    </p>
+    <div class="demo">
+      <div style="background: #0f172a; padding-top: 40px">
+        <div class="box">box A</div>
+        <div
+          class="box box-alt"
+          style="margin-top: -20px; opacity: 0.85; position: relative"
+        >
+          box B — margin-top: -20px
+        </div>
+      </div>
+    </div>
+
+    <!-- 3. Margin Auto -->
+    <h2>3. Margin Auto</h2>
+    <p class="desc">
+      Auto margins distribute available space. Used for centering and
+      push-alignment.
+    </p>
+    <div class="side">
+      <div class="demo">
+        <div class="demo-label">margin: 0 auto</div>
+        <div style="background: #0f172a">
+          <div class="box" style="width: 180px; margin: 0 auto">centered</div>
+        </div>
+      </div>
+      <div class="demo">
+        <div class="demo-label">margin-left: auto</div>
+        <div style="background: #0f172a">
+          <div class="box box-alt" style="width: 180px; margin-left: auto">
+            pushed right
+          </div>
+        </div>
+      </div>
+    </div>
+    <div class="demo">
+      <div class="demo-label">flex + margin-left: auto (spacer pattern)</div>
+      <div style="display: flex; background: #0f172a">
+        <div class="box">A</div>
+        <div class="box box-alt" style="margin-left: auto">B (ml: auto)</div>
+        <div class="box box-green" style="margin-left: auto">C (ml: auto)</div>
+      </div>
+    </div>
+
+    <!-- 4. Background Boundary -->
+    <h2>4. Background Boundary</h2>
+    <p class="desc">
+      Margin is outside the background. Left: margin creates transparent gap.
+      Right: wrapper+padding equivalent — padding zone paints with the wrapper's
+      background.
+    </p>
+    <div class="side">
+      <div class="demo" style="padding: 0">
+        <div class="demo-label">margin: 24px</div>
+        <div class="box" style="margin: 24px">
+          background: blue; margin: 24px;
+        </div>
+      </div>
+      <div class="demo" style="padding: 0">
+        <div class="demo-label">wrapper padding: 24px</div>
+        <div style="padding: 24px; background: transparent">
+          <div class="box">wrapper { padding: 24px } → child</div>
+        </div>
+      </div>
+    </div>
+
+    <!-- 5. Inline Element Margin -->
+    <h2>5. Inline Element Margin</h2>
+    <p class="desc">
+      Inline elements ignore vertical margin. These <code>&lt;span&gt;</code>s
+      have <code>margin: 40px 8px</code> but only horizontal margin applies.
+    </p>
+    <div class="demo">
+      <div class="inline-container">
+        text before <span class="inline-box">span A</span> middle text
+        <span class="inline-box inline-box-alt">span B</span> text after
+        <br />
+        next line <span class="inline-box">span C</span> continues
+      </div>
+    </div>
+
+    <!-- 6. Unequal Collapse -->
+    <h2>6. Collapse Variants</h2>
+    <p class="desc">
+      Unequal margins: A has <code>margin-bottom: 50px</code>, B has
+      <code>margin-top: 20px</code>. Collapsed = max(50, 20) = 50px.
+    </p>
+    <div class="side">
+      <div class="demo">
+        <div class="demo-label">collapsed (50px, not 70px)</div>
+        <div class="collapse-container">
+          <div class="box" style="margin: 0 0 50px 0">mb: 50px</div>
+          <div class="box box-alt" style="margin: 20px 0 0 0">mt: 20px</div>
+        </div>
+      </div>
+      <div class="demo">
+        <div class="demo-label">wrapper+padding (70px total)</div>
+        <div style="background: #0f172a">
+          <div style="padding-bottom: 50px">
+            <div class="box">wrapper pb: 50px</div>
+          </div>
+          <div style="padding-top: 20px">
+            <div class="box box-alt">wrapper pt: 20px</div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <!-- 7. Parent-Child Collapse -->
+    <h2>7. Parent-Child Collapse</h2>
+    <p class="desc">
+      A child's margin can collapse through its parent if the parent has no
+      border, padding, or BFC. Left: margin leaks out. Right: padding on parent
+      prevents collapse.
+    </p>
+    <div class="side">
+      <div class="demo">
+        <div class="demo-label">collapsed (margin leaks)</div>
+        <div style="background: rgba(248, 113, 113, 0.15)">
+          <div class="box" style="margin-top: 30px">child mt: 30px</div>
+        </div>
+        <div class="anno">
+          ↑ parent has no padding/border — child margin leaks out
+        </div>
+      </div>
+      <div class="demo">
+        <div class="demo-label">padding prevents collapse</div>
+        <div style="background: rgba(248, 113, 113, 0.15); padding: 1px">
+          <div class="box" style="margin-top: 30px">child mt: 30px</div>
+        </div>
+        <div class="anno">↑ parent has padding: 1px — margin stays inside</div>
+      </div>
+    </div>
+  </body>
+</html>

From 32e43feba33adc5308a818280676b50c148d7ea8 Mon Sep 17 00:00:00 2001
From: Universe <universe@grida.co>
Date: Wed, 1 Apr 2026 21:24:19 +0900
Subject: [PATCH 6/6] style(cg): cargo fmt

---
 .../skia_bench/skia_bench_cache_blit.rs       | 26 +++-----
 .../skia_bench/skia_bench_cost_model.rs       | 38 +++++++----
 .../skia_bench/skia_bench_scene_scale.rs      | 66 ++++++++++++-------
 crates/grida-canvas/src/runtime/scene.rs      |  8 ++-
 4 files changed, 85 insertions(+), 53 deletions(-)

diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs
index c7598eaf65..11db38c65a 100644
--- a/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs
@@ -44,10 +44,7 @@ fn main() {
     }
 
     /// Measure median time (µs) for a drawing operation.
-    fn bench_draw(
-        surface: &mut Surface,
-        draw_fn: &dyn Fn(&skia_safe::Canvas),
-    ) -> f64 {
+    fn bench_draw(surface: &mut Surface, draw_fn: &dyn Fn(&skia_safe::Canvas)) -> f64 {
         // Warmup
         for _ in 0..WARMUP {
             let canvas = surface.canvas();
@@ -207,12 +204,7 @@ fn main() {
 
             // Cache hit: texture blit
             let hit_us = bench_draw(surface, &|canvas| {
-                canvas.draw_image_rect(
-                    &cached_image,
-                    None,
-                    dst_rect,
-                    &Paint::default(),
-                );
+                canvas.draw_image_rect(&cached_image, None, dst_rect, &Paint::default());
             });
 
             let ratio = hit_us / miss_us;
@@ -238,18 +230,20 @@ fn main() {
     println!("═══════════════════════════════════════════════════════════════════════════");
     println!("  SECTION 2: Blit Cost Constancy (same size, different source complexity)");
     println!("═══════════════════════════════════════════════════════════════════════════");
-    println!(
-        "  Blit cost should NOT vary with source effect complexity at the same size."
-    );
+    println!("  Blit cost should NOT vary with source effect complexity at the same size.");
     println!();
 
     for (si, &size) in sizes.iter().enumerate() {
         let blit_at_size: Vec<f64> = blit_times.iter().map(|bt| bt[si]).collect();
         let mean = blit_at_size.iter().sum::<f64>() / blit_at_size.len() as f64;
-        let variance =
-            blit_at_size.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / blit_at_size.len() as f64;
+        let variance = blit_at_size.iter().map(|v| (v - mean).powi(2)).sum::<f64>()
+            / blit_at_size.len() as f64;
         let stddev = variance.sqrt();
-        let cv = if mean > 0.0 { stddev / mean * 100.0 } else { 0.0 };
+        let cv = if mean > 0.0 {
+            stddev / mean * 100.0
+        } else {
+            0.0
+        };
 
         println!("  Size {}²:", size);
         for (ei, effect) in effects.iter().enumerate() {
diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs
index 504cc5c045..bd065317b6 100644
--- a/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs
@@ -20,9 +20,7 @@ fn main() {
 #[cfg(feature = "native-gl-context")]
 fn main() {
     use cg::window::headless::HeadlessGpu;
-    use skia_safe::{
-        canvas::SaveLayerRec, image_filters, BlendMode, Color, Paint, Rect, Surface,
-    };
+    use skia_safe::{canvas::SaveLayerRec, image_filters, BlendMode, Color, Paint, Rect, Surface};
     use std::time::Instant;
 
     const W: i32 = 1000;
@@ -84,7 +82,11 @@ fn main() {
         let n = xs.len() as f64;
         let x_mean = xs.iter().sum::<f64>() / n;
         let y_mean = ys.iter().sum::<f64>() / n;
-        let ss_xy: f64 = xs.iter().zip(ys).map(|(x, y)| (x - x_mean) * (y - y_mean)).sum();
+        let ss_xy: f64 = xs
+            .iter()
+            .zip(ys)
+            .map(|(x, y)| (x - x_mean) * (y - y_mean))
+            .sum();
         let ss_xx: f64 = xs.iter().map(|x| (x - x_mean).powi(2)).sum();
         let ss_yy: f64 = ys.iter().map(|y| (y - y_mean).powi(2)).sum();
         if ss_xx == 0.0 || ss_yy == 0.0 {
@@ -208,7 +210,7 @@ fn main() {
             name: "opacity 0.5 (save_layer_alpha)",
             predicted: 2.0,
             draw: Box::new(|canvas, rect| {
-                canvas.save_layer_alpha(Some(rect),128);
+                canvas.save_layer_alpha(Some(rect), 128);
                 let mut p = Paint::default();
                 p.set_color(Color::from_argb(255, 66, 133, 244));
                 canvas.draw_rect(rect, &p);
@@ -310,8 +312,8 @@ fn main() {
             name: "2x nested save_layer",
             predicted: 5.0,
             draw: Box::new(|canvas, rect| {
-                canvas.save_layer_alpha(Some(rect),255);
-                canvas.save_layer_alpha(Some(rect),255);
+                canvas.save_layer_alpha(Some(rect), 255);
+                canvas.save_layer_alpha(Some(rect), 255);
                 let mut p = Paint::default();
                 p.set_color(Color::from_argb(255, 66, 133, 244));
                 canvas.draw_rect(rect, &p);
@@ -324,9 +326,9 @@ fn main() {
             name: "3x nested save_layer",
             predicted: 7.0,
             draw: Box::new(|canvas, rect| {
-                canvas.save_layer_alpha(Some(rect),255);
-                canvas.save_layer_alpha(Some(rect),255);
-                canvas.save_layer_alpha(Some(rect),255);
+                canvas.save_layer_alpha(Some(rect), 255);
+                canvas.save_layer_alpha(Some(rect), 255);
+                canvas.save_layer_alpha(Some(rect), 255);
                 let mut p = Paint::default();
                 p.set_color(Color::from_argb(255, 66, 133, 244));
                 canvas.draw_rect(rect, &p);
@@ -391,13 +393,20 @@ fn main() {
         "  {:<35} {:>10} {:>10} {:>10} {:>6}",
         "Effect", "Predicted", "Measured", "Time(µs)", "Status"
     );
-    println!("  {:-<35} {:->10} {:->10} {:->10} {:->6}", "", "", "", "", "");
+    println!(
+        "  {:-<35} {:->10} {:->10} {:->10} {:->6}",
+        "", "", "", "", ""
+    );
 
     for (vi, variant) in variants.iter().enumerate() {
         let time_us = results[vi][size_idx_200];
         let measured = time_us / baseline_200;
         let ratio = measured / variant.predicted;
-        let status = if ratio >= 0.5 && ratio <= 2.0 { "OK" } else { "WARN" };
+        let status = if ratio >= 0.5 && ratio <= 2.0 {
+            "OK"
+        } else {
+            "WARN"
+        };
         println!(
             "  {:<35} {:>9.1}× {:>9.2}× {:>10.1} {:>6}",
             variant.name, variant.predicted, measured, time_us, status
@@ -469,7 +478,10 @@ fn main() {
     let pixels_per_ms = pixels_per_us * 1000.0;
     let budget_12ms = pixels_per_ms * 12.0;
 
-    println!("  Baseline (solid rect) at 500×500: {:.1} µs", baseline_500_us);
+    println!(
+        "  Baseline (solid rect) at 500×500: {:.1} µs",
+        baseline_500_us
+    );
     println!("  Fill rate: {:.1}M pixels/ms", pixels_per_ms / 1_000_000.0);
     println!(
         "  12ms frame budget: {:.1}B pixels ({:.0}M pixels)",
diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs
index c27db1f0e9..ba7efa5bfb 100644
--- a/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs
@@ -70,34 +70,30 @@ fn main() {
 
                 let effects = match scene_type {
                     SceneType::Plain => LayerEffects::default(),
-                    SceneType::WithShadow => LayerEffects::from_array(vec![
-                        FilterEffect::DropShadow(FeShadow {
+                    SceneType::WithShadow => {
+                        LayerEffects::from_array(vec![FilterEffect::DropShadow(FeShadow {
                             dx: 2.0,
                             dy: 2.0,
                             blur: 4.0,
                             spread: 0.0,
                             color: CGColor::from_rgba(0, 0, 0, 128),
                             active: true,
-                        }),
-                    ]),
-                    SceneType::WithBlur => {
-                        LayerEffects::new().blur(3.0)
+                        })])
                     }
+                    SceneType::WithBlur => LayerEffects::new().blur(3.0),
                     SceneType::Mixed => {
                         let kind = i % 10;
                         if kind < 7 {
                             LayerEffects::default() // 70% plain
                         } else if kind < 9 {
-                            LayerEffects::from_array(vec![
-                                FilterEffect::DropShadow(FeShadow {
-                                    dx: 2.0,
-                                    dy: 2.0,
-                                    blur: 4.0,
-                                    spread: 0.0,
-                                    color: CGColor::from_rgba(0, 0, 0, 128),
-                                    active: true,
-                                }),
-                            ]) // 20% shadow
+                            LayerEffects::from_array(vec![FilterEffect::DropShadow(FeShadow {
+                                dx: 2.0,
+                                dy: 2.0,
+                                blur: 4.0,
+                                spread: 0.0,
+                                color: CGColor::from_rgba(0, 0, 0, 128),
+                                active: true,
+                            })]) // 20% shadow
                         } else {
                             LayerEffects::new().blur(3.0) // 10% blur
                         }
@@ -202,9 +198,18 @@ fn main() {
         flush_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
         total_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
 
-        let frame_us = frame_times.get(frame_times.len() / 2).copied().unwrap_or(0.0);
-        let flush_us = flush_times.get(flush_times.len() / 2).copied().unwrap_or(0.0);
-        let total_us = total_times.get(total_times.len() / 2).copied().unwrap_or(0.0);
+        let frame_us = frame_times
+            .get(frame_times.len() / 2)
+            .copied()
+            .unwrap_or(0.0);
+        let flush_us = flush_times
+            .get(flush_times.len() / 2)
+            .copied()
+            .unwrap_or(0.0);
+        let total_us = total_times
+            .get(total_times.len() / 2)
+            .copied()
+            .unwrap_or(0.0);
         let per_visible = if last_visible > 0 {
             total_us / last_visible as f64
         } else {
@@ -242,7 +247,13 @@ fn main() {
 
     for &scene_type in &scene_types {
         for &count in &counts {
-            eprint!("\r  [{}/{}] {} × {}k", done + 1, total_configs, scene_type.label(), count / 1000);
+            eprint!(
+                "\r  [{}/{}] {} × {}k",
+                done + 1,
+                total_configs,
+                scene_type.label(),
+                count / 1000
+            );
             results.push(run_scale_bench(&mut renderer, count, scene_type));
             done += 1;
         }
@@ -257,7 +268,15 @@ fn main() {
     println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
     println!(
         "  {:<22} {:>8} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8} {:>8}",
-        "Scene Type", "Nodes", "Visible", "Frame(µs)", "Flush(µs)", "Total(µs)", "Per-vis", "Hits", "Live"
+        "Scene Type",
+        "Nodes",
+        "Visible",
+        "Frame(µs)",
+        "Flush(µs)",
+        "Total(µs)",
+        "Per-vis",
+        "Hits",
+        "Live"
     );
     println!(
         "  {:-<22} {:->8} {:->8} {:->10} {:->10} {:->10} {:->10} {:->8} {:->8}",
@@ -330,7 +349,10 @@ fn main() {
 
     if let Some(base) = plain_1k {
         let base_per_vis = base.per_visible_us;
-        println!("  Baseline per-visible-node cost (plain, 1k): {:.2} µs", base_per_vis);
+        println!(
+            "  Baseline per-visible-node cost (plain, 1k): {:.2} µs",
+            base_per_vis
+        );
         println!();
         println!(
             "  {:<22} {:>8} {:>12} {:>12} {:>10}",
diff --git a/crates/grida-canvas/src/runtime/scene.rs b/crates/grida-canvas/src/runtime/scene.rs
index 19cba5da1e..b3baf4d10d 100644
--- a/crates/grida-canvas/src/runtime/scene.rs
+++ b/crates/grida-canvas/src/runtime/scene.rs
@@ -2210,7 +2210,10 @@ impl Renderer {
             for (_, region_indices) in &regions {
                 for &idx in region_indices {
                     if let Some(entry) = self.scene_cache.layers.layers.get(idx) {
-                        total += crate::runtime::cost_prediction::estimate_node_cost(&entry.layer, false);
+                        total += crate::runtime::cost_prediction::estimate_node_cost(
+                            &entry.layer,
+                            false,
+                        );
                     }
                 }
             }
@@ -2218,7 +2221,8 @@ impl Renderer {
             for &idx in &compositor_indices {
                 if let Some(entry) = self.scene_cache.layers.layers.get(idx) {
                     if promoted_set.contains(&entry.id) {
-                        total += crate::runtime::cost_prediction::estimate_node_cost(&entry.layer, true);
+                        total +=
+                            crate::runtime::cost_prediction::estimate_node_cost(&entry.layer, true);
                     }
                 }
             }