diff --git a/crates/grida-canvas/Cargo.toml b/crates/grida-canvas/Cargo.toml
index 41a837dfd3..d6d839f33f 100644
--- a/crates/grida-canvas/Cargo.toml
+++ b/crates/grida-canvas/Cargo.toml
@@ -149,6 +149,21 @@ path = "examples/skia_bench/skia_bench_cache_image.rs"
 name = "skia_bench_cache_text"
 path = "examples/skia_bench/skia_bench_cache_text.rs"
 
+[[example]]
+name = "skia_bench_cost_model"
+path = "examples/skia_bench/skia_bench_cost_model.rs"
+required-features = ["native-gl-context"]
+
+[[example]]
+name = "skia_bench_cache_blit"
+path = "examples/skia_bench/skia_bench_cache_blit.rs"
+required-features = ["native-gl-context"]
+
+[[example]]
+name = "skia_bench_scene_scale"
+path = "examples/skia_bench/skia_bench_scene_scale.rs"
+required-features = ["native-gl-context"]
+
 # ── IO tools ─────────────────────────────────────────────────────
 [[example]]
 name = "tool_io_grida"
diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs
new file mode 100644
index 0000000000..11db38c65a
--- /dev/null
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs
@@ -0,0 +1,265 @@
+//! Cache Hit vs. Miss Cost Ratio Benchmark
+//!
+//! Measures the actual cost ratio between a cache hit (GPU texture blit) and
+//! a cache miss (full rasterization). Validates the ~0.1× estimate from
+//! `docs/wg/feat-2d/render-cost-prediction.md`.
+//!
+//! Run with:
+//! ```bash
+//! cargo run -p cg --example skia_bench_cache_blit --features native-gl-context --release
+//! ```
+
+#[cfg(not(feature = "native-gl-context"))]
+fn main() {
+    eprintln!("This example requires --features native-gl-context");
+}
+
+#[cfg(feature = "native-gl-context")]
+fn main() {
+    use cg::window::headless::HeadlessGpu;
+    use skia_safe::{
+        canvas::SaveLayerRec, image_filters, Color, Image, ImageInfo, Paint, Rect, Surface,
+    };
+    use std::time::Instant;
+
+    const W: i32 = 1000;
+    const H: i32 = 1000;
+    const WARMUP: u32 = 10;
+    const ITERS: u32 = 50;
+
+    let mut gpu = HeadlessGpu::new(W, H).expect("GPU init");
+    gpu.print_gl_info();
+    println!();
+
+    let surface = &mut gpu.surface;
+
+    // ── Helpers ──────────────────────────────────────────────────────
+
+    fn flush(s: &mut Surface) {
+        if let Some(mut ctx) = s.recording_context() {
+            if let Some(mut d) = ctx.as_direct_context() {
+                d.flush_and_submit();
+            }
+        }
+    }
+
+    /// Measure median time (µs) for a drawing operation.
+    fn bench_draw(surface: &mut Surface, draw_fn: &dyn Fn(&skia_safe::Canvas)) -> f64 {
+        // Warmup
+        for _ in 0..WARMUP {
+            let canvas = surface.canvas();
+            canvas.clear(Color::WHITE);
+            draw_fn(canvas);
+            flush(surface);
+        }
+        // Measure
+        let mut timings = Vec::with_capacity(ITERS as usize);
+        for _ in 0..ITERS {
+            let t0 = Instant::now();
+            let canvas = surface.canvas();
+            canvas.clear(Color::WHITE);
+            draw_fn(canvas);
+            flush(surface);
+            timings.push(t0.elapsed().as_nanos() as f64 / 1000.0);
+        }
+        timings.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        timings[timings.len() / 2]
+    }
+
+    /// Capture a rect with effects into a GPU-resident Image.
+    fn capture_to_image(
+        surface: &mut Surface,
+        size: i32,
+        draw_fn: &dyn Fn(&skia_safe::Canvas, Rect),
+    ) -> Image {
+        let info = ImageInfo::new_n32_premul((size, size), None);
+        let mut offscreen = surface.new_surface(&info).expect("offscreen surface");
+        {
+            let canvas = offscreen.canvas();
+            canvas.clear(Color::TRANSPARENT);
+            let rect = Rect::from_xywh(0.0, 0.0, size as f32, size as f32);
+            draw_fn(canvas, rect);
+        }
+        flush(surface);
+        offscreen.image_snapshot()
+    }
+
+    // ── Effect configurations ───────────────────────────────────────
+
+    struct EffectConfig {
+        name: &'static str,
+        draw: Box<dyn Fn(&skia_safe::Canvas, Rect)>,
+    }
+
+    let shadow_filter = image_filters::drop_shadow(
+        (4.0, 4.0),
+        (8.0, 8.0),
+        Color::from_argb(128, 0, 0, 0),
+        None,
+        None,
+        None,
+    );
+    let blur_filter = image_filters::blur((8.0, 8.0), None, None, None);
+
+    let sf = shadow_filter.clone();
+    let blf = blur_filter.clone();
+    let sf2 = shadow_filter.clone();
+
+    let effects: Vec<EffectConfig> = vec![
+        EffectConfig {
+            name: "solid rect",
+            draw: Box::new(|canvas, rect| {
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+            }),
+        },
+        EffectConfig {
+            name: "rect + blur (s=8)",
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(blf.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        EffectConfig {
+            name: "rect + shadow (s=8)",
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(sf.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        EffectConfig {
+            name: "complex (3 fills + stroke + shadow)",
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(sf2.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                // 3 fills
+                let mut p1 = Paint::default();
+                p1.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p1);
+                let mut p2 = Paint::default();
+                p2.set_color(Color::from_argb(128, 255, 0, 0));
+                canvas.draw_rect(rect, &p2);
+                let mut p3 = Paint::default();
+                p3.set_color(Color::from_argb(64, 0, 255, 0));
+                canvas.draw_rect(rect, &p3);
+                // 1 stroke
+                let mut s = Paint::default();
+                s.set_color(Color::BLACK);
+                s.set_style(skia_safe::PaintStyle::Stroke);
+                s.set_stroke_width(2.0);
+                canvas.draw_rect(rect, &s);
+                canvas.restore();
+            }),
+        },
+    ];
+
+    let sizes: [i32; 3] = [100, 200, 500];
+
+    // ── Run benchmarks ──────────────────────────────────────────────
+
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 1: Cache Hit vs. Miss Ratio");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<36} {:>5} {:>10} {:>10} {:>10}",
+        "Effect", "Size", "Miss(µs)", "Hit(µs)", "Ratio"
+    );
+    println!(
+        "  {:-<36} {:->5} {:->10} {:->10} {:->10}",
+        "", "", "", "", ""
+    );
+
+    // blit_times[effect_idx][size_idx] for constancy check
+    let mut blit_times: Vec<Vec<f64>> = vec![Vec::new(); effects.len()];
+
+    for (ei, effect) in effects.iter().enumerate() {
+        for (si, &size) in sizes.iter().enumerate() {
+            let sizef = size as f32;
+            let cx = (W as f32 - sizef) / 2.0;
+            let cy = (H as f32 - sizef) / 2.0;
+            let dst_rect = Rect::from_xywh(cx, cy, sizef, sizef);
+
+            // Cache miss: full rasterize
+            let miss_us = bench_draw(surface, &|canvas| {
+                (effect.draw)(canvas, dst_rect);
+            });
+
+            // Capture to GPU texture
+            let cached_image = capture_to_image(surface, size, &*effect.draw);
+
+            // Cache hit: texture blit
+            let hit_us = bench_draw(surface, &|canvas| {
+                canvas.draw_image_rect(&cached_image, None, dst_rect, &Paint::default());
+            });
+
+            let ratio = hit_us / miss_us;
+            blit_times[ei].push(hit_us);
+
+            println!(
+                "  {:<36} {:>4}² {:>10.1} {:>10.1} {:>9.3}×",
+                effect.name, size, miss_us, hit_us, ratio
+            );
+
+            eprint!(
+                "\r  [{}/{}]",
+                ei * sizes.len() + si + 1,
+                effects.len() * sizes.len()
+            );
+        }
+    }
+    eprintln!("\r  Done.{:40}", "");
+
+    // ── Output Section 2: Blit Constancy ────────────────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 2: Blit Cost Constancy (same size, different source complexity)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  Blit cost should NOT vary with source effect complexity at the same size.");
+    println!();
+
+    for (si, &size) in sizes.iter().enumerate() {
+        let blit_at_size: Vec<f64> = blit_times.iter().map(|bt| bt[si]).collect();
+        let mean = blit_at_size.iter().sum::<f64>() / blit_at_size.len() as f64;
+        let variance = blit_at_size.iter().map(|v| (v - mean).powi(2)).sum::<f64>()
+            / blit_at_size.len() as f64;
+        let stddev = variance.sqrt();
+        let cv = if mean > 0.0 {
+            stddev / mean * 100.0
+        } else {
+            0.0
+        };
+
+        println!("  Size {}²:", size);
+        for (ei, effect) in effects.iter().enumerate() {
+            println!("    {:<36} {:>8.1} µs", effect.name, blit_times[ei][si]);
+        }
+        println!(
+            "    mean={:.1} µs  stddev={:.1} µs  CV={:.1}%  {}",
+            mean,
+            stddev,
+            cv,
+            if cv < 10.0 { "OK" } else { "WARN (>10%)" }
+        );
+        println!();
+    }
+
+    println!("  Expected: CV < 10% at each size (blit cost independent of source complexity)");
+    println!("  Reference: predicted cache-hit ratio ~0.1× (from cost model doc)");
+    println!();
+}
diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs
new file mode 100644
index 0000000000..bd065317b6
--- /dev/null
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs
@@ -0,0 +1,549 @@
+//! Render Cost Model Validation Benchmark
+//!
+//! Validates the structural pixel-cost model from
+//! `docs/wg/feat-2d/render-cost-prediction.md` against real GPU measurements.
+//!
+//! Unlike `skia_bench_effects` (10K tiny rects, per-rect overhead), this draws
+//! **one rect per iteration at controlled sizes** to isolate per-pixel cost
+//! from per-draw-call overhead.
+//!
+//! Run with:
+//! ```bash
+//! cargo run -p cg --example skia_bench_cost_model --features native-gl-context --release
+//! ```
+
+#[cfg(not(feature = "native-gl-context"))]
+fn main() {
+    eprintln!("This example requires --features native-gl-context");
+}
+
+#[cfg(feature = "native-gl-context")]
+fn main() {
+    use cg::window::headless::HeadlessGpu;
+    use skia_safe::{canvas::SaveLayerRec, image_filters, BlendMode, Color, Paint, Rect, Surface};
+    use std::time::Instant;
+
+    const W: i32 = 1000;
+    const H: i32 = 1000;
+    const WARMUP: u32 = 10;
+    const ITERS: u32 = 50;
+
+    let mut gpu = HeadlessGpu::new(W, H).expect("GPU init");
+    gpu.print_gl_info();
+    println!();
+
+    let surface = &mut gpu.surface;
+
+    // ── Helpers ──────────────────────────────────────────────────────
+
+    fn flush(s: &mut Surface) {
+        if let Some(mut ctx) = s.recording_context() {
+            if let Some(mut d) = ctx.as_direct_context() {
+                d.flush_and_submit();
+            }
+        }
+    }
+
+    /// Run a single-rect benchmark at the given size.
+    /// Returns the **median** duration in microseconds.
+    fn bench_single_rect(
+        surface: &mut Surface,
+        size: f32,
+        draw_fn: &dyn Fn(&skia_safe::Canvas, Rect),
+    ) -> f64 {
+        let cx = (W as f32 - size) / 2.0;
+        let cy = (H as f32 - size) / 2.0;
+        let rect = Rect::from_xywh(cx, cy, size, size);
+
+        // Warmup
+        for _ in 0..WARMUP {
+            let canvas = surface.canvas();
+            canvas.clear(Color::WHITE);
+            draw_fn(canvas, rect);
+            flush(surface);
+        }
+
+        // Measure
+        let mut timings = Vec::with_capacity(ITERS as usize);
+        for _ in 0..ITERS {
+            let t0 = Instant::now();
+            let canvas = surface.canvas();
+            canvas.clear(Color::WHITE);
+            draw_fn(canvas, rect);
+            flush(surface);
+            timings.push(t0.elapsed().as_nanos() as f64 / 1000.0); // microseconds
+        }
+        timings.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        timings[timings.len() / 2] // median
+    }
+
+    /// Compute R-squared for linear fit of (xs, ys).
+    fn r_squared(xs: &[f64], ys: &[f64]) -> f64 {
+        let n = xs.len() as f64;
+        let x_mean = xs.iter().sum::<f64>() / n;
+        let y_mean = ys.iter().sum::<f64>() / n;
+        let ss_xy: f64 = xs
+            .iter()
+            .zip(ys)
+            .map(|(x, y)| (x - x_mean) * (y - y_mean))
+            .sum();
+        let ss_xx: f64 = xs.iter().map(|x| (x - x_mean).powi(2)).sum();
+        let ss_yy: f64 = ys.iter().map(|y| (y - y_mean).powi(2)).sum();
+        if ss_xx == 0.0 || ss_yy == 0.0 {
+            return 0.0;
+        }
+        let r = ss_xy / (ss_xx * ss_yy).sqrt();
+        r * r
+    }
+
+    // ── Variant definitions ─────────────────────────────────────────
+
+    struct Variant {
+        name: &'static str,
+        predicted: f64,
+        draw: Box<dyn Fn(&skia_safe::Canvas, Rect)>,
+    }
+
+    let shadow_filter_s8 = image_filters::drop_shadow(
+        (4.0, 4.0),
+        (8.0, 8.0),
+        Color::from_argb(128, 0, 0, 0),
+        None,
+        None,
+        None,
+    );
+
+    let shadow_filter_s8_only = image_filters::drop_shadow_only(
+        (2.0, 2.0),
+        (6.0, 6.0),
+        Color::from_argb(128, 0, 0, 0),
+        None,
+        None,
+        None,
+    );
+
+    let blur_filter_5 = image_filters::blur((5.0, 5.0), None, None, None);
+    let blur_filter_50 = image_filters::blur((50.0, 50.0), None, None, None);
+    let backdrop_blur_8 = image_filters::blur((8.0, 8.0), None, None, None).unwrap();
+
+    // Clone filters for closures
+    let sf8 = shadow_filter_s8.clone();
+    let sf8o = shadow_filter_s8_only.clone();
+    let bf5 = blur_filter_5.clone();
+    let bf50 = blur_filter_50.clone();
+    let sf8_for_combo = shadow_filter_s8.clone();
+    let bf5_for_combo = blur_filter_5.clone();
+    let bd8 = backdrop_blur_8.clone();
+
+    let variants: Vec<Variant> = vec![
+        // 1. Baseline
+        Variant {
+            name: "baseline (solid rect)",
+            predicted: 1.0,
+            draw: Box::new(|canvas, rect| {
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+            }),
+        },
+        // 2. +1 extra fill
+        Variant {
+            name: "+1 fill (2 fills total)",
+            predicted: 2.0,
+            draw: Box::new(|canvas, rect| {
+                let mut p1 = Paint::default();
+                p1.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p1);
+                let mut p2 = Paint::default();
+                p2.set_color(Color::from_argb(128, 255, 0, 0));
+                canvas.draw_rect(rect, &p2);
+            }),
+        },
+        // 3. +2 extra fills
+        Variant {
+            name: "+2 fills (3 fills total)",
+            predicted: 3.0,
+            draw: Box::new(|canvas, rect| {
+                let mut p1 = Paint::default();
+                p1.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p1);
+                let mut p2 = Paint::default();
+                p2.set_color(Color::from_argb(128, 255, 0, 0));
+                canvas.draw_rect(rect, &p2);
+                let mut p3 = Paint::default();
+                p3.set_color(Color::from_argb(128, 0, 255, 0));
+                canvas.draw_rect(rect, &p3);
+            }),
+        },
+        // 4. +1 stroke
+        Variant {
+            name: "+1 stroke",
+            predicted: 2.0,
+            draw: Box::new(|canvas, rect| {
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                let mut s = Paint::default();
+                s.set_color(Color::BLACK);
+                s.set_style(skia_safe::PaintStyle::Stroke);
+                s.set_stroke_width(2.0);
+                canvas.draw_rect(rect, &s);
+            }),
+        },
+        // 5. Non-normal blend mode (save_layer)
+        Variant {
+            name: "blend mode (Multiply)",
+            predicted: 2.0,
+            draw: Box::new(|canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_blend_mode(BlendMode::Multiply);
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 6. Opacity (save_layer_alpha)
+        Variant {
+            name: "opacity 0.5 (save_layer_alpha)",
+            predicted: 2.0,
+            draw: Box::new(|canvas, rect| {
+                canvas.save_layer_alpha(Some(rect), 128);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 7. Gaussian blur (r=5)
+        Variant {
+            name: "blur (r=5)",
+            predicted: 4.0,
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(bf5.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 8. Gaussian blur (r=50) — should be ~same cost (radius independence)
+        Variant {
+            name: "blur (r=50)",
+            predicted: 4.0,
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(bf50.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 9. Drop shadow (with content)
+        Variant {
+            name: "drop shadow (s=8)",
+            predicted: 6.0,
+            draw: Box::new(move |canvas, rect| {
+                let mut lp = Paint::default();
+                lp.set_image_filter(sf8.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+            }),
+        },
+        // 10. Inner shadow (clip + shadow_only)
+        Variant {
+            name: "inner shadow (s=6)",
+            predicted: 6.0,
+            draw: Box::new(move |canvas, rect| {
+                // Base rect
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 240, 240, 240));
+                canvas.draw_rect(rect, &p);
+                // Clipped inner shadow
+                canvas.save();
+                canvas.clip_rect(rect, None, None);
+                let mut lp = Paint::default();
+                lp.set_image_filter(sf8o.clone());
+                let rec = SaveLayerRec::default().bounds(&rect).paint(&lp);
+                canvas.save_layer(&rec);
+                let mut sp = Paint::default();
+                sp.set_color(Color::from_argb(255, 240, 240, 240));
+                canvas.draw_rect(rect, &sp);
+                canvas.restore();
+                canvas.restore();
+            }),
+        },
+        // 11. Drop shadow + blur combined
+        Variant {
+            name: "shadow + blur combo",
+            predicted: 9.0,
+            draw: Box::new(move |canvas, rect| {
+                // Outer: blur
+                let mut blur_p = Paint::default();
+                blur_p.set_image_filter(bf5_for_combo.clone());
+                let blur_rec = SaveLayerRec::default().bounds(&rect).paint(&blur_p);
+                canvas.save_layer(&blur_rec);
+                // Inner: shadow
+                let mut shadow_p = Paint::default();
+                shadow_p.set_image_filter(sf8_for_combo.clone());
+                let shadow_rec = SaveLayerRec::default().bounds(&rect).paint(&shadow_p);
+                canvas.save_layer(&shadow_rec);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+                canvas.restore();
+            }),
+        },
+        // 12. 2x nested save_layer (no effects, pure isolation cost)
+        Variant {
+            name: "2x nested save_layer",
+            predicted: 5.0,
+            draw: Box::new(|canvas, rect| {
+                canvas.save_layer_alpha(Some(rect), 255);
+                canvas.save_layer_alpha(Some(rect), 255);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+                canvas.restore();
+            }),
+        },
+        // 13. 3x nested save_layer
+        Variant {
+            name: "3x nested save_layer",
+            predicted: 7.0,
+            draw: Box::new(|canvas, rect| {
+                canvas.save_layer_alpha(Some(rect), 255);
+                canvas.save_layer_alpha(Some(rect), 255);
+                canvas.save_layer_alpha(Some(rect), 255);
+                let mut p = Paint::default();
+                p.set_color(Color::from_argb(255, 66, 133, 244));
+                canvas.draw_rect(rect, &p);
+                canvas.restore();
+                canvas.restore();
+                canvas.restore();
+            }),
+        },
+        // 14. Backdrop blur
+        Variant {
+            name: "backdrop blur (s=8)",
+            predicted: 4.0,
+            draw: Box::new(move |canvas, rect| {
+                // Background content
+                let mut bg = Paint::default();
+                bg.set_color(Color::from_argb(255, 200, 50, 100));
+                canvas.draw_rect(rect, &bg);
+                // Backdrop blur layer on top
+                let lp = Paint::default();
+                let rec = SaveLayerRec::default()
+                    .bounds(&rect)
+                    .backdrop(&bd8)
+                    .paint(&lp);
+                canvas.save_layer(&rec);
+                let mut overlay = Paint::default();
+                overlay.set_color(Color::from_argb(80, 255, 255, 255));
+                canvas.draw_rect(rect, &overlay);
+                canvas.restore();
+            }),
+        },
+    ];
+
+    // ── Run benchmarks ──────────────────────────────────────────────
+
+    let sizes: [f32; 8] = [50.0, 100.0, 200.0, 300.0, 500.0, 1000.0, 2000.0, 4000.0];
+    let pixel_areas: Vec<f64> = sizes.iter().map(|s| (*s as f64) * (*s as f64)).collect();
+
+    // results[variant_idx][size_idx] = median_us
+    let mut results: Vec<Vec<f64>> = Vec::new();
+
+    for (vi, variant) in variants.iter().enumerate() {
+        let mut row = Vec::new();
+        for &size in &sizes {
+            let us = bench_single_rect(surface, size, &*variant.draw);
+            row.push(us);
+        }
+        eprint!("\r  [{}/{}] {:<35}", vi + 1, variants.len(), variant.name);
+        results.push(row);
+    }
+    eprintln!("\r  Done.{:40}", "");
+
+    // ── Output Section 1: Cost Multiplier Table (at 200²) ───────────
+
+    let size_idx_200 = 2; // 200.0 is index 2
+    let baseline_200 = results[0][size_idx_200];
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 1: Cost Multiplier Validation (at 200×200)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<35} {:>10} {:>10} {:>10} {:>6}",
+        "Effect", "Predicted", "Measured", "Time(µs)", "Status"
+    );
+    println!(
+        "  {:-<35} {:->10} {:->10} {:->10} {:->6}",
+        "", "", "", "", ""
+    );
+
+    for (vi, variant) in variants.iter().enumerate() {
+        let time_us = results[vi][size_idx_200];
+        let measured = time_us / baseline_200;
+        let ratio = measured / variant.predicted;
+        let status = if ratio >= 0.5 && ratio <= 2.0 {
+            "OK"
+        } else {
+            "WARN"
+        };
+        println!(
+            "  {:<35} {:>9.1}× {:>9.2}× {:>10.1} {:>6}",
+            variant.name, variant.predicted, measured, time_us, status
+        );
+    }
+
+    // ── Output Section 2: Linearity Table ───────────────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 2: Linearity (time vs. pixel area)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<35} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>6}",
+        "Effect", "50²", "100²", "200²", "300²", "500²", "1000²", "2000²", "4000²", "R²"
+    );
+    println!(
+        "  {:-<35} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->6}",
+        "", "", "", "", "", "", "", "", "", ""
+    );
+
+    for (vi, variant) in variants.iter().enumerate() {
+        let row = &results[vi];
+        let r2 = r_squared(&pixel_areas, row);
+        println!(
+            "  {:<35} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>5.3}",
+            variant.name, row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], r2
+        );
+    }
+
+    // ── Output Section 3: Blur Radius Independence ──────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 3: Blur Radius Independence (r=5 vs r=50)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<10} {:>10} {:>10} {:>10}",
+        "Size", "r=5 (µs)", "r=50 (µs)", "Ratio"
+    );
+    println!("  {:-<10} {:->10} {:->10} {:->10}", "", "", "", "");
+
+    let blur5_idx = 6; // "blur (r=5)"
+    let blur50_idx = 7; // "blur (r=50)"
+    for (si, &size) in sizes.iter().enumerate() {
+        let t5 = results[blur5_idx][si];
+        let t50 = results[blur50_idx][si];
+        let ratio = t50 / t5;
+        println!(
+            "  {:<10} {:>10.1} {:>10.1} {:>9.2}×",
+            format!("{}²", size as i32),
+            t5,
+            t50,
+            ratio
+        );
+    }
+
+    // ── Output Section 4: Device Fill Rate Calibration ──────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 4: Device Fill Rate Calibration");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+
+    // Use baseline at 500² for the most stable measurement
+    let baseline_500_us = results[0][4]; // 500² = 250_000 pixels
+    let pixels_500 = 500.0 * 500.0;
+    let pixels_per_us = pixels_500 / baseline_500_us;
+    let pixels_per_ms = pixels_per_us * 1000.0;
+    let budget_12ms = pixels_per_ms * 12.0;
+
+    println!(
+        "  Baseline (solid rect) at 500×500: {:.1} µs",
+        baseline_500_us
+    );
+    println!("  Fill rate: {:.1}M pixels/ms", pixels_per_ms / 1_000_000.0);
+    println!(
+        "  12ms frame budget: {:.1}B pixels ({:.0}M pixels)",
+        budget_12ms / 1_000_000_000.0,
+        budget_12ms / 1_000_000.0
+    );
+    println!();
+
+    println!("  Reference (from docs/wg/feat-2d/render-cost-prediction.md):");
+    println!("    Desktop GPU (discrete)   ~500M pixels/ms");
+    println!("    Desktop GPU (integrated) ~100M pixels/ms");
+    println!("    WebGL (WASM, desktop)    ~50-100M pixels/ms");
+    println!("    WebGL (WASM, mobile)     ~10-30M pixels/ms");
+
+    // ── Output Section 5: Two-Component Formula Extraction ──────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 5: Two-Component Formula (C_fixed + area × C_per_pixel)");
+    println!("═══════════════════════════════════════════════════════════════════════════");
+    println!("  Solving from 200² and 4000² measurements:");
+    println!();
+    println!(
+        "  {:<35} {:>10} {:>10} {:>12} {:>12}",
+        "Effect", "C_fixed(µs)", "C_pixel(ns/px)", "t@200²(µs)", "t@4000²(µs)"
+    );
+    println!(
+        "  {:-<35} {:->10} {:->10} {:->12} {:->12}",
+        "", "", "", "", ""
+    );
+
+    let area_small = 200.0_f64 * 200.0; // 40,000
+    let area_large = 4000.0_f64 * 4000.0; // 16,000,000
+    let idx_200 = 2usize; // index of 200.0 in sizes
+    let idx_4000 = 7usize; // index of 4000.0 in sizes
+
+    for (vi, variant) in variants.iter().enumerate() {
+        let t_small = results[vi][idx_200];
+        let t_large = results[vi][idx_4000];
+
+        // Solve: t_small = C_fixed + area_small * C_pixel
+        //        t_large = C_fixed + area_large * C_pixel
+        // → C_pixel = (t_large - t_small) / (area_large - area_small)
+        // → C_fixed = t_small - area_small * C_pixel
+        let c_pixel = (t_large - t_small) / (area_large - area_small); // µs per pixel
+        let c_fixed = t_small - area_small * c_pixel;
+
+        let c_pixel_ns = c_pixel * 1000.0; // ns per pixel
+
+        println!(
+            "  {:<35} {:>10.1} {:>10.3} {:>12.1} {:>12.1}",
+            variant.name,
+            c_fixed.max(0.0),
+            c_pixel_ns.max(0.0),
+            t_small,
+            t_large
+        );
+    }
+
+    println!();
+    println!("  C_fixed = per-save_layer FBO/pipeline overhead (device-specific)");
+    println!("  C_pixel = per-pixel bandwidth cost (ns/pixel)");
+    println!("  Cost model: node_cost = C_fixed + screen_area × C_pixel × passes");
+    println!();
+}
diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs
new file mode 100644
index 0000000000..ba7efa5bfb
--- /dev/null
+++ b/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs
@@ -0,0 +1,391 @@
+//! Scene-Scale Cost Model Benchmark
+//!
+//! Measures full-engine render cost at scale (1K–136K nodes) with the complete
+//! Renderer pipeline: R-tree culling, picture cache, layer compositing, GPU flush.
+//!
+//! This complements `skia_bench_cost_model` (single-node isolation) by testing
+//! whether per-node costs are additive at scale or whether GPU batching,
+//! memory pressure, and cache behavior introduce non-linear effects.
+//!
+//! Run with:
+//! ```bash
+//! cargo run -p cg --example skia_bench_scene_scale --features native-gl-context --release
+//! ```
+
+#[cfg(not(feature = "native-gl-context"))]
+fn main() {
+    eprintln!("This example requires --features native-gl-context");
+}
+
+#[cfg(feature = "native-gl-context")]
+fn main() {
+    use cg::cg::prelude::*;
+    use cg::node::scene_graph::{Parent, SceneGraph};
+    use cg::node::schema::*;
+    use cg::runtime::scene::FrameFlushResult;
+    use cg::window::headless::HeadlessGpu;
+    use math2::transform::AffineTransform;
+    use std::time::Instant;
+
+    const W: i32 = 1000;
+    const H: i32 = 1000;
+    const WARMUP: u32 = 5;
+    const ITERS: u32 = 20;
+
+    let mut gpu = HeadlessGpu::new(W, H).expect("GPU init");
+    gpu.print_gl_info();
+    println!();
+
+    // ── Scene builders ──────────────────────────────────────────────
+
+    #[derive(Clone, Copy)]
+    enum SceneType {
+        Plain,
+        WithShadow,
+        WithBlur,
+        Mixed, // 70% plain, 20% shadow, 10% blur
+    }
+
+    impl SceneType {
+        fn label(&self) -> &'static str {
+            match self {
+                SceneType::Plain => "plain rects",
+                SceneType::WithShadow => "all with shadow",
+                SceneType::WithBlur => "all with blur",
+                SceneType::Mixed => "mixed (70/20/10)",
+            }
+        }
+    }
+
+    fn build_scene(count: usize, scene_type: SceneType) -> Scene {
+        let mut graph = SceneGraph::new();
+        let cols = (count as f64).sqrt().ceil() as usize;
+
+        let rectangles: Vec<Node> = (0..count)
+            .map(|i| {
+                let col = i % cols;
+                let row = i / cols;
+                let x = (col as f32) * 10.0;
+                let y = (row as f32) * 10.0;
+
+                let effects = match scene_type {
+                    SceneType::Plain => LayerEffects::default(),
+                    SceneType::WithShadow => {
+                        LayerEffects::from_array(vec![FilterEffect::DropShadow(FeShadow {
+                            dx: 2.0,
+                            dy: 2.0,
+                            blur: 4.0,
+                            spread: 0.0,
+                            color: CGColor::from_rgba(0, 0, 0, 128),
+                            active: true,
+                        })])
+                    }
+                    SceneType::WithBlur => LayerEffects::new().blur(3.0),
+                    SceneType::Mixed => {
+                        let kind = i % 10;
+                        if kind < 7 {
+                            LayerEffects::default() // 70% plain
+                        } else if kind < 9 {
+                            LayerEffects::from_array(vec![FilterEffect::DropShadow(FeShadow {
+                                dx: 2.0,
+                                dy: 2.0,
+                                blur: 4.0,
+                                spread: 0.0,
+                                color: CGColor::from_rgba(0, 0, 0, 128),
+                                active: true,
+                            })]) // 20% shadow
+                        } else {
+                            LayerEffects::new().blur(3.0) // 10% blur
+                        }
+                    }
+                };
+
+                Node::Rectangle(RectangleNodeRec {
+                    active: true,
+                    opacity: 1.0,
+                    blend_mode: LayerBlendMode::default(),
+                    mask: None,
+                    transform: AffineTransform::new(x, y, 0.0),
+                    size: Size {
+                        width: 8.0,
+                        height: 8.0,
+                    },
+                    corner_radius: RectangularCornerRadius::zero(),
+                    corner_smoothing: CornerSmoothing::default(),
+                    fills: Paints::new([Paint::from(CGColor::from_rgba(
+                        66,
+                        (133 + i % 50) as u8,
+                        244,
+                        255,
+                    ))]),
+                    strokes: Paints::default(),
+                    stroke_style: StrokeStyle {
+                        stroke_align: StrokeAlign::Inside,
+                        stroke_cap: StrokeCap::default(),
+                        stroke_join: StrokeJoin::default(),
+                        stroke_miter_limit: StrokeMiterLimit::default(),
+                        stroke_dash_array: None,
+                    },
+                    stroke_width: StrokeWidth::default(),
+                    effects,
+                    layout_child: None,
+                })
+            })
+            .collect();
+
+        graph.append_children(rectangles, Parent::Root);
+
+        Scene {
+            name: format!("scale_{}_{}", count, scene_type.label()),
+            background_color: Some(CGColor::WHITE),
+            graph,
+        }
+    }
+
+    // ── Benchmark runner ────────────────────────────────────────────
+
+    struct ScaleResult {
+        scene_type: &'static str,
+        node_count: usize,
+        visible_count: usize,
+        frame_us: f64,
+        flush_us: f64,
+        total_us: f64,
+        per_visible_us: f64,
+        cache_hits: usize,
+        live_draws: usize,
+    }
+
+    fn run_scale_bench(
+        renderer: &mut cg::runtime::scene::Renderer,
+        count: usize,
+        scene_type: SceneType,
+    ) -> ScaleResult {
+        let scene = build_scene(count, scene_type);
+        renderer.load_scene(scene);
+
+        // Measure stable frames (full draw, no image cache).
+        // load_scene queues a stable frame automatically.
+        // Each iteration: flush (draws), then queue next stable frame.
+        // Stable frames always do a full draw — no pan/zoom image cache reuse.
+        let mut frame_times = Vec::with_capacity((WARMUP + ITERS) as usize);
+        let mut flush_times = Vec::with_capacity((WARMUP + ITERS) as usize);
+        let mut total_times = Vec::with_capacity((WARMUP + ITERS) as usize);
+        let mut last_visible = 0usize;
+        let mut last_cache_hits = 0usize;
+        let mut last_live_draws = 0usize;
+
+        for i in 0..(WARMUP + ITERS) {
+            renderer.queue_stable();
+            let t0 = Instant::now();
+            let result = renderer.flush();
+            let wall = t0.elapsed();
+
+            if let FrameFlushResult::OK(stats) = result {
+                if i >= WARMUP {
+                    frame_times.push(stats.frame_duration.as_nanos() as f64 / 1000.0);
+                    flush_times.push(stats.flush_duration.as_nanos() as f64 / 1000.0);
+                    total_times.push(wall.as_nanos() as f64 / 1000.0);
+                }
+                last_visible = stats.draw.live_draw_count + stats.draw.layer_image_cache_hits;
+                last_cache_hits = stats.draw.layer_image_cache_hits;
+                last_live_draws = stats.draw.live_draw_count;
+            }
+        }
+
+        // Use median
+        frame_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        flush_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        total_times.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+        let frame_us = frame_times
+            .get(frame_times.len() / 2)
+            .copied()
+            .unwrap_or(0.0);
+        let flush_us = flush_times
+            .get(flush_times.len() / 2)
+            .copied()
+            .unwrap_or(0.0);
+        let total_us = total_times
+            .get(total_times.len() / 2)
+            .copied()
+            .unwrap_or(0.0);
+        let per_visible = if last_visible > 0 {
+            total_us / last_visible as f64
+        } else {
+            0.0
+        };
+
+        ScaleResult {
+            scene_type: scene_type.label(),
+            node_count: count,
+            visible_count: last_visible,
+            frame_us,
+            flush_us,
+            total_us,
+            per_visible_us: per_visible,
+            cache_hits: last_cache_hits,
+            live_draws: last_live_draws,
+        }
+    }
+
+    // ── Run all configurations ──────────────────────────────────────
+
+    let counts = [1_000, 5_000, 10_000, 50_000, 100_000, 136_000];
+    let scene_types = [
+        SceneType::Plain,
+        SceneType::WithShadow,
+        SceneType::WithBlur,
+        SceneType::Mixed,
+    ];
+
+    let mut renderer = gpu.create_renderer();
+    let mut results: Vec<ScaleResult> = Vec::new();
+
+    let total_configs = counts.len() * scene_types.len();
+    let mut done = 0;
+
+    for &scene_type in &scene_types {
+        for &count in &counts {
+            eprint!(
+                "\r  [{}/{}] {} × {}k",
+                done + 1,
+                total_configs,
+                scene_type.label(),
+                count / 1000
+            );
+            results.push(run_scale_bench(&mut renderer, count, scene_type));
+            done += 1;
+        }
+    }
+    eprintln!("\r  Done.{:60}", "");
+
+    // ── Output Section 1: Scale Table ───────────────────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 1: Frame Time vs. Node Count (unstable frames, full Renderer pipeline)");
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<22} {:>8} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8} {:>8}",
+        "Scene Type",
+        "Nodes",
+        "Visible",
+        "Frame(µs)",
+        "Flush(µs)",
+        "Total(µs)",
+        "Per-vis",
+        "Hits",
+        "Live"
+    );
+    println!(
+        "  {:-<22} {:->8} {:->8} {:->10} {:->10} {:->10} {:->10} {:->8} {:->8}",
+        "", "", "", "", "", "", "", "", ""
+    );
+
+    for r in &results {
+        println!(
+            "  {:<22} {:>7}k {:>8} {:>10.0} {:>10.0} {:>10.0} {:>9.2} {:>8} {:>8}",
+            r.scene_type,
+            r.node_count / 1000,
+            r.visible_count,
+            r.frame_us,
+            r.flush_us,
+            r.total_us,
+            r.per_visible_us,
+            r.cache_hits,
+            r.live_draws
+        );
+    }
+
+    // ── Output Section 2: Linearity Check ───────────────────────────
+
+    println!();
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 2: Per-Node Cost Linearity (total_us / visible_count across scales)");
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!(
+        "  {:<22} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}",
+        "Scene Type", "1k", "5k", "10k", "50k", "100k", "136k"
+    );
+    println!(
+        "  {:-<22} {:->10} {:->10} {:->10} {:->10} {:->10} {:->10}",
+        "", "", "", "", "", "", ""
+    );
+
+    for scene_type in &scene_types {
+        let label = scene_type.label();
+        let per_vis: Vec<String> = counts
+            .iter()
+            .map(|&count| {
+                results
+                    .iter()
+                    .find(|r| r.node_count == count && r.scene_type == label)
+                    .map(|r| format!("{:.2}", r.per_visible_us))
+                    .unwrap_or_else(|| "-".to_string())
+            })
+            .collect();
+        println!(
+            "  {:<22} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}",
+            label, per_vis[0], per_vis[1], per_vis[2], per_vis[3], per_vis[4], per_vis[5]
+        );
+    }
+    println!();
+    println!("  If per-visible cost is flat → cost model is additive (linear scaling).");
+    println!("  If per-visible cost increases with N → non-linear overhead at scale.");
+    println!();
+
+    // ── Output Section 3: Predicted vs Measured ─────────────────────
+
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!("  SECTION 3: Predicted vs. Measured (using cost model)");
+    println!("═══════════════════════════════════════════════════════════════════════════════════════════════════");
+    println!();
+
+    // Find baseline per-visible cost from plain 1k
+    let plain_1k = results
+        .iter()
+        .find(|r| r.node_count == 1_000 && r.scene_type == "plain rects");
+
+    if let Some(base) = plain_1k {
+        let base_per_vis = base.per_visible_us;
+        println!(
+            "  Baseline per-visible-node cost (plain, 1k): {:.2} µs",
+            base_per_vis
+        );
+        println!();
+        println!(
+            "  {:<22} {:>8} {:>12} {:>12} {:>10}",
+            "Scene Type", "Nodes", "Predicted(µs)", "Measured(µs)", "Ratio"
+        );
+        println!(
+            "  {:-<22} {:->8} {:->12} {:->12} {:->10}",
+            "", "", "", "", ""
+        );
+
+        for r in &results {
+            // Prediction: plain baseline per node × visible count × effect multiplier
+            let multiplier = match r.scene_type {
+                "plain rects" => 1.0,
+                "all with shadow" => 6.0, // 1 base + 5 shadow
+                "all with blur" => 4.0,   // 1 base + 3 blur(σ=3)
+                "mixed (70/20/10)" => 0.7 * 1.0 + 0.2 * 6.0 + 0.1 * 4.0, // 2.3
+                _ => 1.0,
+            };
+            let predicted = base_per_vis * r.visible_count as f64 * multiplier;
+            let measured = r.total_us;
+            let ratio = measured / predicted;
+
+            println!(
+                "  {:<22} {:>7}k {:>12.0} {:>12.0} {:>9.2}×",
+                r.scene_type,
+                r.node_count / 1000,
+                predicted,
+                measured,
+                ratio
+            );
+        }
+    }
+
+    println!();
+}
diff --git a/crates/grida-canvas/src/cache/picture.rs b/crates/grida-canvas/src/cache/picture.rs
index 23e1941084..f6800f69db 100644
--- a/crates/grida-canvas/src/cache/picture.rs
+++ b/crates/grida-canvas/src/cache/picture.rs
@@ -25,6 +25,10 @@ pub struct PictureCache {
     default_store: NodeIdHashMap<NodeId, Picture>,
     /// Store for non-default render variants (variant key != 0).
     variant_store: NodeIdHashMap<(NodeId, u64), Picture>,
+    /// Monotonically increasing counter incremented on any cache mutation
+    /// (insert, invalidate, invalidate_node). The prefill loop uses this
+    /// to skip the 136K-iteration cache-hit check when nothing changed.
+    generation: u64,
 }
 
 impl PictureCache {
@@ -33,6 +37,7 @@ impl PictureCache {
             strategy: PictureCacheStrategy::default(),
             default_store: new_node_id_map(),
             variant_store: new_node_id_map(),
+            generation: 0,
         }
     }
 
@@ -49,8 +54,17 @@ impl PictureCache {
         self.default_store.get(id)
     }
 
+    /// Returns the current cache generation counter. This increments on
+    /// every mutation (insert, invalidate). Callers can compare generations
+    /// to detect whether the cache contents have changed.
+    #[inline]
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+
     pub fn set_node_picture(&mut self, id: NodeId, picture: Picture) {
         self.default_store.insert(id, picture);
+        self.generation = self.generation.wrapping_add(1);
     }
 
     /// Lookup a picture for a node in a specific render variant.
@@ -69,15 +83,25 @@ impl PictureCache {
     pub fn set_node_picture_variant(&mut self, id: NodeId, variant_key: u64, picture: Picture) {
         if variant_key == 0 {
             self.default_store.insert(id, picture);
-            return;
+        } else {
+            self.variant_store.insert((id, variant_key), picture);
         }
-        self.variant_store.insert((id, variant_key), picture);
+        self.generation = self.generation.wrapping_add(1);
     }
 
     pub fn len(&self) -> usize {
         self.default_store.len() + self.variant_store.len()
     }
 
+    /// Returns true when the variant store has no entries.
+    /// When this is true AND variant unification is enabled, ALL cached
+    /// pictures live under the default key (0), making the prefill skip
+    /// safe across stable/unstable transitions.
+    #[inline]
+    pub fn variant_store_is_empty(&self) -> bool {
+        self.variant_store.is_empty()
+    }
+
     pub fn depth(&self) -> Option<usize> {
         self.strategy.depth
     }
@@ -85,6 +109,7 @@ impl PictureCache {
     pub fn invalidate(&mut self) {
         self.default_store.clear();
         self.variant_store.clear();
+        self.generation = self.generation.wrapping_add(1);
     }
 
     /// Invalidate cached pictures for a single node (all variants).
@@ -96,5 +121,6 @@ impl PictureCache {
     pub fn invalidate_node(&mut self, id: NodeId) {
         self.default_store.remove(&id);
         self.variant_store.retain(|&(nid, _), _| nid != id);
+        self.generation = self.generation.wrapping_add(1);
     }
 }
diff --git a/crates/grida-canvas/src/runtime/cost_prediction.rs b/crates/grida-canvas/src/runtime/cost_prediction.rs
new file mode 100644
index 0000000000..04dd737a5b
--- /dev/null
+++ b/crates/grida-canvas/src/runtime/cost_prediction.rs
@@ -0,0 +1,153 @@
+//! Render cost prediction — read-only metric for frame budget estimation.
+//!
+//! Estimates the GPU cost of rendering a frame based on the visible node set
+//! and their effects. All constants are fixed-overhead costs measured on
+//! Apple M2 Pro (Metal 4.1).
+//!
+//! This module is **debug/instrumentation only**. It does not influence
+//! rendering decisions. The predicted cost is reported in `FramePlan` and
+//! the devtools overlay for correlation analysis against actual frame times.
+//!
+//! ## Reference
+//!
+//! - [`docs/wg/feat-2d/render-cost-prediction.md`] — cost model derivation,
+//!   benchmark results, Skia blur algorithm analysis, blend mode tiers,
+//!   and calibration methodology.
+//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs`] —
+//!   per-effect validation benchmark (fixed cost extraction, linearity).
+//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs`] —
+//!   cache hit/miss ratio measurement.
+//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs`] —
+//!   full Renderer pipeline at 1K–136K nodes.
+
+use crate::cg::fe::{FeBlur, FilterShadowEffect};
+use crate::cg::prelude::LayerBlendMode;
+use crate::painter::layer::PainterPictureLayer;
+
+// ── Measured fixed-overhead constants (µs) ──────────────────────────
+//
+// Per-operation FBO/pipeline switch costs, NOT per-pixel.
+// Source: skia_bench_cost_model (single rect, median of 50 runs).
+
+/// Baseline draw call + flush overhead (no save_layer).
+const COST_BASELINE_US: f64 = 12.0;
+
+/// Gaussian blur: FBO + shader dispatch. For σ > 4.0, each additional
+/// downsample level adds ~COST_BLUR_LEVEL_US.
+const COST_BLUR_BASE_US: f64 = 73.0;
+const COST_BLUR_LEVEL_US: f64 = 35.0;
+
+/// Drop shadow: FBO + shadow filter dispatch.
+const COST_SHADOW_US: f64 = 97.0;
+
+/// Inner shadow: FBO + clip + shadow filter dispatch.
+const COST_INNER_SHADOW_US: f64 = 72.0;
+
+/// Non-PassThrough blend mode: FBO + blend resolve.
+const COST_BLEND_MODE_US: f64 = 81.0;
+
+/// Backdrop blur: FBO + dst snapshot + blur.
+const COST_BACKDROP_BLUR_US: f64 = 110.0;
+
+/// Group opacity isolation (save_layer_alpha).
+const COST_OPACITY_ISOLATION_US: f64 = 20.0;
+
+/// Compositor cache hit: single texture blit (~5µs, size-independent).
+const COST_CACHE_HIT_US: f64 = 5.0;
+
+// ── Public API ──────────────────────────────────────────────────────
+
+/// Estimate the blur fixed cost based on sigma.
+///
+/// Skia uses direct convolution for σ ≤ 4.0 and recursive downsampling
+/// for larger values. Each downsample level adds a fixed FBO overhead.
+/// See `skia/src/core/SkBlurEngine.h` for the `kMaxLinearSigma = 4.0`
+/// constant that drives this.
+pub fn blur_cost_us(sigma: f32) -> f64 {
+    if sigma <= 0.03 {
+        return 0.0;
+    }
+    if sigma <= 4.0 {
+        return COST_BLUR_BASE_US;
+    }
+    let levels = (sigma / 4.0).log2().ceil() as u32;
+    COST_BLUR_BASE_US + levels as f64 * COST_BLUR_LEVEL_US
+}
+
+/// Estimate the fixed-overhead cost (µs) for rendering a single node.
+///
+/// `is_cache_hit`: true if the node will be drawn from the compositor
+/// layer cache (texture blit) rather than live-rasterized.
+pub fn estimate_node_cost(layer: &PainterPictureLayer, is_cache_hit: bool) -> f64 {
+    if is_cache_hit {
+        return COST_CACHE_HIT_US;
+    }
+
+    let mut cost = COST_BASELINE_US;
+
+    let (effects, base) = match layer {
+        PainterPictureLayer::Shape(s) => (&s.effects, &s.base),
+        PainterPictureLayer::Text(t) => (&t.effects, &t.base),
+        PainterPictureLayer::Vector(v) => (&v.effects, &v.base),
+    };
+
+    // Blur
+    if let Some(blur) = &effects.blur {
+        if blur.active {
+            let sigma = match &blur.blur {
+                FeBlur::Gaussian(g) => g.radius,
+                FeBlur::Progressive(p) => p.radius.max(p.radius2),
+            };
+            cost += blur_cost_us(sigma);
+        }
+    }
+
+    // Backdrop blur
+    if let Some(backdrop) = &effects.backdrop_blur {
+        if backdrop.active {
+            let sigma = match &backdrop.blur {
+                FeBlur::Gaussian(g) => g.radius,
+                FeBlur::Progressive(p) => p.radius.max(p.radius2),
+            };
+            cost += COST_BACKDROP_BLUR_US.max(blur_cost_us(sigma));
+        }
+    }
+
+    // Shadows
+    for shadow in &effects.shadows {
+        match shadow {
+            FilterShadowEffect::DropShadow(s) => {
+                if s.active {
+                    cost += COST_SHADOW_US.max(blur_cost_us(s.blur));
+                }
+            }
+            FilterShadowEffect::InnerShadow(s) => {
+                if s.active {
+                    cost += COST_INNER_SHADOW_US.max(blur_cost_us(s.blur));
+                }
+            }
+        }
+    }
+
+    // Glass (treated as backdrop blur)
+    if let Some(glass) = &effects.glass {
+        if glass.active {
+            cost += COST_BACKDROP_BLUR_US;
+        }
+    }
+
+    // Blend mode isolation (non-PassThrough requires save_layer)
+    if !matches!(base.blend_mode, LayerBlendMode::PassThrough) {
+        cost += COST_BLEND_MODE_US;
+    }
+
+    // Group opacity isolation
+    // Note: leaf nodes fold opacity into paint alpha; only groups need
+    // save_layer. We can't distinguish group vs leaf from
+    // PainterPictureLayer alone, so we conservatively add the cost.
+    if base.opacity < 1.0 {
+        cost += COST_OPACITY_ISOLATION_US;
+    }
+
+    cost
+}
diff --git a/crates/grida-canvas/src/runtime/mod.rs b/crates/grida-canvas/src/runtime/mod.rs
index 6584dd5762..f1f99f50ef 100644
--- a/crates/grida-canvas/src/runtime/mod.rs
+++ b/crates/grida-canvas/src/runtime/mod.rs
@@ -1,6 +1,7 @@
 pub mod camera;
 pub mod changes;
 pub mod config;
+pub mod cost_prediction;
 pub mod counter;
 pub mod effect_tree;
 pub mod font_repository;
diff --git a/crates/grida-canvas/src/runtime/scene.rs b/crates/grida-canvas/src/runtime/scene.rs
index d92990ca67..b3baf4d10d 100644
--- a/crates/grida-canvas/src/runtime/scene.rs
+++ b/crates/grida-canvas/src/runtime/scene.rs
@@ -155,6 +155,11 @@ pub struct FramePlan {
     pub compositor_indices: Vec<usize>,
     pub display_list_duration: Duration,
     pub display_list_size_estimated: usize,
+    /// Predicted frame cost in microseconds, based on the fixed-overhead cost
+    /// model (sum of per-effect FBO/pipeline costs for visible nodes).
+    /// See `docs/wg/feat-2d/render-cost-prediction.md` for derivation.
+    /// Zero for cache-hit frames (pan/zoom blit).
+    pub predicted_cost_us: f64,
 }
 
 /// Deferred frame plan: stores just the inputs so the expensive R-tree query
@@ -358,6 +363,13 @@ pub struct Renderer {
     /// [`apply_changes`] consumes the set once per frame and performs
     /// the correct invalidation for every cache layer.
     changes: ChangeSet,
+    /// Picture cache generation + variant key at the time of the last
+    /// successful prefill. When the cache generation and variant key
+    /// match, the prefill loop can be skipped entirely — all pictures
+    /// are already cached from a previous frame.
+    last_prefill_generation: u64,
+    last_prefill_variant_key: u64,
+    last_prefill_layer_count: usize,
 }
 
 impl Renderer {
@@ -385,6 +397,37 @@ impl Renderer {
         // True when the policy differs from STANDARD only in effect-related
         // fields — content, compositing, and clip policies are unchanged.
         let can_unify = variant_key != 0 && policy.is_effect_only_variant();
+
+        // Skip-prefill fast path: when the picture cache generation hasn't
+        // changed since the last prefill AND we're using the same variant
+        // key AND the layer count matches, every picture from the previous
+        // prefill is still valid. Skip the O(N) iteration entirely.
+        //
+        // For variant key tracking: when can_unify is true AND the variant
+        // store is empty (no per-variant entries — all nodes are effect-free),
+        // we track key=0 since all pictures live under the default key. This
+        // is safe across stable/unstable transitions for effect-free scenes.
+        // Scenes WITH effects track the actual variant_key.
+        //
+        // On 135K-node scenes at fit zoom, this eliminates ~800µs of HashMap
+        // lookups on every cache-warm frame (the common case during view-only
+        // pan/zoom interaction and settle frames).
+        let effective_key_for_tracking =
+            if can_unify && self.scene_cache.picture.variant_store_is_empty() {
+                0
+            } else {
+                variant_key
+            };
+
+        let current_gen = self.scene_cache.picture.generation();
+        let layer_count: usize = plan.regions.iter().map(|(_, idx)| idx.len()).sum();
+        if current_gen == self.last_prefill_generation
+            && effective_key_for_tracking == self.last_prefill_variant_key
+            && layer_count == self.last_prefill_layer_count
+        {
+            return;
+        }
+
         // Prefill picture cache for visible layers so Painter can reuse pictures even with masks.
         // Fast path: skip clone + recording when the picture is already cached (common case
         // on cache-warm frames). The clone of LayerEntry is expensive because it deep-copies
@@ -433,6 +476,17 @@ impl Renderer {
                 }
             }
         }
+
+        // Update tracking state for future skip-prefill checks.
+        let effective_key_after = if can_unify && self.scene_cache.picture.variant_store_is_empty()
+        {
+            0
+        } else {
+            variant_key
+        };
+        self.last_prefill_generation = self.scene_cache.picture.generation();
+        self.last_prefill_variant_key = effective_key_after;
+        self.last_prefill_layer_count = layer_count;
     }
 
     /// Pre-extract blit data for all promoted nodes.
@@ -608,6 +662,9 @@ impl Renderer {
             pan_image_cache: None,
             zoom_image_cache: None,
             changes: ChangeSet::new(),
+            last_prefill_generation: u64::MAX,
+            last_prefill_variant_key: u64::MAX,
+            last_prefill_layer_count: 0,
         }
     }
 
@@ -1068,6 +1125,7 @@ impl Renderer {
                         compositor_indices: Vec::new(),
                         display_list_duration: Duration::ZERO,
                         display_list_size_estimated: 0,
+                        predicted_cost_us: 0.0,
                     };
 
                     return FrameFlushStats {
@@ -1125,6 +1183,7 @@ impl Renderer {
                     compositor_indices: Vec::new(),
                     display_list_duration: Duration::ZERO,
                     display_list_size_estimated: 0,
+                    predicted_cost_us: 0.0,
                 },
             );
             if let Some((mid_flush_duration, frame_duration)) = zoom_cache_hit {
@@ -1142,6 +1201,7 @@ impl Renderer {
                     compositor_indices: Vec::new(),
                     display_list_duration: Duration::ZERO,
                     display_list_size_estimated: 0,
+                    predicted_cost_us: 0.0,
                 };
                 return FrameFlushStats {
                     frame: plan,
@@ -1488,6 +1548,9 @@ impl Renderer {
         self.scene_cache = cache::scene::SceneCache::new();
         self.pan_image_cache = None;
         self.zoom_image_cache = None;
+        self.last_prefill_generation = u64::MAX;
+        self.last_prefill_variant_key = u64::MAX;
+        self.last_prefill_layer_count = 0;
         self.images.clear_missing_tracking();
         if let Some(scene) = self.scene.as_ref() {
             #[cfg(feature = "perf")]
@@ -2032,6 +2095,10 @@ impl Renderer {
         pic
     }
 
+    // ── Render cost prediction ─────────────────────────────────────
+    // Read-only debug metric. Delegates to `runtime::cost_prediction`.
+    // See docs/wg/feat-2d/render-cost-prediction.md for derivation.
+
     /// Plan the frame for rendering.
     ///
     /// # Arguments
@@ -2135,6 +2202,33 @@ impl Renderer {
 
         let ll_len = regions.iter().map(|(_, indices)| indices.len()).sum();
 
+        // Predict frame cost: sum per-node fixed overhead costs.
+        let predicted_cost_us = {
+            let promoted_set: std::collections::HashSet<&NodeId> = promoted_ids.iter().collect();
+            let mut total = 0.0_f64;
+            // Live-drawn nodes (from regions)
+            for (_, region_indices) in &regions {
+                for &idx in region_indices {
+                    if let Some(entry) = self.scene_cache.layers.layers.get(idx) {
+                        total += crate::runtime::cost_prediction::estimate_node_cost(
+                            &entry.layer,
+                            false,
+                        );
+                    }
+                }
+            }
+            // Promoted (cache-hit) nodes
+            for &idx in &compositor_indices {
+                if let Some(entry) = self.scene_cache.layers.layers.get(idx) {
+                    if promoted_set.contains(&entry.id) {
+                        total +=
+                            crate::runtime::cost_prediction::estimate_node_cost(&entry.layer, true);
+                    }
+                }
+            }
+            total
+        };
+
         let __ll_duration = __start.elapsed();
 
         FramePlan {
@@ -2146,6 +2240,7 @@ impl Renderer {
             compositor_indices,
             display_list_duration: __ll_duration,
             display_list_size_estimated: ll_len,
+            predicted_cost_us,
         }
     }
 
diff --git a/crates/grida-canvas/src/window/application.rs b/crates/grida-canvas/src/window/application.rs
index 7de297a56f..1db614717e 100644
--- a/crates/grida-canvas/src/window/application.rs
+++ b/crates/grida-canvas/src/window/application.rs
@@ -1627,13 +1627,18 @@ impl UnknownTargetApplication {
         wall_time: std::time::Duration,
     ) {
         let s = format!(
-            "fps*: {:.0} | t: {:.2}ms | cam: {} | render: {:.1}ms | flush: {:.1}ms | frame: {:.1}ms | list: {:.1}ms ({:?}) | draw: {:.1}ms | $:pic: {:?} ({:?} use) | $:geo: {:?} | comp: {:?} ({:?} hit, {:.1}KB) | live: {:?} | res: {} | img: {} | fnt: {}",
+            "fps*: {:.0} | t: {:.2}ms | cam: {} | render: {:.1}ms | flush: {:.1}ms | frame: {:.1}ms | pred: {:.0}µs ({:.1}×) | list: {:.1}ms ({:?}) | draw: {:.1}ms | $:pic: {:?} ({:?} use) | $:geo: {:?} | comp: {:?} ({:?} hit, {:.1}KB) | live: {:?} | res: {} | img: {} | fnt: {}",
             1.0 / wall_time.as_secs_f64(),
             wall_time.as_secs_f64() * 1000.0,
             stats.frame.camera_change.label(),
             stats.total_duration.as_secs_f64() * 1000.0,
             stats.flush_duration.as_secs_f64() * 1000.0,
             stats.frame_duration.as_secs_f64() * 1000.0,
+            stats.frame.predicted_cost_us,
+            {
+                let actual_us = stats.frame_duration.as_secs_f64() * 1_000_000.0;
+                if actual_us > 0.0 { stats.frame.predicted_cost_us / actual_us } else { 0.0 }
+            },
             stats.frame.display_list_duration.as_secs_f64() * 1000.0,
             stats.frame.display_list_size_estimated,
             stats.draw.painter_duration.as_secs_f64() * 1000.0,
diff --git a/docs/wg/feat-2d/optimization.md b/docs/wg/feat-2d/optimization.md
index e70bb3d5b5..fe6f5b51ff 100644
--- a/docs/wg/feat-2d/optimization.md
+++ b/docs/wg/feat-2d/optimization.md
@@ -1187,6 +1187,47 @@ expensive full redraws.
     - `runtime/scene.rs` — `apply_changes()` for `last_had_data_changes`
     - `window/application.rs` — `frame()` vs `redraw()` dual-path issue
 
+48. **Picture Cache Prefill Skip (Generation Tracking)** ✅ IMPLEMENTED
+
+    The `prefill_picture_cache_for_plan()` loop iterates ALL visible nodes
+    each frame to check if their `SkPicture` is cached, doing a HashMap
+    lookup per node. On cache-warm frames (the common case during view-only
+    pan/zoom), every lookup succeeds and no work is done — but the iteration
+    itself costs O(N) per frame.
+
+    **The optimization:** track a monotonically increasing `generation`
+    counter on `PictureCache` that increments on any mutation (insert,
+    invalidate). The prefill stores the generation, variant key, and layer
+    count after each successful pass. On the next frame, if all three
+    match, the entire loop is skipped in O(1).
+
+    For effect-free scenes (the common case for large design docs), the
+    variant key unification optimization stores all pictures under key=0
+    regardless of stable/unstable quality. The generation-based skip is
+    safe across stable/unstable transitions because the cache contents
+    are identical.
+
+    **Measured impact (Apple M2 Pro, GPU benchmark, 01-135k 135K nodes):**
+
+    | Scenario | Metric | Before | After | Delta |
+    | -------- | ------ | ------ | ----- | ----- |
+    | rt_pan_fast_fit | p50 frame | 111 µs | 76 µs | **-32%** |
+    | rt_pan_fast_fit | p95 frame | 263 µs | 153 µs | **-42%** |
+    | rt_pan_slow_fit | settle | 2,323 µs | 1,836 µs | **-21%** |
+    | pan_settle_slow_fit | avg | 87 µs | 59 µs | **-32%** |
+    | pan_settle_slow_fit | settle | 1,034 µs | 709 µs | **-31%** |
+
+    **Criterion (CPU raster, 2000-node scene, statistically rigorous):**
+
+    | Scene | Change | p-value |
+    | ----- | ------ | ------- |
+    | large_baseline/pan | **-14.0%** | < 0.01 |
+    | large_baseline/pan_zoomed_in | -5.4% | 0.02 |
+    | large_compositing/pan | -4.2% | 0.02 |
+
+    Implementation: `PictureCache.generation` in `cache/picture.rs`,
+    `Renderer.last_prefill_*` tracking in `runtime/scene.rs`.
+
 ---
 
 This list is designed to evolve the renderer from single-threaded mode to
diff --git a/docs/wg/feat-2d/render-cost-prediction.md b/docs/wg/feat-2d/render-cost-prediction.md
index 0c4c1f1152..d6de03f2a5 100644
--- a/docs/wg/feat-2d/render-cost-prediction.md
+++ b/docs/wg/feat-2d/render-cost-prediction.md
@@ -12,9 +12,13 @@ tags:
 
 # Render Cost Prediction
 
-Reference sheet for computing GPU render cost of 2D scene operations
-**before drawing**. All constants and formulas are derived from GPU
-pipeline structure, not empirical tuning.
+Reference sheet for estimating GPU render cost of 2D scene operations
+**before drawing**. Each claim is labeled as one of:
+
+- **FACT** — verified from Skia/Chromium source or hardware specification
+- **BENCHMARK** — measured locally (Apple M2 Pro, Metal 4.1, Skia 0.93)
+- **INFERENCE** — derived from facts and benchmarks, not directly proven
+- **HEURISTIC** — useful approximation, known to have exceptions
 
 Related:
 
@@ -23,178 +27,258 @@ Related:
 
 ---
 
-## Core Principle: Fill Rate Dominance
+## Dominant Cost: Fixed Overhead per Operation
 
-2D GPU rendering is **memory-bandwidth bound**, not compute bound. The
-fragment shader for a rect fill is ~1 ALU op; even a Gaussian blur pass
-is ~10 ALU ops per pixel. Modern GPUs execute trillions of ALU ops/sec,
-but memory bandwidth is 50-200 GB/s. Each pixel read/write is 4-16 bytes.
+> **BENCHMARK** — Confirmed by measuring identical effects at 200² through
+> 4000² pixels (100× area range). Per-pixel cost component is near zero;
+> total time is constant regardless of area.
 
-Therefore:
+On our measured hardware (M2 Pro, Metal), the cost of most 2D operations
+is dominated by **fixed per-operation overhead** — primarily GPU render
+target switches (`save_layer` / FBO allocation) — not by pixel fill rate.
 
-```
-frame_cost ≈ total_pixels_touched / memory_bandwidth
-```
+The fixed overhead comes from (**FACT**, traced to Skia/GL source):
+
+1. **GPU texture allocation** (~15-30µs) — `glTexStorage2D()`, synchronous
+   on most drivers. Skia's `GrResourceCache` pools textures to mitigate
+   this, but cache misses still pay full cost.
+2. **FBO state change** (~20-40µs) — `glFramebufferTexture2D()`, forces
+   GPU pipeline flush. Unavoidable in GL/Metal immediate-mode API.
+3. **Resource allocator** (~5-15µs) — CPU-side scratch key lookup in
+   `GrResourceAllocator`.
 
-This relationship is **linear**. Double the pixels, double the time.
-No surprises, no non-linear scaling — as long as you stay within VRAM
-and don't hit texture cache thrashing (rare in 2D; access is spatially
-coherent).
+Source: `skia/src/gpu/ganesh/GrGLGpu.cpp` (texture alloc),
+`skia/src/gpu/ganesh/GrResourceAllocator.cpp` (scratch pool).
 
-This means render cost can be pre-computed as an **ALU/pixel budget**:
-count the pixels the GPU will touch, apply structural multipliers per
-effect, and compare against a calibrated device budget.
+> **INFERENCE** — Many common 2D workloads are bandwidth-dominated for
+> simple fills, but effects requiring `save_layer` (blur, shadow, blend
+> mode isolation, group opacity) are dominated by fixed overhead at
+> typical node sizes (< ~1M pixels). The pixel-proportional component
+> becomes significant only at very large sizes or high zoom.
 
 ---
 
-## Effect Cost Constants
-
-These are not magic numbers or tuning parameters. They are the
-**structural pass counts** of each rendering operation — how many
-full-area read-write cycles the GPU performs.
-
-| Effect                                | Pixel Multiplier     | Derivation                                                 |
-| ------------------------------------- | -------------------- | ---------------------------------------------------------- |
-| Plain shape (rect, ellipse, polygon)  | `1×`                 | Single fill pass                                           |
-| Additional fill (N fills on one node) | `+1×` per extra fill | Each fill is a separate pass                               |
-| Additional stroke                     | `+1×` per stroke     | Separate pass                                              |
-| Non-rect clip path                    | `+1×`                | Mask pass + masked content                                 |
-| Rect clip                             | `+0×`                | Hardware scissor — free                                    |
-| Blend mode (non-normal)               | `+1×`                | Requires offscreen isolation layer                         |
-| Group opacity (alpha < 1.0 on group)  | `+1×`                | `save_layer` for isolated compositing                      |
-| Gaussian blur                         | `+3×`                | Downsample pyramid (~1.33×) + blur + upsample + composite  |
-| Drop shadow                           | `+5×`                | Draw shape (1×) + blur pipeline (3×) + composite back (1×) |
-| Inner shadow                          | `+5×`                | Same as drop shadow, inverted mask                         |
-| Backdrop filter (background blur)     | `+3×`                | Snapshot dst + blur + composite                            |
-| Layer blur (on node itself)           | `+3×`                | Offscreen + blur + composite                               |
-| Image fill                            | `+0×` over base      | Texture sample replaces color fill — same bandwidth        |
-| Multiple shadows                      | `+5×` per shadow     | Each shadow is independent                                 |
-
-### Blur Radius Independence
-
-Skia (and most GPU frameworks) implement Gaussian blur via a **downsample
-pyramid**, not a brute-force kernel convolution:
+## Measured Fixed Cost per Operation
+
+> **BENCHMARK** — Single rect, median of 50 runs after 10 warmup.
+> Constant across 50²–4000² pixel area (R² ≈ 0 for most effects).
+
+| Operation                    | C_fixed (µs) | What triggers it                            |
+| ---------------------------- | ------------ | ------------------------------------------- |
+| Baseline (no `save_layer`)   | ~12          | GPU draw call + flush overhead              |
+| `save_layer_alpha` (opacity) | ~20          | 1 FBO switch                                |
+| 2× nested `save_layer`       | ~32          | 2 FBO switches                              |
+| 3× nested `save_layer`       | ~43          | 3 FBO switches (~11µs per additional layer) |
+| Blur (σ=5)                   | ~73          | FBO + blur shader dispatch                  |
+| Inner shadow (σ=6)           | ~72          | FBO + clip + shadow filter dispatch         |
+| Blend mode (Multiply)        | ~81          | FBO + blend resolve                         |
+| Drop shadow (σ=8)            | ~97          | FBO + shadow filter dispatch                |
+| Backdrop blur (σ=8)          | ~110         | FBO + dst snapshot + blur                   |
+| Blur (σ=50)                  | ~207         | FBO + multiple downsample dispatches        |
+| Shadow + blur combo          | ~307         | 2 nested FBOs + both filter dispatches      |
+
+> **INFERENCE** — For frame budget estimation, counting the number of
+> `save_layer`-inducing operations and summing their fixed costs is more
+> accurate than pixel-area-based prediction, at least up to ~16M pixels
+> per node on this hardware.
 
-```
-large sigma → downsample 2× → downsample 2× → ... → blur at reduced size → upsample
-```
+---
 
-Total pixel work = `area × (1 + 1/4 + 1/16 + ...) ≈ area × 1.33` (geometric
-series), plus the blur pass at reduced resolution. The cost is approximately
-**constant regardless of blur radius**. The pyramid absorbs the radius.
+## Blur Cost: Depends on Sigma
 
-### `save_layer` / `save_layer_alpha` — The Hidden Spike Source
+### Skia Constants
 
-`save_layer` is the single most expensive primitive in Skia. It allocates an
-offscreen surface, renders content into it, then composites back.
+> **FACT** — From `skia/src/core/SkBlurEngine.h`.
+
+- `kMaxSamples = 28` — max texture samples per GPU blur pass (hardcoded)
+- `kMaxLinearSigma = 4.0` — max sigma for direct convolution (hardcoded)
+- `SigmaToRadius(σ) = ⌈3 × σ⌉` — sigma-to-radius conversion
+- `LinearKernelWidth(r) = r + 1` — samples per 1D pass (hardware bilinear)
+- σ ≤ 0.03 is treated as identity (no-op)
+
+### Skia Blur Strategy
+
+> **FACT** — From `skia/src/gpu/ganesh/GrBlurUtils.cpp`.
 
 ```
-save_layer_cost = layer_bounds_area × zoom² × 2  (write to offscreen + read back)
+σ ≤ 4.0 and small kernel  →  single 2D convolution pass (≤28 samples)
+σ ≤ 4.0                   →  two separable 1D passes
+σ > 4.0                   →  downsample until σ ≤ 4.0, blur, upsample (recursive)
 ```
 
-Critical: **they cascade multiplicatively with nesting depth**.
+For σ ≤ 4.0, the pass count varies:
 
+- If `KernelWidth(rX) × KernelWidth(rY) ≤ 28`: single 2D pass
+- Otherwise: two separable 1D passes
+
+> **HEURISTIC** — The following formula estimates pass count for σ > 4.0.
+> The exact count depends on image dimensions and Skia's internal
+> rounding, so treat this as an approximation.
+
+```rust
+fn blur_pass_estimate(sigma: f32) -> u32 {
+    if sigma <= 0.03 {
+        return 0; // identity
+    }
+    if sigma <= 4.0 {
+        return 2; // 1–2 passes (1D separable or single 2D)
+    }
+    let levels = ((sigma / 4.0).log2()).ceil() as u32;
+    2 + levels * 2 // 2 blur passes + downsample/upsample per level
+}
 ```
-save_layer              ← offscreen A (full group bounds)
-  save_layer            ← offscreen B (child bounds)
-    save_layer          ← offscreen C (grandchild bounds)
-      draw rect
-    restore             → composite C into B
-  restore               → composite B into A
-restore                 → composite A into target
-```
 
-Three nested layers on the same area = `area × 6` bandwidth, not `area × 2`.
+### Blur Radius Dependence
+
+> **BENCHMARK** — Blur σ=50 is consistently ~2.8× more expensive than
+> σ=5 across all tested sizes. This ratio is stable, confirming that
+> cost scales with downsample level count.
 
-#### Implicit `save_layer` triggers
+| Size  | σ=5 (µs) | σ=50 (µs) | Ratio |
+| ----- | -------- | --------- | ----- |
+| 50²   | 74       | 211       | 2.87× |
+| 100²  | 65       | 193       | 2.99× |
+| 200²  | 73       | 207       | 2.84× |
+| 500²  | 76       | 208       | 2.74× |
+| 4000² | 77       | 230       | 3.00× |
 
-Skia inserts `save_layer` implicitly for these conditions. The cost estimator
-must account for them even when the application code does not call `save_layer`
-explicitly:
+### `reduce_blur()` — Interactive Quality Reduction
 
-| Trigger                                   | Reason                                                          |
-| ----------------------------------------- | --------------------------------------------------------------- |
-| Non-normal blend mode on a group          | Isolated offscreen to blend against dst                         |
-| Group opacity (alpha < 1.0 with children) | Children must composite together first, then alpha applied once |
-| Blur / backdrop filter                    | Reads from dst, needs snapshot                                  |
-| Clip + antialiasing on groups             | Soft-edge mask requires offscreen                               |
-| `ColorFilter` on a group                  | Applied after children composite                                |
+> **FACT** — From `crates/grida-canvas/src/painter/painter.rs`.
+
+The painter implements `reduce_blur()` which divides sigma by 4×
+during interactive frames (`RenderPolicy::EffectQuality::Reduced`).
+This moves most blurs into the σ ≤ 4.0 direct convolution range.
+Example: σ=20 → σ=5 (eliminates ~2 downsample levels).
 
 ---
 
-## Per-Node Cost Formula
+## `save_layer` Triggers
 
-```rust
-fn estimated_fill_pixels(node: &Node, zoom: f32, viewport: &Rect) -> f64 {
-    let screen_area = clipped_area(&node.bounds, viewport) * (zoom * zoom) as f64;
+> **FACT** — From Skia's `SkCanvas::internalSaveLayer()` and observed
+> painter behavior. The cost estimator must account for implicit
+> `save_layer` insertions even when the application code does not call
+> `save_layer` explicitly.
 
-    // Base draw
-    let mut passes: f64 = 1.0;
+| Trigger                                   | Reason                                  |
+| ----------------------------------------- | --------------------------------------- |
+| Non-normal blend mode on a group          | Isolated offscreen to blend against dst |
+| Group opacity (alpha < 1.0 with children) | Children must composite together first  |
+| Blur / backdrop filter                    | Needs offscreen for filter input        |
+| Clip + antialiasing on groups             | Soft-edge mask requires offscreen       |
+| `ColorFilter` on a group                  | Applied after children composite        |
 
-    // Extra fills/strokes beyond the first
-    passes += (node.fill_count.saturating_sub(1)) as f64;
-    passes += node.stroke_count as f64;
+> **FACT** — `save_layer` costs cascade with nesting depth.
+> Each additional layer adds ~11µs fixed overhead (measured from
+> 2× vs 3× nested `save_layer`: 32µs → 43µs).
 
-    // Effects
-    for shadow in &node.shadows {
-        if shadow.visible {
-            passes += 5.0; // shape + blur pipeline + composite
-        }
-    }
-    if node.has_blur() {
-        passes += 3.0; // downsample + blur + composite
-    }
-    if node.has_backdrop_blur() {
-        passes += 3.0;
-    }
+### Blend Mode Tiers
 
-    // Isolation layers (implicit save_layer)
-    if node.blend_mode != BlendMode::Normal {
-        passes += 1.0; // offscreen + composite
-    }
-    if node.opacity < 1.0 && node.has_children() {
-        passes += 1.0; // group opacity isolation
-    }
+> **FACT** — From `skia/src/gpu/Blend.h`, `skia/src/gpu/BlendFormula.h`,
+> `skia/src/gpu/ganesh/effects/GrCustomXfermode.cpp`.
 
-    // Clip
-    if node.has_non_rect_clip() {
-        passes += 1.0; // mask pass
-    }
+Not all blend modes have the same cost. Three tiers:
 
-    screen_area * passes
-}
-```
+| Tier                   | Modes                                                                | Implementation                                      |
+| ---------------------- | -------------------------------------------------------------------- | --------------------------------------------------- |
+| Coefficient (cheapest) | Normal, Screen, SrcOver, Plus, Modulate                              | Hardware fixed-function blend — zero shader cost    |
+| Simple advanced        | Overlay, HardLight, Darken, Lighten                                  | Shared shader, ~10-20 lines, separable              |
+| Complex advanced       | ColorDodge, ColorBurn, SoftLight, Hue, Saturation, Color, Luminosity | Individual shaders, non-separable, guarded division |
+
+> **INFERENCE** — The ~81µs measured for blend mode (Multiply) is
+> entirely `save_layer` FBO overhead, not blend math. Multiply is a
+> coefficient blend mode (cheapest tier). The blend mode tier affects
+> ALU cost per pixel, which is negligible compared to FBO overhead at
+> typical node sizes. Per-paint blend modes (no `save_layer`) are
+> effectively free.
+
+---
 
-### Cache Hit vs. Miss Cost
+## Cache Hit vs. Miss
 
-A compositor/picture cache **hit** replaces the full rasterization pipeline
-with a single texture blit:
+> **BENCHMARK** — Measured with `skia_bench_cache_blit`.
 
-| State      | Effective multiplier          | What happens                                         |
-| ---------- | ----------------------------- | ---------------------------------------------------- |
-| Cache miss | `passes ×` (from table above) | Full rasterization: path tessellation, fill, effects |
-| Cache hit  | `~0.1×`                       | Single texture-sampled quad draw                     |
+| State      | Cost                         | What happens                                                  |
+| ---------- | ---------------------------- | ------------------------------------------------------------- |
+| Cache miss | ~70-300µs (effect-dependent) | Full rasterization with FBO overhead                          |
+| Cache hit  | ~5µs (constant)              | Single texture blit, independent of source complexity or size |
 
-The cost difference is **100-1000×**. Cache state is a binary signal — the
-single largest contributor to per-node cost variance.
+Hit/miss ratio for effect nodes: **~0.05×** (measured).
+Blit cost is ~5µs regardless of source effect complexity — confirmed
+with coefficient of variation check across 4 effect types.
+
+> **BENCHMARK** — At scale (136K nodes, 2600 visible), the compositor
+> cache serves all effect nodes as texture blits. Shadow and blur nodes
+> show `cache_hits = 2704, live_draws = 0`. Effect multipliers only
+> apply to **cache-miss frames** (first render, zoom change, scene
+> mutation).
 
 ---
 
-## Device Fill Rate Reference
+## Scale Behavior
+
+> **BENCHMARK** — Full Renderer pipeline with R-tree culling, picture
+> cache, and layer compositing. Measured with `skia_bench_scene_scale`.
+
+### Per-Visible-Node Cost (stable frames)
+
+| Scene Type       | 1K   | 5K   | 10K  | 50K  | 100K | 136K         |
+| ---------------- | ---- | ---- | ---- | ---- | ---- | ------------ |
+| Plain rects      | 0.41 | 0.38 | 0.40 | 0.43 | 0.54 | 0.89 µs/node |
+| All with shadow  | 0.49 | 0.45 | 0.46 | 0.47 | 0.64 | 0.87 µs/node |
+| All with blur    | 0.46 | 0.48 | 0.45 | 0.51 | 0.74 | 0.84 µs/node |
+| Mixed (70/20/10) | 0.85 | 0.81 | 0.72 | 0.80 | 1.03 | 1.17 µs/node |
+
+> **INFERENCE** — Per-visible-node cost is approximately additive
+> (linear) from 1K to 50K total nodes. Non-linear overhead appears at
+> 100K+ due to R-tree query and scene cache management scaling with
+> total scene size, not drawing cost. Visible count caps at ~2600 nodes
+> in a 1000×1000 viewport with 8×8 rects — R-tree culling works.
+
+---
+
+## Practical Cost Model
 
-The total pixel budget depends on device fill rate — the one value that
-varies per hardware. Everything else is derived from geometry and scene
-structure.
+> **HEURISTIC** — Based on all benchmarks above. For frame budget
+> decisions (skip or draw), the following is more accurate than
+> pixel-area-based prediction at typical node sizes.
+
+```
+frame_cost ≈ Σ visible_nodes(
+    if cache_hit:     ~5 µs
+    if cache_miss:    C_fixed(effect_type)
+)
+```
+
+Where `C_fixed` values are from the measured table above. The pixel-area
+component is negligible up to ~16M pixels per node on tested hardware.
+
+For nodes with multiple effects, sum the fixed costs (each effect
+that triggers a `save_layer` adds its own FBO overhead).
 
 ### Calibration
 
-Render a known workload (e.g., full-screen solid rect) and measure:
+Two device-specific constants must be measured at startup:
 
 ```
-pixels_per_ms = (screen_width × screen_height) / render_time_ms
+save_layer_overhead_us  = measured via single save_layer + draw + restore
+pixels_per_ms           = measured via full-screen solid rect
 ```
 
-### Reference Values (order-of-magnitude)
+Everything else is derived from scene structure (effect types, cache state).
+
+---
+
+## Device Fill Rate Reference
+
+> **BENCHMARK** — Baseline solid rect at 500².
+
+| Metric      | Value (M2 Pro)  |
+| ----------- | --------------- |
+| Fill rate   | ~146M pixels/ms |
+| 12ms budget | ~1.8B pixels    |
+
+> **HEURISTIC** — Order-of-magnitude reference.
 
 | Platform                 | Expected pixels_per_ms |
 | ------------------------ | ---------------------- |
@@ -207,58 +291,53 @@ pixels_per_ms = (screen_width × screen_height) / render_time_ms
 
 ## Chromium Reference
 
-Chromium's `cc/` compositor collects similar metrics but uses them differently:
+> **FACT** — From `cc/paint/display_item_list.h`, `cc/tiles/tile_manager.cc`.
+
+Chromium's `cc/` compositor collects these metrics:
 
-| Metric                                | Chromium Location              | Chromium Usage                                                |
+| Metric                                | Location                       | Usage                                                         |
 | ------------------------------------- | ------------------------------ | ------------------------------------------------------------- |
 | `TotalOpCount()`                      | `cc/paint/display_item_list.h` | Solid-color analysis gate                                     |
 | `num_slow_paths_up_to_min_for_MSAA()` | `cc/paint/display_item_list.h` | Page-level GPU raster veto                                    |
 | `has_save_layer_ops()`                | `cc/paint/display_item_list.h` | LCD text decision                                             |
-| `has_non_aa_paint()`                  | `cc/paint/display_item_list.h` | Antialiasing decisions                                        |
 | `BytesUsed()` / `OpBytesUsed()`       | `cc/paint/display_item_list.h` | Tracing / debugging                                           |
-| `AreaOfDrawText()`                    | `cc/paint/display_item_list.h` | Text coverage statistics                                      |
 | Solid color analysis                  | `cc/tiles/tile_manager.cc`     | Skip rasterization for uniform tiles (`kMaxOpsToAnalyze = 5`) |
 
-Chromium does **not** perform per-tile raster cost prediction. Tile
-scheduling is purely spatial (viewport distance + scroll velocity) with
-a memory budget constraint. Their architecture tolerates stale tiles
-(multi-threaded raster catches up across frames). Ours cannot — we render
-single-threaded with a hard per-frame deadline, requiring predictive
-budgeting.
+> **INFERENCE** — Based on source review, Chromium does not appear to
+> perform per-tile raster cost prediction. Tile scheduling is spatial
+> (viewport distance + scroll velocity) with a memory budget constraint.
+> Their multi-threaded raster architecture can tolerate stale tiles in
+> ways our single-threaded pipeline cannot.
 
 Local source: `/Users/softmarshmallow/Documents/Github/chromium/cc/`
 
 ---
 
-## Skia `Picture` Metrics (Available for Free)
+## Skia `Picture` Metrics
 
-Skia's `Picture` object exposes complexity metrics that are already
-computed during recording and cost nothing to query:
+> **FACT** — From `skia/include/core/SkPicture.h`.
 
-| Method                     | What it returns                    | Use                                |
-| -------------------------- | ---------------------------------- | ---------------------------------- |
-| `approximate_op_count()`   | Number of draw operations recorded | Secondary complexity signal        |
-| `approximate_bytes_used()` | Serialized size of the picture     | Memory pressure / complexity proxy |
+| Method                     | Returns                            | Cost to query       |
+| -------------------------- | ---------------------------------- | ------------------- |
+| `approximate_op_count()`   | Number of recorded draw operations | Free (stored field) |
+| `approximate_bytes_used()` | Serialized size of the picture     | Free (stored field) |
 
-These are stored fields, not computations. They complement the pixel-area
-model by capturing path complexity variance (a 1000-op picture with
-complex beziers vs. a 3-op picture with simple rects at the same pixel
-area).
+These capture path complexity variance that the fixed-cost model does
+not account for (e.g., a 1000-op picture with complex beziers vs. a
+3-op picture with simple rects).
 
 ---
 
-## Linearity Bounds
+## Benchmark Source
 
-The fill-rate model is linear under these conditions:
+All benchmarks use `HeadlessGpu` (offscreen Metal/GL surface), median
+of 50 iterations after 10 warmup, single rect per iteration unless
+noted otherwise.
 
-| Condition                          | Linear?             | Notes                                                  |
-| ---------------------------------- | ------------------- | ------------------------------------------------------ |
-| Work above ~10K pixels             | Yes                 | Below this, GPU launch overhead dominates (flat floor) |
-| Spatial texture access (normal 2D) | Yes                 | Bandwidth-bound, no cache thrashing                    |
-| Random texture access              | Can be super-linear | Rare in 2D rendering                                   |
-| Tile-based GPU (mobile)            | Mostly              | Large nodes spanning many tiles add per-tile overhead  |
-| Thermal throttling                 | N/A                 | Between-frame variance, not within-frame               |
-| VRAM pressure / swapping           | Non-linear          | Catastrophic; avoid by staying within budget           |
+| Benchmark                | What it measures                                                                   |
+| ------------------------ | ---------------------------------------------------------------------------------- |
+| `skia_bench_cost_model`  | Per-effect fixed cost, linearity, blur radius, fill rate, two-component extraction |
+| `skia_bench_cache_blit`  | Cache hit/miss ratio, blit constancy across effect types                           |
+| `skia_bench_scene_scale` | Full Renderer pipeline at 1K–136K nodes with culling and caching                   |
 
-For typical 2D canvas rendering (spatial access, nodes > 10K pixels),
-the linear model holds.
+Source: `crates/grida-canvas/examples/skia_bench/`
diff --git a/fixtures/test-html/L0/box-margin.html b/fixtures/test-html/L0/box-margin.html
new file mode 100644
index 0000000000..3ae840fb83
--- /dev/null
+++ b/fixtures/test-html/L0/box-margin.html
@@ -0,0 +1,295 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>Box: Margin</title>
+    <style>
+      * {
+        box-sizing: border-box;
+      }
+      body {
+        background: #030712;
+        color: #e2e8f0;
+        font-family: system-ui, sans-serif;
+        font-size: 14px;
+        padding: 24px;
+      }
+
+      h1 {
+        font-size: 20px;
+        color: #fff;
+        margin-bottom: 32px;
+      }
+      h2 {
+        font-size: 14px;
+        margin-top: 40px;
+        margin-bottom: 6px;
+        color: #94a3b8;
+        text-transform: uppercase;
+        letter-spacing: 0.05em;
+      }
+      p.desc {
+        font-size: 12px;
+        color: #64748b;
+        margin-bottom: 14px;
+      }
+
+      .demo {
+        background: #1e293b;
+        border: 1px solid #334155;
+        border-radius: 8px;
+        padding: 24px;
+        margin-bottom: 14px;
+        position: relative;
+      }
+
+      .demo-label {
+        position: absolute;
+        top: 8px;
+        right: 12px;
+        font-size: 11px;
+        color: #475569;
+        font-family: monospace;
+      }
+
+      .box {
+        background: #6366f1;
+        color: #fff;
+        padding: 12px 16px;
+        font-size: 12px;
+        font-family: monospace;
+      }
+
+      .box-alt {
+        background: #a855f7;
+      }
+      .box-green {
+        background: #059669;
+      }
+
+      .anno {
+        font-size: 11px;
+        color: #f87171;
+        font-family: monospace;
+        text-align: center;
+        padding: 4px;
+      }
+
+      .side {
+        display: flex;
+        flex-direction: row;
+        flex-wrap: wrap;
+        gap: 14px;
+        margin-bottom: 14px;
+      }
+
+      .side .demo {
+        margin-bottom: 0;
+        flex: 1;
+        min-width: 260px;
+      }
+
+      .collapse-container {
+        background: #0f172a;
+      }
+      .collapse-container .box {
+        margin: 30px 0;
+      }
+
+      .no-collapse-container {
+        display: flex;
+        flex-direction: column;
+        background: #0f172a;
+      }
+      .no-collapse-container .box {
+        margin: 30px 0;
+      }
+
+      .inline-container {
+        background: #0f172a;
+        line-height: 2;
+        padding: 12px;
+      }
+      .inline-box {
+        display: inline;
+        background: #6366f1;
+        color: #fff;
+        padding: 2px 6px;
+        font-size: 12px;
+        font-family: monospace;
+        margin: 40px 8px;
+      }
+
+      .inline-box-alt {
+        background: #a855f7;
+      }
+    </style>
+  </head>
+  <body>
+    <h1>CSS Margin Behaviors</h1>
+
+    <!-- 1. Margin Collapse (sibling) -->
+    <h2>1. Margin Collapse (block flow)</h2>
+    <p class="desc">
+      Both boxes have <code>margin: 30px 0</code>. In normal flow, the 30px
+      margins collapse into 30px (not 60px).
+    </p>
+    <div class="side">
+      <div class="demo">
+        <div class="demo-label">normal flow (collapsed)</div>
+        <div class="collapse-container">
+          <div class="box">margin: 30px 0</div>
+          <div class="anno">↕ 30px (collapsed)</div>
+          <div class="box box-alt">margin: 30px 0</div>
+        </div>
+      </div>
+      <div class="demo">
+        <div class="demo-label">flex column (no collapse)</div>
+        <div class="no-collapse-container">
+          <div class="box">margin: 30px 0</div>
+          <div class="anno">↕ 60px (no collapse)</div>
+          <div class="box box-alt">margin: 30px 0</div>
+        </div>
+      </div>
+    </div>
+
+    <!-- 2. Negative Margin -->
+    <h2>2. Negative Margin</h2>
+    <p class="desc">
+      Second box has <code>margin-top: -20px</code>, pulling it upward and
+      overlapping the first box.
+    </p>
+    <div class="demo">
+      <div style="background: #0f172a; padding-top: 40px">
+        <div class="box">box A</div>
+        <div
+          class="box box-alt"
+          style="margin-top: -20px; opacity: 0.85; position: relative"
+        >
+          box B — margin-top: -20px
+        </div>
+      </div>
+    </div>
+
+    <!-- 3. Margin Auto -->
+    <h2>3. Margin Auto</h2>
+    <p class="desc">
+      Auto margins distribute available space. Used for centering and
+      push-alignment.
+    </p>
+    <div class="side">
+      <div class="demo">
+        <div class="demo-label">margin: 0 auto</div>
+        <div style="background: #0f172a">
+          <div class="box" style="width: 180px; margin: 0 auto">centered</div>
+        </div>
+      </div>
+      <div class="demo">
+        <div class="demo-label">margin-left: auto</div>
+        <div style="background: #0f172a">
+          <div class="box box-alt" style="width: 180px; margin-left: auto">
+            pushed right
+          </div>
+        </div>
+      </div>
+    </div>
+    <div class="demo">
+      <div class="demo-label">flex + margin-left: auto (spacer pattern)</div>
+      <div style="display: flex; background: #0f172a">
+        <div class="box">A</div>
+        <div class="box box-alt" style="margin-left: auto">B (ml: auto)</div>
+        <div class="box box-green" style="margin-left: auto">C (ml: auto)</div>
+      </div>
+    </div>
+
+    <!-- 4. Background Boundary -->
+    <h2>4. Background Boundary</h2>
+    <p class="desc">
+      Margin is outside the background. Left: margin creates transparent gap.
+      Right: wrapper+padding equivalent — padding zone paints with the wrapper's
+      background.
+    </p>
+    <div class="side">
+      <div class="demo" style="padding: 0">
+        <div class="demo-label">margin: 24px</div>
+        <div class="box" style="margin: 24px">
+          background: blue; margin: 24px;
+        </div>
+      </div>
+      <div class="demo" style="padding: 0">
+        <div class="demo-label">wrapper padding: 24px</div>
+        <div style="padding: 24px; background: transparent">
+          <div class="box">wrapper { padding: 24px } → child</div>
+        </div>
+      </div>
+    </div>
+
+    <!-- 5. Inline Element Margin -->
+    <h2>5. Inline Element Margin</h2>
+    <p class="desc">
+      Inline elements ignore vertical margin. These <code>&lt;span&gt;</code>s
+      have <code>margin: 40px 8px</code> but only horizontal margin applies.
+    </p>
+    <div class="demo">
+      <div class="inline-container">
+        text before <span class="inline-box">span A</span> middle text
+        <span class="inline-box inline-box-alt">span B</span> text after
+        <br />
+        next line <span class="inline-box">span C</span> continues
+      </div>
+    </div>
+
+    <!-- 6. Unequal Collapse -->
+    <h2>6. Collapse Variants</h2>
+    <p class="desc">
+      Unequal margins: A has <code>margin-bottom: 50px</code>, B has
+      <code>margin-top: 20px</code>. Collapsed = max(50, 20) = 50px.
+    </p>
+    <div class="side">
+      <div class="demo">
+        <div class="demo-label">collapsed (50px, not 70px)</div>
+        <div class="collapse-container">
+          <div class="box" style="margin: 0 0 50px 0">mb: 50px</div>
+          <div class="box box-alt" style="margin: 20px 0 0 0">mt: 20px</div>
+        </div>
+      </div>
+      <div class="demo">
+        <div class="demo-label">wrapper+padding (70px total)</div>
+        <div style="background: #0f172a">
+          <div style="padding-bottom: 50px">
+            <div class="box">wrapper pb: 50px</div>
+          </div>
+          <div style="padding-top: 20px">
+            <div class="box box-alt">wrapper pt: 20px</div>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <!-- 7. Parent-Child Collapse -->
+    <h2>7. Parent-Child Collapse</h2>
+    <p class="desc">
+      A child's margin can collapse through its parent if the parent has no
+      border, padding, or BFC. Left: margin leaks out. Right: padding on parent
+      prevents collapse.
+    </p>
+    <div class="side">
+      <div class="demo">
+        <div class="demo-label">collapsed (margin leaks)</div>
+        <div style="background: rgba(248, 113, 113, 0.15)">
+          <div class="box" style="margin-top: 30px">child mt: 30px</div>
+        </div>
+        <div class="anno">
+          ↑ parent has no padding/border — child margin leaks out
+        </div>
+      </div>
+      <div class="demo">
+        <div class="demo-label">padding prevents collapse</div>
+        <div style="background: rgba(248, 113, 113, 0.15); padding: 1px">
+          <div class="box" style="margin-top: 30px">child mt: 30px</div>
+        </div>
+        <div class="anno">↑ parent has padding: 1px — margin stays inside</div>
+      </div>
+    </div>
+  </body>
+</html>