diff --git a/crates/grida-canvas/Cargo.toml b/crates/grida-canvas/Cargo.toml index 41a837dfd3..d6d839f33f 100644 --- a/crates/grida-canvas/Cargo.toml +++ b/crates/grida-canvas/Cargo.toml @@ -149,6 +149,21 @@ path = "examples/skia_bench/skia_bench_cache_image.rs" name = "skia_bench_cache_text" path = "examples/skia_bench/skia_bench_cache_text.rs" +[[example]] +name = "skia_bench_cost_model" +path = "examples/skia_bench/skia_bench_cost_model.rs" +required-features = ["native-gl-context"] + +[[example]] +name = "skia_bench_cache_blit" +path = "examples/skia_bench/skia_bench_cache_blit.rs" +required-features = ["native-gl-context"] + +[[example]] +name = "skia_bench_scene_scale" +path = "examples/skia_bench/skia_bench_scene_scale.rs" +required-features = ["native-gl-context"] + # ── IO tools ───────────────────────────────────────────────────── [[example]] name = "tool_io_grida" diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs new file mode 100644 index 0000000000..11db38c65a --- /dev/null +++ b/crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs @@ -0,0 +1,265 @@ +//! Cache Hit vs. Miss Cost Ratio Benchmark +//! +//! Measures the actual cost ratio between a cache hit (GPU texture blit) and +//! a cache miss (full rasterization). Validates the ~0.1× estimate from +//! `docs/wg/feat-2d/render-cost-prediction.md`. +//! +//! Run with: +//! ```bash +//! cargo run -p cg --example skia_bench_cache_blit --features native-gl-context --release +//! ``` + +#[cfg(not(feature = "native-gl-context"))] +fn main() { + eprintln!("This example requires --features native-gl-context"); +} + +#[cfg(feature = "native-gl-context")] +fn main() { + use cg::window::headless::HeadlessGpu; + use skia_safe::{ + canvas::SaveLayerRec, image_filters, Color, Image, ImageInfo, Paint, Rect, Surface, + }; + use std::time::Instant; + + const W: i32 = 1000; + const H: i32 = 1000; + const WARMUP: u32 = 10; + const ITERS: u32 = 50; + + let mut gpu = HeadlessGpu::new(W, H).expect("GPU init"); + gpu.print_gl_info(); + println!(); + + let surface = &mut gpu.surface; + + // ── Helpers ────────────────────────────────────────────────────── + + fn flush(s: &mut Surface) { + if let Some(mut ctx) = s.recording_context() { + if let Some(mut d) = ctx.as_direct_context() { + d.flush_and_submit(); + } + } + } + + /// Measure median time (µs) for a drawing operation. + fn bench_draw(surface: &mut Surface, draw_fn: &dyn Fn(&skia_safe::Canvas)) -> f64 { + // Warmup + for _ in 0..WARMUP { + let canvas = surface.canvas(); + canvas.clear(Color::WHITE); + draw_fn(canvas); + flush(surface); + } + // Measure + let mut timings = Vec::with_capacity(ITERS as usize); + for _ in 0..ITERS { + let t0 = Instant::now(); + let canvas = surface.canvas(); + canvas.clear(Color::WHITE); + draw_fn(canvas); + flush(surface); + timings.push(t0.elapsed().as_nanos() as f64 / 1000.0); + } + timings.sort_by(|a, b| a.partial_cmp(b).unwrap()); + timings[timings.len() / 2] + } + + /// Capture a rect with effects into a GPU-resident Image. + fn capture_to_image( + surface: &mut Surface, + size: i32, + draw_fn: &dyn Fn(&skia_safe::Canvas, Rect), + ) -> Image { + let info = ImageInfo::new_n32_premul((size, size), None); + let mut offscreen = surface.new_surface(&info).expect("offscreen surface"); + { + let canvas = offscreen.canvas(); + canvas.clear(Color::TRANSPARENT); + let rect = Rect::from_xywh(0.0, 0.0, size as f32, size as f32); + draw_fn(canvas, rect); + } + flush(surface); + offscreen.image_snapshot() + } + + // ── Effect configurations ─────────────────────────────────────── + + struct EffectConfig { + name: &'static str, + draw: Box, + } + + let shadow_filter = image_filters::drop_shadow( + (4.0, 4.0), + (8.0, 8.0), + Color::from_argb(128, 0, 0, 0), + None, + None, + None, + ); + let blur_filter = image_filters::blur((8.0, 8.0), None, None, None); + + let sf = shadow_filter.clone(); + let blf = blur_filter.clone(); + let sf2 = shadow_filter.clone(); + + let effects: Vec = vec![ + EffectConfig { + name: "solid rect", + draw: Box::new(|canvas, rect| { + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + }), + }, + EffectConfig { + name: "rect + blur (s=8)", + draw: Box::new(move |canvas, rect| { + let mut lp = Paint::default(); + lp.set_image_filter(blf.clone()); + let rec = SaveLayerRec::default().bounds(&rect).paint(&lp); + canvas.save_layer(&rec); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + }), + }, + EffectConfig { + name: "rect + shadow (s=8)", + draw: Box::new(move |canvas, rect| { + let mut lp = Paint::default(); + lp.set_image_filter(sf.clone()); + let rec = SaveLayerRec::default().bounds(&rect).paint(&lp); + canvas.save_layer(&rec); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + }), + }, + EffectConfig { + name: "complex (3 fills + stroke + shadow)", + draw: Box::new(move |canvas, rect| { + let mut lp = Paint::default(); + lp.set_image_filter(sf2.clone()); + let rec = SaveLayerRec::default().bounds(&rect).paint(&lp); + canvas.save_layer(&rec); + // 3 fills + let mut p1 = Paint::default(); + p1.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p1); + let mut p2 = Paint::default(); + p2.set_color(Color::from_argb(128, 255, 0, 0)); + canvas.draw_rect(rect, &p2); + let mut p3 = Paint::default(); + p3.set_color(Color::from_argb(64, 0, 255, 0)); + canvas.draw_rect(rect, &p3); + // 1 stroke + let mut s = Paint::default(); + s.set_color(Color::BLACK); + s.set_style(skia_safe::PaintStyle::Stroke); + s.set_stroke_width(2.0); + canvas.draw_rect(rect, &s); + canvas.restore(); + }), + }, + ]; + + let sizes: [i32; 3] = [100, 200, 500]; + + // ── Run benchmarks ────────────────────────────────────────────── + + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 1: Cache Hit vs. Miss Ratio"); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!( + " {:<36} {:>5} {:>10} {:>10} {:>10}", + "Effect", "Size", "Miss(µs)", "Hit(µs)", "Ratio" + ); + println!( + " {:-<36} {:->5} {:->10} {:->10} {:->10}", + "", "", "", "", "" + ); + + // blit_times[effect_idx][size_idx] for constancy check + let mut blit_times: Vec> = vec![Vec::new(); effects.len()]; + + for (ei, effect) in effects.iter().enumerate() { + for (si, &size) in sizes.iter().enumerate() { + let sizef = size as f32; + let cx = (W as f32 - sizef) / 2.0; + let cy = (H as f32 - sizef) / 2.0; + let dst_rect = Rect::from_xywh(cx, cy, sizef, sizef); + + // Cache miss: full rasterize + let miss_us = bench_draw(surface, &|canvas| { + (effect.draw)(canvas, dst_rect); + }); + + // Capture to GPU texture + let cached_image = capture_to_image(surface, size, &*effect.draw); + + // Cache hit: texture blit + let hit_us = bench_draw(surface, &|canvas| { + canvas.draw_image_rect(&cached_image, None, dst_rect, &Paint::default()); + }); + + let ratio = hit_us / miss_us; + blit_times[ei].push(hit_us); + + println!( + " {:<36} {:>4}² {:>10.1} {:>10.1} {:>9.3}×", + effect.name, size, miss_us, hit_us, ratio + ); + + eprint!( + "\r [{}/{}]", + ei * sizes.len() + si + 1, + effects.len() * sizes.len() + ); + } + } + eprintln!("\r Done.{:40}", ""); + + // ── Output Section 2: Blit Constancy ──────────────────────────── + + println!(); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 2: Blit Cost Constancy (same size, different source complexity)"); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" Blit cost should NOT vary with source effect complexity at the same size."); + println!(); + + for (si, &size) in sizes.iter().enumerate() { + let blit_at_size: Vec = blit_times.iter().map(|bt| bt[si]).collect(); + let mean = blit_at_size.iter().sum::() / blit_at_size.len() as f64; + let variance = blit_at_size.iter().map(|v| (v - mean).powi(2)).sum::() + / blit_at_size.len() as f64; + let stddev = variance.sqrt(); + let cv = if mean > 0.0 { + stddev / mean * 100.0 + } else { + 0.0 + }; + + println!(" Size {}²:", size); + for (ei, effect) in effects.iter().enumerate() { + println!(" {:<36} {:>8.1} µs", effect.name, blit_times[ei][si]); + } + println!( + " mean={:.1} µs stddev={:.1} µs CV={:.1}% {}", + mean, + stddev, + cv, + if cv < 10.0 { "OK" } else { "WARN (>10%)" } + ); + println!(); + } + + println!(" Expected: CV < 10% at each size (blit cost independent of source complexity)"); + println!(" Reference: predicted cache-hit ratio ~0.1× (from cost model doc)"); + println!(); +} diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs new file mode 100644 index 0000000000..bd065317b6 --- /dev/null +++ b/crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs @@ -0,0 +1,549 @@ +//! Render Cost Model Validation Benchmark +//! +//! Validates the structural pixel-cost model from +//! `docs/wg/feat-2d/render-cost-prediction.md` against real GPU measurements. +//! +//! Unlike `skia_bench_effects` (10K tiny rects, per-rect overhead), this draws +//! **one rect per iteration at controlled sizes** to isolate per-pixel cost +//! from per-draw-call overhead. +//! +//! Run with: +//! ```bash +//! cargo run -p cg --example skia_bench_cost_model --features native-gl-context --release +//! ``` + +#[cfg(not(feature = "native-gl-context"))] +fn main() { + eprintln!("This example requires --features native-gl-context"); +} + +#[cfg(feature = "native-gl-context")] +fn main() { + use cg::window::headless::HeadlessGpu; + use skia_safe::{canvas::SaveLayerRec, image_filters, BlendMode, Color, Paint, Rect, Surface}; + use std::time::Instant; + + const W: i32 = 1000; + const H: i32 = 1000; + const WARMUP: u32 = 10; + const ITERS: u32 = 50; + + let mut gpu = HeadlessGpu::new(W, H).expect("GPU init"); + gpu.print_gl_info(); + println!(); + + let surface = &mut gpu.surface; + + // ── Helpers ────────────────────────────────────────────────────── + + fn flush(s: &mut Surface) { + if let Some(mut ctx) = s.recording_context() { + if let Some(mut d) = ctx.as_direct_context() { + d.flush_and_submit(); + } + } + } + + /// Run a single-rect benchmark at the given size. + /// Returns the **median** duration in microseconds. + fn bench_single_rect( + surface: &mut Surface, + size: f32, + draw_fn: &dyn Fn(&skia_safe::Canvas, Rect), + ) -> f64 { + let cx = (W as f32 - size) / 2.0; + let cy = (H as f32 - size) / 2.0; + let rect = Rect::from_xywh(cx, cy, size, size); + + // Warmup + for _ in 0..WARMUP { + let canvas = surface.canvas(); + canvas.clear(Color::WHITE); + draw_fn(canvas, rect); + flush(surface); + } + + // Measure + let mut timings = Vec::with_capacity(ITERS as usize); + for _ in 0..ITERS { + let t0 = Instant::now(); + let canvas = surface.canvas(); + canvas.clear(Color::WHITE); + draw_fn(canvas, rect); + flush(surface); + timings.push(t0.elapsed().as_nanos() as f64 / 1000.0); // microseconds + } + timings.sort_by(|a, b| a.partial_cmp(b).unwrap()); + timings[timings.len() / 2] // median + } + + /// Compute R-squared for linear fit of (xs, ys). + fn r_squared(xs: &[f64], ys: &[f64]) -> f64 { + let n = xs.len() as f64; + let x_mean = xs.iter().sum::() / n; + let y_mean = ys.iter().sum::() / n; + let ss_xy: f64 = xs + .iter() + .zip(ys) + .map(|(x, y)| (x - x_mean) * (y - y_mean)) + .sum(); + let ss_xx: f64 = xs.iter().map(|x| (x - x_mean).powi(2)).sum(); + let ss_yy: f64 = ys.iter().map(|y| (y - y_mean).powi(2)).sum(); + if ss_xx == 0.0 || ss_yy == 0.0 { + return 0.0; + } + let r = ss_xy / (ss_xx * ss_yy).sqrt(); + r * r + } + + // ── Variant definitions ───────────────────────────────────────── + + struct Variant { + name: &'static str, + predicted: f64, + draw: Box, + } + + let shadow_filter_s8 = image_filters::drop_shadow( + (4.0, 4.0), + (8.0, 8.0), + Color::from_argb(128, 0, 0, 0), + None, + None, + None, + ); + + let shadow_filter_s8_only = image_filters::drop_shadow_only( + (2.0, 2.0), + (6.0, 6.0), + Color::from_argb(128, 0, 0, 0), + None, + None, + None, + ); + + let blur_filter_5 = image_filters::blur((5.0, 5.0), None, None, None); + let blur_filter_50 = image_filters::blur((50.0, 50.0), None, None, None); + let backdrop_blur_8 = image_filters::blur((8.0, 8.0), None, None, None).unwrap(); + + // Clone filters for closures + let sf8 = shadow_filter_s8.clone(); + let sf8o = shadow_filter_s8_only.clone(); + let bf5 = blur_filter_5.clone(); + let bf50 = blur_filter_50.clone(); + let sf8_for_combo = shadow_filter_s8.clone(); + let bf5_for_combo = blur_filter_5.clone(); + let bd8 = backdrop_blur_8.clone(); + + let variants: Vec = vec![ + // 1. Baseline + Variant { + name: "baseline (solid rect)", + predicted: 1.0, + draw: Box::new(|canvas, rect| { + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + }), + }, + // 2. +1 extra fill + Variant { + name: "+1 fill (2 fills total)", + predicted: 2.0, + draw: Box::new(|canvas, rect| { + let mut p1 = Paint::default(); + p1.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p1); + let mut p2 = Paint::default(); + p2.set_color(Color::from_argb(128, 255, 0, 0)); + canvas.draw_rect(rect, &p2); + }), + }, + // 3. +2 extra fills + Variant { + name: "+2 fills (3 fills total)", + predicted: 3.0, + draw: Box::new(|canvas, rect| { + let mut p1 = Paint::default(); + p1.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p1); + let mut p2 = Paint::default(); + p2.set_color(Color::from_argb(128, 255, 0, 0)); + canvas.draw_rect(rect, &p2); + let mut p3 = Paint::default(); + p3.set_color(Color::from_argb(128, 0, 255, 0)); + canvas.draw_rect(rect, &p3); + }), + }, + // 4. +1 stroke + Variant { + name: "+1 stroke", + predicted: 2.0, + draw: Box::new(|canvas, rect| { + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + let mut s = Paint::default(); + s.set_color(Color::BLACK); + s.set_style(skia_safe::PaintStyle::Stroke); + s.set_stroke_width(2.0); + canvas.draw_rect(rect, &s); + }), + }, + // 5. Non-normal blend mode (save_layer) + Variant { + name: "blend mode (Multiply)", + predicted: 2.0, + draw: Box::new(|canvas, rect| { + let mut lp = Paint::default(); + lp.set_blend_mode(BlendMode::Multiply); + let rec = SaveLayerRec::default().bounds(&rect).paint(&lp); + canvas.save_layer(&rec); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + }), + }, + // 6. Opacity (save_layer_alpha) + Variant { + name: "opacity 0.5 (save_layer_alpha)", + predicted: 2.0, + draw: Box::new(|canvas, rect| { + canvas.save_layer_alpha(Some(rect), 128); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + }), + }, + // 7. Gaussian blur (r=5) + Variant { + name: "blur (r=5)", + predicted: 4.0, + draw: Box::new(move |canvas, rect| { + let mut lp = Paint::default(); + lp.set_image_filter(bf5.clone()); + let rec = SaveLayerRec::default().bounds(&rect).paint(&lp); + canvas.save_layer(&rec); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + }), + }, + // 8. Gaussian blur (r=50) — should be ~same cost (radius independence) + Variant { + name: "blur (r=50)", + predicted: 4.0, + draw: Box::new(move |canvas, rect| { + let mut lp = Paint::default(); + lp.set_image_filter(bf50.clone()); + let rec = SaveLayerRec::default().bounds(&rect).paint(&lp); + canvas.save_layer(&rec); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + }), + }, + // 9. Drop shadow (with content) + Variant { + name: "drop shadow (s=8)", + predicted: 6.0, + draw: Box::new(move |canvas, rect| { + let mut lp = Paint::default(); + lp.set_image_filter(sf8.clone()); + let rec = SaveLayerRec::default().bounds(&rect).paint(&lp); + canvas.save_layer(&rec); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + }), + }, + // 10. Inner shadow (clip + shadow_only) + Variant { + name: "inner shadow (s=6)", + predicted: 6.0, + draw: Box::new(move |canvas, rect| { + // Base rect + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 240, 240, 240)); + canvas.draw_rect(rect, &p); + // Clipped inner shadow + canvas.save(); + canvas.clip_rect(rect, None, None); + let mut lp = Paint::default(); + lp.set_image_filter(sf8o.clone()); + let rec = SaveLayerRec::default().bounds(&rect).paint(&lp); + canvas.save_layer(&rec); + let mut sp = Paint::default(); + sp.set_color(Color::from_argb(255, 240, 240, 240)); + canvas.draw_rect(rect, &sp); + canvas.restore(); + canvas.restore(); + }), + }, + // 11. Drop shadow + blur combined + Variant { + name: "shadow + blur combo", + predicted: 9.0, + draw: Box::new(move |canvas, rect| { + // Outer: blur + let mut blur_p = Paint::default(); + blur_p.set_image_filter(bf5_for_combo.clone()); + let blur_rec = SaveLayerRec::default().bounds(&rect).paint(&blur_p); + canvas.save_layer(&blur_rec); + // Inner: shadow + let mut shadow_p = Paint::default(); + shadow_p.set_image_filter(sf8_for_combo.clone()); + let shadow_rec = SaveLayerRec::default().bounds(&rect).paint(&shadow_p); + canvas.save_layer(&shadow_rec); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + canvas.restore(); + }), + }, + // 12. 2x nested save_layer (no effects, pure isolation cost) + Variant { + name: "2x nested save_layer", + predicted: 5.0, + draw: Box::new(|canvas, rect| { + canvas.save_layer_alpha(Some(rect), 255); + canvas.save_layer_alpha(Some(rect), 255); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + canvas.restore(); + }), + }, + // 13. 3x nested save_layer + Variant { + name: "3x nested save_layer", + predicted: 7.0, + draw: Box::new(|canvas, rect| { + canvas.save_layer_alpha(Some(rect), 255); + canvas.save_layer_alpha(Some(rect), 255); + canvas.save_layer_alpha(Some(rect), 255); + let mut p = Paint::default(); + p.set_color(Color::from_argb(255, 66, 133, 244)); + canvas.draw_rect(rect, &p); + canvas.restore(); + canvas.restore(); + canvas.restore(); + }), + }, + // 14. Backdrop blur + Variant { + name: "backdrop blur (s=8)", + predicted: 4.0, + draw: Box::new(move |canvas, rect| { + // Background content + let mut bg = Paint::default(); + bg.set_color(Color::from_argb(255, 200, 50, 100)); + canvas.draw_rect(rect, &bg); + // Backdrop blur layer on top + let lp = Paint::default(); + let rec = SaveLayerRec::default() + .bounds(&rect) + .backdrop(&bd8) + .paint(&lp); + canvas.save_layer(&rec); + let mut overlay = Paint::default(); + overlay.set_color(Color::from_argb(80, 255, 255, 255)); + canvas.draw_rect(rect, &overlay); + canvas.restore(); + }), + }, + ]; + + // ── Run benchmarks ────────────────────────────────────────────── + + let sizes: [f32; 8] = [50.0, 100.0, 200.0, 300.0, 500.0, 1000.0, 2000.0, 4000.0]; + let pixel_areas: Vec = sizes.iter().map(|s| (*s as f64) * (*s as f64)).collect(); + + // results[variant_idx][size_idx] = median_us + let mut results: Vec> = Vec::new(); + + for (vi, variant) in variants.iter().enumerate() { + let mut row = Vec::new(); + for &size in &sizes { + let us = bench_single_rect(surface, size, &*variant.draw); + row.push(us); + } + eprint!("\r [{}/{}] {:<35}", vi + 1, variants.len(), variant.name); + results.push(row); + } + eprintln!("\r Done.{:40}", ""); + + // ── Output Section 1: Cost Multiplier Table (at 200²) ─────────── + + let size_idx_200 = 2; // 200.0 is index 2 + let baseline_200 = results[0][size_idx_200]; + + println!(); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 1: Cost Multiplier Validation (at 200×200)"); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!( + " {:<35} {:>10} {:>10} {:>10} {:>6}", + "Effect", "Predicted", "Measured", "Time(µs)", "Status" + ); + println!( + " {:-<35} {:->10} {:->10} {:->10} {:->6}", + "", "", "", "", "" + ); + + for (vi, variant) in variants.iter().enumerate() { + let time_us = results[vi][size_idx_200]; + let measured = time_us / baseline_200; + let ratio = measured / variant.predicted; + let status = if ratio >= 0.5 && ratio <= 2.0 { + "OK" + } else { + "WARN" + }; + println!( + " {:<35} {:>9.1}× {:>9.2}× {:>10.1} {:>6}", + variant.name, variant.predicted, measured, time_us, status + ); + } + + // ── Output Section 2: Linearity Table ─────────────────────────── + + println!(); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 2: Linearity (time vs. pixel area)"); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!( + " {:<35} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>7} {:>6}", + "Effect", "50²", "100²", "200²", "300²", "500²", "1000²", "2000²", "4000²", "R²" + ); + println!( + " {:-<35} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->7} {:->6}", + "", "", "", "", "", "", "", "", "", "" + ); + + for (vi, variant) in variants.iter().enumerate() { + let row = &results[vi]; + let r2 = r_squared(&pixel_areas, row); + println!( + " {:<35} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>7.0} {:>5.3}", + variant.name, row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], r2 + ); + } + + // ── Output Section 3: Blur Radius Independence ────────────────── + + println!(); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 3: Blur Radius Independence (r=5 vs r=50)"); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!( + " {:<10} {:>10} {:>10} {:>10}", + "Size", "r=5 (µs)", "r=50 (µs)", "Ratio" + ); + println!(" {:-<10} {:->10} {:->10} {:->10}", "", "", "", ""); + + let blur5_idx = 6; // "blur (r=5)" + let blur50_idx = 7; // "blur (r=50)" + for (si, &size) in sizes.iter().enumerate() { + let t5 = results[blur5_idx][si]; + let t50 = results[blur50_idx][si]; + let ratio = t50 / t5; + println!( + " {:<10} {:>10.1} {:>10.1} {:>9.2}×", + format!("{}²", size as i32), + t5, + t50, + ratio + ); + } + + // ── Output Section 4: Device Fill Rate Calibration ────────────── + + println!(); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 4: Device Fill Rate Calibration"); + println!("═══════════════════════════════════════════════════════════════════════════"); + + // Use baseline at 500² for the most stable measurement + let baseline_500_us = results[0][4]; // 500² = 250_000 pixels + let pixels_500 = 500.0 * 500.0; + let pixels_per_us = pixels_500 / baseline_500_us; + let pixels_per_ms = pixels_per_us * 1000.0; + let budget_12ms = pixels_per_ms * 12.0; + + println!( + " Baseline (solid rect) at 500×500: {:.1} µs", + baseline_500_us + ); + println!(" Fill rate: {:.1}M pixels/ms", pixels_per_ms / 1_000_000.0); + println!( + " 12ms frame budget: {:.1}B pixels ({:.0}M pixels)", + budget_12ms / 1_000_000_000.0, + budget_12ms / 1_000_000.0 + ); + println!(); + + println!(" Reference (from docs/wg/feat-2d/render-cost-prediction.md):"); + println!(" Desktop GPU (discrete) ~500M pixels/ms"); + println!(" Desktop GPU (integrated) ~100M pixels/ms"); + println!(" WebGL (WASM, desktop) ~50-100M pixels/ms"); + println!(" WebGL (WASM, mobile) ~10-30M pixels/ms"); + + // ── Output Section 5: Two-Component Formula Extraction ────────── + + println!(); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 5: Two-Component Formula (C_fixed + area × C_per_pixel)"); + println!("═══════════════════════════════════════════════════════════════════════════"); + println!(" Solving from 200² and 4000² measurements:"); + println!(); + println!( + " {:<35} {:>10} {:>10} {:>12} {:>12}", + "Effect", "C_fixed(µs)", "C_pixel(ns/px)", "t@200²(µs)", "t@4000²(µs)" + ); + println!( + " {:-<35} {:->10} {:->10} {:->12} {:->12}", + "", "", "", "", "" + ); + + let area_small = 200.0_f64 * 200.0; // 40,000 + let area_large = 4000.0_f64 * 4000.0; // 16,000,000 + let idx_200 = 2usize; // index of 200.0 in sizes + let idx_4000 = 7usize; // index of 4000.0 in sizes + + for (vi, variant) in variants.iter().enumerate() { + let t_small = results[vi][idx_200]; + let t_large = results[vi][idx_4000]; + + // Solve: t_small = C_fixed + area_small * C_pixel + // t_large = C_fixed + area_large * C_pixel + // → C_pixel = (t_large - t_small) / (area_large - area_small) + // → C_fixed = t_small - area_small * C_pixel + let c_pixel = (t_large - t_small) / (area_large - area_small); // µs per pixel + let c_fixed = t_small - area_small * c_pixel; + + let c_pixel_ns = c_pixel * 1000.0; // ns per pixel + + println!( + " {:<35} {:>10.1} {:>10.3} {:>12.1} {:>12.1}", + variant.name, + c_fixed.max(0.0), + c_pixel_ns.max(0.0), + t_small, + t_large + ); + } + + println!(); + println!(" C_fixed = per-save_layer FBO/pipeline overhead (device-specific)"); + println!(" C_pixel = per-pixel bandwidth cost (ns/pixel)"); + println!(" Cost model: node_cost = C_fixed + screen_area × C_pixel × passes"); + println!(); +} diff --git a/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs b/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs new file mode 100644 index 0000000000..ba7efa5bfb --- /dev/null +++ b/crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs @@ -0,0 +1,391 @@ +//! Scene-Scale Cost Model Benchmark +//! +//! Measures full-engine render cost at scale (1K–136K nodes) with the complete +//! Renderer pipeline: R-tree culling, picture cache, layer compositing, GPU flush. +//! +//! This complements `skia_bench_cost_model` (single-node isolation) by testing +//! whether per-node costs are additive at scale or whether GPU batching, +//! memory pressure, and cache behavior introduce non-linear effects. +//! +//! Run with: +//! ```bash +//! cargo run -p cg --example skia_bench_scene_scale --features native-gl-context --release +//! ``` + +#[cfg(not(feature = "native-gl-context"))] +fn main() { + eprintln!("This example requires --features native-gl-context"); +} + +#[cfg(feature = "native-gl-context")] +fn main() { + use cg::cg::prelude::*; + use cg::node::scene_graph::{Parent, SceneGraph}; + use cg::node::schema::*; + use cg::runtime::scene::FrameFlushResult; + use cg::window::headless::HeadlessGpu; + use math2::transform::AffineTransform; + use std::time::Instant; + + const W: i32 = 1000; + const H: i32 = 1000; + const WARMUP: u32 = 5; + const ITERS: u32 = 20; + + let mut gpu = HeadlessGpu::new(W, H).expect("GPU init"); + gpu.print_gl_info(); + println!(); + + // ── Scene builders ────────────────────────────────────────────── + + #[derive(Clone, Copy)] + enum SceneType { + Plain, + WithShadow, + WithBlur, + Mixed, // 70% plain, 20% shadow, 10% blur + } + + impl SceneType { + fn label(&self) -> &'static str { + match self { + SceneType::Plain => "plain rects", + SceneType::WithShadow => "all with shadow", + SceneType::WithBlur => "all with blur", + SceneType::Mixed => "mixed (70/20/10)", + } + } + } + + fn build_scene(count: usize, scene_type: SceneType) -> Scene { + let mut graph = SceneGraph::new(); + let cols = (count as f64).sqrt().ceil() as usize; + + let rectangles: Vec = (0..count) + .map(|i| { + let col = i % cols; + let row = i / cols; + let x = (col as f32) * 10.0; + let y = (row as f32) * 10.0; + + let effects = match scene_type { + SceneType::Plain => LayerEffects::default(), + SceneType::WithShadow => { + LayerEffects::from_array(vec![FilterEffect::DropShadow(FeShadow { + dx: 2.0, + dy: 2.0, + blur: 4.0, + spread: 0.0, + color: CGColor::from_rgba(0, 0, 0, 128), + active: true, + })]) + } + SceneType::WithBlur => LayerEffects::new().blur(3.0), + SceneType::Mixed => { + let kind = i % 10; + if kind < 7 { + LayerEffects::default() // 70% plain + } else if kind < 9 { + LayerEffects::from_array(vec![FilterEffect::DropShadow(FeShadow { + dx: 2.0, + dy: 2.0, + blur: 4.0, + spread: 0.0, + color: CGColor::from_rgba(0, 0, 0, 128), + active: true, + })]) // 20% shadow + } else { + LayerEffects::new().blur(3.0) // 10% blur + } + } + }; + + Node::Rectangle(RectangleNodeRec { + active: true, + opacity: 1.0, + blend_mode: LayerBlendMode::default(), + mask: None, + transform: AffineTransform::new(x, y, 0.0), + size: Size { + width: 8.0, + height: 8.0, + }, + corner_radius: RectangularCornerRadius::zero(), + corner_smoothing: CornerSmoothing::default(), + fills: Paints::new([Paint::from(CGColor::from_rgba( + 66, + (133 + i % 50) as u8, + 244, + 255, + ))]), + strokes: Paints::default(), + stroke_style: StrokeStyle { + stroke_align: StrokeAlign::Inside, + stroke_cap: StrokeCap::default(), + stroke_join: StrokeJoin::default(), + stroke_miter_limit: StrokeMiterLimit::default(), + stroke_dash_array: None, + }, + stroke_width: StrokeWidth::default(), + effects, + layout_child: None, + }) + }) + .collect(); + + graph.append_children(rectangles, Parent::Root); + + Scene { + name: format!("scale_{}_{}", count, scene_type.label()), + background_color: Some(CGColor::WHITE), + graph, + } + } + + // ── Benchmark runner ──────────────────────────────────────────── + + struct ScaleResult { + scene_type: &'static str, + node_count: usize, + visible_count: usize, + frame_us: f64, + flush_us: f64, + total_us: f64, + per_visible_us: f64, + cache_hits: usize, + live_draws: usize, + } + + fn run_scale_bench( + renderer: &mut cg::runtime::scene::Renderer, + count: usize, + scene_type: SceneType, + ) -> ScaleResult { + let scene = build_scene(count, scene_type); + renderer.load_scene(scene); + + // Measure stable frames (full draw, no image cache). + // load_scene queues a stable frame automatically. + // Each iteration: flush (draws), then queue next stable frame. + // Stable frames always do a full draw — no pan/zoom image cache reuse. + let mut frame_times = Vec::with_capacity((WARMUP + ITERS) as usize); + let mut flush_times = Vec::with_capacity((WARMUP + ITERS) as usize); + let mut total_times = Vec::with_capacity((WARMUP + ITERS) as usize); + let mut last_visible = 0usize; + let mut last_cache_hits = 0usize; + let mut last_live_draws = 0usize; + + for i in 0..(WARMUP + ITERS) { + renderer.queue_stable(); + let t0 = Instant::now(); + let result = renderer.flush(); + let wall = t0.elapsed(); + + if let FrameFlushResult::OK(stats) = result { + if i >= WARMUP { + frame_times.push(stats.frame_duration.as_nanos() as f64 / 1000.0); + flush_times.push(stats.flush_duration.as_nanos() as f64 / 1000.0); + total_times.push(wall.as_nanos() as f64 / 1000.0); + } + last_visible = stats.draw.live_draw_count + stats.draw.layer_image_cache_hits; + last_cache_hits = stats.draw.layer_image_cache_hits; + last_live_draws = stats.draw.live_draw_count; + } + } + + // Use median + frame_times.sort_by(|a, b| a.partial_cmp(b).unwrap()); + flush_times.sort_by(|a, b| a.partial_cmp(b).unwrap()); + total_times.sort_by(|a, b| a.partial_cmp(b).unwrap()); + + let frame_us = frame_times + .get(frame_times.len() / 2) + .copied() + .unwrap_or(0.0); + let flush_us = flush_times + .get(flush_times.len() / 2) + .copied() + .unwrap_or(0.0); + let total_us = total_times + .get(total_times.len() / 2) + .copied() + .unwrap_or(0.0); + let per_visible = if last_visible > 0 { + total_us / last_visible as f64 + } else { + 0.0 + }; + + ScaleResult { + scene_type: scene_type.label(), + node_count: count, + visible_count: last_visible, + frame_us, + flush_us, + total_us, + per_visible_us: per_visible, + cache_hits: last_cache_hits, + live_draws: last_live_draws, + } + } + + // ── Run all configurations ────────────────────────────────────── + + let counts = [1_000, 5_000, 10_000, 50_000, 100_000, 136_000]; + let scene_types = [ + SceneType::Plain, + SceneType::WithShadow, + SceneType::WithBlur, + SceneType::Mixed, + ]; + + let mut renderer = gpu.create_renderer(); + let mut results: Vec = Vec::new(); + + let total_configs = counts.len() * scene_types.len(); + let mut done = 0; + + for &scene_type in &scene_types { + for &count in &counts { + eprint!( + "\r [{}/{}] {} × {}k", + done + 1, + total_configs, + scene_type.label(), + count / 1000 + ); + results.push(run_scale_bench(&mut renderer, count, scene_type)); + done += 1; + } + } + eprintln!("\r Done.{:60}", ""); + + // ── Output Section 1: Scale Table ─────────────────────────────── + + println!(); + println!("═══════════════════════════════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 1: Frame Time vs. Node Count (unstable frames, full Renderer pipeline)"); + println!("═══════════════════════════════════════════════════════════════════════════════════════════════════"); + println!( + " {:<22} {:>8} {:>8} {:>10} {:>10} {:>10} {:>10} {:>8} {:>8}", + "Scene Type", + "Nodes", + "Visible", + "Frame(µs)", + "Flush(µs)", + "Total(µs)", + "Per-vis", + "Hits", + "Live" + ); + println!( + " {:-<22} {:->8} {:->8} {:->10} {:->10} {:->10} {:->10} {:->8} {:->8}", + "", "", "", "", "", "", "", "", "" + ); + + for r in &results { + println!( + " {:<22} {:>7}k {:>8} {:>10.0} {:>10.0} {:>10.0} {:>9.2} {:>8} {:>8}", + r.scene_type, + r.node_count / 1000, + r.visible_count, + r.frame_us, + r.flush_us, + r.total_us, + r.per_visible_us, + r.cache_hits, + r.live_draws + ); + } + + // ── Output Section 2: Linearity Check ─────────────────────────── + + println!(); + println!("═══════════════════════════════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 2: Per-Node Cost Linearity (total_us / visible_count across scales)"); + println!("═══════════════════════════════════════════════════════════════════════════════════════════════════"); + println!( + " {:<22} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}", + "Scene Type", "1k", "5k", "10k", "50k", "100k", "136k" + ); + println!( + " {:-<22} {:->10} {:->10} {:->10} {:->10} {:->10} {:->10}", + "", "", "", "", "", "", "" + ); + + for scene_type in &scene_types { + let label = scene_type.label(); + let per_vis: Vec = counts + .iter() + .map(|&count| { + results + .iter() + .find(|r| r.node_count == count && r.scene_type == label) + .map(|r| format!("{:.2}", r.per_visible_us)) + .unwrap_or_else(|| "-".to_string()) + }) + .collect(); + println!( + " {:<22} {:>10} {:>10} {:>10} {:>10} {:>10} {:>10}", + label, per_vis[0], per_vis[1], per_vis[2], per_vis[3], per_vis[4], per_vis[5] + ); + } + println!(); + println!(" If per-visible cost is flat → cost model is additive (linear scaling)."); + println!(" If per-visible cost increases with N → non-linear overhead at scale."); + println!(); + + // ── Output Section 3: Predicted vs Measured ───────────────────── + + println!("═══════════════════════════════════════════════════════════════════════════════════════════════════"); + println!(" SECTION 3: Predicted vs. Measured (using cost model)"); + println!("═══════════════════════════════════════════════════════════════════════════════════════════════════"); + println!(); + + // Find baseline per-visible cost from plain 1k + let plain_1k = results + .iter() + .find(|r| r.node_count == 1_000 && r.scene_type == "plain rects"); + + if let Some(base) = plain_1k { + let base_per_vis = base.per_visible_us; + println!( + " Baseline per-visible-node cost (plain, 1k): {:.2} µs", + base_per_vis + ); + println!(); + println!( + " {:<22} {:>8} {:>12} {:>12} {:>10}", + "Scene Type", "Nodes", "Predicted(µs)", "Measured(µs)", "Ratio" + ); + println!( + " {:-<22} {:->8} {:->12} {:->12} {:->10}", + "", "", "", "", "" + ); + + for r in &results { + // Prediction: plain baseline per node × visible count × effect multiplier + let multiplier = match r.scene_type { + "plain rects" => 1.0, + "all with shadow" => 6.0, // 1 base + 5 shadow + "all with blur" => 4.0, // 1 base + 3 blur(σ=3) + "mixed (70/20/10)" => 0.7 * 1.0 + 0.2 * 6.0 + 0.1 * 4.0, // 2.3 + _ => 1.0, + }; + let predicted = base_per_vis * r.visible_count as f64 * multiplier; + let measured = r.total_us; + let ratio = measured / predicted; + + println!( + " {:<22} {:>7}k {:>12.0} {:>12.0} {:>9.2}×", + r.scene_type, + r.node_count / 1000, + predicted, + measured, + ratio + ); + } + } + + println!(); +} diff --git a/crates/grida-canvas/src/cache/picture.rs b/crates/grida-canvas/src/cache/picture.rs index 23e1941084..f6800f69db 100644 --- a/crates/grida-canvas/src/cache/picture.rs +++ b/crates/grida-canvas/src/cache/picture.rs @@ -25,6 +25,10 @@ pub struct PictureCache { default_store: NodeIdHashMap, /// Store for non-default render variants (variant key != 0). variant_store: NodeIdHashMap<(NodeId, u64), Picture>, + /// Monotonically increasing counter incremented on any cache mutation + /// (insert, invalidate, invalidate_node). The prefill loop uses this + /// to skip the 136K-iteration cache-hit check when nothing changed. + generation: u64, } impl PictureCache { @@ -33,6 +37,7 @@ impl PictureCache { strategy: PictureCacheStrategy::default(), default_store: new_node_id_map(), variant_store: new_node_id_map(), + generation: 0, } } @@ -49,8 +54,17 @@ impl PictureCache { self.default_store.get(id) } + /// Returns the current cache generation counter. This increments on + /// every mutation (insert, invalidate). Callers can compare generations + /// to detect whether the cache contents have changed. + #[inline] + pub fn generation(&self) -> u64 { + self.generation + } + pub fn set_node_picture(&mut self, id: NodeId, picture: Picture) { self.default_store.insert(id, picture); + self.generation = self.generation.wrapping_add(1); } /// Lookup a picture for a node in a specific render variant. @@ -69,15 +83,25 @@ impl PictureCache { pub fn set_node_picture_variant(&mut self, id: NodeId, variant_key: u64, picture: Picture) { if variant_key == 0 { self.default_store.insert(id, picture); - return; + } else { + self.variant_store.insert((id, variant_key), picture); } - self.variant_store.insert((id, variant_key), picture); + self.generation = self.generation.wrapping_add(1); } pub fn len(&self) -> usize { self.default_store.len() + self.variant_store.len() } + /// Returns true when the variant store has no entries. + /// When this is true AND variant unification is enabled, ALL cached + /// pictures live under the default key (0), making the prefill skip + /// safe across stable/unstable transitions. + #[inline] + pub fn variant_store_is_empty(&self) -> bool { + self.variant_store.is_empty() + } + pub fn depth(&self) -> Option { self.strategy.depth } @@ -85,6 +109,7 @@ impl PictureCache { pub fn invalidate(&mut self) { self.default_store.clear(); self.variant_store.clear(); + self.generation = self.generation.wrapping_add(1); } /// Invalidate cached pictures for a single node (all variants). @@ -96,5 +121,6 @@ impl PictureCache { pub fn invalidate_node(&mut self, id: NodeId) { self.default_store.remove(&id); self.variant_store.retain(|&(nid, _), _| nid != id); + self.generation = self.generation.wrapping_add(1); } } diff --git a/crates/grida-canvas/src/runtime/cost_prediction.rs b/crates/grida-canvas/src/runtime/cost_prediction.rs new file mode 100644 index 0000000000..04dd737a5b --- /dev/null +++ b/crates/grida-canvas/src/runtime/cost_prediction.rs @@ -0,0 +1,153 @@ +//! Render cost prediction — read-only metric for frame budget estimation. +//! +//! Estimates the GPU cost of rendering a frame based on the visible node set +//! and their effects. All constants are fixed-overhead costs measured on +//! Apple M2 Pro (Metal 4.1). +//! +//! This module is **debug/instrumentation only**. It does not influence +//! rendering decisions. The predicted cost is reported in `FramePlan` and +//! the devtools overlay for correlation analysis against actual frame times. +//! +//! ## Reference +//! +//! - [`docs/wg/feat-2d/render-cost-prediction.md`] — cost model derivation, +//! benchmark results, Skia blur algorithm analysis, blend mode tiers, +//! and calibration methodology. +//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_cost_model.rs`] — +//! per-effect validation benchmark (fixed cost extraction, linearity). +//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_cache_blit.rs`] — +//! cache hit/miss ratio measurement. +//! - [`crates/grida-canvas/examples/skia_bench/skia_bench_scene_scale.rs`] — +//! full Renderer pipeline at 1K–136K nodes. + +use crate::cg::fe::{FeBlur, FilterShadowEffect}; +use crate::cg::prelude::LayerBlendMode; +use crate::painter::layer::PainterPictureLayer; + +// ── Measured fixed-overhead constants (µs) ────────────────────────── +// +// Per-operation FBO/pipeline switch costs, NOT per-pixel. +// Source: skia_bench_cost_model (single rect, median of 50 runs). + +/// Baseline draw call + flush overhead (no save_layer). +const COST_BASELINE_US: f64 = 12.0; + +/// Gaussian blur: FBO + shader dispatch. For σ > 4.0, each additional +/// downsample level adds ~COST_BLUR_LEVEL_US. +const COST_BLUR_BASE_US: f64 = 73.0; +const COST_BLUR_LEVEL_US: f64 = 35.0; + +/// Drop shadow: FBO + shadow filter dispatch. +const COST_SHADOW_US: f64 = 97.0; + +/// Inner shadow: FBO + clip + shadow filter dispatch. +const COST_INNER_SHADOW_US: f64 = 72.0; + +/// Non-PassThrough blend mode: FBO + blend resolve. +const COST_BLEND_MODE_US: f64 = 81.0; + +/// Backdrop blur: FBO + dst snapshot + blur. +const COST_BACKDROP_BLUR_US: f64 = 110.0; + +/// Group opacity isolation (save_layer_alpha). +const COST_OPACITY_ISOLATION_US: f64 = 20.0; + +/// Compositor cache hit: single texture blit (~5µs, size-independent). +const COST_CACHE_HIT_US: f64 = 5.0; + +// ── Public API ────────────────────────────────────────────────────── + +/// Estimate the blur fixed cost based on sigma. +/// +/// Skia uses direct convolution for σ ≤ 4.0 and recursive downsampling +/// for larger values. Each downsample level adds a fixed FBO overhead. +/// See `skia/src/core/SkBlurEngine.h` for the `kMaxLinearSigma = 4.0` +/// constant that drives this. +pub fn blur_cost_us(sigma: f32) -> f64 { + if sigma <= 0.03 { + return 0.0; + } + if sigma <= 4.0 { + return COST_BLUR_BASE_US; + } + let levels = (sigma / 4.0).log2().ceil() as u32; + COST_BLUR_BASE_US + levels as f64 * COST_BLUR_LEVEL_US +} + +/// Estimate the fixed-overhead cost (µs) for rendering a single node. +/// +/// `is_cache_hit`: true if the node will be drawn from the compositor +/// layer cache (texture blit) rather than live-rasterized. +pub fn estimate_node_cost(layer: &PainterPictureLayer, is_cache_hit: bool) -> f64 { + if is_cache_hit { + return COST_CACHE_HIT_US; + } + + let mut cost = COST_BASELINE_US; + + let (effects, base) = match layer { + PainterPictureLayer::Shape(s) => (&s.effects, &s.base), + PainterPictureLayer::Text(t) => (&t.effects, &t.base), + PainterPictureLayer::Vector(v) => (&v.effects, &v.base), + }; + + // Blur + if let Some(blur) = &effects.blur { + if blur.active { + let sigma = match &blur.blur { + FeBlur::Gaussian(g) => g.radius, + FeBlur::Progressive(p) => p.radius.max(p.radius2), + }; + cost += blur_cost_us(sigma); + } + } + + // Backdrop blur + if let Some(backdrop) = &effects.backdrop_blur { + if backdrop.active { + let sigma = match &backdrop.blur { + FeBlur::Gaussian(g) => g.radius, + FeBlur::Progressive(p) => p.radius.max(p.radius2), + }; + cost += COST_BACKDROP_BLUR_US.max(blur_cost_us(sigma)); + } + } + + // Shadows + for shadow in &effects.shadows { + match shadow { + FilterShadowEffect::DropShadow(s) => { + if s.active { + cost += COST_SHADOW_US.max(blur_cost_us(s.blur)); + } + } + FilterShadowEffect::InnerShadow(s) => { + if s.active { + cost += COST_INNER_SHADOW_US.max(blur_cost_us(s.blur)); + } + } + } + } + + // Glass (treated as backdrop blur) + if let Some(glass) = &effects.glass { + if glass.active { + cost += COST_BACKDROP_BLUR_US; + } + } + + // Blend mode isolation (non-PassThrough requires save_layer) + if !matches!(base.blend_mode, LayerBlendMode::PassThrough) { + cost += COST_BLEND_MODE_US; + } + + // Group opacity isolation + // Note: leaf nodes fold opacity into paint alpha; only groups need + // save_layer. We can't distinguish group vs leaf from + // PainterPictureLayer alone, so we conservatively add the cost. + if base.opacity < 1.0 { + cost += COST_OPACITY_ISOLATION_US; + } + + cost +} diff --git a/crates/grida-canvas/src/runtime/mod.rs b/crates/grida-canvas/src/runtime/mod.rs index 6584dd5762..f1f99f50ef 100644 --- a/crates/grida-canvas/src/runtime/mod.rs +++ b/crates/grida-canvas/src/runtime/mod.rs @@ -1,6 +1,7 @@ pub mod camera; pub mod changes; pub mod config; +pub mod cost_prediction; pub mod counter; pub mod effect_tree; pub mod font_repository; diff --git a/crates/grida-canvas/src/runtime/scene.rs b/crates/grida-canvas/src/runtime/scene.rs index d92990ca67..b3baf4d10d 100644 --- a/crates/grida-canvas/src/runtime/scene.rs +++ b/crates/grida-canvas/src/runtime/scene.rs @@ -155,6 +155,11 @@ pub struct FramePlan { pub compositor_indices: Vec, pub display_list_duration: Duration, pub display_list_size_estimated: usize, + /// Predicted frame cost in microseconds, based on the fixed-overhead cost + /// model (sum of per-effect FBO/pipeline costs for visible nodes). + /// See `docs/wg/feat-2d/render-cost-prediction.md` for derivation. + /// Zero for cache-hit frames (pan/zoom blit). + pub predicted_cost_us: f64, } /// Deferred frame plan: stores just the inputs so the expensive R-tree query @@ -358,6 +363,13 @@ pub struct Renderer { /// [`apply_changes`] consumes the set once per frame and performs /// the correct invalidation for every cache layer. changes: ChangeSet, + /// Picture cache generation + variant key at the time of the last + /// successful prefill. When the cache generation and variant key + /// match, the prefill loop can be skipped entirely — all pictures + /// are already cached from a previous frame. + last_prefill_generation: u64, + last_prefill_variant_key: u64, + last_prefill_layer_count: usize, } impl Renderer { @@ -385,6 +397,37 @@ impl Renderer { // True when the policy differs from STANDARD only in effect-related // fields — content, compositing, and clip policies are unchanged. let can_unify = variant_key != 0 && policy.is_effect_only_variant(); + + // Skip-prefill fast path: when the picture cache generation hasn't + // changed since the last prefill AND we're using the same variant + // key AND the layer count matches, every picture from the previous + // prefill is still valid. Skip the O(N) iteration entirely. + // + // For variant key tracking: when can_unify is true AND the variant + // store is empty (no per-variant entries — all nodes are effect-free), + // we track key=0 since all pictures live under the default key. This + // is safe across stable/unstable transitions for effect-free scenes. + // Scenes WITH effects track the actual variant_key. + // + // On 135K-node scenes at fit zoom, this eliminates ~800µs of HashMap + // lookups on every cache-warm frame (the common case during view-only + // pan/zoom interaction and settle frames). + let effective_key_for_tracking = + if can_unify && self.scene_cache.picture.variant_store_is_empty() { + 0 + } else { + variant_key + }; + + let current_gen = self.scene_cache.picture.generation(); + let layer_count: usize = plan.regions.iter().map(|(_, idx)| idx.len()).sum(); + if current_gen == self.last_prefill_generation + && effective_key_for_tracking == self.last_prefill_variant_key + && layer_count == self.last_prefill_layer_count + { + return; + } + // Prefill picture cache for visible layers so Painter can reuse pictures even with masks. // Fast path: skip clone + recording when the picture is already cached (common case // on cache-warm frames). The clone of LayerEntry is expensive because it deep-copies @@ -433,6 +476,17 @@ impl Renderer { } } } + + // Update tracking state for future skip-prefill checks. + let effective_key_after = if can_unify && self.scene_cache.picture.variant_store_is_empty() + { + 0 + } else { + variant_key + }; + self.last_prefill_generation = self.scene_cache.picture.generation(); + self.last_prefill_variant_key = effective_key_after; + self.last_prefill_layer_count = layer_count; } /// Pre-extract blit data for all promoted nodes. @@ -608,6 +662,9 @@ impl Renderer { pan_image_cache: None, zoom_image_cache: None, changes: ChangeSet::new(), + last_prefill_generation: u64::MAX, + last_prefill_variant_key: u64::MAX, + last_prefill_layer_count: 0, } } @@ -1068,6 +1125,7 @@ impl Renderer { compositor_indices: Vec::new(), display_list_duration: Duration::ZERO, display_list_size_estimated: 0, + predicted_cost_us: 0.0, }; return FrameFlushStats { @@ -1125,6 +1183,7 @@ impl Renderer { compositor_indices: Vec::new(), display_list_duration: Duration::ZERO, display_list_size_estimated: 0, + predicted_cost_us: 0.0, }, ); if let Some((mid_flush_duration, frame_duration)) = zoom_cache_hit { @@ -1142,6 +1201,7 @@ impl Renderer { compositor_indices: Vec::new(), display_list_duration: Duration::ZERO, display_list_size_estimated: 0, + predicted_cost_us: 0.0, }; return FrameFlushStats { frame: plan, @@ -1488,6 +1548,9 @@ impl Renderer { self.scene_cache = cache::scene::SceneCache::new(); self.pan_image_cache = None; self.zoom_image_cache = None; + self.last_prefill_generation = u64::MAX; + self.last_prefill_variant_key = u64::MAX; + self.last_prefill_layer_count = 0; self.images.clear_missing_tracking(); if let Some(scene) = self.scene.as_ref() { #[cfg(feature = "perf")] @@ -2032,6 +2095,10 @@ impl Renderer { pic } + // ── Render cost prediction ───────────────────────────────────── + // Read-only debug metric. Delegates to `runtime::cost_prediction`. + // See docs/wg/feat-2d/render-cost-prediction.md for derivation. + /// Plan the frame for rendering. /// /// # Arguments @@ -2135,6 +2202,33 @@ impl Renderer { let ll_len = regions.iter().map(|(_, indices)| indices.len()).sum(); + // Predict frame cost: sum per-node fixed overhead costs. + let predicted_cost_us = { + let promoted_set: std::collections::HashSet<&NodeId> = promoted_ids.iter().collect(); + let mut total = 0.0_f64; + // Live-drawn nodes (from regions) + for (_, region_indices) in ®ions { + for &idx in region_indices { + if let Some(entry) = self.scene_cache.layers.layers.get(idx) { + total += crate::runtime::cost_prediction::estimate_node_cost( + &entry.layer, + false, + ); + } + } + } + // Promoted (cache-hit) nodes + for &idx in &compositor_indices { + if let Some(entry) = self.scene_cache.layers.layers.get(idx) { + if promoted_set.contains(&entry.id) { + total += + crate::runtime::cost_prediction::estimate_node_cost(&entry.layer, true); + } + } + } + total + }; + let __ll_duration = __start.elapsed(); FramePlan { @@ -2146,6 +2240,7 @@ impl Renderer { compositor_indices, display_list_duration: __ll_duration, display_list_size_estimated: ll_len, + predicted_cost_us, } } diff --git a/crates/grida-canvas/src/window/application.rs b/crates/grida-canvas/src/window/application.rs index 7de297a56f..1db614717e 100644 --- a/crates/grida-canvas/src/window/application.rs +++ b/crates/grida-canvas/src/window/application.rs @@ -1627,13 +1627,18 @@ impl UnknownTargetApplication { wall_time: std::time::Duration, ) { let s = format!( - "fps*: {:.0} | t: {:.2}ms | cam: {} | render: {:.1}ms | flush: {:.1}ms | frame: {:.1}ms | list: {:.1}ms ({:?}) | draw: {:.1}ms | $:pic: {:?} ({:?} use) | $:geo: {:?} | comp: {:?} ({:?} hit, {:.1}KB) | live: {:?} | res: {} | img: {} | fnt: {}", + "fps*: {:.0} | t: {:.2}ms | cam: {} | render: {:.1}ms | flush: {:.1}ms | frame: {:.1}ms | pred: {:.0}µs ({:.1}×) | list: {:.1}ms ({:?}) | draw: {:.1}ms | $:pic: {:?} ({:?} use) | $:geo: {:?} | comp: {:?} ({:?} hit, {:.1}KB) | live: {:?} | res: {} | img: {} | fnt: {}", 1.0 / wall_time.as_secs_f64(), wall_time.as_secs_f64() * 1000.0, stats.frame.camera_change.label(), stats.total_duration.as_secs_f64() * 1000.0, stats.flush_duration.as_secs_f64() * 1000.0, stats.frame_duration.as_secs_f64() * 1000.0, + stats.frame.predicted_cost_us, + { + let actual_us = stats.frame_duration.as_secs_f64() * 1_000_000.0; + if actual_us > 0.0 { stats.frame.predicted_cost_us / actual_us } else { 0.0 } + }, stats.frame.display_list_duration.as_secs_f64() * 1000.0, stats.frame.display_list_size_estimated, stats.draw.painter_duration.as_secs_f64() * 1000.0, diff --git a/docs/wg/feat-2d/optimization.md b/docs/wg/feat-2d/optimization.md index e70bb3d5b5..fe6f5b51ff 100644 --- a/docs/wg/feat-2d/optimization.md +++ b/docs/wg/feat-2d/optimization.md @@ -1187,6 +1187,47 @@ expensive full redraws. - `runtime/scene.rs` — `apply_changes()` for `last_had_data_changes` - `window/application.rs` — `frame()` vs `redraw()` dual-path issue +48. **Picture Cache Prefill Skip (Generation Tracking)** ✅ IMPLEMENTED + + The `prefill_picture_cache_for_plan()` loop iterates ALL visible nodes + each frame to check if their `SkPicture` is cached, doing a HashMap + lookup per node. On cache-warm frames (the common case during view-only + pan/zoom), every lookup succeeds and no work is done — but the iteration + itself costs O(N) per frame. + + **The optimization:** track a monotonically increasing `generation` + counter on `PictureCache` that increments on any mutation (insert, + invalidate). The prefill stores the generation, variant key, and layer + count after each successful pass. On the next frame, if all three + match, the entire loop is skipped in O(1). + + For effect-free scenes (the common case for large design docs), the + variant key unification optimization stores all pictures under key=0 + regardless of stable/unstable quality. The generation-based skip is + safe across stable/unstable transitions because the cache contents + are identical. + + **Measured impact (Apple M2 Pro, GPU benchmark, 01-135k 135K nodes):** + + | Scenario | Metric | Before | After | Delta | + | -------- | ------ | ------ | ----- | ----- | + | rt_pan_fast_fit | p50 frame | 111 µs | 76 µs | **-32%** | + | rt_pan_fast_fit | p95 frame | 263 µs | 153 µs | **-42%** | + | rt_pan_slow_fit | settle | 2,323 µs | 1,836 µs | **-21%** | + | pan_settle_slow_fit | avg | 87 µs | 59 µs | **-32%** | + | pan_settle_slow_fit | settle | 1,034 µs | 709 µs | **-31%** | + + **Criterion (CPU raster, 2000-node scene, statistically rigorous):** + + | Scene | Change | p-value | + | ----- | ------ | ------- | + | large_baseline/pan | **-14.0%** | < 0.01 | + | large_baseline/pan_zoomed_in | -5.4% | 0.02 | + | large_compositing/pan | -4.2% | 0.02 | + + Implementation: `PictureCache.generation` in `cache/picture.rs`, + `Renderer.last_prefill_*` tracking in `runtime/scene.rs`. + --- This list is designed to evolve the renderer from single-threaded mode to diff --git a/docs/wg/feat-2d/render-cost-prediction.md b/docs/wg/feat-2d/render-cost-prediction.md index 0c4c1f1152..d6de03f2a5 100644 --- a/docs/wg/feat-2d/render-cost-prediction.md +++ b/docs/wg/feat-2d/render-cost-prediction.md @@ -12,9 +12,13 @@ tags: # Render Cost Prediction -Reference sheet for computing GPU render cost of 2D scene operations -**before drawing**. All constants and formulas are derived from GPU -pipeline structure, not empirical tuning. +Reference sheet for estimating GPU render cost of 2D scene operations +**before drawing**. Each claim is labeled as one of: + +- **FACT** — verified from Skia/Chromium source or hardware specification +- **BENCHMARK** — measured locally (Apple M2 Pro, Metal 4.1, Skia 0.93) +- **INFERENCE** — derived from facts and benchmarks, not directly proven +- **HEURISTIC** — useful approximation, known to have exceptions Related: @@ -23,178 +27,258 @@ Related: --- -## Core Principle: Fill Rate Dominance +## Dominant Cost: Fixed Overhead per Operation -2D GPU rendering is **memory-bandwidth bound**, not compute bound. The -fragment shader for a rect fill is ~1 ALU op; even a Gaussian blur pass -is ~10 ALU ops per pixel. Modern GPUs execute trillions of ALU ops/sec, -but memory bandwidth is 50-200 GB/s. Each pixel read/write is 4-16 bytes. +> **BENCHMARK** — Confirmed by measuring identical effects at 200² through +> 4000² pixels (100× area range). Per-pixel cost component is near zero; +> total time is constant regardless of area. -Therefore: +On our measured hardware (M2 Pro, Metal), the cost of most 2D operations +is dominated by **fixed per-operation overhead** — primarily GPU render +target switches (`save_layer` / FBO allocation) — not by pixel fill rate. -``` -frame_cost ≈ total_pixels_touched / memory_bandwidth -``` +The fixed overhead comes from (**FACT**, traced to Skia/GL source): + +1. **GPU texture allocation** (~15-30µs) — `glTexStorage2D()`, synchronous + on most drivers. Skia's `GrResourceCache` pools textures to mitigate + this, but cache misses still pay full cost. +2. **FBO state change** (~20-40µs) — `glFramebufferTexture2D()`, forces + GPU pipeline flush. Unavoidable in GL/Metal immediate-mode API. +3. **Resource allocator** (~5-15µs) — CPU-side scratch key lookup in + `GrResourceAllocator`. -This relationship is **linear**. Double the pixels, double the time. -No surprises, no non-linear scaling — as long as you stay within VRAM -and don't hit texture cache thrashing (rare in 2D; access is spatially -coherent). +Source: `skia/src/gpu/ganesh/GrGLGpu.cpp` (texture alloc), +`skia/src/gpu/ganesh/GrResourceAllocator.cpp` (scratch pool). -This means render cost can be pre-computed as an **ALU/pixel budget**: -count the pixels the GPU will touch, apply structural multipliers per -effect, and compare against a calibrated device budget. +> **INFERENCE** — Many common 2D workloads are bandwidth-dominated for +> simple fills, but effects requiring `save_layer` (blur, shadow, blend +> mode isolation, group opacity) are dominated by fixed overhead at +> typical node sizes (< ~1M pixels). The pixel-proportional component +> becomes significant only at very large sizes or high zoom. --- -## Effect Cost Constants - -These are not magic numbers or tuning parameters. They are the -**structural pass counts** of each rendering operation — how many -full-area read-write cycles the GPU performs. - -| Effect | Pixel Multiplier | Derivation | -| ------------------------------------- | -------------------- | ---------------------------------------------------------- | -| Plain shape (rect, ellipse, polygon) | `1×` | Single fill pass | -| Additional fill (N fills on one node) | `+1×` per extra fill | Each fill is a separate pass | -| Additional stroke | `+1×` per stroke | Separate pass | -| Non-rect clip path | `+1×` | Mask pass + masked content | -| Rect clip | `+0×` | Hardware scissor — free | -| Blend mode (non-normal) | `+1×` | Requires offscreen isolation layer | -| Group opacity (alpha < 1.0 on group) | `+1×` | `save_layer` for isolated compositing | -| Gaussian blur | `+3×` | Downsample pyramid (~1.33×) + blur + upsample + composite | -| Drop shadow | `+5×` | Draw shape (1×) + blur pipeline (3×) + composite back (1×) | -| Inner shadow | `+5×` | Same as drop shadow, inverted mask | -| Backdrop filter (background blur) | `+3×` | Snapshot dst + blur + composite | -| Layer blur (on node itself) | `+3×` | Offscreen + blur + composite | -| Image fill | `+0×` over base | Texture sample replaces color fill — same bandwidth | -| Multiple shadows | `+5×` per shadow | Each shadow is independent | - -### Blur Radius Independence - -Skia (and most GPU frameworks) implement Gaussian blur via a **downsample -pyramid**, not a brute-force kernel convolution: +## Measured Fixed Cost per Operation + +> **BENCHMARK** — Single rect, median of 50 runs after 10 warmup. +> Constant across 50²–4000² pixel area (R² ≈ 0 for most effects). + +| Operation | C_fixed (µs) | What triggers it | +| ---------------------------- | ------------ | ------------------------------------------- | +| Baseline (no `save_layer`) | ~12 | GPU draw call + flush overhead | +| `save_layer_alpha` (opacity) | ~20 | 1 FBO switch | +| 2× nested `save_layer` | ~32 | 2 FBO switches | +| 3× nested `save_layer` | ~43 | 3 FBO switches (~11µs per additional layer) | +| Blur (σ=5) | ~73 | FBO + blur shader dispatch | +| Inner shadow (σ=6) | ~72 | FBO + clip + shadow filter dispatch | +| Blend mode (Multiply) | ~81 | FBO + blend resolve | +| Drop shadow (σ=8) | ~97 | FBO + shadow filter dispatch | +| Backdrop blur (σ=8) | ~110 | FBO + dst snapshot + blur | +| Blur (σ=50) | ~207 | FBO + multiple downsample dispatches | +| Shadow + blur combo | ~307 | 2 nested FBOs + both filter dispatches | + +> **INFERENCE** — For frame budget estimation, counting the number of +> `save_layer`-inducing operations and summing their fixed costs is more +> accurate than pixel-area-based prediction, at least up to ~16M pixels +> per node on this hardware. -``` -large sigma → downsample 2× → downsample 2× → ... → blur at reduced size → upsample -``` +--- -Total pixel work = `area × (1 + 1/4 + 1/16 + ...) ≈ area × 1.33` (geometric -series), plus the blur pass at reduced resolution. The cost is approximately -**constant regardless of blur radius**. The pyramid absorbs the radius. +## Blur Cost: Depends on Sigma -### `save_layer` / `save_layer_alpha` — The Hidden Spike Source +### Skia Constants -`save_layer` is the single most expensive primitive in Skia. It allocates an -offscreen surface, renders content into it, then composites back. +> **FACT** — From `skia/src/core/SkBlurEngine.h`. + +- `kMaxSamples = 28` — max texture samples per GPU blur pass (hardcoded) +- `kMaxLinearSigma = 4.0` — max sigma for direct convolution (hardcoded) +- `SigmaToRadius(σ) = ⌈3 × σ⌉` — sigma-to-radius conversion +- `LinearKernelWidth(r) = r + 1` — samples per 1D pass (hardware bilinear) +- σ ≤ 0.03 is treated as identity (no-op) + +### Skia Blur Strategy + +> **FACT** — From `skia/src/gpu/ganesh/GrBlurUtils.cpp`. ``` -save_layer_cost = layer_bounds_area × zoom² × 2 (write to offscreen + read back) +σ ≤ 4.0 and small kernel → single 2D convolution pass (≤28 samples) +σ ≤ 4.0 → two separable 1D passes +σ > 4.0 → downsample until σ ≤ 4.0, blur, upsample (recursive) ``` -Critical: **they cascade multiplicatively with nesting depth**. +For σ ≤ 4.0, the pass count varies: +- If `KernelWidth(rX) × KernelWidth(rY) ≤ 28`: single 2D pass +- Otherwise: two separable 1D passes + +> **HEURISTIC** — The following formula estimates pass count for σ > 4.0. +> The exact count depends on image dimensions and Skia's internal +> rounding, so treat this as an approximation. + +```rust +fn blur_pass_estimate(sigma: f32) -> u32 { + if sigma <= 0.03 { + return 0; // identity + } + if sigma <= 4.0 { + return 2; // 1–2 passes (1D separable or single 2D) + } + let levels = ((sigma / 4.0).log2()).ceil() as u32; + 2 + levels * 2 // 2 blur passes + downsample/upsample per level +} ``` -save_layer ← offscreen A (full group bounds) - save_layer ← offscreen B (child bounds) - save_layer ← offscreen C (grandchild bounds) - draw rect - restore → composite C into B - restore → composite B into A -restore → composite A into target -``` -Three nested layers on the same area = `area × 6` bandwidth, not `area × 2`. +### Blur Radius Dependence + +> **BENCHMARK** — Blur σ=50 is consistently ~2.8× more expensive than +> σ=5 across all tested sizes. This ratio is stable, confirming that +> cost scales with downsample level count. -#### Implicit `save_layer` triggers +| Size | σ=5 (µs) | σ=50 (µs) | Ratio | +| ----- | -------- | --------- | ----- | +| 50² | 74 | 211 | 2.87× | +| 100² | 65 | 193 | 2.99× | +| 200² | 73 | 207 | 2.84× | +| 500² | 76 | 208 | 2.74× | +| 4000² | 77 | 230 | 3.00× | -Skia inserts `save_layer` implicitly for these conditions. The cost estimator -must account for them even when the application code does not call `save_layer` -explicitly: +### `reduce_blur()` — Interactive Quality Reduction -| Trigger | Reason | -| ----------------------------------------- | --------------------------------------------------------------- | -| Non-normal blend mode on a group | Isolated offscreen to blend against dst | -| Group opacity (alpha < 1.0 with children) | Children must composite together first, then alpha applied once | -| Blur / backdrop filter | Reads from dst, needs snapshot | -| Clip + antialiasing on groups | Soft-edge mask requires offscreen | -| `ColorFilter` on a group | Applied after children composite | +> **FACT** — From `crates/grida-canvas/src/painter/painter.rs`. + +The painter implements `reduce_blur()` which divides sigma by 4× +during interactive frames (`RenderPolicy::EffectQuality::Reduced`). +This moves most blurs into the σ ≤ 4.0 direct convolution range. +Example: σ=20 → σ=5 (eliminates ~2 downsample levels). --- -## Per-Node Cost Formula +## `save_layer` Triggers -```rust -fn estimated_fill_pixels(node: &Node, zoom: f32, viewport: &Rect) -> f64 { - let screen_area = clipped_area(&node.bounds, viewport) * (zoom * zoom) as f64; +> **FACT** — From Skia's `SkCanvas::internalSaveLayer()` and observed +> painter behavior. The cost estimator must account for implicit +> `save_layer` insertions even when the application code does not call +> `save_layer` explicitly. - // Base draw - let mut passes: f64 = 1.0; +| Trigger | Reason | +| ----------------------------------------- | --------------------------------------- | +| Non-normal blend mode on a group | Isolated offscreen to blend against dst | +| Group opacity (alpha < 1.0 with children) | Children must composite together first | +| Blur / backdrop filter | Needs offscreen for filter input | +| Clip + antialiasing on groups | Soft-edge mask requires offscreen | +| `ColorFilter` on a group | Applied after children composite | - // Extra fills/strokes beyond the first - passes += (node.fill_count.saturating_sub(1)) as f64; - passes += node.stroke_count as f64; +> **FACT** — `save_layer` costs cascade with nesting depth. +> Each additional layer adds ~11µs fixed overhead (measured from +> 2× vs 3× nested `save_layer`: 32µs → 43µs). - // Effects - for shadow in &node.shadows { - if shadow.visible { - passes += 5.0; // shape + blur pipeline + composite - } - } - if node.has_blur() { - passes += 3.0; // downsample + blur + composite - } - if node.has_backdrop_blur() { - passes += 3.0; - } +### Blend Mode Tiers - // Isolation layers (implicit save_layer) - if node.blend_mode != BlendMode::Normal { - passes += 1.0; // offscreen + composite - } - if node.opacity < 1.0 && node.has_children() { - passes += 1.0; // group opacity isolation - } +> **FACT** — From `skia/src/gpu/Blend.h`, `skia/src/gpu/BlendFormula.h`, +> `skia/src/gpu/ganesh/effects/GrCustomXfermode.cpp`. - // Clip - if node.has_non_rect_clip() { - passes += 1.0; // mask pass - } +Not all blend modes have the same cost. Three tiers: - screen_area * passes -} -``` +| Tier | Modes | Implementation | +| ---------------------- | -------------------------------------------------------------------- | --------------------------------------------------- | +| Coefficient (cheapest) | Normal, Screen, SrcOver, Plus, Modulate | Hardware fixed-function blend — zero shader cost | +| Simple advanced | Overlay, HardLight, Darken, Lighten | Shared shader, ~10-20 lines, separable | +| Complex advanced | ColorDodge, ColorBurn, SoftLight, Hue, Saturation, Color, Luminosity | Individual shaders, non-separable, guarded division | + +> **INFERENCE** — The ~81µs measured for blend mode (Multiply) is +> entirely `save_layer` FBO overhead, not blend math. Multiply is a +> coefficient blend mode (cheapest tier). The blend mode tier affects +> ALU cost per pixel, which is negligible compared to FBO overhead at +> typical node sizes. Per-paint blend modes (no `save_layer`) are +> effectively free. + +--- -### Cache Hit vs. Miss Cost +## Cache Hit vs. Miss -A compositor/picture cache **hit** replaces the full rasterization pipeline -with a single texture blit: +> **BENCHMARK** — Measured with `skia_bench_cache_blit`. -| State | Effective multiplier | What happens | -| ---------- | ----------------------------- | ---------------------------------------------------- | -| Cache miss | `passes ×` (from table above) | Full rasterization: path tessellation, fill, effects | -| Cache hit | `~0.1×` | Single texture-sampled quad draw | +| State | Cost | What happens | +| ---------- | ---------------------------- | ------------------------------------------------------------- | +| Cache miss | ~70-300µs (effect-dependent) | Full rasterization with FBO overhead | +| Cache hit | ~5µs (constant) | Single texture blit, independent of source complexity or size | -The cost difference is **100-1000×**. Cache state is a binary signal — the -single largest contributor to per-node cost variance. +Hit/miss ratio for effect nodes: **~0.05×** (measured). +Blit cost is ~5µs regardless of source effect complexity — confirmed +with coefficient of variation check across 4 effect types. + +> **BENCHMARK** — At scale (136K nodes, 2600 visible), the compositor +> cache serves all effect nodes as texture blits. Shadow and blur nodes +> show `cache_hits = 2704, live_draws = 0`. Effect multipliers only +> apply to **cache-miss frames** (first render, zoom change, scene +> mutation). --- -## Device Fill Rate Reference +## Scale Behavior + +> **BENCHMARK** — Full Renderer pipeline with R-tree culling, picture +> cache, and layer compositing. Measured with `skia_bench_scene_scale`. + +### Per-Visible-Node Cost (stable frames) + +| Scene Type | 1K | 5K | 10K | 50K | 100K | 136K | +| ---------------- | ---- | ---- | ---- | ---- | ---- | ------------ | +| Plain rects | 0.41 | 0.38 | 0.40 | 0.43 | 0.54 | 0.89 µs/node | +| All with shadow | 0.49 | 0.45 | 0.46 | 0.47 | 0.64 | 0.87 µs/node | +| All with blur | 0.46 | 0.48 | 0.45 | 0.51 | 0.74 | 0.84 µs/node | +| Mixed (70/20/10) | 0.85 | 0.81 | 0.72 | 0.80 | 1.03 | 1.17 µs/node | + +> **INFERENCE** — Per-visible-node cost is approximately additive +> (linear) from 1K to 50K total nodes. Non-linear overhead appears at +> 100K+ due to R-tree query and scene cache management scaling with +> total scene size, not drawing cost. Visible count caps at ~2600 nodes +> in a 1000×1000 viewport with 8×8 rects — R-tree culling works. + +--- + +## Practical Cost Model -The total pixel budget depends on device fill rate — the one value that -varies per hardware. Everything else is derived from geometry and scene -structure. +> **HEURISTIC** — Based on all benchmarks above. For frame budget +> decisions (skip or draw), the following is more accurate than +> pixel-area-based prediction at typical node sizes. + +``` +frame_cost ≈ Σ visible_nodes( + if cache_hit: ~5 µs + if cache_miss: C_fixed(effect_type) +) +``` + +Where `C_fixed` values are from the measured table above. The pixel-area +component is negligible up to ~16M pixels per node on tested hardware. + +For nodes with multiple effects, sum the fixed costs (each effect +that triggers a `save_layer` adds its own FBO overhead). ### Calibration -Render a known workload (e.g., full-screen solid rect) and measure: +Two device-specific constants must be measured at startup: ``` -pixels_per_ms = (screen_width × screen_height) / render_time_ms +save_layer_overhead_us = measured via single save_layer + draw + restore +pixels_per_ms = measured via full-screen solid rect ``` -### Reference Values (order-of-magnitude) +Everything else is derived from scene structure (effect types, cache state). + +--- + +## Device Fill Rate Reference + +> **BENCHMARK** — Baseline solid rect at 500². + +| Metric | Value (M2 Pro) | +| ----------- | --------------- | +| Fill rate | ~146M pixels/ms | +| 12ms budget | ~1.8B pixels | + +> **HEURISTIC** — Order-of-magnitude reference. | Platform | Expected pixels_per_ms | | ------------------------ | ---------------------- | @@ -207,58 +291,53 @@ pixels_per_ms = (screen_width × screen_height) / render_time_ms ## Chromium Reference -Chromium's `cc/` compositor collects similar metrics but uses them differently: +> **FACT** — From `cc/paint/display_item_list.h`, `cc/tiles/tile_manager.cc`. + +Chromium's `cc/` compositor collects these metrics: -| Metric | Chromium Location | Chromium Usage | +| Metric | Location | Usage | | ------------------------------------- | ------------------------------ | ------------------------------------------------------------- | | `TotalOpCount()` | `cc/paint/display_item_list.h` | Solid-color analysis gate | | `num_slow_paths_up_to_min_for_MSAA()` | `cc/paint/display_item_list.h` | Page-level GPU raster veto | | `has_save_layer_ops()` | `cc/paint/display_item_list.h` | LCD text decision | -| `has_non_aa_paint()` | `cc/paint/display_item_list.h` | Antialiasing decisions | | `BytesUsed()` / `OpBytesUsed()` | `cc/paint/display_item_list.h` | Tracing / debugging | -| `AreaOfDrawText()` | `cc/paint/display_item_list.h` | Text coverage statistics | | Solid color analysis | `cc/tiles/tile_manager.cc` | Skip rasterization for uniform tiles (`kMaxOpsToAnalyze = 5`) | -Chromium does **not** perform per-tile raster cost prediction. Tile -scheduling is purely spatial (viewport distance + scroll velocity) with -a memory budget constraint. Their architecture tolerates stale tiles -(multi-threaded raster catches up across frames). Ours cannot — we render -single-threaded with a hard per-frame deadline, requiring predictive -budgeting. +> **INFERENCE** — Based on source review, Chromium does not appear to +> perform per-tile raster cost prediction. Tile scheduling is spatial +> (viewport distance + scroll velocity) with a memory budget constraint. +> Their multi-threaded raster architecture can tolerate stale tiles in +> ways our single-threaded pipeline cannot. Local source: `/Users/softmarshmallow/Documents/Github/chromium/cc/` --- -## Skia `Picture` Metrics (Available for Free) +## Skia `Picture` Metrics -Skia's `Picture` object exposes complexity metrics that are already -computed during recording and cost nothing to query: +> **FACT** — From `skia/include/core/SkPicture.h`. -| Method | What it returns | Use | -| -------------------------- | ---------------------------------- | ---------------------------------- | -| `approximate_op_count()` | Number of draw operations recorded | Secondary complexity signal | -| `approximate_bytes_used()` | Serialized size of the picture | Memory pressure / complexity proxy | +| Method | Returns | Cost to query | +| -------------------------- | ---------------------------------- | ------------------- | +| `approximate_op_count()` | Number of recorded draw operations | Free (stored field) | +| `approximate_bytes_used()` | Serialized size of the picture | Free (stored field) | -These are stored fields, not computations. They complement the pixel-area -model by capturing path complexity variance (a 1000-op picture with -complex beziers vs. a 3-op picture with simple rects at the same pixel -area). +These capture path complexity variance that the fixed-cost model does +not account for (e.g., a 1000-op picture with complex beziers vs. a +3-op picture with simple rects). --- -## Linearity Bounds +## Benchmark Source -The fill-rate model is linear under these conditions: +All benchmarks use `HeadlessGpu` (offscreen Metal/GL surface), median +of 50 iterations after 10 warmup, single rect per iteration unless +noted otherwise. -| Condition | Linear? | Notes | -| ---------------------------------- | ------------------- | ------------------------------------------------------ | -| Work above ~10K pixels | Yes | Below this, GPU launch overhead dominates (flat floor) | -| Spatial texture access (normal 2D) | Yes | Bandwidth-bound, no cache thrashing | -| Random texture access | Can be super-linear | Rare in 2D rendering | -| Tile-based GPU (mobile) | Mostly | Large nodes spanning many tiles add per-tile overhead | -| Thermal throttling | N/A | Between-frame variance, not within-frame | -| VRAM pressure / swapping | Non-linear | Catastrophic; avoid by staying within budget | +| Benchmark | What it measures | +| ------------------------ | ---------------------------------------------------------------------------------- | +| `skia_bench_cost_model` | Per-effect fixed cost, linearity, blur radius, fill rate, two-component extraction | +| `skia_bench_cache_blit` | Cache hit/miss ratio, blit constancy across effect types | +| `skia_bench_scene_scale` | Full Renderer pipeline at 1K–136K nodes with culling and caching | -For typical 2D canvas rendering (spatial access, nodes > 10K pixels), -the linear model holds. +Source: `crates/grida-canvas/examples/skia_bench/` diff --git a/fixtures/test-html/L0/box-margin.html b/fixtures/test-html/L0/box-margin.html new file mode 100644 index 0000000000..3ae840fb83 --- /dev/null +++ b/fixtures/test-html/L0/box-margin.html @@ -0,0 +1,295 @@ + + + + + Box: Margin + + + +

CSS Margin Behaviors

+ + +

1. Margin Collapse (block flow)

+

+ Both boxes have margin: 30px 0. In normal flow, the 30px + margins collapse into 30px (not 60px). +

+
+
+
normal flow (collapsed)
+
+
margin: 30px 0
+
↕ 30px (collapsed)
+
margin: 30px 0
+
+
+
+
flex column (no collapse)
+
+
margin: 30px 0
+
↕ 60px (no collapse)
+
margin: 30px 0
+
+
+
+ + +

2. Negative Margin

+

+ Second box has margin-top: -20px, pulling it upward and + overlapping the first box. +

+
+
+
box A
+
+ box B — margin-top: -20px +
+
+
+ + +

3. Margin Auto

+

+ Auto margins distribute available space. Used for centering and + push-alignment. +

+
+
+
margin: 0 auto
+
+
centered
+
+
+
+
margin-left: auto
+
+
+ pushed right +
+
+
+
+
+
flex + margin-left: auto (spacer pattern)
+
+
A
+
B (ml: auto)
+
C (ml: auto)
+
+
+ + +

4. Background Boundary

+

+ Margin is outside the background. Left: margin creates transparent gap. + Right: wrapper+padding equivalent — padding zone paints with the wrapper's + background. +

+
+
+
margin: 24px
+
+ background: blue; margin: 24px; +
+
+
+
wrapper padding: 24px
+
+
wrapper { padding: 24px } → child
+
+
+
+ + +

5. Inline Element Margin

+

+ Inline elements ignore vertical margin. These <span>s + have margin: 40px 8px but only horizontal margin applies. +

+
+
+ text before span A middle text + span B text after +
+ next line span C continues +
+
+ + +

6. Collapse Variants

+

+ Unequal margins: A has margin-bottom: 50px, B has + margin-top: 20px. Collapsed = max(50, 20) = 50px. +

+
+
+
collapsed (50px, not 70px)
+
+
mb: 50px
+
mt: 20px
+
+
+
+
wrapper+padding (70px total)
+
+
+
wrapper pb: 50px
+
+
+
wrapper pt: 20px
+
+
+
+
+ + +

7. Parent-Child Collapse

+

+ A child's margin can collapse through its parent if the parent has no + border, padding, or BFC. Left: margin leaks out. Right: padding on parent + prevents collapse. +

+
+
+
collapsed (margin leaks)
+
+
child mt: 30px
+
+
+ ↑ parent has no padding/border — child margin leaks out +
+
+
+
padding prevents collapse
+
+
child mt: 30px
+
+
↑ parent has padding: 1px — margin stays inside
+
+
+ +