gridaco · softmarshmallow · Mar 30, 2026 · Mar 30, 2026
diff --git a/crates/grida-canvas/src/runtime/scene.rs b/crates/grida-canvas/src/runtime/scene.rs
@@ -1329,31 +1329,31 @@ impl Renderer {
         // unstable frame can use the cache. Without this, stable frames
         // clear the cache (in queue()) but don't recapture, causing
         // the next unstable frame to do another expensive full draw.
+        //
+        // Single snapshot: both caches need the same image on non-zoom
+        // frames. image_snapshot() is copy-on-write but still allocates
+        // a handle — sharing avoids the second allocation.
         if self.backend.is_gpu() {
             let vm = self.camera.view_matrix();
+            let image = surface.image_snapshot();
 
             // Pan image cache: only useful when zoom is constant.
             if !plan.camera_change.zoom_changed() {
-                let image = surface.image_snapshot();
                 self.pan_image_cache = Some(PanImageCache {
-                    image,
+                    image: image.clone(),
                     origin_tx: vm.matrix[0][2],
                     origin_ty: vm.matrix[1][2],
                 });
             }
 
             // Zoom image cache: capture after every full draw so that
             // the next zoom frame can use a scaled blit instead of
-            // re-drawing. We snapshot the surface once; the cost is
-            // amortized over all subsequent zoom cache-hit frames.
-            {
-                let image = surface.image_snapshot();
-                self.zoom_image_cache = Some(ZoomImageCache {
-                    image,
-                    zoom: self.camera.get_zoom(),
-                    view_matrix: vm,
-                });
-            }
+            // re-drawing.
+            self.zoom_image_cache = Some(ZoomImageCache {
+                image,
+                zoom: self.camera.get_zoom(),
+                view_matrix: vm,
+            });
         }
 
         // Compositor update (GPU-only).
@@ -2055,13 +2055,46 @@ impl Renderer {
         let mut promoted_ids: Vec<NodeId> = Vec::new();
         let mut regions: Vec<(rect::Rectangle, Vec<usize>)> = Vec::new();
 
-        // Query the R-tree once for all visible layer indices.
-        let mut indices = self.scene_cache.intersects(bounds);
+        // Full-viewport fast path: when the camera viewport fully contains
+        // the scene envelope (R-tree root AABB), ALL indexed layers are
+        // visible. Skip the R-tree traversal + sort entirely and return
+        // 0..n. This is O(1) vs O(n log n) — saves ~1600 us on 135K-node
+        // scenes at fit zoom (the common view-only case).
+        //
+        // Safety: only valid when every layer has render bounds (i.e. the
+        // R-tree indexes all layers). update_layers() uses filter_map to
+        // skip layers without render bounds, so layer_count can exceed
+        // the R-tree size. We guard against this by requiring the counts
+        // match — when they don't, the R-tree query correctly excludes
+        // the bounds-less layers.
+        let layer_count = self.scene_cache.layers.layers.len();
+        let rtree_size = self.scene_cache.layer_index.size();
+        let all_visible = layer_count == rtree_size
+            && match self.scene_cache.scene_envelope() {
+                None => true, // empty scene → trivially "all visible"
+                Some(envelope) => {
+                    let lower = envelope.lower();
+                    let upper = envelope.upper();
+                    bounds.x <= lower[0]
+                        && bounds.y <= lower[1]
+                        && bounds.x + bounds.width >= upper[0]
+                        && bounds.y + bounds.height >= upper[1]
+                }
+            };
 
-        // TODO: sort is expensive — consider incremental visible-set
-        // update (item 19) for pan-only frames where the entering/exiting
-        // sets are tiny.
-        indices.sort();
+        let indices = if all_visible {
+            // All layers visible — sequential indices, already sorted.
+            (0..layer_count).collect::<Vec<_>>()
+        } else {
+            // Partial visibility — R-tree spatial query.
+            let mut queried = self.scene_cache.intersects(bounds);
+            // sort_unstable (pdqsort) is 2-3x faster than stable merge sort
+            // for integer data because it avoids the O(n) merge buffer
+            // allocation. Draw order correctness only requires sorted indices,
+            // not stability.
+            queried.sort_unstable();
+            queried
+        };
 
         // Pre-filter compositor-relevant indices during the same pass.
         // Nodes without expensive effects (the vast majority) are skipped

diff --git a/docs/wg/feat-2d/optimization.md b/docs/wg/feat-2d/optimization.md
@@ -459,6 +459,40 @@ Related:
     the geometry cache), RenderSurface culling can be enabled using the
     stored `viewport` rect.
 
+10c. **Full-Viewport Frame Plan Fast Path** ✅ IMPLEMENTED
+
+    When the camera viewport fully contains the scene envelope (R-tree
+    root AABB), ALL layers are visible. The R-tree traversal and index
+    sort are redundant — we can return `0..n` directly in O(1).
+
+    Detection uses `scene_envelope()` (O(1) R-tree root node read) and
+    a simple AABB containment check. This fires at fit zoom and any
+    zoom level where the entire document is visible.
+
+    Additionally, partial-viewport frames now use `sort_unstable()`
+    (pdqsort) instead of `sort()` (merge sort), which is 2-3x faster
+    for integer data because it avoids the O(n) merge buffer allocation.
+
+    A third sub-optimization shares the GPU `image_snapshot()` between
+    the pan and zoom image caches on non-zoom frames, avoiding a
+    redundant snapshot handle allocation.
+
+    **Measured impact (Apple M2 Pro, GPU benchmark, 01-135k 135K nodes):**
+
+    | Scenario | Metric | Before | After | Delta |
+    | -------- | ------ | ------ | ----- | ----- |
+    | rt_pan_slow_fit | queue_us | 1,598 | 485 | **-70%** |
+    | rt_pan_slow_fit | settle_us | 3,388 | 1,049 | **-69%** |
+    | rt_pan_slow_fit | p95 frame | 6,317 | 1,199 | **-81%** |
+    | rt_pan_slow_zoomed | p50 frame | 300 | 151 | **-50%** |
+    | rt_pan_fast_fit | p50 frame | 82 | 41 | **-50%** |
+    | fl_16ms | p50 frame | 97 | 61 | **-37%** |
+
+    The optimization is most impactful at fit zoom on large scenes where
+    all nodes are visible — exactly the view-only reading experience.
+
+    Implementation: `Renderer::frame()` in `runtime/scene.rs`.
+
 11. **Minimize Canvas State Changes**
     - Reuse transforms and paints.
     - Precompute common values like DPI × Zoom × ViewMatrix.