From 8f4c68f1fb404d512820a2a13999747ae3f9df56 Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Tue, 10 Mar 2026 19:23:52 -0700
Subject: [PATCH 01/11] Add --video support for kitless renderers backends

---
 .../isaaclab/isaaclab/envs/direct_marl_env.py | 60 +++++++++++++++--
 .../isaaclab/isaaclab/envs/direct_rl_env.py   | 62 +++++++++++++++--
 .../isaaclab/envs/manager_based_rl_env.py     | 66 ++++++++++++++++---
 3 files changed, 168 insertions(+), 20 deletions(-)

diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env.py b/source/isaaclab/isaaclab/envs/direct_marl_env.py
index eb0a359e4f5..7994a1a5b88 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env.py
@@ -521,13 +521,18 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # check that if any render could have happened
+            # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
+            # and produces consistent, scene-content frames.  Fall back to the omni.replicator
+            # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
+            if self._find_video_camera() is not None:
+                return self._render_tiled_camera_rgb_array()
             if not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
-                    f"Cannot render '{self.render_mode}' - no GUI and offscreen rendering not enabled."
-                    " If running headless, make sure --enable_cameras is set."
+                    "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
+                    " the scene, and neither GUI nor offscreen rendering is available."
+                    " Add a TiledCamera sensor to the scene configuration to enable video recording."
                 )
-            # create the annotator if it does not exist
+            # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
             if not hasattr(self, "_rgb_annotator"):
                 import omni.replicator.core as rep
 
@@ -542,7 +547,6 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
             rgb_data = self._rgb_annotator.get_data()
             # convert to numpy array
             rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape)
-            # return the rgb data
             # note: initially the renderer is warming up and returns empty data
             if rgb_data.size == 0:
                 return np.zeros((self.cfg.viewer.resolution[1], self.cfg.viewer.resolution[0], 3), dtype=np.uint8)
@@ -553,6 +557,52 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
             )
 
+    def _find_video_camera(self):
+        """
+            Locates and caches the first TiledCamera sensor with RGB output. 
+            Previously used the omni.replicator viewer camera which had RGB output.
+            Returns ``None`` if absent.
+        """
+        if not hasattr(self, "_video_camera"):
+            from isaaclab.sensors.camera import TiledCamera
+
+            self._video_camera = None
+            for sensor in self.scene.sensors.values():
+                if isinstance(sensor, TiledCamera):
+                    output = sensor.data.output
+                    if "rgb" in output or "rgba" in output:
+                        self._video_camera = sensor
+                        break
+        return self._video_camera
+
+    def _render_tiled_camera_rgb_array(self) -> np.ndarray:
+        """Return a square tile-grid of RGB frames from the scene's TiledCamera.
+
+        Create a square grid of tiles. This method reads directly from the
+        TiledCamera sensor buffer to generate the tiles.
+
+        Returns:
+            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
+            ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
+        """
+        output = self._video_camera.data.output
+        # shape: [num_envs, H, W, 3], uint8
+        rgb_all = output["rgb"] if "rgb" in output else output["rgba"][..., :3]
+
+        n_envs = int(rgb_all.shape[0])
+        grid_size = math.ceil(math.sqrt(n_envs))
+        n_slots = grid_size * grid_size
+        tiles = rgb_all.cpu().numpy()  # [n_envs, H, W, 3]
+        H, W = tiles.shape[1], tiles.shape[2]
+        # Pad unused slots with black to fill the square grid.
+        pad = n_slots - n_envs
+        if pad > 0:
+            tiles = np.concatenate([tiles, np.zeros((pad, H, W, 3), dtype=tiles.dtype)], axis=0)
+        # [grid_size, grid_size, H, W, 3] → [grid_size*H, grid_size*W, 3]
+        grid = tiles.reshape(grid_size, grid_size, H, W, 3)
+        grid = grid.transpose(0, 2, 1, 3, 4)  # [grid_size, H, grid_size, W, 3]
+        return grid.reshape(grid_size * H, grid_size * W, 3)
+
     def close(self):
         """Cleanup for the environment."""
         if not self._is_closed:
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env.py b/source/isaaclab/isaaclab/envs/direct_rl_env.py
index b362ac72bc2..013b9281ac0 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env.py
@@ -489,13 +489,18 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # check that if any render could have happened
+            # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
+            # and produces consistent, scene-content frames.  Fall back to the omni.replicator
+            # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
+            if self._find_video_camera() is not None:
+                return self._render_tiled_camera_rgb_array()
             if not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
-                    f"Cannot render '{self.render_mode}' - no GUI and offscreen rendering not enabled."
-                    " If running headless, make sure --enable_cameras is set."
+                    "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
+                    " the scene, and neither GUI nor offscreen rendering is available."
+                    " Add a TiledCamera sensor to the scene configuration to enable video recording."
                 )
-            # create the annotator if it does not exist
+            # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
             if not hasattr(self, "_rgb_annotator"):
                 import omni.replicator.core as rep
 
@@ -510,8 +515,7 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
             rgb_data = self._rgb_annotator.get_data()
             # convert to numpy array
             rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape)
-            # return the rgb data
-            # note: initially the renerer is warming up and returns empty data
+            # note: initially the renderer is warming up and returns empty data
             if rgb_data.size == 0:
                 return np.zeros((self.cfg.viewer.resolution[1], self.cfg.viewer.resolution[0], 3), dtype=np.uint8)
             else:
@@ -521,6 +525,52 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
             )
 
+    def _find_video_camera(self):
+        """
+            Locates and caches the first TiledCamera sensor with RGB output.
+            Previously used the omni.replicator viewer camera which had RGB output.
+            Returns ``None`` if absent.
+        """
+        if not hasattr(self, "_video_camera"):
+            from isaaclab.sensors.camera import TiledCamera
+
+            self._video_camera = None
+            for sensor in self.scene.sensors.values():
+                if isinstance(sensor, TiledCamera):
+                    output = sensor.data.output
+                    if "rgb" in output or "rgba" in output:
+                        self._video_camera = sensor
+                        break
+        return self._video_camera
+
+    def _render_tiled_camera_rgb_array(self) -> np.ndarray:
+        """Return a square tile-grid of RGB frames from the scene's TiledCamera.
+
+        Create a square grid of tiles. This method reads directly from the
+        TiledCamera sensor buffer to generate the tiles.
+
+        Returns:
+            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
+            ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
+        """
+        output = self._video_camera.data.output
+        # shape: [num_envs, H, W, 3], uint8
+        rgb_all = output["rgb"] if "rgb" in output else output["rgba"][..., :3]
+
+        n_envs = int(rgb_all.shape[0])
+        grid_size = math.ceil(math.sqrt(n_envs))
+        n_slots = grid_size * grid_size
+        tiles = rgb_all.cpu().numpy()  # [n_envs, H, W, 3]
+        H, W = tiles.shape[1], tiles.shape[2]
+        # Pad unused slots with black to fill the square grid.
+        pad = n_slots - n_envs
+        if pad > 0:
+            tiles = np.concatenate([tiles, np.zeros((pad, H, W, 3), dtype=tiles.dtype)], axis=0)
+        # [grid_size, grid_size, H, W, 3] → [grid_size*H, grid_size*W, 3]
+        grid = tiles.reshape(grid_size, grid_size, H, W, 3)
+        grid = grid.transpose(0, 2, 1, 3, 4)  # [grid_size, H, grid_size, W, 3]
+        return grid.reshape(grid_size * H, grid_size * W, 3)
+
     def close(self):
         """Cleanup for the environment."""
         if not self._is_closed:
diff --git a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
index d08b7e3be3a..30ac1ea88d2 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
@@ -270,15 +270,18 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # check that if any render could have happened
-            # Check for GUI, offscreen rendering, or visualizers
-            has_visualizers = bool(self.sim.get_setting("/isaaclab/visualizer"))
-            if not (self.sim.has_gui or self.sim.has_offscreen_render or has_visualizers):
+            # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
+            # and produces consistent, scene-content frames.  Fall back to the omni.replicator
+            # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
+            if self._find_video_camera() is not None:
+                return self._render_tiled_camera_rgb_array()
+            if not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
-                    f"Cannot render '{self.render_mode}' - no GUI and offscreen rendering not enabled."
-                    " If running headless, make sure --enable_cameras is set."
+                    "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
+                    " the scene, and neither GUI nor offscreen rendering is available."
+                    " Add a TiledCamera sensor to the scene configuration to enable video recording."
                 )
-            # create the annotator if it does not exist
+            # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
             if not hasattr(self, "_rgb_annotator"):
                 import omni.replicator.core as rep
 
@@ -293,8 +296,7 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
             rgb_data = self._rgb_annotator.get_data()
             # convert to numpy array
             rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape)
-            # return the rgb data
-            # note: initially the renerer is warming up and returns empty data
+            # note: initially the renderer is warming up and returns empty data
             if rgb_data.size == 0:
                 return np.zeros((self.cfg.viewer.resolution[1], self.cfg.viewer.resolution[0], 3), dtype=np.uint8)
             else:
@@ -304,6 +306,52 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
             )
 
+    def _find_video_camera(self):
+        """
+            Locates and caches the first TiledCamera sensor with RGB output.
+            Previously used the omni.replicator viewer camera which had RGB output.
+            Returns ``None`` if absent.
+        """
+        if not hasattr(self, "_video_camera"):
+            from isaaclab.sensors.camera import TiledCamera
+
+            self._video_camera = None
+            for sensor in self.scene.sensors.values():
+                if isinstance(sensor, TiledCamera):
+                    output = sensor.data.output
+                    if "rgb" in output or "rgba" in output:
+                        self._video_camera = sensor
+                        break
+        return self._video_camera
+
+    def _render_tiled_camera_rgb_array(self) -> np.ndarray:
+        """Return a square tile-grid of RGB frames from the scene's TiledCamera.
+
+        Create a square grid of tiles. This method reads directly from the
+        TiledCamera sensor buffer to generate the tiles.
+
+        Returns:
+            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
+            ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
+        """
+        output = self._video_camera.data.output
+        # shape: [num_envs, H, W, 3], uint8
+        rgb_all = output["rgb"] if "rgb" in output else output["rgba"][..., :3]
+
+        n_envs = int(rgb_all.shape[0])
+        grid_size = math.ceil(math.sqrt(n_envs))
+        n_slots = grid_size * grid_size
+        tiles = rgb_all.cpu().numpy()  # [n_envs, H, W, 3]
+        H, W = tiles.shape[1], tiles.shape[2]
+        # Pad unused slots with black to fill the square grid.
+        pad = n_slots - n_envs
+        if pad > 0:
+            tiles = np.concatenate([tiles, np.zeros((pad, H, W, 3), dtype=tiles.dtype)], axis=0)
+        # [grid_size, grid_size, H, W, 3] → [grid_size*H, grid_size*W, 3]
+        grid = tiles.reshape(grid_size, grid_size, H, W, 3)
+        grid = grid.transpose(0, 2, 1, 3, 4)  # [grid_size, H, grid_size, W, 3]
+        return grid.reshape(grid_size * H, grid_size * W, 3)
+
     def close(self):
         if not self._is_closed:
             # destructor is order-sensitive

From b628c4d36f2fc180e5c54fa3fb16b7c9b48fe6b8 Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Wed, 11 Mar 2026 14:44:33 -0700
Subject: [PATCH 02/11] Refactor video recording into VideoRecorder class with
 caching and contiguous memory

---
 .../isaaclab/isaaclab/envs/direct_marl_env.py |  62 +++-------
 .../isaaclab/envs/direct_marl_env_cfg.py      |  14 +++
 .../isaaclab/isaaclab/envs/direct_rl_env.py   |  62 +++-------
 .../isaaclab/envs/direct_rl_env_cfg.py        |  14 +++
 .../isaaclab/envs/manager_based_env_cfg.py    |  14 +++
 .../isaaclab/envs/manager_based_rl_env.py     |  63 +++-------
 .../isaaclab/envs/utils/video_recorder.py     | 115 ++++++++++++++++++
 .../isaaclab/envs/utils/video_recorder_cfg.py |  44 +++++++
 8 files changed, 244 insertions(+), 144 deletions(-)
 create mode 100644 source/isaaclab/isaaclab/envs/utils/video_recorder.py
 create mode 100644 source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py

diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env.py b/source/isaaclab/isaaclab/envs/direct_marl_env.py
index 7994a1a5b88..284ed815ab6 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env.py
@@ -35,6 +35,8 @@
 from .common import ActionType, AgentID, EnvStepReturn, ObsType, StateType
 from .direct_marl_env_cfg import DirectMARLEnvCfg
 from .ui import ViewportCameraController
+from .utils.video_recorder import VideoRecorder
+from .utils.video_recorder_cfg import VideoRecorderCfg
 from .utils.spaces import sample_space, spec_to_gym_space
 
 # import logger
@@ -226,6 +228,14 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
                 if noise_model is not None
             }
 
+        # instantiate the viewport recorder for rgb_array video capture
+        if self.cfg.video_recorder is not None:
+            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
+                self.cfg.video_recorder, self.scene
+            )
+        else:
+            self.video_recorder = None
+
         # perform events at the start of the simulation
         if self.cfg.events:
             # we print it here to make the logging consistent
@@ -524,8 +534,10 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
             # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
             # and produces consistent, scene-content frames.  Fall back to the omni.replicator
             # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
-            if self._find_video_camera() is not None:
-                return self._render_tiled_camera_rgb_array()
+            if self.video_recorder is not None:
+                frame = self.video_recorder.render_rgb_array()
+                if frame is not None:
+                    return frame
             if not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
                     "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
@@ -557,52 +569,6 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
             )
 
-    def _find_video_camera(self):
-        """
-            Locates and caches the first TiledCamera sensor with RGB output. 
-            Previously used the omni.replicator viewer camera which had RGB output.
-            Returns ``None`` if absent.
-        """
-        if not hasattr(self, "_video_camera"):
-            from isaaclab.sensors.camera import TiledCamera
-
-            self._video_camera = None
-            for sensor in self.scene.sensors.values():
-                if isinstance(sensor, TiledCamera):
-                    output = sensor.data.output
-                    if "rgb" in output or "rgba" in output:
-                        self._video_camera = sensor
-                        break
-        return self._video_camera
-
-    def _render_tiled_camera_rgb_array(self) -> np.ndarray:
-        """Return a square tile-grid of RGB frames from the scene's TiledCamera.
-
-        Create a square grid of tiles. This method reads directly from the
-        TiledCamera sensor buffer to generate the tiles.
-
-        Returns:
-            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
-            ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
-        """
-        output = self._video_camera.data.output
-        # shape: [num_envs, H, W, 3], uint8
-        rgb_all = output["rgb"] if "rgb" in output else output["rgba"][..., :3]
-
-        n_envs = int(rgb_all.shape[0])
-        grid_size = math.ceil(math.sqrt(n_envs))
-        n_slots = grid_size * grid_size
-        tiles = rgb_all.cpu().numpy()  # [n_envs, H, W, 3]
-        H, W = tiles.shape[1], tiles.shape[2]
-        # Pad unused slots with black to fill the square grid.
-        pad = n_slots - n_envs
-        if pad > 0:
-            tiles = np.concatenate([tiles, np.zeros((pad, H, W, 3), dtype=tiles.dtype)], axis=0)
-        # [grid_size, grid_size, H, W, 3] → [grid_size*H, grid_size*W, 3]
-        grid = tiles.reshape(grid_size, grid_size, H, W, 3)
-        grid = grid.transpose(0, 2, 1, 3, 4)  # [grid_size, H, grid_size, W, 3]
-        return grid.reshape(grid_size * H, grid_size * W, 3)
-
     def close(self):
         """Cleanup for the environment."""
         if not self._is_closed:
diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env_cfg.py b/source/isaaclab/isaaclab/envs/direct_marl_env_cfg.py
index b22a6169d7a..962c1ffb99b 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env_cfg.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env_cfg.py
@@ -17,6 +17,7 @@
 from isaaclab.utils.noise import NoiseModelCfg
 
 from .common import AgentID, SpaceType, ViewerCfg
+from .utils.video_recorder_cfg import VideoRecorderCfg
 
 
 @configclass
@@ -234,3 +235,16 @@ class DirectMARLEnvCfg:
 
     log_dir: str | None = None
     """Directory for logging experiment artifacts. Defaults to None, in which case no specific log directory is set."""
+
+    video_recorder: VideoRecorderCfg = VideoRecorderCfg()
+    """Configuration for the viewport recorder used when ``render_mode="rgb_array"``.
+
+    Defaults to a :class:`~isaaclab.envs.VideoRecorderCfg` that captures all environments
+    in a square tile-grid using :class:`~isaaclab.envs.VideoRecorder`.
+
+    Set :attr:`~isaaclab.envs.VideoRecorderCfg.class_type` to a custom subclass to swap the
+    capture implementation without modifying environment code.  Set to ``None`` to disable
+    TiledCamera-based recording entirely and fall back to the Kit-based omni.replicator path.
+
+    CLI example: ``env.video_recorder.video_num_tiles=9``
+    """
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env.py b/source/isaaclab/isaaclab/envs/direct_rl_env.py
index 013b9281ac0..af4a800b691 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env.py
@@ -32,6 +32,8 @@
 from .common import VecEnvObs, VecEnvStepReturn
 from .direct_rl_env_cfg import DirectRLEnvCfg
 from .ui import ViewportCameraController
+from .utils.video_recorder import VideoRecorder
+from .utils.video_recorder_cfg import VideoRecorderCfg
 from .utils.spaces import sample_space, spec_to_gym_space
 
 if has_kit():
@@ -229,6 +231,14 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
                 self.cfg.observation_noise_model, num_envs=self.num_envs, device=self.device
             )
 
+        # instantiate the viewport recorder for rgb_array video capture
+        if self.cfg.video_recorder is not None:
+            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
+                self.cfg.video_recorder, self.scene
+            )
+        else:
+            self.video_recorder = None
+
         # perform events at the start of the simulation
         if self.cfg.events:
             # we print it here to make the logging consistent
@@ -492,8 +502,10 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
             # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
             # and produces consistent, scene-content frames.  Fall back to the omni.replicator
             # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
-            if self._find_video_camera() is not None:
-                return self._render_tiled_camera_rgb_array()
+            if self.video_recorder is not None:
+                frame = self.video_recorder.render_rgb_array()
+                if frame is not None:
+                    return frame
             if not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
                     "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
@@ -525,52 +537,6 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
             )
 
-    def _find_video_camera(self):
-        """
-            Locates and caches the first TiledCamera sensor with RGB output.
-            Previously used the omni.replicator viewer camera which had RGB output.
-            Returns ``None`` if absent.
-        """
-        if not hasattr(self, "_video_camera"):
-            from isaaclab.sensors.camera import TiledCamera
-
-            self._video_camera = None
-            for sensor in self.scene.sensors.values():
-                if isinstance(sensor, TiledCamera):
-                    output = sensor.data.output
-                    if "rgb" in output or "rgba" in output:
-                        self._video_camera = sensor
-                        break
-        return self._video_camera
-
-    def _render_tiled_camera_rgb_array(self) -> np.ndarray:
-        """Return a square tile-grid of RGB frames from the scene's TiledCamera.
-
-        Create a square grid of tiles. This method reads directly from the
-        TiledCamera sensor buffer to generate the tiles.
-
-        Returns:
-            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
-            ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
-        """
-        output = self._video_camera.data.output
-        # shape: [num_envs, H, W, 3], uint8
-        rgb_all = output["rgb"] if "rgb" in output else output["rgba"][..., :3]
-
-        n_envs = int(rgb_all.shape[0])
-        grid_size = math.ceil(math.sqrt(n_envs))
-        n_slots = grid_size * grid_size
-        tiles = rgb_all.cpu().numpy()  # [n_envs, H, W, 3]
-        H, W = tiles.shape[1], tiles.shape[2]
-        # Pad unused slots with black to fill the square grid.
-        pad = n_slots - n_envs
-        if pad > 0:
-            tiles = np.concatenate([tiles, np.zeros((pad, H, W, 3), dtype=tiles.dtype)], axis=0)
-        # [grid_size, grid_size, H, W, 3] → [grid_size*H, grid_size*W, 3]
-        grid = tiles.reshape(grid_size, grid_size, H, W, 3)
-        grid = grid.transpose(0, 2, 1, 3, 4)  # [grid_size, H, grid_size, W, 3]
-        return grid.reshape(grid_size * H, grid_size * W, 3)
-
     def close(self):
         """Cleanup for the environment."""
         if not self._is_closed:
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env_cfg.py b/source/isaaclab/isaaclab/envs/direct_rl_env_cfg.py
index fd40b3104c2..c7c11bdb2e9 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env_cfg.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env_cfg.py
@@ -16,6 +16,7 @@
 from isaaclab.utils.noise import NoiseModelCfg
 
 from .common import SpaceType, ViewerCfg
+from .utils.video_recorder_cfg import VideoRecorderCfg
 
 
 @configclass
@@ -254,3 +255,16 @@ class DirectRLEnvCfg:
 
     log_dir: str | None = None
     """Directory for logging experiment artifacts. Defaults to None, in which case no specific log directory is set."""
+
+    video_recorder: VideoRecorderCfg = VideoRecorderCfg()
+    """Configuration for the viewport recorder used when ``render_mode="rgb_array"``.
+
+    Defaults to a :class:`~isaaclab.envs.VideoRecorderCfg` that captures all environments
+    in a square tile-grid using :class:`~isaaclab.envs.VideoRecorder`.
+
+    Set :attr:`~isaaclab.envs.VideoRecorderCfg.class_type` to a custom subclass to swap the
+    capture implementation without modifying environment code.  Set to ``None`` to disable
+    TiledCamera-based recording entirely and fall back to the Kit-based omni.replicator path.
+
+    CLI example: ``env.video_recorder.video_num_tiles=9``
+    """
diff --git a/source/isaaclab/isaaclab/envs/manager_based_env_cfg.py b/source/isaaclab/isaaclab/envs/manager_based_env_cfg.py
index 24a88d5e72c..b231d278e44 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_env_cfg.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_env_cfg.py
@@ -26,6 +26,7 @@
 from isaaclab.utils import configclass
 
 from .common import ViewerCfg
+from .utils.video_recorder_cfg import VideoRecorderCfg
 
 
 @configclass
@@ -163,3 +164,16 @@ class ManagerBasedEnvCfg:
 
     log_dir: str | None = None
     """Directory for logging experiment artifacts. Defaults to None, in which case no specific log directory is set."""
+
+    video_recorder: VideoRecorderCfg = VideoRecorderCfg()
+    """Configuration for the viewport recorder used when ``render_mode="rgb_array"``.
+
+    Defaults to a :class:`~isaaclab.envs.VideoRecorderCfg` that captures all environments
+    in a square tile-grid using :class:`~isaaclab.envs.VideoRecorder`.
+
+    Set :attr:`~isaaclab.envs.VideoRecorderCfg.class_type` to a custom subclass to swap the
+    capture implementation without modifying environment code.  Set to ``None`` to disable
+    TiledCamera-based recording entirely and fall back to the Kit-based omni.replicator path.
+
+    CLI example: ``env.video_recorder.video_num_tiles=9``
+    """
diff --git a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
index 30ac1ea88d2..e2b5635b0f9 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
@@ -20,6 +20,8 @@
 from .common import VecEnvStepReturn
 from .manager_based_env import ManagerBasedEnv
 from .manager_based_rl_env_cfg import ManagerBasedRLEnvCfg
+from .utils.video_recorder import VideoRecorder
+from .utils.video_recorder_cfg import VideoRecorderCfg
 
 
 class ManagerBasedRLEnv(ManagerBasedEnv, gym.Env):
@@ -86,6 +88,15 @@ def __init__(self, cfg: ManagerBasedRLEnvCfg, render_mode: str | None = None, **
         #    produced video matches the simulation
         self.metadata["render_fps"] = 1 / self.step_dt
         self.has_rtx_sensors = self.sim.get_setting("/isaaclab/render/rtx_sensors")
+
+        # instantiate the viewport recorder for rgb_array video capture
+        if self.cfg.video_recorder is not None:
+            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
+                self.cfg.video_recorder, self.scene
+            )
+        else:
+            self.video_recorder = None
+
         print("[INFO]: Completed setting up the environment...")
 
     """
@@ -273,8 +284,10 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
             # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
             # and produces consistent, scene-content frames.  Fall back to the omni.replicator
             # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
-            if self._find_video_camera() is not None:
-                return self._render_tiled_camera_rgb_array()
+            if self.video_recorder is not None:
+                frame = self.video_recorder.render_rgb_array()
+                if frame is not None:
+                    return frame
             if not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
                     "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
@@ -306,52 +319,6 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
             )
 
-    def _find_video_camera(self):
-        """
-            Locates and caches the first TiledCamera sensor with RGB output.
-            Previously used the omni.replicator viewer camera which had RGB output.
-            Returns ``None`` if absent.
-        """
-        if not hasattr(self, "_video_camera"):
-            from isaaclab.sensors.camera import TiledCamera
-
-            self._video_camera = None
-            for sensor in self.scene.sensors.values():
-                if isinstance(sensor, TiledCamera):
-                    output = sensor.data.output
-                    if "rgb" in output or "rgba" in output:
-                        self._video_camera = sensor
-                        break
-        return self._video_camera
-
-    def _render_tiled_camera_rgb_array(self) -> np.ndarray:
-        """Return a square tile-grid of RGB frames from the scene's TiledCamera.
-
-        Create a square grid of tiles. This method reads directly from the
-        TiledCamera sensor buffer to generate the tiles.
-
-        Returns:
-            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
-            ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
-        """
-        output = self._video_camera.data.output
-        # shape: [num_envs, H, W, 3], uint8
-        rgb_all = output["rgb"] if "rgb" in output else output["rgba"][..., :3]
-
-        n_envs = int(rgb_all.shape[0])
-        grid_size = math.ceil(math.sqrt(n_envs))
-        n_slots = grid_size * grid_size
-        tiles = rgb_all.cpu().numpy()  # [n_envs, H, W, 3]
-        H, W = tiles.shape[1], tiles.shape[2]
-        # Pad unused slots with black to fill the square grid.
-        pad = n_slots - n_envs
-        if pad > 0:
-            tiles = np.concatenate([tiles, np.zeros((pad, H, W, 3), dtype=tiles.dtype)], axis=0)
-        # [grid_size, grid_size, H, W, 3] → [grid_size*H, grid_size*W, 3]
-        grid = tiles.reshape(grid_size, grid_size, H, W, 3)
-        grid = grid.transpose(0, 2, 1, 3, 4)  # [grid_size, H, grid_size, W, 3]
-        return grid.reshape(grid_size * H, grid_size * W, 3)
-
     def close(self):
         if not self._is_closed:
             # destructor is order-sensitive
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder.py b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
new file mode 100644
index 00000000000..d328f7c374b
--- /dev/null
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Viewport recorder for capturing video frames from a :class:`~isaaclab.sensors.camera.TiledCamera`."""
+
+from __future__ import annotations
+
+import math
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from isaaclab.scene import InteractiveScene
+    from .video_recorder_cfg import VideoRecorderCfg
+
+
+class VideoRecorder:
+    """Records video frames from the scene's :class:`~isaaclab.sensors.camera.TiledCamera`.
+
+    On the first :meth:`render_rgb_array` call this class searches the scene for the first
+    ``TiledCamera`` sensor with ``"rgb"`` or ``"rgba"`` output and caches the camera reference
+    together with all grid-layout constants so subsequent calls are allocation-free (except for
+    the unavoidable GPU-to-CPU transfer and the final tile-stitch reshape).
+
+    The default implementation reads *all* ``num_envs`` frames from the TiledCamera buffer on
+    the GPU and slices the first ``cfg.video_num_tiles`` on the CPU (Option A).  Swap
+    ``cfg.class_type`` for a custom subclass to change this behaviour without touching any
+    environment code.
+
+    Args:
+        cfg: Configuration for this recorder.
+        scene: The interactive scene that owns the sensors.
+    """
+
+    def __init__(self, cfg: VideoRecorderCfg, scene: InteractiveScene):
+        self.cfg = cfg
+        self._scene = scene
+
+    def render_rgb_array(self) -> np.ndarray | None:
+        """Return a square tile-grid RGB frame, or ``None`` if no suitable camera exists.
+
+        Returns:
+            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
+            ``G = ceil(sqrt(video_num_tiles))`` and ``(H, W)`` is the per-tile resolution,
+            or ``None`` when no :class:`~isaaclab.sensors.camera.TiledCamera` with RGB output
+            is present in the scene.
+        """
+        if self._find_video_camera() is None:
+            return None
+        return self._render_tiled_camera_rgb_array()
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    def _find_video_camera(self):
+        """
+            Locates and caches the first TiledCamera sensor with RGB output.
+            Previously used the omni.replicator viewer camera which had RGB output.
+            Returns ``None`` if absent.
+        """
+        if not hasattr(self, "_video_camera"):
+            from isaaclab.sensors.camera import TiledCamera
+
+            self._video_camera = None
+            for sensor in self._scene.sensors.values():
+                if isinstance(sensor, TiledCamera):
+                    output = sensor.data.output
+                    if "rgb" in output or "rgba" in output:
+                        self._video_camera = sensor
+                        self._video_rgb_key = "rgb" if "rgb" in output else "rgba"
+                        # Cache all grid constants — these are fixed for the lifetime of the env.
+                        n_total = int(sensor.data.output[self._video_rgb_key].shape[0])
+                        n_envs = n_total if self.cfg.video_num_tiles < 0 else min(self.cfg.video_num_tiles, n_total)
+                        self._video_n_envs = n_envs
+                        self._video_grid_size = math.ceil(math.sqrt(n_envs))
+                        n_slots = self._video_grid_size * self._video_grid_size
+                        H = int(sensor.data.output[self._video_rgb_key].shape[1])
+                        W = int(sensor.data.output[self._video_rgb_key].shape[2])
+                        self._video_H = H
+                        self._video_W = W
+                        # Pre-allocate the black padding block (zero-copy when pad == 0).
+                        pad = n_slots - n_envs
+                        self._video_pad = np.zeros((pad, H, W, 3), dtype=np.uint8) if pad > 0 else None
+                        break
+        return self._video_camera
+
+    def _render_tiled_camera_rgb_array(self) -> np.ndarray:
+        """Return a square tile-grid of RGB frames from the scene's TiledCamera.
+
+        Create a square grid of tiles. This method reads directly from the
+        TiledCamera sensor buffer to generate the tiles.
+
+        Returns:
+            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
+            ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
+        """
+        rgb_all = self._video_camera.data.output[self._video_rgb_key]
+        # Drop alpha channel once on GPU before the CPU transfer.
+        if self._video_rgb_key == "rgba":
+            rgb_all = rgb_all[..., :3]
+
+        # .contiguous() ensures the reshape below returns a zero-copy view.
+        tiles = rgb_all[: self._video_n_envs].contiguous().cpu().numpy()  # [n_envs, H, W, 3]
+        if self._video_pad is not None:
+            tiles = np.concatenate([tiles, self._video_pad], axis=0)
+        # [grid_size, grid_size, H, W, 3] → [grid_size*H, grid_size*W, 3]
+        g, H, W = self._video_grid_size, self._video_H, self._video_W
+        grid = tiles.reshape(g, g, H, W, 3)
+        grid = grid.transpose(0, 2, 1, 3, 4)
+        # after transpose the strides are non-standard; reshape must copy here.
+        return grid.reshape(g * H, g * W, 3)
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
new file mode 100644
index 00000000000..0d9b6287728
--- /dev/null
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Configuration for :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`."""
+
+from __future__ import annotations
+
+from isaaclab.utils import configclass
+
+from .video_recorder import VideoRecorder
+
+
+@configclass
+class VideoRecorderCfg:
+    """Configuration for :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`.
+
+    Set :attr:`class_type` to a custom subclass of
+    :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder` to swap the
+    video-capture implementation (e.g. an Option-B pipeline that only renders
+    ``video_num_tiles`` cameras on the GPU) without modifying any environment code.
+    """
+
+    class_type: type = VideoRecorder
+    """The recorder class to instantiate.  Must accept ``(cfg, scene)`` as constructor arguments.
+    Defaults to :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`.
+    """
+
+    video_num_tiles: int = -1
+    """Number of environment tiles to include in each video frame when using ``render_mode="rgb_array"``.
+    Defaults to -1, which renders all environments.
+
+    Environments are arranged into a square grid of size
+    ``ceil(sqrt(video_num_tiles)) * ceil(sqrt(video_num_tiles))``, with unused slots filled with
+    black. For example:
+
+    * ``-1``: all environments (default)
+    * ``1``: single environment (1*1)
+    * ``4``: first 4 environments (2*2 grid)
+    * ``9``: first 9 environments (3*3 grid)
+
+    CLI example: ``env.video_recorder.video_num_tiles=9``
+    """

From 556154273c9cd4ed3604c941960b1bf4a31ff996 Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 12:59:34 -0700
Subject: [PATCH 03/11] tiled camera-based video recording functionality

---
 .../isaaclab/isaaclab/envs/direct_marl_env.py |  20 ++-
 .../isaaclab/isaaclab/envs/direct_rl_env.py   |  20 ++-
 .../isaaclab/envs/manager_based_env.py        |  11 ++
 .../isaaclab/envs/manager_based_rl_env.py     |  14 +-
 .../isaaclab/envs/utils/video_recorder.py     | 139 +++++++++++++++---
 .../isaaclab/envs/utils/video_recorder_cfg.py |  73 +++++++++
 6 files changed, 232 insertions(+), 45 deletions(-)

diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env.py b/source/isaaclab/isaaclab/envs/direct_marl_env.py
index 284ed815ab6..9eebdc89e18 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env.py
@@ -170,6 +170,18 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
             if "prestartup" in self.event_manager.available_modes:
                 self.event_manager.apply(mode="prestartup")
 
+        # Instantiate the video recorder before sim.reset() so that any fallback TiledCamera
+        # (used for state-based envs without an observation camera) is spawned into the USD
+        # stage and registered for the PHYSICS_READY callback before physics initialises.
+        # Forward render_mode so VideoRecorder only spawns fallback cameras when --video is active.
+        if self.cfg.video_recorder is not None:
+            self.cfg.video_recorder.render_mode = render_mode
+            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
+                self.cfg.video_recorder, self.scene
+            )
+        else:
+            self.video_recorder = None
+
         # play the simulator to activate physics handles
         # note: this activates the physics simulation view that exposes TensorAPIs
         # note: when started in extension mode, first call sim.reset_async() and then initialize the managers
@@ -228,14 +240,6 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
                 if noise_model is not None
             }
 
-        # instantiate the viewport recorder for rgb_array video capture
-        if self.cfg.video_recorder is not None:
-            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
-                self.cfg.video_recorder, self.scene
-            )
-        else:
-            self.video_recorder = None
-
         # perform events at the start of the simulation
         if self.cfg.events:
             # we print it here to make the logging consistent
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env.py b/source/isaaclab/isaaclab/envs/direct_rl_env.py
index af4a800b691..e821bc92a6f 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env.py
@@ -175,6 +175,18 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
             if "prestartup" in self.event_manager.available_modes:
                 self.event_manager.apply(mode="prestartup")
 
+        # Instantiate the video recorder before sim.reset() so that any fallback TiledCamera
+        # (used for state-based envs without an observation camera) is spawned into the USD
+        # stage and registered for the PHYSICS_READY callback before physics initialises.
+        # Forward render_mode so VideoRecorder only spawns fallback cameras when --video is active.
+        if self.cfg.video_recorder is not None:
+            self.cfg.video_recorder.render_mode = render_mode
+            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
+                self.cfg.video_recorder, self.scene
+            )
+        else:
+            self.video_recorder = None
+
         # play the simulator to activate physics handles
         # note: this activates the physics simulation view that exposes TensorAPIs
         # note: when started in extension mode, first call sim.reset_async() and then initialize the managers
@@ -231,14 +243,6 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
                 self.cfg.observation_noise_model, num_envs=self.num_envs, device=self.device
             )
 
-        # instantiate the viewport recorder for rgb_array video capture
-        if self.cfg.video_recorder is not None:
-            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
-                self.cfg.video_recorder, self.scene
-            )
-        else:
-            self.video_recorder = None
-
         # perform events at the start of the simulation
         if self.cfg.events:
             # we print it here to make the logging consistent
diff --git a/source/isaaclab/isaaclab/envs/manager_based_env.py b/source/isaaclab/isaaclab/envs/manager_based_env.py
index 33327dc0186..c47dbfd89a5 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_env.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_env.py
@@ -26,6 +26,7 @@
 from .manager_based_env_cfg import ManagerBasedEnvCfg
 from .ui import ViewportCameraController
 from .utils.io_descriptors import export_articulations_data, export_scene_data
+from .utils.video_recorder import VideoRecorder
 
 # import logger
 logger = logging.getLogger(__name__)
@@ -182,6 +183,16 @@ def _init_sim(self):
         if "prestartup" in self.event_manager.available_modes:
             self.event_manager.apply(mode="prestartup")
 
+        # Instantiate the video recorder before sim.reset() so that any fallback TiledCamera
+        # (used for state-based envs without an observation camera) is spawned into the USD
+        # stage and registered for the PHYSICS_READY callback before physics initialises.
+        if self.cfg.video_recorder is not None:
+            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
+                self.cfg.video_recorder, self.scene
+            )
+        else:
+            self.video_recorder = None
+
         # play the simulator to activate physics handles
         # note: this activates the physics simulation view that exposes TensorAPIs
         # note: when started in extension mode, first call sim.reset_async() and then initialize the managers
diff --git a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
index e2b5635b0f9..fb5e586d6f3 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
@@ -21,7 +21,6 @@
 from .manager_based_env import ManagerBasedEnv
 from .manager_based_rl_env_cfg import ManagerBasedRLEnvCfg
 from .utils.video_recorder import VideoRecorder
-from .utils.video_recorder_cfg import VideoRecorderCfg
 
 
 class ManagerBasedRLEnv(ManagerBasedEnv, gym.Env):
@@ -78,6 +77,11 @@ def __init__(self, cfg: ManagerBasedRLEnvCfg, render_mode: str | None = None, **
         # initialize the episode length buffer BEFORE loading the managers to use it in mdp functions.
         self.episode_length_buf = torch.zeros(cfg.scene.num_envs, device=cfg.sim.device, dtype=torch.long)
 
+        # Forward render_mode to VideoRecorderCfg before super().__init__() creates VideoRecorder,
+        # so fallback cameras are only spawned when --video is active (render_mode="rgb_array").
+        if cfg.video_recorder is not None:
+            cfg.video_recorder.render_mode = render_mode
+
         # initialize the base class to setup the scene.
         super().__init__(cfg=cfg)
         # store the render mode
@@ -89,14 +93,6 @@ def __init__(self, cfg: ManagerBasedRLEnvCfg, render_mode: str | None = None, **
         self.metadata["render_fps"] = 1 / self.step_dt
         self.has_rtx_sensors = self.sim.get_setting("/isaaclab/render/rtx_sensors")
 
-        # instantiate the viewport recorder for rgb_array video capture
-        if self.cfg.video_recorder is not None:
-            self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
-                self.cfg.video_recorder, self.scene
-            )
-        else:
-            self.video_recorder = None
-
         print("[INFO]: Completed setting up the environment...")
 
     """
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder.py b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
index d328f7c374b..e0475223ecc 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
@@ -30,6 +30,18 @@ class VideoRecorder:
     ``cfg.class_type`` for a custom subclass to change this behaviour without touching any
     environment code.
 
+    **Camera selection priority:**
+
+    1. An existing :class:`~isaaclab.sensors.camera.TiledCamera` found in the scene sensors
+       (vision-based env path — the observation camera is reused for free).
+    2. A dedicated video camera grid instantiated from ``cfg.fallback_camera_cfg``
+       (state-based env path — no observation camera exists, so one camera per environment
+       is spawned, up to ``cfg.video_num_tiles``).
+
+    For the fallback cameras to be initialised correctly they **must** be created before
+    ``sim.reset()`` is called, so :class:`VideoRecorder` must be instantiated before
+    ``sim.reset()`` in the environment setup. The environment base classes handle this.
+
     Args:
         cfg: Configuration for this recorder.
         scene: The interactive scene that owns the sensors.
@@ -38,6 +50,14 @@ class VideoRecorder:
     def __init__(self, cfg: VideoRecorderCfg, scene: InteractiveScene):
         self.cfg = cfg
         self._scene = scene
+        self._fallback_tiled_camera = None
+
+        # Spawn fallback cameras only when video recording is actually requested.
+        # cfg.render_mode is set to "rgb_array" by the env base class when --video is active
+        # (forwarded from the render_mode argument of gym.make / the env constructor).
+        # Gating here avoids GPU overhead in ordinary training runs that don't record video.
+        if cfg.fallback_camera_cfg is not None and cfg.render_mode == "rgb_array":
+            self._fallback_tiled_camera = self._spawn_fallback_cameras(cfg, scene)
 
     def render_rgb_array(self) -> np.ndarray | None:
         """Return a square tile-grid RGB frame, or ``None`` if no suitable camera exists.
@@ -46,7 +66,7 @@ def render_rgb_array(self) -> np.ndarray | None:
             RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
             ``G = ceil(sqrt(video_num_tiles))`` and ``(H, W)`` is the per-tile resolution,
             or ``None`` when no :class:`~isaaclab.sensors.camera.TiledCamera` with RGB output
-            is present in the scene.
+            is present in the scene or configured as a fallback.
         """
         if self._find_video_camera() is None:
             return None
@@ -56,48 +76,127 @@ def render_rgb_array(self) -> np.ndarray | None:
     # Internal helpers
     # ------------------------------------------------------------------
 
-    def _find_video_camera(self):
+    @staticmethod
+    def _spawn_fallback_cameras(cfg: VideoRecorderCfg, scene: InteractiveScene):
+        """Spawn one video camera prim per environment (up to ``cfg.video_num_tiles``) and
+        return a single :class:`~isaaclab.sensors.camera.TiledCamera` covering all of them.
+
+        Camera prims are spawned at ``/World/envs/env_{i}/VideoCamera`` for
+        ``i in range(n_cameras)``, then a ``TiledCamera`` with the regex prim path
+        ``/World/envs/env_.*/VideoCamera`` is created so that all spawned prims are
+        discovered and rendered as tiles.
+
+        This must be called **before** ``sim.reset()`` so the prims exist in the USD stage
+        and the ``TiledCamera`` can register for the ``PHYSICS_READY`` callback.
         """
-            Locates and caches the first TiledCamera sensor with RGB output.
-            Previously used the omni.replicator viewer camera which had RGB output.
-            Returns ``None`` if absent.
+        import torch
+
+        from isaaclab.sensors.camera import TiledCamera
+        from isaaclab.utils.math import convert_camera_frame_orientation_convention
+
+        camera_cfg = cfg.fallback_camera_cfg
+
+        # Pre-compute the OpenGL rotation offset (mirrors Camera.__init__ logic).
+        n_total_envs = scene.num_envs
+        rot = torch.tensor(camera_cfg.offset.rot, dtype=torch.float32, device="cpu").unsqueeze(0)
+        rot_offset = convert_camera_frame_orientation_convention(
+            rot, origin=camera_cfg.offset.convention, target="opengl"
+        )
+        rot_offset = rot_offset.squeeze(0).cpu().numpy()
+
+        # Ensure vertical_aperture is set before calling the spawn func.
+        spawn_cfg = camera_cfg.spawn
+        if spawn_cfg.vertical_aperture is None:
+            spawn_cfg = spawn_cfg.replace(
+                vertical_aperture=spawn_cfg.horizontal_aperture * camera_cfg.height / camera_cfg.width
+            )
+
+        # TiledCamera requires exactly one camera prim per environment (count == num_envs).
+        # We must therefore spawn cameras for ALL environments, not just video_num_tiles of them.
+        # The video_num_tiles limit is applied at render time in _render_tiled_camera_rgb_array,
+        # which only reads the first N tiles — the same behaviour as vision-based observation cameras.
+        for i in range(n_total_envs):
+            prim_path_i = f"/World/envs/env_{i}/VideoCamera"
+            spawn_cfg.func(prim_path_i, spawn_cfg, translation=camera_cfg.offset.pos, orientation=rot_offset)
+
+        # Create one TiledCamera that discovers all spawned prims via the regex path.
+        # spawn=None tells Camera.__init__ to skip re-spawning; it will verify the prims exist.
+        tiled_cfg = camera_cfg.replace(
+            prim_path="/World/envs/env_.*/VideoCamera",
+            spawn=None,
+        )
+        return TiledCamera(tiled_cfg)
+
+    def _find_video_camera(self):
+        """Locate and cache the TiledCamera to use for video recording.
+
+        Search order:
+          1. Observation TiledCamera already in the scene (vision-based env path, zero extra cost).
+          2. Dedicated fallback TiledCamera from ``cfg.fallback_camera_cfg`` (state-based env path).
+
+        Returns ``None`` if neither source is available.
+
+        Previously used the omni.replicator viewer camera which had RGB output only for
+        Kit-based backends (``physx`` / ``newton,isaacsim_rtx_renderer``).
         """
         if not hasattr(self, "_video_camera"):
             from isaaclab.sensors.camera import TiledCamera
 
             self._video_camera = None
+
+            # Priority 1: observation TiledCamera in the scene (vision-based env path).
             for sensor in self._scene.sensors.values():
                 if isinstance(sensor, TiledCamera):
                     output = sensor.data.output
                     if "rgb" in output or "rgba" in output:
                         self._video_camera = sensor
-                        self._video_rgb_key = "rgb" if "rgb" in output else "rgba"
-                        # Cache all grid constants — these are fixed for the lifetime of the env.
-                        n_total = int(sensor.data.output[self._video_rgb_key].shape[0])
-                        n_envs = n_total if self.cfg.video_num_tiles < 0 else min(self.cfg.video_num_tiles, n_total)
-                        self._video_n_envs = n_envs
-                        self._video_grid_size = math.ceil(math.sqrt(n_envs))
-                        n_slots = self._video_grid_size * self._video_grid_size
-                        H = int(sensor.data.output[self._video_rgb_key].shape[1])
-                        W = int(sensor.data.output[self._video_rgb_key].shape[2])
-                        self._video_H = H
-                        self._video_W = W
-                        # Pre-allocate the black padding block (zero-copy when pad == 0).
-                        pad = n_slots - n_envs
-                        self._video_pad = np.zeros((pad, H, W, 3), dtype=np.uint8) if pad > 0 else None
                         break
+
+            # Priority 2: fallback video camera (state-based env path).
+            if self._video_camera is None and self._fallback_tiled_camera is not None:
+                if self._fallback_tiled_camera.is_initialized:
+                    output = self._fallback_tiled_camera.data.output
+                    if "rgb" in output or "rgba" in output:
+                        self._video_camera = self._fallback_tiled_camera
+
+            # Cache all grid constants — these are fixed for the lifetime of the env.
+            if self._video_camera is not None:
+                output = self._video_camera.data.output
+                self._video_rgb_key = "rgb" if "rgb" in output else "rgba"
+                n_total = int(output[self._video_rgb_key].shape[0])
+                n_envs = n_total if self.cfg.video_num_tiles < 0 else min(self.cfg.video_num_tiles, n_total)
+                self._video_n_envs = n_envs
+                self._video_grid_size = math.ceil(math.sqrt(n_envs))
+                n_slots = self._video_grid_size * self._video_grid_size
+                H = int(output[self._video_rgb_key].shape[1])
+                W = int(output[self._video_rgb_key].shape[2])
+                self._video_H = H
+                self._video_W = W
+                # Pre-allocate the black padding block (zero-copy when pad == 0).
+                pad = n_slots - n_envs
+                self._video_pad = np.zeros((pad, H, W, 3), dtype=np.uint8) if pad > 0 else None
+
         return self._video_camera
 
     def _render_tiled_camera_rgb_array(self) -> np.ndarray:
-        """Return a square tile-grid of RGB frames from the scene's TiledCamera.
+        """Return a square tile-grid of RGB frames from the TiledCamera.
 
         Create a square grid of tiles. This method reads directly from the
         TiledCamera sensor buffer to generate the tiles.
 
+        If using the dedicated fallback video cameras (not observation sensors),
+        this method calls ``update()`` on them first to trigger a fresh render pass.
+        Observation TiledCameras are updated by ``scene.update()`` during the
+        environment step and do not need an extra update here.
+
         Returns:
             RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
             ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
         """
+        # Fallback cameras are not updated by scene.update(), so drive them manually.
+        if self._video_camera is self._fallback_tiled_camera:
+            self._fallback_tiled_camera.update(dt=0.0, force_recompute=True)
+
         rgb_all = self._video_camera.data.output[self._video_rgb_key]
         # Drop alpha channel once on GPU before the CPU transfer.
         if self._video_rgb_key == "rgba":
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
index 0d9b6287728..07726ed5a14 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
@@ -7,11 +7,47 @@
 
 from __future__ import annotations
 
+import isaaclab.sim as sim_utils
+from isaaclab.sensors.camera import TiledCameraCfg
 from isaaclab.utils import configclass
 
 from .video_recorder import VideoRecorder
 
 
+DEFAULT_VIDEO_FALLBACK_CAMERA_CFG = TiledCameraCfg(
+    prim_path="/World/envs/env_0/VideoCamera",
+    update_period=0.0,
+    height=480,
+    width=640,
+    data_types=["rgb"],
+    spawn=sim_utils.PinholeCameraCfg(
+        focal_length=24.0,
+        focus_distance=400.0,
+        horizontal_aperture=20.955,
+        clipping_range=(0.1, 1.0e5),
+    ),
+    offset=TiledCameraCfg.OffsetCfg(pos=(-7.0, 0.0, 3.0), rot=(0.0, 0.1045, 0.0, 0.9945), convention="world"),
+)
+"""Default fallback :class:`~isaaclab.sensors.camera.TiledCameraCfg` for state-based video recording.
+
+Places a pinhole camera at ``/World/envs/env_0/VideoCamera`` offset ``(-7, 0, 3)`` from
+env_0's origin, angled ~12° downward in the world frame.  This matches the camera position used
+by ``Isaac-Cartpole-RGB-v0`` and gives a reasonable side view for medium-scale environments
+(env spacing ~4 m).
+
+This is the **default** value of :attr:`VideoRecorderCfg.fallback_camera_cfg`.  No action is
+needed in task configs — fallback cameras are automatically available for all state-based
+environments.  Spawning only occurs when :attr:`VideoRecorderCfg.render_mode` is ``"rgb_array"``
+(i.e. ``--video`` is active), so ordinary training runs incur zero overhead.
+
+To customise the pose for a different environment scale, override in the task's ``__post_init__``::
+
+    self.video_recorder.fallback_camera_cfg = self.video_recorder.fallback_camera_cfg.replace(
+        offset=TiledCameraCfg.OffsetCfg(pos=(-3.0, 0.0, 2.0), rot=(0.0, 0.1045, 0.0, 0.9945), convention="world"),
+    )
+"""
+
+
 @configclass
 class VideoRecorderCfg:
     """Configuration for :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`.
@@ -27,6 +63,19 @@ class VideoRecorderCfg:
     Defaults to :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`.
     """
 
+    render_mode: str | None = None
+    """The render mode forwarded from the environment constructor.
+
+    Populated automatically by the environment base classes from the ``render_mode`` argument
+    passed to :func:`gymnasium.make` (or the environment constructor directly).  User code
+    should not set this field manually.
+
+    When ``None`` (the default, i.e. ``--video`` was **not** passed), :class:`VideoRecorder`
+    skips spawning any fallback cameras so that state-based runs incur zero overhead.
+    Only when this is ``"rgb_array"`` does the recorder allocate GPU resources for the
+    fallback camera grid.
+    """
+
     video_num_tiles: int = -1
     """Number of environment tiles to include in each video frame when using ``render_mode="rgb_array"``.
     Defaults to -1, which renders all environments.
@@ -42,3 +91,27 @@ class VideoRecorderCfg:
 
     CLI example: ``env.video_recorder.video_num_tiles=9``
     """
+
+    fallback_camera_cfg: object = DEFAULT_VIDEO_FALLBACK_CAMERA_CFG
+    """Optional :class:`~isaaclab.sensors.camera.TiledCameraCfg` used to spawn a dedicated
+    video-only camera for state-based environments (no observation ``TiledCamera`` in the scene).
+
+    Defaults to :data:`DEFAULT_VIDEO_FALLBACK_CAMERA_CFG` — a pinhole camera placed at
+    ``(-7, 0, 3)`` relative to env_0's origin, giving a reasonable side view for environments
+    with ~4 m spacing.  Set to ``None`` to disable fallback cameras entirely (e.g. for
+    vision-based tasks that already have an observation :class:`~isaaclab.sensors.camera.TiledCamera`).
+
+    Spawning is **gated on** :attr:`render_mode` ``== "rgb_array"`` (i.e. ``--video`` must be
+    active), so the default value causes zero overhead during ordinary training runs.
+
+    To customise the pose for a different environment scale, override in the task's ``__post_init__``::
+
+        self.video_recorder.fallback_camera_cfg = self.video_recorder.fallback_camera_cfg.replace(
+            offset=TiledCameraCfg.OffsetCfg(pos=(-3.0, 0.0, 2.0), rot=(0.0, 0.1045, 0.0, 0.9945), convention="world"),
+        )
+
+    .. note::
+        The prim path in the cfg must start with ``/World/envs/env_0/`` so that the OVRTX
+        renderer path check succeeds and ``TiledCamera`` correctly infers ``num_envs`` from
+        the scene.
+    """

From d03f028b2b045718779a94e12d4bc58f3af45abc Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 16:17:56 -0700
Subject: [PATCH 04/11] Add --video=perspective mode for perspective recording

---
 scripts/benchmarks/benchmark_non_rl.py        |  13 +-
 scripts/benchmarks/benchmark_rlgames.py       |  13 +-
 scripts/benchmarks/benchmark_rsl_rl.py        |  13 +-
 .../reinforcement_learning/rl_games/play.py   |  13 +-
 .../reinforcement_learning/rl_games/train.py  |  13 +-
 scripts/reinforcement_learning/rlinf/play.py  |   9 +-
 scripts/reinforcement_learning/rsl_rl/play.py |  13 +-
 .../reinforcement_learning/rsl_rl/train.py    |  13 +-
 scripts/reinforcement_learning/sb3/play.py    |  13 +-
 scripts/reinforcement_learning/sb3/train.py   |  13 +-
 scripts/reinforcement_learning/skrl/play.py   |  13 +-
 scripts/reinforcement_learning/skrl/train.py  |  13 +-
 scripts/sim2sim_transfer/rsl_rl_transfer.py   |  13 +-
 .../isaaclab/isaaclab/envs/direct_marl_env.py |  19 +-
 .../isaaclab/isaaclab/envs/direct_rl_env.py   |  22 +-
 .../isaaclab/envs/manager_based_rl_env.py     |  19 +-
 .../isaaclab/envs/utils/video_recorder.py     | 268 +++++++++++++++---
 .../isaaclab/envs/utils/video_recorder_cfg.py |  53 +++-
 18 files changed, 484 insertions(+), 62 deletions(-)

diff --git a/scripts/benchmarks/benchmark_non_rl.py b/scripts/benchmarks/benchmark_non_rl.py
index aee3be21a40..dfda247a0db 100644
--- a/scripts/benchmarks/benchmark_non_rl.py
+++ b/scripts/benchmarks/benchmark_non_rl.py
@@ -16,7 +16,14 @@
 
 # add argparse arguments
 parser = argparse.ArgumentParser(description="Train an RL agent with RL-Games.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during training. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
 parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
@@ -140,6 +147,10 @@ def main(
 
     task_startup_time_begin = time.perf_counter_ns()
 
+    # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+    if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+        env_cfg.video_recorder.video_mode = args_cli.video
+
     # create isaac environment
     env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
     # wrap for video recording
diff --git a/scripts/benchmarks/benchmark_rlgames.py b/scripts/benchmarks/benchmark_rlgames.py
index 86786026493..e68db05187e 100644
--- a/scripts/benchmarks/benchmark_rlgames.py
+++ b/scripts/benchmarks/benchmark_rlgames.py
@@ -16,7 +16,14 @@
 
 # add argparse arguments
 parser = argparse.ArgumentParser(description="Train an RL agent with RL-Games.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during training. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
 parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
@@ -195,6 +202,10 @@ def main(
 
     task_startup_time_begin = time.perf_counter_ns()
 
+    # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+    if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+        env_cfg.video_recorder.video_mode = args_cli.video
+
     # create isaac environment
     env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
     # wrap for video recording
diff --git a/scripts/benchmarks/benchmark_rsl_rl.py b/scripts/benchmarks/benchmark_rsl_rl.py
index e0c6eb68b5d..a1582950462 100644
--- a/scripts/benchmarks/benchmark_rsl_rl.py
+++ b/scripts/benchmarks/benchmark_rsl_rl.py
@@ -19,7 +19,14 @@
 
 # add argparse arguments
 parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during training. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
 parser.add_argument("--num_envs", type=int, default=4096, help="Number of environments to simulate.")
@@ -182,6 +189,10 @@ def main(
 
     task_startup_time_begin = time.perf_counter_ns()
 
+    # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+    if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+        env_cfg.video_recorder.video_mode = args_cli.video
+
     # create isaac environment
     env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
     # wrap for video recording
diff --git a/scripts/reinforcement_learning/rl_games/play.py b/scripts/reinforcement_learning/rl_games/play.py
index eb2390af90d..5762d45f801 100644
--- a/scripts/reinforcement_learning/rl_games/play.py
+++ b/scripts/reinforcement_learning/rl_games/play.py
@@ -32,7 +32,14 @@
 
 # -- argparse ----------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Play a checkpoint of an RL agent from RL-Games.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during playing. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument(
     "--disable_fabric", action="store_true", default=False, help="Disable fabric and use USD I/O operations."
@@ -114,6 +121,10 @@ def main():
         obs_groups = agent_cfg["params"]["env"].get("obs_groups")
         concate_obs_groups = agent_cfg["params"]["env"].get("concate_obs_groups", True)
 
+        # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+        if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+            env_cfg.video_recorder.video_mode = args_cli.video
+
         # create isaac environment
         env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/scripts/reinforcement_learning/rl_games/train.py b/scripts/reinforcement_learning/rl_games/train.py
index 5ad13b401bb..cfc260941db 100644
--- a/scripts/reinforcement_learning/rl_games/train.py
+++ b/scripts/reinforcement_learning/rl_games/train.py
@@ -36,7 +36,14 @@
 
 # -- argparse ----------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Train an RL agent with RL-Games.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during training. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
 parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
@@ -158,6 +165,10 @@ def main():
         # set the log directory for the environment
         env_cfg.log_dir = os.path.join(log_root_path, log_dir)
 
+        # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+        if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+            env_cfg.video_recorder.video_mode = args_cli.video
+
         # create isaac environment
         env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/scripts/reinforcement_learning/rlinf/play.py b/scripts/reinforcement_learning/rlinf/play.py
index f63e02d3e1f..c3782567617 100644
--- a/scripts/reinforcement_learning/rlinf/play.py
+++ b/scripts/reinforcement_learning/rlinf/play.py
@@ -50,7 +50,14 @@
 parser.add_argument(
     "--num_episodes", type=int, default=None, help="Number of evaluation episodes (overrides config if set)."
 )
-parser.add_argument("--video", action="store_true", default=False, help="Enable video recording.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Enable video recording. MODE is 'tiled' (default) or 'perspective' (not yet supported for rlinf).",
+)
 cli_args.add_rlinf_args(parser)
 args_cli = parser.parse_args()
 
diff --git a/scripts/reinforcement_learning/rsl_rl/play.py b/scripts/reinforcement_learning/rsl_rl/play.py
index f790f627a22..3b87e88e170 100644
--- a/scripts/reinforcement_learning/rsl_rl/play.py
+++ b/scripts/reinforcement_learning/rsl_rl/play.py
@@ -40,7 +40,14 @@
 
 # -- argparse ----------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during playing. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument(
     "--disable_fabric", action="store_true", default=False, help="Disable fabric and use USD I/O operations."
@@ -109,6 +116,10 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         # set the log directory for the environment
         env_cfg.log_dir = log_dir
 
+        # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+        if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+            env_cfg.video_recorder.video_mode = args_cli.video
+
         # create isaac environment
         env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/scripts/reinforcement_learning/rsl_rl/train.py b/scripts/reinforcement_learning/rsl_rl/train.py
index 7ca2d3156da..fac9142dd6a 100644
--- a/scripts/reinforcement_learning/rsl_rl/train.py
+++ b/scripts/reinforcement_learning/rsl_rl/train.py
@@ -45,7 +45,14 @@
 
 # -- argparse ----------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Train an RL agent with RSL-RL.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during training. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
 parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
@@ -147,6 +154,10 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
         # set the log directory for the environment (works for all environment types)
         env_cfg.log_dir = log_dir
 
+        # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+        if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+            env_cfg.video_recorder.video_mode = args_cli.video
+
         # create isaac environment
         env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/scripts/reinforcement_learning/sb3/play.py b/scripts/reinforcement_learning/sb3/play.py
index a6f222d346c..73d56f5ccd2 100644
--- a/scripts/reinforcement_learning/sb3/play.py
+++ b/scripts/reinforcement_learning/sb3/play.py
@@ -30,7 +30,14 @@
 
 # -- argparse ----------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Play a checkpoint of an RL agent from Stable-Baselines3.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during playing. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument(
     "--disable_fabric", action="store_true", default=False, help="Disable fabric and use USD I/O operations."
@@ -107,6 +114,10 @@ def main():
         # set the log directory for the environment
         env_cfg.log_dir = log_dir
 
+        # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+        if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+            env_cfg.video_recorder.video_mode = args_cli.video
+
         # create isaac environment
         env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/scripts/reinforcement_learning/sb3/train.py b/scripts/reinforcement_learning/sb3/train.py
index bd79599d1fd..98148db3708 100644
--- a/scripts/reinforcement_learning/sb3/train.py
+++ b/scripts/reinforcement_learning/sb3/train.py
@@ -38,7 +38,14 @@
 
 # -- argparse ----------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Train an RL agent with Stable-Baselines3.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during training. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
 parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
@@ -137,6 +144,10 @@ def main():
         # set the log directory for the environment
         env_cfg.log_dir = log_dir
 
+        # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+        if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+            env_cfg.video_recorder.video_mode = args_cli.video
+
         # create isaac environment
         env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/scripts/reinforcement_learning/skrl/play.py b/scripts/reinforcement_learning/skrl/play.py
index 0349d405967..da7f36dd5a7 100644
--- a/scripts/reinforcement_learning/skrl/play.py
+++ b/scripts/reinforcement_learning/skrl/play.py
@@ -35,7 +35,14 @@
 
 # -- argparse ----------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Play a checkpoint of an RL agent from skrl.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during playing. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument(
     "--disable_fabric", action="store_true", default=False, help="Disable fabric and use USD I/O operations."
@@ -150,6 +157,10 @@ def main():
         # set the log directory for the environment
         env_cfg.log_dir = log_dir
 
+        # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+        if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+            env_cfg.video_recorder.video_mode = args_cli.video
+
         # create isaac environment
         env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/scripts/reinforcement_learning/skrl/train.py b/scripts/reinforcement_learning/skrl/train.py
index 750ebeb8798..cb5eeb85752 100644
--- a/scripts/reinforcement_learning/skrl/train.py
+++ b/scripts/reinforcement_learning/skrl/train.py
@@ -40,7 +40,14 @@
 
 # -- argparse ----------------------------------------------------------------
 parser = argparse.ArgumentParser(description="Train an RL agent with skrl.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during training. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument("--video_interval", type=int, default=2000, help="Interval between video recordings (in steps).")
 parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
@@ -173,6 +180,10 @@ def main():
         # set the log directory for the environment
         env_cfg.log_dir = log_dir
 
+        # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+        if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+            env_cfg.video_recorder.video_mode = args_cli.video
+
         # create isaac environment
         env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/scripts/sim2sim_transfer/rsl_rl_transfer.py b/scripts/sim2sim_transfer/rsl_rl_transfer.py
index 4de3c42b7a8..78021ebf518 100644
--- a/scripts/sim2sim_transfer/rsl_rl_transfer.py
+++ b/scripts/sim2sim_transfer/rsl_rl_transfer.py
@@ -19,7 +19,14 @@
 
 # add argparse arguments
 parser = argparse.ArgumentParser(description="Play an RL agent with RSL-RL with policy transfer.")
-parser.add_argument("--video", action="store_true", default=False, help="Record videos during training.")
+parser.add_argument(
+    "--video",
+    nargs="?",
+    const="perspective",
+    default=None,
+    metavar="MODE",
+    help="Record videos during transfer. MODE is 'perspective' (default, wide-angle isometric view) or 'tiled' (camera-sensor tile-grid).",
+)
 parser.add_argument("--video_length", type=int, default=200, help="Length of the recorded video (in steps).")
 parser.add_argument(
     "--disable_fabric", action="store_true", default=False, help="Disable fabric and use USD I/O operations."
@@ -171,6 +178,10 @@ def main(env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg, agen
     # set the log directory for the environment (works for all environment types)
     env_cfg.log_dir = log_dir
 
+    # Forward the video mode ("tiled" / "perspective") to the recorder config before env creation.
+    if args_cli.video and hasattr(env_cfg, "video_recorder") and env_cfg.video_recorder is not None:
+        env_cfg.video_recorder.video_mode = args_cli.video
+
     # create isaac environment
     env = gym.make(args_cli.task, cfg=env_cfg, render_mode="rgb_array" if args_cli.video else None)
 
diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env.py b/source/isaaclab/isaaclab/envs/direct_marl_env.py
index 9eebdc89e18..1bb2400d94d 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env.py
@@ -535,20 +535,31 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
-            # and produces consistent, scene-content frames.  Fall back to the omni.replicator
-            # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
+            # Prefer TiledCamera in tiled mode; works for all backends and produces
+            # consistent per-agent frames.  In perspective mode the recorder returns None
+            # intentionally (bypassing TiledCamera entirely) so we always reach the
+            # omni.replicator Kit-viewport path below.
             if self.video_recorder is not None:
                 frame = self.video_recorder.render_rgb_array()
                 if frame is not None:
                     return frame
-            if not self.sim.has_gui and not self.sim.has_offscreen_render:
+            _perspective_mode = (
+                self.video_recorder is not None
+                and self.cfg.video_recorder is not None
+                and getattr(self.cfg.video_recorder, "video_mode", "tiled") == "perspective"
+            )
+            if not _perspective_mode and not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
                     "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
                     " the scene, and neither GUI nor offscreen rendering is available."
                     " Add a TiledCamera sensor to the scene configuration to enable video recording."
                 )
             # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
+            # /OmniverseKit_Persp is NOT an RTX sensor, so the guard above may have skipped
+            # sim.render() when has_rtx_sensors=True (e.g., vision tasks with TiledCamera).
+            # Force a render pass here so the annotator receives non-empty data.
+            if self.has_rtx_sensors:
+                self.sim.render()
             if not hasattr(self, "_rgb_annotator"):
                 import omni.replicator.core as rep
 
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env.py b/source/isaaclab/isaaclab/envs/direct_rl_env.py
index e821bc92a6f..f2812062e69 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env.py
@@ -503,20 +503,34 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
-            # and produces consistent, scene-content frames.  Fall back to the omni.replicator
-            # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
+            # Prefer TiledCamera in tiled mode; works for all backends and produces
+            # consistent per-agent frames. In perspective mode the recorder returns None
+            # intentionally (bypassing TiledCamera entirely) so we always reach the
+            # omni.replicator Kit-viewport path below.
             if self.video_recorder is not None:
                 frame = self.video_recorder.render_rgb_array()
                 if frame is not None:
                     return frame
-            if not self.sim.has_gui and not self.sim.has_offscreen_render:
+            # In perspective mode the recorder returns None intentionally so we fall through
+            # to the omni.replicator viewport path below. Skip the has_offscreen_render guard
+            # in that case; the annotator works in non-headless Kit sessions too.
+            _perspective_mode = (
+                self.video_recorder is not None
+                and self.cfg.video_recorder is not None
+                and getattr(self.cfg.video_recorder, "video_mode", "tiled") == "perspective"
+            )
+            if not _perspective_mode and not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
                     "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
                     " the scene, and neither GUI nor offscreen rendering is available."
                     " Add a TiledCamera sensor to the scene configuration to enable video recording."
                 )
             # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
+            # /OmniverseKit_Persp is NOT an RTX sensor, so the guard above may have skipped
+            # sim.render() when has_rtx_sensors=True (e.g., vision tasks with TiledCamera).
+            # Force a render pass here so the annotator receives non-empty data.
+            if self.has_rtx_sensors:
+                self.sim.render()
             if not hasattr(self, "_rgb_annotator"):
                 import omni.replicator.core as rep
 
diff --git a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
index fb5e586d6f3..68f9be35aac 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
@@ -277,20 +277,31 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # Prefer TiledCamera when available — works for all backends (kitless and Kit-based)
-            # and produces consistent, scene-content frames.  Fall back to the omni.replicator
-            # viewer-camera path only when no TiledCamera with RGB output exists in the scene.
+            # Prefer TiledCamera in tiled mode; works for all backends and produces
+            # consistent per-agent frames.  In perspective mode the recorder returns None
+            # intentionally (bypassing TiledCamera entirely) so we always reach the
+            # omni.replicator Kit-viewport path below.
             if self.video_recorder is not None:
                 frame = self.video_recorder.render_rgb_array()
                 if frame is not None:
                     return frame
-            if not self.sim.has_gui and not self.sim.has_offscreen_render:
+            _perspective_mode = (
+                self.video_recorder is not None
+                and self.cfg.video_recorder is not None
+                and getattr(self.cfg.video_recorder, "video_mode", "tiled") == "perspective"
+            )
+            if not _perspective_mode and not self.sim.has_gui and not self.sim.has_offscreen_render:
                 raise RuntimeError(
                     "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
                     " the scene, and neither GUI nor offscreen rendering is available."
                     " Add a TiledCamera sensor to the scene configuration to enable video recording."
                 )
             # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
+            # /OmniverseKit_Persp is NOT an RTX sensor, so the guard above may have skipped
+            # sim.render() when has_rtx_sensors=True (e.g., vision tasks with TiledCamera).
+            # Force a render pass here so the annotator receives non-empty data.
+            if self.has_rtx_sensors:
+                self.sim.render()
             if not hasattr(self, "_rgb_annotator"):
                 import omni.replicator.core as rep
 
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder.py b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
index e0475223ecc..158c71ef371 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
@@ -3,10 +3,12 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-"""Viewport recorder for capturing video frames from a :class:`~isaaclab.sensors.camera.TiledCamera`."""
+"""Video recorder for capturing video frames from either a Newton OpenGL perspective
+viewer or a :class:`~isaaclab.sensors.camera.TiledCamera` sensor."""
 
 from __future__ import annotations
 
+import logging
 import math
 from typing import TYPE_CHECKING
 
@@ -16,31 +18,43 @@
     from isaaclab.scene import InteractiveScene
     from .video_recorder_cfg import VideoRecorderCfg
 
+logger = logging.getLogger(__name__)
+
 
 class VideoRecorder:
-    """Records video frames from the scene's :class:`~isaaclab.sensors.camera.TiledCamera`.
+    """Records video frames from the scene's active renderer.
+
+    The recording strategy is determined by :attr:`~VideoRecorderCfg.video_mode`:
+
+    **``video_mode = "perspective"`` (default)**
+
+    The TiledCamera is **bypassed** entirely, even when one is in the scene.
+
+    * **Newton backends** - headless :class:`newton.viewer.ViewerGL` renders an isometric
+      wide-angle view of all environments (limited to ``video_num_tiles`` when set).
+    * **Kit backends** - returns ``None`` so that the environment's ``render()`` method
+      falls through to the ``omni.replicator.core`` Kit viewport camera path
+      (``/OmniverseKit_Persp``).
+
+    **``video_mode = "tiled"``**
 
-    On the first :meth:`render_rgb_array` call this class searches the scene for the first
-    ``TiledCamera`` sensor with ``"rgb"`` or ``"rgba"`` output and caches the camera reference
-    together with all grid-layout constants so subsequent calls are allocation-free (except for
-    the unavoidable GPU-to-CPU transfer and the final tile-stitch reshape).
+    Frame sources are tried in priority order on every :meth:`render_rgb_array` call:
 
-    The default implementation reads *all* ``num_envs`` frames from the TiledCamera buffer on
-    the GPU and slices the first ``cfg.video_num_tiles`` on the CPU (Option A).  Swap
-    ``cfg.class_type`` for a custom subclass to change this behaviour without touching any
-    environment code.
+    1. **Observation** :class:`~isaaclab.sensors.camera.TiledCamera` already present in
+       the scene; vision-based env path. Reuses the agent's own camera sensor at zero
+       extra cost and produces a square tile-grid of per-agent views.
 
-    **Camera selection priority:**
+    2. **Newton OpenGL perspective viewer** - Newton backends with no observation
+       ``TiledCamera``. A headless :class:`newton.viewer.ViewerGL` is lazy-initialised
+       on the first call and renders an isometric perspective of all environments
+       (limited to ``video_num_tiles`` when that field is set).
 
-    1. An existing :class:`~isaaclab.sensors.camera.TiledCamera` found in the scene sensors
-       (vision-based env path — the observation camera is reused for free).
-    2. A dedicated video camera grid instantiated from ``cfg.fallback_camera_cfg``
-       (state-based env path — no observation camera exists, so one camera per environment
-       is spawned, up to ``cfg.video_num_tiles``).
+    3. **Fallback** :class:`~isaaclab.sensors.camera.TiledCamera` - state-based env path
+       with Kit-based backends.  A camera prim is spawned per environment before
+       ``sim.reset()``.
 
-    For the fallback cameras to be initialised correctly they **must** be created before
-    ``sim.reset()`` is called, so :class:`VideoRecorder` must be instantiated before
-    ``sim.reset()`` in the environment setup. The environment base classes handle this.
+    For fallback cameras to initialise correctly they **must** be created before
+    ``sim.reset()`` is called; the environment base classes handle this.
 
     Args:
         cfg: Configuration for this recorder.
@@ -52,28 +66,212 @@ def __init__(self, cfg: VideoRecorderCfg, scene: InteractiveScene):
         self._scene = scene
         self._fallback_tiled_camera = None
 
-        # Spawn fallback cameras only when video recording is actually requested.
-        # cfg.render_mode is set to "rgb_array" by the env base class when --video is active
-        # (forwarded from the render_mode argument of gym.make / the env constructor).
-        # Gating here avoids GPU overhead in ordinary training runs that don't record video.
-        if cfg.fallback_camera_cfg is not None and cfg.render_mode == "rgb_array":
-            self._fallback_tiled_camera = self._spawn_fallback_cameras(cfg, scene)
+        # Newton GL perspective viewer; lazy-initialised on first render call.
+        self._gl_viewer = None
+        self._gl_viewer_initialized = False  # True once _try_init_gl_viewer() has run
+
+        if cfg.render_mode == "rgb_array":
+            # Enable EGL-backed headless rendering for pyglet before ViewerGL is ever
+            # imported.  Must be set before the first 'import pyglet.window'.  This is a
+            # no-op when pyglet is not installed (GL viewer simply stays None).
+            try:
+                import pyglet
+
+                if not pyglet.options.get("headless", False):
+                    pyglet.options["headless"] = True
+            except ImportError:
+                pass
+
+            # Skip spawning fallback TiledCameras when:
+            #   (a) a Newton backend is active; the GL perspective viewer handles state-based
+            #       rendering so creating per-env camera prims would waste GPU resources, or
+            #   (b) perspective mode is requested; TiledCamera is not used in that path.
+            _newton_backend = self._is_newton_backend()
+            if cfg.fallback_camera_cfg is not None and not _newton_backend and cfg.video_mode == "tiled":
+                self._fallback_tiled_camera = self._spawn_fallback_cameras(cfg, scene)
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
 
     def render_rgb_array(self) -> np.ndarray | None:
-        """Return a square tile-grid RGB frame, or ``None`` if no suitable camera exists.
+        """Return an RGB frame for video recording, or ``None`` when unavailable.
 
-        Returns:
-            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
-            ``G = ceil(sqrt(video_num_tiles))`` and ``(H, W)`` is the per-tile resolution,
-            or ``None`` when no :class:`~isaaclab.sensors.camera.TiledCamera` with RGB output
-            is present in the scene or configured as a fallback.
+        The frame source depends on :attr:`~VideoRecorderCfg.video_mode`:
+
+        **``"tiled"`` mode** (default):
+
+        * Source 1 - observation :class:`~isaaclab.sensors.camera.TiledCamera`:
+          returns a square tile-grid ``(G*H, G*W, 3)`` uint8 array,
+          where ``G = ceil(sqrt(video_num_tiles))``.
+        * Source 2 - Newton GL perspective viewer (state-based + Newton backend):
+          returns ``(gl_viewer_height, gl_viewer_width, 3)`` uint8.
+        * Source 3 - fallback :class:`~isaaclab.sensors.camera.TiledCamera`
+          (state-based + Kit backend): same tile-grid shape as source 1.
+
+        **``"perspective"`` mode**:
+
+        * Newton backends: Newton GL perspective viewer (same shape as source 2).
+        * Kit backends: returns ``None`` so the environment's ``render()`` method
+          falls through to the ``omni.replicator.core`` viewport camera path.
         """
-        if self._find_video_camera() is None:
+        if self.cfg.video_mode == "perspective":
+            # Perspective mode: bypass TiledCamera entirely.
+            # Newton backends → GL viewer; Kit backends → return None (env render() continues).
+            if not self._gl_viewer_initialized:
+                self._try_init_gl_viewer()
+            if self._gl_viewer is not None:
+                return self._render_newton_gl_rgb_array()
+            # No GL viewer (Kit backend) → signal the env to use its Kit perspective path.
+            return None
+
+        # --- Tiled mode (default) - priority chain. ---------------------------------
+
+        # Source 1: observation TiledCamera (vision-based path).
+        # _find_video_camera() sets self._video_camera and caches grid constants.
+        video_camera = self._find_video_camera()
+        has_obs_camera = video_camera is not None and video_camera is not self._fallback_tiled_camera
+        if has_obs_camera:
+            return self._render_tiled_camera_rgb_array()
+
+        # Source 2: Newton GL perspective viewer (state-based + Newton backend).
+        if not self._gl_viewer_initialized:
+            self._try_init_gl_viewer()
+        if self._gl_viewer is not None:
+            return self._render_newton_gl_rgb_array()
+
+        # Source 3: fallback TiledCamera (state-based + Kit backend).
+        if video_camera is None:
             return None
         return self._render_tiled_camera_rgb_array()
 
     # ------------------------------------------------------------------
-    # Internal helpers
+    # Internal helpers - Newton GL viewer
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _is_newton_backend() -> bool:
+        """Return ``True`` when the active scene data provider is Newton-based.
+
+        Detected by duck-typing: Newton providers expose ``get_newton_model()``,
+        while PhysX providers do not.  Safe to call before ``sim.reset()`` since
+        the provider is registered during scene setup.
+        """
+        try:
+            from isaaclab.sim import SimulationContext
+
+            sdp = SimulationContext.instance().initialize_scene_data_provider()
+            return hasattr(sdp, "get_newton_model")
+        except Exception:
+            return False
+
+    def _try_init_gl_viewer(self) -> None:
+        """Lazy-initialise the Newton OpenGL perspective viewer.
+
+        Called once on the first :meth:`render_rgb_array` invocation, at which point
+        ``sim.reset()`` has already been called so the Newton model is fully built.
+        On failure the viewer stays ``None`` and the caller falls through to the next
+        source: source 3 (fallback TiledCamera) in tiled mode, or ``None`` (Kit
+        viewport path) in perspective mode.
+        """
+        self._gl_viewer_initialized = True
+        try:
+            from isaaclab.sim import SimulationContext
+
+            sdp = SimulationContext.instance().initialize_scene_data_provider()
+            model = sdp.get_newton_model()
+            if model is None:
+                return
+
+            import pyglet
+
+            pyglet.options["headless"] = True
+            from newton.viewer import ViewerGL
+
+            max_worlds = (
+                None if self.cfg.video_num_tiles < 0 else min(self.cfg.video_num_tiles, model.world_count)
+            )
+
+            viewer = ViewerGL(
+                width=self.cfg.gl_viewer_width,
+                height=self.cfg.gl_viewer_height,
+                headless=True,
+            )
+            # set_model() auto-computes per-world visual offsets from body positions.
+            viewer.set_model(model, max_worlds=max_worlds)
+            # Zero additional spacing - world positions are already in model body_q.
+            viewer.set_world_offsets((0.0, 0.0, 0.0))
+            viewer.up_axis = 2  # Z-up
+
+            self._gl_viewer = viewer
+
+            # Frame the camera once using the current (reset) physics state.
+            # 1. Set an isometric CAD-viewport angle (pitch/yaw in degrees) so that
+            #    _frame_camera_on_model() preserves the viewing direction and only
+            #    adjusts the distance to fit the scene.
+            # 2. Run a throwaway begin_frame/log_state/end_frame cycle so the viewer
+            #    has geometry context (needed for accurate bounding-box computation).
+            # 3. Call _frame_camera_on_model() to auto-set the distance.
+            # All subsequent renders in _render_newton_gl_rgb_array() reuse this camera.
+            try:
+                import warp as wp
+
+                sim = SimulationContext.instance()
+                state = sdp.get_newton_state()
+                dt = sim.get_physics_dt()
+                # Match the Kit /OmniverseKit_Persp default FOV (60°) so the distance
+                # computed by _frame_camera_on_model() is consistent.  Newton GL defaults
+                # to 45°, which places the camera ~1.3× further back for the same extent.
+                viewer.camera.fov = 60.0
+                # Isometric angle: ~35° down, 45° to the right - matches the style of
+                # the Kit /OmniverseKit_Persp default "user" viewport camera.
+                viewer.set_camera(pos=wp.vec3(0.0, 0.0, 0.0), pitch=-35.0, yaw=45.0)
+                viewer.begin_frame(dt)
+                viewer.log_state(state)
+                viewer.end_frame()
+                viewer._frame_camera_on_model()
+            except Exception as frame_exc:
+                logger.warning("[VideoRecorder] GL viewer camera framing failed: %s", frame_exc)
+
+            logger.info(
+                "[VideoRecorder] Newton GL perspective viewer ready (%dx%d, max_worlds=%s).",
+                self.cfg.gl_viewer_width,
+                self.cfg.gl_viewer_height,
+                max_worlds,
+            )
+        except Exception as exc:
+            logger.warning("[VideoRecorder] Newton GL viewer unavailable: %s", exc)
+
+    def _render_newton_gl_rgb_array(self) -> np.ndarray | None:
+        """Render one perspective frame from the Newton OpenGL viewer.
+
+        Returns:
+            RGB array of shape ``(gl_viewer_height, gl_viewer_width, 3)`` and
+            dtype ``uint8``, or ``None`` on error.
+        """
+        try:
+            from isaaclab.sim import SimulationContext
+
+            sim = SimulationContext.instance()
+            sdp = sim.initialize_scene_data_provider()
+            state = sdp.get_newton_state()
+
+            # Use the actual physics timestep so that the viewer does not treat
+            # dt=0 as a no-op and skip drawing geometry on frames after the first.
+            dt = sim.get_physics_dt()
+
+            viewer = self._gl_viewer
+            viewer.begin_frame(dt)
+            viewer.log_state(state)
+            viewer.end_frame()  # renders scene geometry to the off-screen FBO
+            frame = viewer.get_frame()  # wp.array (H, W, 3) uint8 - GPU readback via PBO
+            return frame.numpy()
+        except Exception as exc:
+            logger.warning("[VideoRecorder] GL frame capture failed: %s", exc)
+            return None
+
+    # ------------------------------------------------------------------
+    # Internal helpers - TiledCamera (sources 1 and 3)
     # ------------------------------------------------------------------
 
     @staticmethod
@@ -114,7 +312,7 @@ def _spawn_fallback_cameras(cfg: VideoRecorderCfg, scene: InteractiveScene):
         # TiledCamera requires exactly one camera prim per environment (count == num_envs).
         # We must therefore spawn cameras for ALL environments, not just video_num_tiles of them.
         # The video_num_tiles limit is applied at render time in _render_tiled_camera_rgb_array,
-        # which only reads the first N tiles — the same behaviour as vision-based observation cameras.
+        # which only reads the first N tiles - the same behaviour as vision-based observation cameras.
         for i in range(n_total_envs):
             prim_path_i = f"/World/envs/env_{i}/VideoCamera"
             spawn_cfg.func(prim_path_i, spawn_cfg, translation=camera_cfg.offset.pos, orientation=rot_offset)
@@ -159,7 +357,7 @@ def _find_video_camera(self):
                     if "rgb" in output or "rgba" in output:
                         self._video_camera = self._fallback_tiled_camera
 
-            # Cache all grid constants — these are fixed for the lifetime of the env.
+            # Cache all grid constants - these are fixed for the lifetime of the env.
             if self._video_camera is not None:
                 output = self._video_camera.data.output
                 self._video_rgb_key = "rgb" if "rgb" in output else "rgba"
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
index 07726ed5a14..b0d0b57cbfc 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
@@ -36,7 +36,7 @@
 (env spacing ~4 m).
 
 This is the **default** value of :attr:`VideoRecorderCfg.fallback_camera_cfg`.  No action is
-needed in task configs — fallback cameras are automatically available for all state-based
+needed in task configs - fallback cameras are automatically available for all state-based
 environments.  Spawning only occurs when :attr:`VideoRecorderCfg.render_mode` is ``"rgb_array"``
 (i.e. ``--video`` is active), so ordinary training runs incur zero overhead.
 
@@ -76,6 +76,33 @@ class VideoRecorderCfg:
     fallback camera grid.
     """
 
+    video_mode: str = "perspective"
+    """Video recording mode.  One of ``"tiled"`` or ``"perspective"``.
+
+    * ``"perspective"`` *(default)* - captures a single wide-angle isometric view of the
+      scene.
+
+      * **Newton backends** (Newton Warp or OVRTX renderer): a headless
+        :class:`newton.viewer.ViewerGL` renders an isometric perspective of all
+        environments (or the first ``video_num_tiles`` when that field is set).
+      * **Kit backends** (PhysX + RTX renderer): the Kit viewport camera
+        ``/OmniverseKit_Persp`` is captured via ``omni.replicator.core``.
+
+      The TiledCamera sensor is **bypassed** entirely, even when one is present in the
+      scene (e.g. vision-based tasks), giving a human-readable view instead of the
+      agent's raw pixel observations.
+
+    * ``"tiled"`` - reads pixel data from a
+      :class:`~isaaclab.sensors.camera.TiledCamera`.  On vision-based tasks the agent's
+      own observation camera is reused at zero extra cost and the output is a square
+      tile-grid of per-agent views.  On state-based tasks with Kit-based backends a
+      fallback :class:`~isaaclab.sensors.camera.TiledCamera` (``fallback_camera_cfg``) is
+      spawned.  On Newton backends the Newton OpenGL perspective viewer is used instead.
+
+    Set via the ``--video`` CLI flag (``--video=perspective`` / ``--video=tiled``), or
+    as a Hydra override: ``env.video_recorder.video_mode=tiled``.
+    """
+
     video_num_tiles: int = -1
     """Number of environment tiles to include in each video frame when using ``render_mode="rgb_array"``.
     Defaults to -1, which renders all environments.
@@ -96,7 +123,7 @@ class VideoRecorderCfg:
     """Optional :class:`~isaaclab.sensors.camera.TiledCameraCfg` used to spawn a dedicated
     video-only camera for state-based environments (no observation ``TiledCamera`` in the scene).
 
-    Defaults to :data:`DEFAULT_VIDEO_FALLBACK_CAMERA_CFG` — a pinhole camera placed at
+    Defaults to :data:`DEFAULT_VIDEO_FALLBACK_CAMERA_CFG` - a pinhole camera placed at
     ``(-7, 0, 3)`` relative to env_0's origin, giving a reasonable side view for environments
     with ~4 m spacing.  Set to ``None`` to disable fallback cameras entirely (e.g. for
     vision-based tasks that already have an observation :class:`~isaaclab.sensors.camera.TiledCamera`).
@@ -104,6 +131,9 @@ class VideoRecorderCfg:
     Spawning is **gated on** :attr:`render_mode` ``== "rgb_array"`` (i.e. ``--video`` must be
     active), so the default value causes zero overhead during ordinary training runs.
 
+    For Newton-based backends (Newton Warp or OVRTX renderer), the Newton OpenGL perspective
+    viewer is used instead of fallback TiledCameras - see :attr:`gl_viewer_width`.
+
     To customise the pose for a different environment scale, override in the task's ``__post_init__``::
 
         self.video_recorder.fallback_camera_cfg = self.video_recorder.fallback_camera_cfg.replace(
@@ -115,3 +145,22 @@ class VideoRecorderCfg:
         renderer path check succeeds and ``TiledCamera`` correctly infers ``num_envs`` from
         the scene.
     """
+
+    gl_viewer_width: int = 1280
+    """Width in pixels of the Newton OpenGL perspective video frame.
+
+    Only used when the active physics/renderer backend exposes a Newton model
+    (i.e. Newton Warp or OVRTX renderer presets).  In that case :class:`VideoRecorder`
+    spawns a headless :class:`newton.viewer.ViewerGL` instance that renders an isometric
+    perspective view of all environments (limited to :attr:`video_num_tiles` when set),
+    replacing the fallback :class:`~isaaclab.sensors.camera.TiledCamera` grid.
+
+    This perspective path is activated only when ``render_mode == "rgb_array"``
+    (i.e. ``--video`` is active).  Regular training runs are unaffected.
+    """
+
+    gl_viewer_height: int = 720
+    """Height in pixels of the Newton OpenGL perspective video frame.
+
+    See :attr:`gl_viewer_width` for full description.
+    """

From 2cca98d1c9eec5daaa6dc1a5ec975f856a458c4b Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 18:59:00 -0700
Subject: [PATCH 05/11] Match Newton GL perspective camera to
 OmniverseKit_Persp camera through FOV conversion and position

---
 .../isaaclab/envs/utils/video_recorder.py     | 44 +++++++++----------
 .../isaaclab/envs/utils/video_recorder_cfg.py | 20 +++++++++
 2 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder.py b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
index 158c71ef371..14faf241c58 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
@@ -205,33 +205,31 @@ def _try_init_gl_viewer(self) -> None:
 
             self._gl_viewer = viewer
 
-            # Frame the camera once using the current (reset) physics state.
-            # 1. Set an isometric CAD-viewport angle (pitch/yaw in degrees) so that
-            #    _frame_camera_on_model() preserves the viewing direction and only
-            #    adjusts the distance to fit the scene.
-            # 2. Run a throwaway begin_frame/log_state/end_frame cycle so the viewer
-            #    has geometry context (needed for accurate bounding-box computation).
-            # 3. Call _frame_camera_on_model() to auto-set the distance.
-            # All subsequent renders in _render_newton_gl_rgb_array() reuse this camera.
+            # Position the camera to match the Kit /OmniverseKit_Persp viewport.
+            # Convert cfg.camera_eye / cfg.camera_lookat (same defaults as ViewerCfg)
+            # into Newton GL pitch/yaw (Z-up convention, degrees).
             try:
                 import warp as wp
 
-                sim = SimulationContext.instance()
-                state = sdp.get_newton_state()
-                dt = sim.get_physics_dt()
-                # Match the Kit /OmniverseKit_Persp default FOV (60°) so the distance
-                # computed by _frame_camera_on_model() is consistent.  Newton GL defaults
-                # to 45°, which places the camera ~1.3× further back for the same extent.
-                viewer.camera.fov = 60.0
-                # Isometric angle: ~35° down, 45° to the right - matches the style of
-                # the Kit /OmniverseKit_Persp default "user" viewport camera.
-                viewer.set_camera(pos=wp.vec3(0.0, 0.0, 0.0), pitch=-35.0, yaw=45.0)
-                viewer.begin_frame(dt)
-                viewer.log_state(state)
-                viewer.end_frame()
-                viewer._frame_camera_on_model()
+                ex, ey, ez = self.cfg.camera_eye
+                lx, ly, lz = self.cfg.camera_lookat
+                dx, dy, dz = lx - ex, ly - ey, lz - ez
+                length = math.sqrt(dx**2 + dy**2 + dz**2)
+                dx, dy, dz = dx / length, dy / length, dz / length
+                pitch = math.degrees(math.asin(max(-1.0, min(1.0, dz))))
+                yaw = math.degrees(math.atan2(dy, dx))
+
+                # Kit's /OmniverseKit_Persp uses a *horizontal* FOV of 60° (derived
+                # from its default focal_length=18.15 mm / horizontal_aperture=20.955 mm).
+                # pyglet / Newton GL use *vertical* FOV.  Convert so both cameras see
+                # the same scene extent.
+                aspect = self.cfg.gl_viewer_width / self.cfg.gl_viewer_height
+                kit_h_fov_rad = math.radians(60.0)
+                v_fov_deg = math.degrees(2.0 * math.atan(math.tan(kit_h_fov_rad / 2.0) / aspect))
+                viewer.camera.fov = v_fov_deg  # ≈ 36° for 1280×720
+                viewer.set_camera(pos=wp.vec3(ex, ey, ez), pitch=pitch, yaw=yaw)
             except Exception as frame_exc:
-                logger.warning("[VideoRecorder] GL viewer camera framing failed: %s", frame_exc)
+                logger.warning("[VideoRecorder] GL viewer camera setup failed: %s", frame_exc)
 
             logger.info(
                 "[VideoRecorder] Newton GL perspective viewer ready (%dx%d, max_worlds=%s).",
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
index b0d0b57cbfc..a39923f6334 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
@@ -146,6 +146,26 @@ class VideoRecorderCfg:
         the scene.
     """
 
+    camera_eye: tuple[float, float, float] = (7.5, 7.5, 7.5)
+    """World-space position of the Newton GL perspective camera (in metres).
+
+    Defaults to ``(7.5, 7.5, 7.5)`` — the same value as :attr:`~isaaclab.envs.common.ViewerCfg.eye`
+    — so the Newton GL video matches the Kit ``/OmniverseKit_Persp`` viewport exactly.
+
+    Override to reposition the camera for tasks with a very different scene scale::
+
+        self.video_recorder.camera_eye    = (20.0, 20.0, 20.0)
+        self.video_recorder.camera_lookat = (0.0,  0.0,  0.0)
+
+    Only used by Newton backends in perspective mode.
+    """
+
+    camera_lookat: tuple[float, float, float] = (0.0, 0.0, 0.0)
+    """World-space point the Newton GL perspective camera looks at (in metres).
+
+    Defaults to ``(0.0, 0.0, 0.0)`` — the same as :attr:`~isaaclab.envs.common.ViewerCfg.lookat`.
+    """
+
     gl_viewer_width: int = 1280
     """Width in pixels of the Newton OpenGL perspective video frame.
 

From ed55c9e39063c3e565b05abdbd955c0616f7e8f8 Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 20:16:24 -0700
Subject: [PATCH 06/11] Refactor video recording: move OV camera into
 VideoRecorder, clean up routing

---
 .../isaaclab/isaaclab/envs/direct_marl_env.py |  47 +--
 .../isaaclab/envs/direct_marl_env_cfg.py      |  13 +-
 .../isaaclab/isaaclab/envs/direct_rl_env.py   |  50 +--
 .../isaaclab/envs/direct_rl_env_cfg.py        |  13 +-
 .../isaaclab/envs/manager_based_env_cfg.py    |  13 +-
 .../isaaclab/envs/manager_based_rl_env.py     |  48 +--
 .../isaaclab/envs/utils/video_recorder.py     | 309 ++++++------------
 .../isaaclab/envs/utils/video_recorder_cfg.py | 165 +++-------
 8 files changed, 164 insertions(+), 494 deletions(-)

diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env.py b/source/isaaclab/isaaclab/envs/direct_marl_env.py
index 1bb2400d94d..412228b5c74 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env.py
@@ -176,6 +176,8 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
         # Forward render_mode so VideoRecorder only spawns fallback cameras when --video is active.
         if self.cfg.video_recorder is not None:
             self.cfg.video_recorder.render_mode = render_mode
+            self.cfg.video_recorder.kit_cam_prim_path = self.cfg.viewer.cam_prim_path
+            self.cfg.video_recorder.kit_resolution = self.cfg.viewer.resolution
             self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
                 self.cfg.video_recorder, self.scene
             )
@@ -535,50 +537,7 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # Prefer TiledCamera in tiled mode; works for all backends and produces
-            # consistent per-agent frames.  In perspective mode the recorder returns None
-            # intentionally (bypassing TiledCamera entirely) so we always reach the
-            # omni.replicator Kit-viewport path below.
-            if self.video_recorder is not None:
-                frame = self.video_recorder.render_rgb_array()
-                if frame is not None:
-                    return frame
-            _perspective_mode = (
-                self.video_recorder is not None
-                and self.cfg.video_recorder is not None
-                and getattr(self.cfg.video_recorder, "video_mode", "tiled") == "perspective"
-            )
-            if not _perspective_mode and not self.sim.has_gui and not self.sim.has_offscreen_render:
-                raise RuntimeError(
-                    "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
-                    " the scene, and neither GUI nor offscreen rendering is available."
-                    " Add a TiledCamera sensor to the scene configuration to enable video recording."
-                )
-            # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
-            # /OmniverseKit_Persp is NOT an RTX sensor, so the guard above may have skipped
-            # sim.render() when has_rtx_sensors=True (e.g., vision tasks with TiledCamera).
-            # Force a render pass here so the annotator receives non-empty data.
-            if self.has_rtx_sensors:
-                self.sim.render()
-            if not hasattr(self, "_rgb_annotator"):
-                import omni.replicator.core as rep
-
-                # create render product
-                self._render_product = rep.create.render_product(
-                    self.cfg.viewer.cam_prim_path, self.cfg.viewer.resolution
-                )
-                # create rgb annotator -- used to read data from the render product
-                self._rgb_annotator = rep.AnnotatorRegistry.get_annotator("rgb", device="cpu")
-                self._rgb_annotator.attach([self._render_product])
-            # obtain the rgb data
-            rgb_data = self._rgb_annotator.get_data()
-            # convert to numpy array
-            rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape)
-            # note: initially the renderer is warming up and returns empty data
-            if rgb_data.size == 0:
-                return np.zeros((self.cfg.viewer.resolution[1], self.cfg.viewer.resolution[0], 3), dtype=np.uint8)
-            else:
-                return rgb_data[:, :, :3]
+            return self.video_recorder.render_rgb_array()
         else:
             raise NotImplementedError(
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env_cfg.py b/source/isaaclab/isaaclab/envs/direct_marl_env_cfg.py
index 962c1ffb99b..d697c7fad93 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env_cfg.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env_cfg.py
@@ -237,14 +237,9 @@ class DirectMARLEnvCfg:
     """Directory for logging experiment artifacts. Defaults to None, in which case no specific log directory is set."""
 
     video_recorder: VideoRecorderCfg = VideoRecorderCfg()
-    """Configuration for the viewport recorder used when ``render_mode="rgb_array"``.
+    """Configuration for video recording when ``render_mode="rgb_array"`` (i.e. ``--video``).
 
-    Defaults to a :class:`~isaaclab.envs.VideoRecorderCfg` that captures all environments
-    in a square tile-grid using :class:`~isaaclab.envs.VideoRecorder`.
-
-    Set :attr:`~isaaclab.envs.VideoRecorderCfg.class_type` to a custom subclass to swap the
-    capture implementation without modifying environment code.  Set to ``None`` to disable
-    TiledCamera-based recording entirely and fall back to the Kit-based omni.replicator path.
-
-    CLI example: ``env.video_recorder.video_num_tiles=9``
+    See :class:`~isaaclab.envs.VideoRecorderCfg` for available options including
+    ``video_mode`` (``"perspective"`` or ``"tiled"``), ``camera_eye``/``camera_lookat``,
+    and ``video_num_tiles``.  Set to ``None`` to disable the recorder entirely.
     """
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env.py b/source/isaaclab/isaaclab/envs/direct_rl_env.py
index f2812062e69..1115d64415b 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env.py
@@ -181,6 +181,8 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
         # Forward render_mode so VideoRecorder only spawns fallback cameras when --video is active.
         if self.cfg.video_recorder is not None:
             self.cfg.video_recorder.render_mode = render_mode
+            self.cfg.video_recorder.kit_cam_prim_path = self.cfg.viewer.cam_prim_path
+            self.cfg.video_recorder.kit_resolution = self.cfg.viewer.resolution
             self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
                 self.cfg.video_recorder, self.scene
             )
@@ -503,53 +505,7 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # Prefer TiledCamera in tiled mode; works for all backends and produces
-            # consistent per-agent frames. In perspective mode the recorder returns None
-            # intentionally (bypassing TiledCamera entirely) so we always reach the
-            # omni.replicator Kit-viewport path below.
-            if self.video_recorder is not None:
-                frame = self.video_recorder.render_rgb_array()
-                if frame is not None:
-                    return frame
-            # In perspective mode the recorder returns None intentionally so we fall through
-            # to the omni.replicator viewport path below. Skip the has_offscreen_render guard
-            # in that case; the annotator works in non-headless Kit sessions too.
-            _perspective_mode = (
-                self.video_recorder is not None
-                and self.cfg.video_recorder is not None
-                and getattr(self.cfg.video_recorder, "video_mode", "tiled") == "perspective"
-            )
-            if not _perspective_mode and not self.sim.has_gui and not self.sim.has_offscreen_render:
-                raise RuntimeError(
-                    "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
-                    " the scene, and neither GUI nor offscreen rendering is available."
-                    " Add a TiledCamera sensor to the scene configuration to enable video recording."
-                )
-            # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
-            # /OmniverseKit_Persp is NOT an RTX sensor, so the guard above may have skipped
-            # sim.render() when has_rtx_sensors=True (e.g., vision tasks with TiledCamera).
-            # Force a render pass here so the annotator receives non-empty data.
-            if self.has_rtx_sensors:
-                self.sim.render()
-            if not hasattr(self, "_rgb_annotator"):
-                import omni.replicator.core as rep
-
-                # create render product
-                self._render_product = rep.create.render_product(
-                    self.cfg.viewer.cam_prim_path, self.cfg.viewer.resolution
-                )
-                # create rgb annotator -- used to read data from the render product
-                self._rgb_annotator = rep.AnnotatorRegistry.get_annotator("rgb", device="cpu")
-                self._rgb_annotator.attach([self._render_product])
-            # obtain the rgb data
-            rgb_data = self._rgb_annotator.get_data()
-            # convert to numpy array
-            rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape)
-            # note: initially the renderer is warming up and returns empty data
-            if rgb_data.size == 0:
-                return np.zeros((self.cfg.viewer.resolution[1], self.cfg.viewer.resolution[0], 3), dtype=np.uint8)
-            else:
-                return rgb_data[:, :, :3]
+            return self.video_recorder.render_rgb_array()
         else:
             raise NotImplementedError(
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env_cfg.py b/source/isaaclab/isaaclab/envs/direct_rl_env_cfg.py
index c7c11bdb2e9..acc597dd3dd 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env_cfg.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env_cfg.py
@@ -257,14 +257,9 @@ class DirectRLEnvCfg:
     """Directory for logging experiment artifacts. Defaults to None, in which case no specific log directory is set."""
 
     video_recorder: VideoRecorderCfg = VideoRecorderCfg()
-    """Configuration for the viewport recorder used when ``render_mode="rgb_array"``.
+    """Configuration for video recording when ``render_mode="rgb_array"`` (i.e. ``--video``).
 
-    Defaults to a :class:`~isaaclab.envs.VideoRecorderCfg` that captures all environments
-    in a square tile-grid using :class:`~isaaclab.envs.VideoRecorder`.
-
-    Set :attr:`~isaaclab.envs.VideoRecorderCfg.class_type` to a custom subclass to swap the
-    capture implementation without modifying environment code.  Set to ``None`` to disable
-    TiledCamera-based recording entirely and fall back to the Kit-based omni.replicator path.
-
-    CLI example: ``env.video_recorder.video_num_tiles=9``
+    See :class:`~isaaclab.envs.VideoRecorderCfg` for available options including
+    ``video_mode`` (``"perspective"`` or ``"tiled"``), ``camera_eye``/``camera_lookat``,
+    and ``video_num_tiles``.  Set to ``None`` to disable the recorder entirely.
     """
diff --git a/source/isaaclab/isaaclab/envs/manager_based_env_cfg.py b/source/isaaclab/isaaclab/envs/manager_based_env_cfg.py
index b231d278e44..2df177f2238 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_env_cfg.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_env_cfg.py
@@ -166,14 +166,9 @@ class ManagerBasedEnvCfg:
     """Directory for logging experiment artifacts. Defaults to None, in which case no specific log directory is set."""
 
     video_recorder: VideoRecorderCfg = VideoRecorderCfg()
-    """Configuration for the viewport recorder used when ``render_mode="rgb_array"``.
+    """Configuration for video recording when ``render_mode="rgb_array"`` (i.e. ``--video``).
 
-    Defaults to a :class:`~isaaclab.envs.VideoRecorderCfg` that captures all environments
-    in a square tile-grid using :class:`~isaaclab.envs.VideoRecorder`.
-
-    Set :attr:`~isaaclab.envs.VideoRecorderCfg.class_type` to a custom subclass to swap the
-    capture implementation without modifying environment code.  Set to ``None`` to disable
-    TiledCamera-based recording entirely and fall back to the Kit-based omni.replicator path.
-
-    CLI example: ``env.video_recorder.video_num_tiles=9``
+    See :class:`~isaaclab.envs.VideoRecorderCfg` for available options including
+    ``video_mode`` (``"perspective"`` or ``"tiled"``), ``camera_eye``/``camera_lookat``,
+    and ``video_num_tiles``.  Set to ``None`` to disable the recorder entirely.
     """
diff --git a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
index 68f9be35aac..669ac93032f 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
@@ -81,6 +81,8 @@ def __init__(self, cfg: ManagerBasedRLEnvCfg, render_mode: str | None = None, **
         # so fallback cameras are only spawned when --video is active (render_mode="rgb_array").
         if cfg.video_recorder is not None:
             cfg.video_recorder.render_mode = render_mode
+            cfg.video_recorder.kit_cam_prim_path = cfg.viewer.cam_prim_path
+            cfg.video_recorder.kit_resolution = cfg.viewer.resolution
 
         # initialize the base class to setup the scene.
         super().__init__(cfg=cfg)
@@ -92,7 +94,6 @@ def __init__(self, cfg: ManagerBasedRLEnvCfg, render_mode: str | None = None, **
         #    produced video matches the simulation
         self.metadata["render_fps"] = 1 / self.step_dt
         self.has_rtx_sensors = self.sim.get_setting("/isaaclab/render/rtx_sensors")
-
         print("[INFO]: Completed setting up the environment...")
 
     """
@@ -277,50 +278,7 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
-            # Prefer TiledCamera in tiled mode; works for all backends and produces
-            # consistent per-agent frames.  In perspective mode the recorder returns None
-            # intentionally (bypassing TiledCamera entirely) so we always reach the
-            # omni.replicator Kit-viewport path below.
-            if self.video_recorder is not None:
-                frame = self.video_recorder.render_rgb_array()
-                if frame is not None:
-                    return frame
-            _perspective_mode = (
-                self.video_recorder is not None
-                and self.cfg.video_recorder is not None
-                and getattr(self.cfg.video_recorder, "video_mode", "tiled") == "perspective"
-            )
-            if not _perspective_mode and not self.sim.has_gui and not self.sim.has_offscreen_render:
-                raise RuntimeError(
-                    "Cannot render 'rgb_array': no TiledCamera sensor with RGB output was found in"
-                    " the scene, and neither GUI nor offscreen rendering is available."
-                    " Add a TiledCamera sensor to the scene configuration to enable video recording."
-                )
-            # Kit-based fallback: use an omni.replicator annotator on the viewer camera.
-            # /OmniverseKit_Persp is NOT an RTX sensor, so the guard above may have skipped
-            # sim.render() when has_rtx_sensors=True (e.g., vision tasks with TiledCamera).
-            # Force a render pass here so the annotator receives non-empty data.
-            if self.has_rtx_sensors:
-                self.sim.render()
-            if not hasattr(self, "_rgb_annotator"):
-                import omni.replicator.core as rep
-
-                # create render product
-                self._render_product = rep.create.render_product(
-                    self.cfg.viewer.cam_prim_path, self.cfg.viewer.resolution
-                )
-                # create rgb annotator -- used to read data from the render product
-                self._rgb_annotator = rep.AnnotatorRegistry.get_annotator("rgb", device="cpu")
-                self._rgb_annotator.attach([self._render_product])
-            # obtain the rgb data
-            rgb_data = self._rgb_annotator.get_data()
-            # convert to numpy array
-            rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape)
-            # note: initially the renderer is warming up and returns empty data
-            if rgb_data.size == 0:
-                return np.zeros((self.cfg.viewer.resolution[1], self.cfg.viewer.resolution[0], 3), dtype=np.uint8)
-            else:
-                return rgb_data[:, :, :3]
+            return self.video_recorder.render_rgb_array()
         else:
             raise NotImplementedError(
                 f"Render mode '{self.render_mode}' is not supported. Please use: {self.metadata['render_modes']}."
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder.py b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
index 14faf241c58..325b174bb06 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
@@ -3,8 +3,17 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-"""Video recorder for capturing video frames from either a Newton OpenGL perspective
-viewer or a :class:`~isaaclab.sensors.camera.TiledCamera` sensor."""
+"""Video recorder implementation.
+
+* **Perspective view** (``video_mode="perspective"``) — captures a single wide-angle
+  view of the scene using the Newton GL viewer (Newton backends) or the Kit viewport
+  camera ``/OmniverseKit_Persp`` via ``omni.replicator.core`` (Kit backends).
+* **Camera sensor / tiled** (``video_mode="tiled"``) — reads pixel data from a
+  :class:`~isaaclab.sensors.camera.TiledCamera` sensor, producing a grid of per-agent
+  views.
+
+See :mod:`video_recorder_cfg` for configuration and full mode descriptions.
+"""
 
 from __future__ import annotations
 
@@ -24,40 +33,11 @@
 class VideoRecorder:
     """Records video frames from the scene's active renderer.
 
-    The recording strategy is determined by :attr:`~VideoRecorderCfg.video_mode`:
-
-    **``video_mode = "perspective"`` (default)**
-
-    The TiledCamera is **bypassed** entirely, even when one is in the scene.
-
-    * **Newton backends** - headless :class:`newton.viewer.ViewerGL` renders an isometric
-      wide-angle view of all environments (limited to ``video_num_tiles`` when set).
-    * **Kit backends** - returns ``None`` so that the environment's ``render()`` method
-      falls through to the ``omni.replicator.core`` Kit viewport camera path
-      (``/OmniverseKit_Persp``).
-
-    **``video_mode = "tiled"``**
-
-    Frame sources are tried in priority order on every :meth:`render_rgb_array` call:
-
-    1. **Observation** :class:`~isaaclab.sensors.camera.TiledCamera` already present in
-       the scene; vision-based env path. Reuses the agent's own camera sensor at zero
-       extra cost and produces a square tile-grid of per-agent views.
-
-    2. **Newton OpenGL perspective viewer** - Newton backends with no observation
-       ``TiledCamera``. A headless :class:`newton.viewer.ViewerGL` is lazy-initialised
-       on the first call and renders an isometric perspective of all environments
-       (limited to ``video_num_tiles`` when that field is set).
-
-    3. **Fallback** :class:`~isaaclab.sensors.camera.TiledCamera` - state-based env path
-       with Kit-based backends.  A camera prim is spawned per environment before
-       ``sim.reset()``.
-
-    For fallback cameras to initialise correctly they **must** be created before
-    ``sim.reset()`` is called; the environment base classes handle this.
+    See :class:`~isaaclab.envs.utils.video_recorder_cfg.VideoRecorderCfg` for the full
+    description of ``video_mode`` and the fallback priority chain.
 
     Args:
-        cfg: Configuration for this recorder.
+        cfg: Recorder configuration.
         scene: The interactive scene that owns the sensors.
     """
 
@@ -65,15 +45,11 @@ def __init__(self, cfg: VideoRecorderCfg, scene: InteractiveScene):
         self.cfg = cfg
         self._scene = scene
         self._fallback_tiled_camera = None
-
-        # Newton GL perspective viewer; lazy-initialised on first render call.
         self._gl_viewer = None
-        self._gl_viewer_initialized = False  # True once _try_init_gl_viewer() has run
+        self._gl_viewer_init_attempted = False
 
         if cfg.render_mode == "rgb_array":
-            # Enable EGL-backed headless rendering for pyglet before ViewerGL is ever
-            # imported.  Must be set before the first 'import pyglet.window'.  This is a
-            # no-op when pyglet is not installed (GL viewer simply stays None).
+            # enable EGL headless rendering for pyglet before any pyglet.window import.
             try:
                 import pyglet
 
@@ -82,99 +58,40 @@ def __init__(self, cfg: VideoRecorderCfg, scene: InteractiveScene):
             except ImportError:
                 pass
 
-            # Skip spawning fallback TiledCameras when:
-            #   (a) a Newton backend is active; the GL perspective viewer handles state-based
-            #       rendering so creating per-env camera prims would waste GPU resources, or
-            #   (b) perspective mode is requested; TiledCamera is not used in that path.
-            _newton_backend = self._is_newton_backend()
-            if cfg.fallback_camera_cfg is not None and not _newton_backend and cfg.video_mode == "tiled":
+            # pre-spawn fallback TiledCamera; must exist in USD stage before physics initialises.
+            # whether it is actually used is decided lazily in _find_video_camera().
+            if cfg.fallback_camera_cfg is not None and cfg.video_mode == "tiled":
                 self._fallback_tiled_camera = self._spawn_fallback_cameras(cfg, scene)
 
-    # ------------------------------------------------------------------
-    # Public API
-    # ------------------------------------------------------------------
-
     def render_rgb_array(self) -> np.ndarray | None:
-        """Return an RGB frame for video recording, or ``None`` when unavailable.
-
-        The frame source depends on :attr:`~VideoRecorderCfg.video_mode`:
-
-        **``"tiled"`` mode** (default):
-
-        * Source 1 - observation :class:`~isaaclab.sensors.camera.TiledCamera`:
-          returns a square tile-grid ``(G*H, G*W, 3)`` uint8 array,
-          where ``G = ceil(sqrt(video_num_tiles))``.
-        * Source 2 - Newton GL perspective viewer (state-based + Newton backend):
-          returns ``(gl_viewer_height, gl_viewer_width, 3)`` uint8.
-        * Source 3 - fallback :class:`~isaaclab.sensors.camera.TiledCamera`
-          (state-based + Kit backend): same tile-grid shape as source 1.
-
-        **``"perspective"`` mode**:
-
-        * Newton backends: Newton GL perspective viewer (same shape as source 2).
-        * Kit backends: returns ``None`` so the environment's ``render()`` method
-          falls through to the ``omni.replicator.core`` viewport camera path.
-        """
+        """Return an RGB frame for video recording, or ``None`` on transient Kit warmup."""
         if self.cfg.video_mode == "perspective":
-            # Perspective mode: bypass TiledCamera entirely.
-            # Newton backends → GL viewer; Kit backends → return None (env render() continues).
-            if not self._gl_viewer_initialized:
+            if not self._gl_viewer_init_attempted:
                 self._try_init_gl_viewer()
             if self._gl_viewer is not None:
                 return self._render_newton_gl_rgb_array()
-            # No GL viewer (Kit backend) → signal the env to use its Kit perspective path.
-            return None
-
-        # --- Tiled mode (default) - priority chain. ---------------------------------
+            return self._render_kit_perspective_rgb_array()
 
-        # Source 1: observation TiledCamera (vision-based path).
-        # _find_video_camera() sets self._video_camera and caches grid constants.
+        # tiled mode: use observation TiledCamera if available, then fallback.
         video_camera = self._find_video_camera()
-        has_obs_camera = video_camera is not None and video_camera is not self._fallback_tiled_camera
-        if has_obs_camera:
-            return self._render_tiled_camera_rgb_array()
-
-        # Source 2: Newton GL perspective viewer (state-based + Newton backend).
-        if not self._gl_viewer_initialized:
-            self._try_init_gl_viewer()
-        if self._gl_viewer is not None:
-            return self._render_newton_gl_rgb_array()
-
-        # Source 3: fallback TiledCamera (state-based + Kit backend).
         if video_camera is None:
-            return None
+            raise RuntimeError(
+                "Cannot record video in tiled mode: no TiledCamera sensor with RGB output was found"
+                " in the scene. Add a TiledCamera sensor or switch to perspective mode (--video=perspective)."
+            )
+        if video_camera is not self._fallback_tiled_camera:
+            logger.debug("[VideoRecorder] tiled source: observation TiledCamera")
+        else:
+            logger.debug("[VideoRecorder] tiled source: fallback TiledCamera")
         return self._render_tiled_camera_rgb_array()
 
-    # ------------------------------------------------------------------
-    # Internal helpers - Newton GL viewer
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _is_newton_backend() -> bool:
-        """Return ``True`` when the active scene data provider is Newton-based.
-
-        Detected by duck-typing: Newton providers expose ``get_newton_model()``,
-        while PhysX providers do not.  Safe to call before ``sim.reset()`` since
-        the provider is registered during scene setup.
-        """
-        try:
-            from isaaclab.sim import SimulationContext
-
-            sdp = SimulationContext.instance().initialize_scene_data_provider()
-            return hasattr(sdp, "get_newton_model")
-        except Exception:
-            return False
-
     def _try_init_gl_viewer(self) -> None:
-        """Lazy-initialise the Newton OpenGL perspective viewer.
+        """Lazy-initialise the Newton GL viewer on the first render call.
 
-        Called once on the first :meth:`render_rgb_array` invocation, at which point
-        ``sim.reset()`` has already been called so the Newton model is fully built.
-        On failure the viewer stays ``None`` and the caller falls through to the next
-        source: source 3 (fallback TiledCamera) in tiled mode, or ``None`` (Kit
-        viewport path) in perspective mode.
+        Called after ``sim.reset()`` so the Newton model is fully built.
+        Leaves ``_gl_viewer`` as ``None`` on failure so callers fall through gracefully.
         """
-        self._gl_viewer_initialized = True
+        self._gl_viewer_init_attempted = True
         try:
             from isaaclab.sim import SimulationContext
 
@@ -192,22 +109,13 @@ def _try_init_gl_viewer(self) -> None:
                 None if self.cfg.video_num_tiles < 0 else min(self.cfg.video_num_tiles, model.world_count)
             )
 
-            viewer = ViewerGL(
-                width=self.cfg.gl_viewer_width,
-                height=self.cfg.gl_viewer_height,
-                headless=True,
-            )
-            # set_model() auto-computes per-world visual offsets from body positions.
+            viewer = ViewerGL(width=self.cfg.gl_viewer_width, height=self.cfg.gl_viewer_height, headless=True)
             viewer.set_model(model, max_worlds=max_worlds)
-            # Zero additional spacing - world positions are already in model body_q.
-            viewer.set_world_offsets((0.0, 0.0, 0.0))
+            viewer.set_world_offsets((0.0, 0.0, 0.0))  # world positions already in body_q
             viewer.up_axis = 2  # Z-up
-
             self._gl_viewer = viewer
 
-            # Position the camera to match the Kit /OmniverseKit_Persp viewport.
-            # Convert cfg.camera_eye / cfg.camera_lookat (same defaults as ViewerCfg)
-            # into Newton GL pitch/yaw (Z-up convention, degrees).
+            # place camera to match Kit /OmniverseKit_Persp (same eye/lookat as ViewerCfg).
             try:
                 import warp as wp
 
@@ -219,20 +127,16 @@ def _try_init_gl_viewer(self) -> None:
                 pitch = math.degrees(math.asin(max(-1.0, min(1.0, dz))))
                 yaw = math.degrees(math.atan2(dy, dx))
 
-                # Kit's /OmniverseKit_Persp uses a *horizontal* FOV of 60° (derived
-                # from its default focal_length=18.15 mm / horizontal_aperture=20.955 mm).
-                # pyglet / Newton GL use *vertical* FOV.  Convert so both cameras see
-                # the same scene extent.
+                # Kit uses horizontal FOV (60°); pyglet/Newton GL uses vertical FOV.
                 aspect = self.cfg.gl_viewer_width / self.cfg.gl_viewer_height
-                kit_h_fov_rad = math.radians(60.0)
-                v_fov_deg = math.degrees(2.0 * math.atan(math.tan(kit_h_fov_rad / 2.0) / aspect))
+                v_fov_deg = math.degrees(2.0 * math.atan(math.tan(math.radians(60.0) / 2.0) / aspect))
                 viewer.camera.fov = v_fov_deg  # ≈ 36° for 1280×720
                 viewer.set_camera(pos=wp.vec3(ex, ey, ez), pitch=pitch, yaw=yaw)
-            except Exception as frame_exc:
-                logger.warning("[VideoRecorder] GL viewer camera setup failed: %s", frame_exc)
+            except Exception as exc:
+                logger.warning("[VideoRecorder] GL viewer camera setup failed: %s", exc)
 
             logger.info(
-                "[VideoRecorder] Newton GL perspective viewer ready (%dx%d, max_worlds=%s).",
+                "[VideoRecorder] Newton GL viewer ready (%dx%d, max_worlds=%s).",
                 self.cfg.gl_viewer_width,
                 self.cfg.gl_viewer_height,
                 max_worlds,
@@ -241,49 +145,61 @@ def _try_init_gl_viewer(self) -> None:
             logger.warning("[VideoRecorder] Newton GL viewer unavailable: %s", exc)
 
     def _render_newton_gl_rgb_array(self) -> np.ndarray | None:
-        """Render one perspective frame from the Newton OpenGL viewer.
-
-        Returns:
-            RGB array of shape ``(gl_viewer_height, gl_viewer_width, 3)`` and
-            dtype ``uint8``, or ``None`` on error.
-        """
+        """Return one RGB frame from the Newton GL viewer, or ``None`` on error."""
         try:
             from isaaclab.sim import SimulationContext
 
             sim = SimulationContext.instance()
             sdp = sim.initialize_scene_data_provider()
             state = sdp.get_newton_state()
-
-            # Use the actual physics timestep so that the viewer does not treat
-            # dt=0 as a no-op and skip drawing geometry on frames after the first.
             dt = sim.get_physics_dt()
 
             viewer = self._gl_viewer
             viewer.begin_frame(dt)
             viewer.log_state(state)
-            viewer.end_frame()  # renders scene geometry to the off-screen FBO
-            frame = viewer.get_frame()  # wp.array (H, W, 3) uint8 - GPU readback via PBO
-            return frame.numpy()
+            viewer.end_frame()
+            return viewer.get_frame().numpy()
         except Exception as exc:
             logger.warning("[VideoRecorder] GL frame capture failed: %s", exc)
             return None
 
-    # ------------------------------------------------------------------
-    # Internal helpers - TiledCamera (sources 1 and 3)
-    # ------------------------------------------------------------------
+    def _render_kit_perspective_rgb_array(self) -> np.ndarray | None:
+        """Return one RGB frame from the Kit /OmniverseKit_Persp camera via omni.replicator.
+
+        Returns ``None`` during the initial warmup frames when the renderer returns empty data.
+        """
+        try:
+            import omni.replicator.core as rep
+
+            from isaaclab.sim import SimulationContext
+
+            # /OmniverseKit_Persp is not an RTX sensor; always force a render pass for fresh data.
+            SimulationContext.instance().render()
+
+            if not hasattr(self, "_rgb_annotator"):
+                self._render_product = rep.create.render_product(
+                    self.cfg.kit_cam_prim_path, self.cfg.kit_resolution
+                )
+                self._rgb_annotator = rep.AnnotatorRegistry.get_annotator("rgb", device="cpu")
+                self._rgb_annotator.attach([self._render_product])
+
+            rgb_data = self._rgb_annotator.get_data()
+            rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape)
+            if rgb_data.size == 0:
+                # renderer is warming up; return blank frame
+                h, w = self.cfg.kit_resolution[1], self.cfg.kit_resolution[0]
+                return np.zeros((h, w, 3), dtype=np.uint8)
+            return rgb_data[:, :, :3]
+        except Exception as exc:
+            logger.warning("[VideoRecorder] Kit perspective capture failed: %s", exc)
+            return None
 
     @staticmethod
     def _spawn_fallback_cameras(cfg: VideoRecorderCfg, scene: InteractiveScene):
-        """Spawn one video camera prim per environment (up to ``cfg.video_num_tiles``) and
-        return a single :class:`~isaaclab.sensors.camera.TiledCamera` covering all of them.
-
-        Camera prims are spawned at ``/World/envs/env_{i}/VideoCamera`` for
-        ``i in range(n_cameras)``, then a ``TiledCamera`` with the regex prim path
-        ``/World/envs/env_.*/VideoCamera`` is created so that all spawned prims are
-        discovered and rendered as tiles.
+        """Spawn one video camera prim per environment and return a single TiledCamera.
 
-        This must be called **before** ``sim.reset()`` so the prims exist in the USD stage
-        and the ``TiledCamera`` can register for the ``PHYSICS_READY`` callback.
+        Must be called **before** ``sim.reset()`` so the prims exist when the TiledCamera
+        registers for its ``PHYSICS_READY`` callback.
         """
         import torch
 
@@ -291,56 +207,37 @@ def _spawn_fallback_cameras(cfg: VideoRecorderCfg, scene: InteractiveScene):
         from isaaclab.utils.math import convert_camera_frame_orientation_convention
 
         camera_cfg = cfg.fallback_camera_cfg
-
-        # Pre-compute the OpenGL rotation offset (mirrors Camera.__init__ logic).
         n_total_envs = scene.num_envs
+
         rot = torch.tensor(camera_cfg.offset.rot, dtype=torch.float32, device="cpu").unsqueeze(0)
         rot_offset = convert_camera_frame_orientation_convention(
             rot, origin=camera_cfg.offset.convention, target="opengl"
-        )
-        rot_offset = rot_offset.squeeze(0).cpu().numpy()
+        ).squeeze(0).cpu().numpy()
 
-        # Ensure vertical_aperture is set before calling the spawn func.
         spawn_cfg = camera_cfg.spawn
         if spawn_cfg.vertical_aperture is None:
             spawn_cfg = spawn_cfg.replace(
                 vertical_aperture=spawn_cfg.horizontal_aperture * camera_cfg.height / camera_cfg.width
             )
 
-        # TiledCamera requires exactly one camera prim per environment (count == num_envs).
-        # We must therefore spawn cameras for ALL environments, not just video_num_tiles of them.
-        # The video_num_tiles limit is applied at render time in _render_tiled_camera_rgb_array,
-        # which only reads the first N tiles - the same behaviour as vision-based observation cameras.
         for i in range(n_total_envs):
-            prim_path_i = f"/World/envs/env_{i}/VideoCamera"
-            spawn_cfg.func(prim_path_i, spawn_cfg, translation=camera_cfg.offset.pos, orientation=rot_offset)
-
-        # Create one TiledCamera that discovers all spawned prims via the regex path.
-        # spawn=None tells Camera.__init__ to skip re-spawning; it will verify the prims exist.
-        tiled_cfg = camera_cfg.replace(
-            prim_path="/World/envs/env_.*/VideoCamera",
-            spawn=None,
-        )
+            spawn_cfg.func(f"/World/envs/env_{i}/VideoCamera", spawn_cfg,
+                           translation=camera_cfg.offset.pos, orientation=rot_offset)
+
+        tiled_cfg = camera_cfg.replace(prim_path="/World/envs/env_.*/VideoCamera", spawn=None)
         return TiledCamera(tiled_cfg)
 
     def _find_video_camera(self):
         """Locate and cache the TiledCamera to use for video recording.
 
-        Search order:
-          1. Observation TiledCamera already in the scene (vision-based env path, zero extra cost).
-          2. Dedicated fallback TiledCamera from ``cfg.fallback_camera_cfg`` (state-based env path).
-
-        Returns ``None`` if neither source is available.
-
-        Previously used the omni.replicator viewer camera which had RGB output only for
-        Kit-based backends (``physx`` / ``newton,isaacsim_rtx_renderer``).
+        Priority: (1) observation TiledCamera already in the scene, (2) fallback camera.
+        Returns ``None`` if neither is available.
         """
         if not hasattr(self, "_video_camera"):
             from isaaclab.sensors.camera import TiledCamera
 
             self._video_camera = None
 
-            # Priority 1: observation TiledCamera in the scene (vision-based env path).
             for sensor in self._scene.sensors.values():
                 if isinstance(sensor, TiledCamera):
                     output = sensor.data.output
@@ -348,14 +245,12 @@ def _find_video_camera(self):
                         self._video_camera = sensor
                         break
 
-            # Priority 2: fallback video camera (state-based env path).
             if self._video_camera is None and self._fallback_tiled_camera is not None:
                 if self._fallback_tiled_camera.is_initialized:
                     output = self._fallback_tiled_camera.data.output
                     if "rgb" in output or "rgba" in output:
                         self._video_camera = self._fallback_tiled_camera
 
-            # Cache all grid constants - these are fixed for the lifetime of the env.
             if self._video_camera is not None:
                 output = self._video_camera.data.output
                 self._video_rgb_key = "rgb" if "rgb" in output else "rgba"
@@ -363,48 +258,28 @@ def _find_video_camera(self):
                 n_envs = n_total if self.cfg.video_num_tiles < 0 else min(self.cfg.video_num_tiles, n_total)
                 self._video_n_envs = n_envs
                 self._video_grid_size = math.ceil(math.sqrt(n_envs))
-                n_slots = self._video_grid_size * self._video_grid_size
+                n_slots = self._video_grid_size ** 2
                 H = int(output[self._video_rgb_key].shape[1])
                 W = int(output[self._video_rgb_key].shape[2])
                 self._video_H = H
                 self._video_W = W
-                # Pre-allocate the black padding block (zero-copy when pad == 0).
                 pad = n_slots - n_envs
                 self._video_pad = np.zeros((pad, H, W, 3), dtype=np.uint8) if pad > 0 else None
 
         return self._video_camera
 
     def _render_tiled_camera_rgb_array(self) -> np.ndarray:
-        """Return a square tile-grid of RGB frames from the TiledCamera.
-
-        Create a square grid of tiles. This method reads directly from the
-        TiledCamera sensor buffer to generate the tiles.
-
-        If using the dedicated fallback video cameras (not observation sensors),
-        this method calls ``update()`` on them first to trigger a fresh render pass.
-        Observation TiledCameras are updated by ``scene.update()`` during the
-        environment step and do not need an extra update here.
-
-        Returns:
-            RGB image of shape ``(G*H, G*W, 3)`` and dtype ``uint8``, where
-            ``G = ceil(sqrt(num_envs))`` and ``(H, W)`` is the per-tile resolution.
-        """
-        # Fallback cameras are not updated by scene.update(), so drive them manually.
+        """Return a square tile-grid ``(G*H, G*W, 3)`` from the cached TiledCamera."""
         if self._video_camera is self._fallback_tiled_camera:
             self._fallback_tiled_camera.update(dt=0.0, force_recompute=True)
 
         rgb_all = self._video_camera.data.output[self._video_rgb_key]
-        # Drop alpha channel once on GPU before the CPU transfer.
         if self._video_rgb_key == "rgba":
             rgb_all = rgb_all[..., :3]
 
-        # .contiguous() ensures the reshape below returns a zero-copy view.
-        tiles = rgb_all[: self._video_n_envs].contiguous().cpu().numpy()  # [n_envs, H, W, 3]
+        tiles = rgb_all[: self._video_n_envs].contiguous().cpu().numpy()
         if self._video_pad is not None:
             tiles = np.concatenate([tiles, self._video_pad], axis=0)
-        # [grid_size, grid_size, H, W, 3] → [grid_size*H, grid_size*W, 3]
+
         g, H, W = self._video_grid_size, self._video_H, self._video_W
-        grid = tiles.reshape(g, g, H, W, 3)
-        grid = grid.transpose(0, 2, 1, 3, 4)
-        # after transpose the strides are non-standard; reshape must copy here.
-        return grid.reshape(g * H, g * W, 3)
+        return tiles.reshape(g, g, H, W, 3).transpose(0, 2, 1, 3, 4).reshape(g * H, g * W, 3)
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
index a39923f6334..3436b2a333a 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
@@ -3,7 +3,17 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
-"""Configuration for :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`."""
+"""Configuration for :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`.
+
+Two recording modes are supported (set via :attr:`VideoRecorderCfg.video_mode`):
+
+* **Perspective view** (``"perspective"``, default) - a single wide-angle viewport
+  camera.  Uses the Newton GL viewer on Newton backends; falls back to the Kit
+  ``/OmniverseKit_Persp`` camera via ``omni.replicator.core`` on Kit backends.
+* **Camera sensor / tiled** (``"tiled"``) - reads pixel data from a
+  :class:`~isaaclab.sensors.camera.TiledCamera` sensor and arranges the per-agent
+  frames into a square grid.
+"""
 
 from __future__ import annotations
 
@@ -14,7 +24,7 @@
 from .video_recorder import VideoRecorder
 
 
-DEFAULT_VIDEO_FALLBACK_CAMERA_CFG = TiledCameraCfg(
+DEFAULT_TILED_RECORDING_CAMERA_CFG = TiledCameraCfg(
     prim_path="/World/envs/env_0/VideoCamera",
     update_period=0.0,
     height=480,
@@ -28,19 +38,12 @@
     ),
     offset=TiledCameraCfg.OffsetCfg(pos=(-7.0, 0.0, 3.0), rot=(0.0, 0.1045, 0.0, 0.9945), convention="world"),
 )
-"""Default fallback :class:`~isaaclab.sensors.camera.TiledCameraCfg` for state-based video recording.
+"""Default :class:`~isaaclab.sensors.camera.TiledCameraCfg` for tiled state-based video recording.
 
-Places a pinhole camera at ``/World/envs/env_0/VideoCamera`` offset ``(-7, 0, 3)`` from
-env_0's origin, angled ~12° downward in the world frame.  This matches the camera position used
-by ``Isaac-Cartpole-RGB-v0`` and gives a reasonable side view for medium-scale environments
-(env spacing ~4 m).
+Places a pinhole camera at ``(-7, 0, 3)`` m relative to env_0's origin, angled ~12° downward.
+Only spawned when ``--video=tiled`` is active and no observation TiledCamera exists in the scene.
 
-This is the **default** value of :attr:`VideoRecorderCfg.fallback_camera_cfg`.  No action is
-needed in task configs - fallback cameras are automatically available for all state-based
-environments.  Spawning only occurs when :attr:`VideoRecorderCfg.render_mode` is ``"rgb_array"``
-(i.e. ``--video`` is active), so ordinary training runs incur zero overhead.
-
-To customise the pose for a different environment scale, override in the task's ``__post_init__``::
+Override pose in ``__post_init__`` for tasks with different scene scales::
 
     self.video_recorder.fallback_camera_cfg = self.video_recorder.fallback_camera_cfg.replace(
         offset=TiledCameraCfg.OffsetCfg(pos=(-3.0, 0.0, 2.0), rot=(0.0, 0.1045, 0.0, 0.9945), convention="world"),
@@ -50,137 +53,71 @@
 
 @configclass
 class VideoRecorderCfg:
-    """Configuration for :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`.
-
-    Set :attr:`class_type` to a custom subclass of
-    :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder` to swap the
-    video-capture implementation (e.g. an Option-B pipeline that only renders
-    ``video_num_tiles`` cameras on the GPU) without modifying any environment code.
-    """
+    """Configuration for :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`."""
 
     class_type: type = VideoRecorder
-    """The recorder class to instantiate.  Must accept ``(cfg, scene)`` as constructor arguments.
-    Defaults to :class:`~isaaclab.envs.utils.video_recorder.VideoRecorder`.
-    """
+    """Recorder class to instantiate; must accept ``(cfg, scene)``."""
 
     render_mode: str | None = None
-    """The render mode forwarded from the environment constructor.
-
-    Populated automatically by the environment base classes from the ``render_mode`` argument
-    passed to :func:`gymnasium.make` (or the environment constructor directly).  User code
-    should not set this field manually.
+    """Render mode forwarded from the environment constructor (``"rgb_array"`` when ``--video`` is active).
 
-    When ``None`` (the default, i.e. ``--video`` was **not** passed), :class:`VideoRecorder`
-    skips spawning any fallback cameras so that state-based runs incur zero overhead.
-    Only when this is ``"rgb_array"`` does the recorder allocate GPU resources for the
-    fallback camera grid.
+    Set automatically by the environment base classes; do not set manually.
     """
 
     video_mode: str = "perspective"
-    """Video recording mode.  One of ``"tiled"`` or ``"perspective"``.
-
-    * ``"perspective"`` *(default)* - captures a single wide-angle isometric view of the
-      scene.
-
-      * **Newton backends** (Newton Warp or OVRTX renderer): a headless
-        :class:`newton.viewer.ViewerGL` renders an isometric perspective of all
-        environments (or the first ``video_num_tiles`` when that field is set).
-      * **Kit backends** (PhysX + RTX renderer): the Kit viewport camera
-        ``/OmniverseKit_Persp`` is captured via ``omni.replicator.core``.
-
-      The TiledCamera sensor is **bypassed** entirely, even when one is present in the
-      scene (e.g. vision-based tasks), giving a human-readable view instead of the
-      agent's raw pixel observations.
-
-    * ``"tiled"`` - reads pixel data from a
-      :class:`~isaaclab.sensors.camera.TiledCamera`.  On vision-based tasks the agent's
-      own observation camera is reused at zero extra cost and the output is a square
-      tile-grid of per-agent views.  On state-based tasks with Kit-based backends a
-      fallback :class:`~isaaclab.sensors.camera.TiledCamera` (``fallback_camera_cfg``) is
-      spawned.  On Newton backends the Newton OpenGL perspective viewer is used instead.
-
-    Set via the ``--video`` CLI flag (``--video=perspective`` / ``--video=tiled``), or
-    as a Hydra override: ``env.video_recorder.video_mode=tiled``.
-    """
+    """Recording mode: ``"perspective"`` (default) or ``"tiled"``.
 
-    video_num_tiles: int = -1
-    """Number of environment tiles to include in each video frame when using ``render_mode="rgb_array"``.
-    Defaults to -1, which renders all environments.
+    * ``"perspective"`` - single wide-angle view of the scene. Newton backends use the Newton GL
+      viewer; Kit backends use ``/OmniverseKit_Persp`` via ``omni.replicator.core``. TiledCamera
+      is bypassed even when present.
+    * ``"tiled"`` - square tile-grid from a :class:`~isaaclab.sensors.camera.TiledCamera`.
+      Reuses the observation camera on vision-based tasks; spawns ``fallback_camera_cfg`` on
+      state-based Kit tasks; uses the Newton GL viewer on Newton backends.
 
-    Environments are arranged into a square grid of size
-    ``ceil(sqrt(video_num_tiles)) * ceil(sqrt(video_num_tiles))``, with unused slots filled with
-    black. For example:
+    Set via CLI: ``--video=perspective`` / ``--video=tiled``.
+    """
 
-    * ``-1``: all environments (default)
-    * ``1``: single environment (1*1)
-    * ``4``: first 4 environments (2*2 grid)
-    * ``9``: first 9 environments (3*3 grid)
+    video_num_tiles: int = -1
+    """Max environments to include per frame (``-1`` = all).
 
+    Tiles are arranged into a ``ceil(sqrt(N)) × ceil(sqrt(N))`` grid with black padding.
     CLI example: ``env.video_recorder.video_num_tiles=9``
     """
 
-    fallback_camera_cfg: object = DEFAULT_VIDEO_FALLBACK_CAMERA_CFG
-    """Optional :class:`~isaaclab.sensors.camera.TiledCameraCfg` used to spawn a dedicated
-    video-only camera for state-based environments (no observation ``TiledCamera`` in the scene).
+    fallback_camera_cfg: object = DEFAULT_TILED_RECORDING_CAMERA_CFG
+    """Side-view :class:`~isaaclab.sensors.camera.TiledCameraCfg` for tiled state-based recording.
 
-    Defaults to :data:`DEFAULT_VIDEO_FALLBACK_CAMERA_CFG` - a pinhole camera placed at
-    ``(-7, 0, 3)`` relative to env_0's origin, giving a reasonable side view for environments
-    with ~4 m spacing.  Set to ``None`` to disable fallback cameras entirely (e.g. for
-    vision-based tasks that already have an observation :class:`~isaaclab.sensors.camera.TiledCamera`).
-
-    Spawning is **gated on** :attr:`render_mode` ``== "rgb_array"`` (i.e. ``--video`` must be
-    active), so the default value causes zero overhead during ordinary training runs.
-
-    For Newton-based backends (Newton Warp or OVRTX renderer), the Newton OpenGL perspective
-    viewer is used instead of fallback TiledCameras - see :attr:`gl_viewer_width`.
-
-    To customise the pose for a different environment scale, override in the task's ``__post_init__``::
-
-        self.video_recorder.fallback_camera_cfg = self.video_recorder.fallback_camera_cfg.replace(
-            offset=TiledCameraCfg.OffsetCfg(pos=(-3.0, 0.0, 2.0), rot=(0.0, 0.1045, 0.0, 0.9945), convention="world"),
-        )
-
-    .. note::
-        The prim path in the cfg must start with ``/World/envs/env_0/`` so that the OVRTX
-        renderer path check succeeds and ``TiledCamera`` correctly infers ``num_envs`` from
-        the scene.
+    Spawned when ``video_mode="tiled"`` and no observation TiledCamera exists in the scene.
+    Set to ``None`` to disable.
     """
 
     camera_eye: tuple[float, float, float] = (7.5, 7.5, 7.5)
-    """World-space position of the Newton GL perspective camera (in metres).
-
-    Defaults to ``(7.5, 7.5, 7.5)`` — the same value as :attr:`~isaaclab.envs.common.ViewerCfg.eye`
-    — so the Newton GL video matches the Kit ``/OmniverseKit_Persp`` viewport exactly.
+    """Newton GL perspective camera position in world space (metres).
 
-    Override to reposition the camera for tasks with a very different scene scale::
-
-        self.video_recorder.camera_eye    = (20.0, 20.0, 20.0)
-        self.video_recorder.camera_lookat = (0.0,  0.0,  0.0)
-
-    Only used by Newton backends in perspective mode.
+    Matches :attr:`~isaaclab.envs.common.ViewerCfg.eye` so the Newton GL video aligns with
+    the Kit ``/OmniverseKit_Persp`` viewport. Only used by Newton backends in perspective mode.
     """
 
     camera_lookat: tuple[float, float, float] = (0.0, 0.0, 0.0)
-    """World-space point the Newton GL perspective camera looks at (in metres).
+    """Newton GL perspective camera look-at point in world space (metres).
 
-    Defaults to ``(0.0, 0.0, 0.0)`` — the same as :attr:`~isaaclab.envs.common.ViewerCfg.lookat`.
+    Matches :attr:`~isaaclab.envs.common.ViewerCfg.lookat`. Only used by Newton backends in perspective mode.
     """
 
     gl_viewer_width: int = 1280
-    """Width in pixels of the Newton OpenGL perspective video frame.
+    """Width in pixels of the Newton GL perspective frame. Only active when ``--video`` is set."""
 
-    Only used when the active physics/renderer backend exposes a Newton model
-    (i.e. Newton Warp or OVRTX renderer presets).  In that case :class:`VideoRecorder`
-    spawns a headless :class:`newton.viewer.ViewerGL` instance that renders an isometric
-    perspective view of all environments (limited to :attr:`video_num_tiles` when set),
-    replacing the fallback :class:`~isaaclab.sensors.camera.TiledCamera` grid.
+    gl_viewer_height: int = 720
+    """Height in pixels of the Newton GL perspective frame. Only active when ``--video`` is set."""
 
-    This perspective path is activated only when ``render_mode == "rgb_array"``
-    (i.e. ``--video`` is active).  Regular training runs are unaffected.
+    kit_cam_prim_path: str = "/OmniverseKit_Persp"
+    """USD prim path of the Kit viewport camera used for perspective recording on Kit backends.
+
+    Set automatically from :attr:`~isaaclab.envs.common.ViewerCfg.cam_prim_path`; do not set manually.
     """
 
-    gl_viewer_height: int = 720
-    """Height in pixels of the Newton OpenGL perspective video frame.
+    kit_resolution: tuple[int, int] = (1280, 720)
+    """Resolution ``(width, height)`` of the Kit perspective frame.
 
-    See :attr:`gl_viewer_width` for full description.
+    Set automatically from :attr:`~isaaclab.envs.common.ViewerCfg.resolution`; do not set manually.
     """

From 9988962fc1d757f05a6ad3419240f50337b04576 Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 20:29:30 -0700
Subject: [PATCH 07/11] Clean up video recorder: drop redundant kit cfg
 injection, debug logs, and stale comments

---
 source/isaaclab/isaaclab/envs/direct_marl_env.py  |  2 --
 source/isaaclab/isaaclab/envs/direct_rl_env.py    |  2 --
 .../isaaclab/envs/manager_based_rl_env.py         |  2 --
 .../isaaclab/envs/utils/video_recorder.py         | 12 ++++--------
 .../isaaclab/envs/utils/video_recorder_cfg.py     | 15 ++-------------
 5 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env.py b/source/isaaclab/isaaclab/envs/direct_marl_env.py
index 412228b5c74..fa067df4b42 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env.py
@@ -176,8 +176,6 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
         # Forward render_mode so VideoRecorder only spawns fallback cameras when --video is active.
         if self.cfg.video_recorder is not None:
             self.cfg.video_recorder.render_mode = render_mode
-            self.cfg.video_recorder.kit_cam_prim_path = self.cfg.viewer.cam_prim_path
-            self.cfg.video_recorder.kit_resolution = self.cfg.viewer.resolution
             self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
                 self.cfg.video_recorder, self.scene
             )
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env.py b/source/isaaclab/isaaclab/envs/direct_rl_env.py
index 1115d64415b..237f94e3f61 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env.py
@@ -181,8 +181,6 @@ def _init_sim(self, render_mode: str | None = None, **kwargs):
         # Forward render_mode so VideoRecorder only spawns fallback cameras when --video is active.
         if self.cfg.video_recorder is not None:
             self.cfg.video_recorder.render_mode = render_mode
-            self.cfg.video_recorder.kit_cam_prim_path = self.cfg.viewer.cam_prim_path
-            self.cfg.video_recorder.kit_resolution = self.cfg.viewer.resolution
             self.video_recorder: VideoRecorder = self.cfg.video_recorder.class_type(
                 self.cfg.video_recorder, self.scene
             )
diff --git a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
index 669ac93032f..db4ff7a13de 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
@@ -81,8 +81,6 @@ def __init__(self, cfg: ManagerBasedRLEnvCfg, render_mode: str | None = None, **
         # so fallback cameras are only spawned when --video is active (render_mode="rgb_array").
         if cfg.video_recorder is not None:
             cfg.video_recorder.render_mode = render_mode
-            cfg.video_recorder.kit_cam_prim_path = cfg.viewer.cam_prim_path
-            cfg.video_recorder.kit_resolution = cfg.viewer.resolution
 
         # initialize the base class to setup the scene.
         super().__init__(cfg=cfg)
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder.py b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
index 325b174bb06..20492ba0585 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
@@ -79,17 +79,14 @@ def render_rgb_array(self) -> np.ndarray | None:
                 "Cannot record video in tiled mode: no TiledCamera sensor with RGB output was found"
                 " in the scene. Add a TiledCamera sensor or switch to perspective mode (--video=perspective)."
             )
-        if video_camera is not self._fallback_tiled_camera:
-            logger.debug("[VideoRecorder] tiled source: observation TiledCamera")
-        else:
-            logger.debug("[VideoRecorder] tiled source: fallback TiledCamera")
         return self._render_tiled_camera_rgb_array()
 
     def _try_init_gl_viewer(self) -> None:
         """Lazy-initialise the Newton GL viewer on the first render call.
 
         Called after ``sim.reset()`` so the Newton model is fully built.
-        Leaves ``_gl_viewer`` as ``None`` on failure so callers fall through gracefully.
+        Leaves ``_gl_viewer`` as ``None`` on Kit backends; ``render_rgb_array`` then
+        calls ``_render_kit_perspective_rgb_array`` instead.
         """
         self._gl_viewer_init_attempted = True
         try:
@@ -178,7 +175,7 @@ def _render_kit_perspective_rgb_array(self) -> np.ndarray | None:
 
             if not hasattr(self, "_rgb_annotator"):
                 self._render_product = rep.create.render_product(
-                    self.cfg.kit_cam_prim_path, self.cfg.kit_resolution
+                    "/OmniverseKit_Persp", (1280, 720)
                 )
                 self._rgb_annotator = rep.AnnotatorRegistry.get_annotator("rgb", device="cpu")
                 self._rgb_annotator.attach([self._render_product])
@@ -187,8 +184,7 @@ def _render_kit_perspective_rgb_array(self) -> np.ndarray | None:
             rgb_data = np.frombuffer(rgb_data, dtype=np.uint8).reshape(*rgb_data.shape)
             if rgb_data.size == 0:
                 # renderer is warming up; return blank frame
-                h, w = self.cfg.kit_resolution[1], self.cfg.kit_resolution[0]
-                return np.zeros((h, w, 3), dtype=np.uint8)
+                return np.zeros((720, 1280, 3), dtype=np.uint8)
             return rgb_data[:, :, :3]
         except Exception as exc:
             logger.warning("[VideoRecorder] Kit perspective capture failed: %s", exc)
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
index 3436b2a333a..501df00a5a8 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder_cfg.py
@@ -71,8 +71,8 @@ class VideoRecorderCfg:
       viewer; Kit backends use ``/OmniverseKit_Persp`` via ``omni.replicator.core``. TiledCamera
       is bypassed even when present.
     * ``"tiled"`` - square tile-grid from a :class:`~isaaclab.sensors.camera.TiledCamera`.
-      Reuses the observation camera on vision-based tasks; spawns ``fallback_camera_cfg`` on
-      state-based Kit tasks; uses the Newton GL viewer on Newton backends.
+      Reuses the observation camera on vision-based tasks; spawns ``fallback_camera_cfg`` for
+      state-based tasks. Raises ``RuntimeError`` if no TiledCamera is available.
 
     Set via CLI: ``--video=perspective`` / ``--video=tiled``.
     """
@@ -110,14 +110,3 @@ class VideoRecorderCfg:
     gl_viewer_height: int = 720
     """Height in pixels of the Newton GL perspective frame. Only active when ``--video`` is set."""
 
-    kit_cam_prim_path: str = "/OmniverseKit_Persp"
-    """USD prim path of the Kit viewport camera used for perspective recording on Kit backends.
-
-    Set automatically from :attr:`~isaaclab.envs.common.ViewerCfg.cam_prim_path`; do not set manually.
-    """
-
-    kit_resolution: tuple[int, int] = (1280, 720)
-    """Resolution ``(width, height)`` of the Kit perspective frame.
-
-    Set automatically from :attr:`~isaaclab.envs.common.ViewerCfg.resolution`; do not set manually.
-    """

From ad35aa007969ed53e06ba97f6b4326a721baade5 Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 20:34:11 -0700
Subject: [PATCH 08/11] greptile fix: fix video recorder bugs

---
 scripts/reinforcement_learning/rlinf/play.py  |  2 +-
 .../isaaclab/isaaclab/envs/direct_marl_env.py |  2 +
 .../isaaclab/isaaclab/envs/direct_rl_env.py   |  2 +
 .../isaaclab/envs/manager_based_rl_env.py     |  2 +
 .../isaaclab/envs/utils/video_recorder.py     | 75 ++++++++++---------
 5 files changed, 47 insertions(+), 36 deletions(-)

diff --git a/scripts/reinforcement_learning/rlinf/play.py b/scripts/reinforcement_learning/rlinf/play.py
index c3782567617..dcade9a3237 100644
--- a/scripts/reinforcement_learning/rlinf/play.py
+++ b/scripts/reinforcement_learning/rlinf/play.py
@@ -56,7 +56,7 @@
     const="perspective",
     default=None,
     metavar="MODE",
-    help="Enable video recording. MODE is 'tiled' (default) or 'perspective' (not yet supported for rlinf).",
+    help="Enable video recording. MODE is 'perspective' (default) or 'tiled'.",
 )
 cli_args.add_rlinf_args(parser)
 args_cli = parser.parse_args()
diff --git a/source/isaaclab/isaaclab/envs/direct_marl_env.py b/source/isaaclab/isaaclab/envs/direct_marl_env.py
index fa067df4b42..33541c3cd44 100644
--- a/source/isaaclab/isaaclab/envs/direct_marl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_marl_env.py
@@ -535,6 +535,8 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
+            if self.video_recorder is None:
+                return None
             return self.video_recorder.render_rgb_array()
         else:
             raise NotImplementedError(
diff --git a/source/isaaclab/isaaclab/envs/direct_rl_env.py b/source/isaaclab/isaaclab/envs/direct_rl_env.py
index 237f94e3f61..58456e72fb2 100644
--- a/source/isaaclab/isaaclab/envs/direct_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/direct_rl_env.py
@@ -503,6 +503,8 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
+            if self.video_recorder is None:
+                return None
             return self.video_recorder.render_rgb_array()
         else:
             raise NotImplementedError(
diff --git a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
index db4ff7a13de..132fa4d97fb 100644
--- a/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
+++ b/source/isaaclab/isaaclab/envs/manager_based_rl_env.py
@@ -276,6 +276,8 @@ def render(self, recompute: bool = False) -> np.ndarray | None:
         if self.render_mode == "human" or self.render_mode is None:
             return None
         elif self.render_mode == "rgb_array":
+            if self.video_recorder is None:
+                return None
             return self.video_recorder.render_rgb_array()
         else:
             raise NotImplementedError(
diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder.py b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
index 20492ba0585..ab59707e384 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
@@ -64,7 +64,7 @@ def __init__(self, cfg: VideoRecorderCfg, scene: InteractiveScene):
                 self._fallback_tiled_camera = self._spawn_fallback_cameras(cfg, scene)
 
     def render_rgb_array(self) -> np.ndarray | None:
-        """Return an RGB frame for video recording, or ``None`` on transient Kit warmup."""
+        """Return an RGB frame for video recording, or ``None`` when no GL viewer and no Kit runtime."""
         if self.cfg.video_mode == "perspective":
             if not self._gl_viewer_init_attempted:
                 self._try_init_gl_viewer()
@@ -188,7 +188,7 @@ def _render_kit_perspective_rgb_array(self) -> np.ndarray | None:
             return rgb_data[:, :, :3]
         except Exception as exc:
             logger.warning("[VideoRecorder] Kit perspective capture failed: %s", exc)
-            return None
+            return np.zeros((720, 1280, 3), dtype=np.uint8)
 
     @staticmethod
     def _spawn_fallback_cameras(cfg: VideoRecorderCfg, scene: InteractiveScene):
@@ -227,41 +227,46 @@ def _find_video_camera(self):
         """Locate and cache the TiledCamera to use for video recording.
 
         Priority: (1) observation TiledCamera already in the scene, (2) fallback camera.
-        Returns ``None`` if neither is available.
+        Returns ``None`` if neither is available yet (retried on the next call).
         """
-        if not hasattr(self, "_video_camera"):
-            from isaaclab.sensors.camera import TiledCamera
-
-            self._video_camera = None
-
-            for sensor in self._scene.sensors.values():
-                if isinstance(sensor, TiledCamera):
-                    output = sensor.data.output
-                    if "rgb" in output or "rgba" in output:
-                        self._video_camera = sensor
-                        break
-
-            if self._video_camera is None and self._fallback_tiled_camera is not None:
-                if self._fallback_tiled_camera.is_initialized:
-                    output = self._fallback_tiled_camera.data.output
-                    if "rgb" in output or "rgba" in output:
-                        self._video_camera = self._fallback_tiled_camera
-
-            if self._video_camera is not None:
-                output = self._video_camera.data.output
-                self._video_rgb_key = "rgb" if "rgb" in output else "rgba"
-                n_total = int(output[self._video_rgb_key].shape[0])
-                n_envs = n_total if self.cfg.video_num_tiles < 0 else min(self.cfg.video_num_tiles, n_total)
-                self._video_n_envs = n_envs
-                self._video_grid_size = math.ceil(math.sqrt(n_envs))
-                n_slots = self._video_grid_size ** 2
-                H = int(output[self._video_rgb_key].shape[1])
-                W = int(output[self._video_rgb_key].shape[2])
-                self._video_H = H
-                self._video_W = W
-                pad = n_slots - n_envs
-                self._video_pad = np.zeros((pad, H, W, 3), dtype=np.uint8) if pad > 0 else None
+        if hasattr(self, "_video_camera"):
+            return self._video_camera
+
+        from isaaclab.sensors.camera import TiledCamera
+
+        camera = None
+
+        for sensor in self._scene.sensors.values():
+            if isinstance(sensor, TiledCamera):
+                output = sensor.data.output
+                if "rgb" in output or "rgba" in output:
+                    camera = sensor
+                    break
+
+        if camera is None and self._fallback_tiled_camera is not None:
+            if self._fallback_tiled_camera.is_initialized:
+                output = self._fallback_tiled_camera.data.output
+                if "rgb" in output or "rgba" in output:
+                    camera = self._fallback_tiled_camera
+
+        if camera is None:
+            return None
 
+        # cache only once a camera is confirmed available.
+        self._video_camera = camera
+        output = camera.data.output
+        self._video_rgb_key = "rgb" if "rgb" in output else "rgba"
+        n_total = int(output[self._video_rgb_key].shape[0])
+        n_envs = n_total if self.cfg.video_num_tiles < 0 else min(self.cfg.video_num_tiles, n_total)
+        self._video_n_envs = n_envs
+        self._video_grid_size = math.ceil(math.sqrt(n_envs))
+        n_slots = self._video_grid_size ** 2
+        H = int(output[self._video_rgb_key].shape[1])
+        W = int(output[self._video_rgb_key].shape[2])
+        self._video_H = H
+        self._video_W = W
+        pad = n_slots - n_envs
+        self._video_pad = np.zeros((pad, H, W, 3), dtype=np.uint8) if pad > 0 else None
         return self._video_camera
 
     def _render_tiled_camera_rgb_array(self) -> np.ndarray:

From 9714614d9cf6c26442d984ef9552339951f76802 Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 20:45:53 -0700
Subject: [PATCH 09/11] fix: return blank frame instead of None from
 _render_newton_gl_rgb_array on error

---
 source/isaaclab/isaaclab/envs/utils/video_recorder.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/source/isaaclab/isaaclab/envs/utils/video_recorder.py b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
index ab59707e384..837563b4c9b 100644
--- a/source/isaaclab/isaaclab/envs/utils/video_recorder.py
+++ b/source/isaaclab/isaaclab/envs/utils/video_recorder.py
@@ -64,7 +64,7 @@ def __init__(self, cfg: VideoRecorderCfg, scene: InteractiveScene):
                 self._fallback_tiled_camera = self._spawn_fallback_cameras(cfg, scene)
 
     def render_rgb_array(self) -> np.ndarray | None:
-        """Return an RGB frame for video recording, or ``None`` when no GL viewer and no Kit runtime."""
+        """Return an RGB frame for video recording, or ``None`` when neither GL viewer nor Kit runtime is available."""
         if self.cfg.video_mode == "perspective":
             if not self._gl_viewer_init_attempted:
                 self._try_init_gl_viewer()
@@ -141,8 +141,8 @@ def _try_init_gl_viewer(self) -> None:
         except Exception as exc:
             logger.warning("[VideoRecorder] Newton GL viewer unavailable: %s", exc)
 
-    def _render_newton_gl_rgb_array(self) -> np.ndarray | None:
-        """Return one RGB frame from the Newton GL viewer, or ``None`` on error."""
+    def _render_newton_gl_rgb_array(self) -> np.ndarray:
+        """Return one RGB frame from the Newton GL viewer, or a blank frame on error."""
         try:
             from isaaclab.sim import SimulationContext
 
@@ -158,7 +158,7 @@ def _render_newton_gl_rgb_array(self) -> np.ndarray | None:
             return viewer.get_frame().numpy()
         except Exception as exc:
             logger.warning("[VideoRecorder] GL frame capture failed: %s", exc)
-            return None
+            return np.zeros((self.cfg.gl_viewer_height, self.cfg.gl_viewer_width, 3), dtype=np.uint8)
 
     def _render_kit_perspective_rgb_array(self) -> np.ndarray | None:
         """Return one RGB frame from the Kit /OmniverseKit_Persp camera via omni.replicator.

From e1a3b640ae514a2d949c5d34b1ed8ef43d032a15 Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 20:51:14 -0700
Subject: [PATCH 10/11] fix: clarify --video help text in rlinf/play.py; mode
 selection not yet supported

---
 scripts/reinforcement_learning/rlinf/play.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/reinforcement_learning/rlinf/play.py b/scripts/reinforcement_learning/rlinf/play.py
index dcade9a3237..5d5e9682c9f 100644
--- a/scripts/reinforcement_learning/rlinf/play.py
+++ b/scripts/reinforcement_learning/rlinf/play.py
@@ -56,7 +56,7 @@
     const="perspective",
     default=None,
     metavar="MODE",
-    help="Enable video recording. MODE is 'perspective' (default) or 'tiled'.",
+    help="Enable video recording. MODE is 'perspective' (default) or 'tiled'. Note: mode selection is not yet supported for rlinf; any non-None value enables recording.",
 )
 cli_args.add_rlinf_args(parser)
 args_cli = parser.parse_args()

From 2fee9dc2392d54f8c4d908ebd69713bb006899df Mon Sep 17 00:00:00 2001
From: Brian Dilinila <bdilinila@nvidia.com>
Date: Thu, 12 Mar 2026 21:00:49 -0700
Subject: [PATCH 11/11] Added unit tests

---
 .../envs/utils/test_video_recorder.py         | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 source/isaaclab/isaaclab/envs/utils/test_video_recorder.py

diff --git a/source/isaaclab/isaaclab/envs/utils/test_video_recorder.py b/source/isaaclab/isaaclab/envs/utils/test_video_recorder.py
new file mode 100644
index 00000000000..398dc0ee045
--- /dev/null
+++ b/source/isaaclab/isaaclab/envs/utils/test_video_recorder.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+"""Unit tests for VideoRecorder."""
+import importlib.util, pathlib, sys
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+import numpy as np
+import pytest
+
+_spec = importlib.util.spec_from_file_location("_vr", pathlib.Path(__file__).parent / "video_recorder.py")
+_module = importlib.util.module_from_spec(_spec); _spec.loader.exec_module(_module); VideoRecorder = _module.VideoRecorder
+
+_BLANK_720p = np.zeros((720, 1280, 3), dtype=np.uint8)
+_DEFAULT_CFG = dict(
+    render_mode="rgb_array", video_mode="perspective", fallback_camera_cfg=None,
+    video_num_tiles=-1, camera_eye=(7.5, 7.5, 7.5), camera_lookat=(0.0, 0.0, 0.0),
+    gl_viewer_width=1280, gl_viewer_height=720,
+)
+
+
+def _create_recorder(**kw):
+    """Return a VideoRecorder with __init__ bypassed and all deps mocked out."""
+    recorder = object.__new__(VideoRecorder)
+    recorder.cfg = SimpleNamespace(**{**_DEFAULT_CFG, **kw})
+    recorder._scene = MagicMock(); recorder._scene.sensors = {}
+    recorder._fallback_tiled_camera = None
+    recorder._gl_viewer = None
+    recorder._gl_viewer_init_attempted = False
+    return recorder
+
+
+def test_init_perspective_mode_does_not_spawn_fallback():
+    """In perspective mode, __init__ never spawns a TiledCamera fallback."""
+    scene = MagicMock(); scene.sensors = {}; scene.num_envs = 1
+    cfg = SimpleNamespace(**{**_DEFAULT_CFG, "fallback_camera_cfg": MagicMock()})
+    with patch.dict(sys.modules, {"pyglet": MagicMock()}):
+        with patch.object(VideoRecorder, "_spawn_fallback_cameras") as mock_spawn:
+            VideoRecorder(cfg, scene)
+    mock_spawn.assert_not_called()
+
+
+def test_init_tiled_mode_spawns_fallback_when_configured():
+    """In tiled mode with a fallback_camera_cfg, __init__ calls _spawn_fallback_cameras."""
+    scene = MagicMock(); scene.sensors = {}; scene.num_envs = 1
+    cfg = SimpleNamespace(**{**_DEFAULT_CFG, "video_mode": "tiled", "fallback_camera_cfg": MagicMock()})
+    with patch.dict(sys.modules, {"pyglet": MagicMock()}):
+        with patch.object(VideoRecorder, "_spawn_fallback_cameras", return_value=MagicMock()) as mock_spawn:
+            VideoRecorder(cfg, scene)
+    mock_spawn.assert_called_once()
+
+
+def test_render_rgb_array_perspective_uses_gl_viewer_when_available():
+    """Perspective mode returns a GL viewer frame when _gl_viewer is set."""
+    recorder = _create_recorder()
+    recorder._gl_viewer = MagicMock(); recorder._gl_viewer_init_attempted = True
+    with patch.object(recorder, "_render_newton_gl_rgb_array", return_value=_BLANK_720p) as mock_gl:
+        result = recorder.render_rgb_array()
+    mock_gl.assert_called_once()
+    assert result.shape == (720, 1280, 3)
+
+
+def test_render_rgb_array_perspective_falls_through_to_kit_when_no_gl_viewer():
+    """Kit capture path is used when no GL viewer is available (Kit backend)."""
+    recorder = _create_recorder(); recorder._gl_viewer_init_attempted = True
+    with patch.object(recorder, "_render_kit_perspective_rgb_array", return_value=_BLANK_720p) as mock_kit:
+        recorder.render_rgb_array()
+    mock_kit.assert_called_once()
+
+
+def test_render_rgb_array_tiled_raises_when_no_camera():
+    """Tiled mode with no TiledCamera raises RuntimeError with a descriptive message."""
+    recorder = _create_recorder(video_mode="tiled")
+    with patch.object(recorder, "_find_video_camera", return_value=None):
+        with pytest.raises(RuntimeError, match="tiled mode"):
+            recorder.render_rgb_array()
+
+
+def test_gl_exception_returns_blank_ndarray_not_none():
+    """GL renderer crash must return a blank ndarray, never None, so RecordVideo never sees None."""
+    recorder = _create_recorder(); recorder._gl_viewer = MagicMock(); recorder._gl_viewer_init_attempted = True
+    with patch.dict(sys.modules, {"isaaclab.sim": MagicMock(SimulationContext=MagicMock(instance=MagicMock(side_effect=RuntimeError)))}):
+        frame = recorder._render_newton_gl_rgb_array()
+    assert isinstance(frame, np.ndarray) and frame.shape == (720, 1280, 3)
+
+
+def test_find_video_camera_does_not_cache_none():
+    """A None result is not cached, allowing retry on the next call."""
+    recorder = _create_recorder(video_mode="tiled")
+    FakeTiledCamera = type("TiledCamera", (), {})
+    with patch.dict(sys.modules, {"isaaclab": MagicMock(), "isaaclab.sensors": MagicMock(), "isaaclab.sensors.camera": MagicMock(TiledCamera=FakeTiledCamera)}):
+        result = recorder._find_video_camera()
+    assert result is None and not hasattr(recorder, "_video_camera")
+
+
+def test_find_video_camera_caches_result_when_found():
+    """A found camera is cached so the scene is not re-scanned on subsequent calls."""
+    recorder = _create_recorder(video_mode="tiled")
+    FakeTiledCamera = type("TiledCamera", (), {})
+    camera = MagicMock(); camera.__class__ = FakeTiledCamera
+    camera.is_initialized = True; camera.data.output = {"rgb": MagicMock(shape=(4, 64, 64, 3))}
+    recorder._scene.sensors = {"cam": camera}
+    with patch.dict(sys.modules, {"isaaclab": MagicMock(), "isaaclab.sensors": MagicMock(), "isaaclab.sensors.camera": MagicMock(TiledCamera=FakeTiledCamera)}):
+        result = recorder._find_video_camera()
+    assert result is camera and hasattr(recorder, "_video_camera")
+
+
+def test_gl_viewer_init_attempted_only_once():
+    """_try_init_gl_viewer is called at most once regardless of render call count."""
+    recorder = _create_recorder(); recorder._gl_viewer_init_attempted = False
+    def _set_flag(): recorder._gl_viewer_init_attempted = True
+    with patch.object(recorder, "_try_init_gl_viewer", side_effect=_set_flag) as mock_init, \
+         patch.object(recorder, "_render_kit_perspective_rgb_array", return_value=_BLANK_720p):
+        for _ in range(3): recorder.render_rgb_array()
+    mock_init.assert_called_once()