From 7535af1e5cc6f3538a1b9b64585037a72174219f Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 13:25:02 +0100
Subject: [PATCH 01/12] WIP

---
 docs/source/testing/benchmarks.rst            |  98 +++++-
 scripts/benchmarks/benchmark_startup.py       | 301 ++++++++++++++++++
 scripts/benchmarks/startup_whitelist.yaml     |  21 ++
 scripts/benchmarks/utils.py                   | 164 ++++++++++
 source/isaaclab/config/extension.toml         |   2 +-
 source/isaaclab/docs/CHANGELOG.rst            |  14 +
 .../isaaclab/test/benchmark/backends.py       |  27 ++
 7 files changed, 623 insertions(+), 4 deletions(-)
 create mode 100644 scripts/benchmarks/benchmark_startup.py
 create mode 100644 scripts/benchmarks/startup_whitelist.yaml

diff --git a/docs/source/testing/benchmarks.rst b/docs/source/testing/benchmarks.rst
index 0225569f1759..82d071a88bd7 100644
--- a/docs/source/testing/benchmarks.rst
+++ b/docs/source/testing/benchmarks.rst
@@ -154,6 +154,95 @@ Measure asset method and property performance using mock interfaces:
 For detailed documentation on micro-benchmarks, including available benchmark files,
 input modes, and how to add new benchmarks, see :ref:`testing_micro_benchmarks`.
 
+Startup Profiling Benchmark
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Profile the startup sequence of an IsaacLab environment using ``cProfile``. Each
+startup stage is wrapped in its own profiling session and the top functions by
+own-time are reported. This is useful for investigating startup regressions and
+understanding where time is spent during initialization.
+
+.. code-block:: bash
+
+   # Basic usage — reports top 30 functions per phase
+   ./isaaclab.sh -p scripts/benchmarks/benchmark_startup.py \
+       --task Isaac-Ant-v0 \
+       --num_envs 4096 \
+       --headless \
+       --benchmark_backend summary
+
+The script profiles four phases independently:
+
+- **app_launch**: ``launch_simulation()`` context entry (Kit/USD/PhysX init)
+- **python_imports**: importing gymnasium, torch, isaaclab_tasks, etc.
+- **env_creation**: ``gym.make()`` + ``env.reset()`` (scene creation, sim start)
+- **first_step**: a single ``env.step()`` call
+
+Each phase records a wall-clock time plus per-function own-time and cumulative
+time as ``SingleMeasurement`` entries. Only IsaacLab functions and first-level
+calls into external libraries are included (deep internals of torch, USD, etc.
+are filtered out).
+
+**Whitelist mode** — For dashboard time-series comparisons across runs, use a
+YAML whitelist config to report a fixed set of functions instead of top-N.
+Patterns use ``fnmatch`` syntax (``*`` and ``?`` wildcards):
+
+.. code-block:: yaml
+
+   # startup_whitelist.yaml
+   app_launch:
+     - "isaaclab.utils.configclass:_custom_post_init"
+     - "isaaclab.sim.*:__init__"
+   env_creation:
+     - "isaaclab.cloner.*:usd_replicate"
+     - "isaaclab.cloner.*:filter_collisions"
+     - "isaaclab.scene.*:_init_scene"
+   first_step:
+     - "isaaclab.actuators.*:compute"
+     - "warp.*:launch"
+
+.. code-block:: bash
+
+   ./isaaclab.sh -p scripts/benchmarks/benchmark_startup.py \
+       --task Isaac-Ant-v0 \
+       --num_envs 4096 \
+       --headless \
+       --benchmark_backend omniperf \
+       --whitelist_config scripts/benchmarks/startup_whitelist.yaml
+
+Phases listed in the YAML use the whitelist; phases not listed fall back to
+``--top_n`` (default: 5 in whitelist mode, 30 otherwise). Patterns that match
+no profiled function emit ``0.0`` placeholders so the output always contains
+the same keys.
+
+A default whitelist is provided at ``scripts/benchmarks/startup_whitelist.yaml``.
+
+.. list-table::
+   :header-rows: 1
+   :widths: 25 15 60
+
+   * - Argument
+     - Default
+     - Description
+   * - ``--task``
+     - required
+     - Environment task name
+   * - ``--num_envs``
+     - from config
+     - Number of parallel environments
+   * - ``--top_n``
+     - 30 (5 with whitelist)
+     - Max functions per non-whitelisted phase
+   * - ``--whitelist_config``
+     - None
+     - Path to YAML whitelist file
+   * - ``--benchmark_backend``
+     - ``omniperf``
+     - Output backend (``json``, ``osmo``, ``omniperf``, ``summary``)
+   * - ``--output_path``
+     - ``.``
+     - Directory for output files
+
 Command Line Arguments
 ----------------------
 
@@ -399,9 +488,12 @@ Output structure:
 Summary Backend
 ~~~~~~~~~~~~~~~
 
-Human-readable console report plus JSON file. Prints a formatted summary (runtime,
-startup, train, frametime, and system info) to the terminal while also writing
-the same data as JSON. Use when you want a quick readout without opening the JSON:
+Human-readable console report plus JSON file. Prints a formatted summary to the
+terminal while also writing the same data as JSON. Standard phases (runtime,
+startup, train, frametime, system info) are rendered with specialized formatting;
+any additional phases (e.g., from the startup profiling benchmark) are rendered
+automatically with their ``SingleMeasurement`` and ``StatisticalMeasurement``
+entries. Use when you want a quick readout without opening the JSON:
 
 .. code-block:: bash
 
diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
new file mode 100644
index 000000000000..e7870d5e92be
--- /dev/null
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -0,0 +1,301 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Script to profile IsaacLab startup phases with cProfile.
+
+Each startup stage (app launch, python imports, env creation, first step) is
+wrapped in its own cProfile session. The top functions by own-time are emitted
+as SingleMeasurement entries via the standard benchmark backend.
+"""
+
+"""Launch Isaac Sim Simulator first."""
+
+import argparse
+import cProfile
+import os
+import sys
+import time
+
+from isaaclab.app import AppLauncher
+
+# -- CLI arguments -----------------------------------------------------------
+
+parser = argparse.ArgumentParser(description="Profile IsaacLab startup phases.")
+parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
+parser.add_argument("--task", type=str, default=None, help="Name of the task.")
+parser.add_argument("--seed", type=int, default=None, help="Seed used for the environment")
+parser.add_argument(
+    "--top_n",
+    type=int,
+    default=None,
+    help="Number of top functions per phase (default: 30, or 5 with --whitelist_config).",
+)
+parser.add_argument(
+    "--benchmark_backend",
+    type=str,
+    default="omniperf",
+    choices=[
+        "json",
+        "osmo",
+        "omniperf",
+        "summary",
+        "LocalLogMetrics",
+        "JSONFileMetrics",
+        "OsmoKPIFile",
+        "OmniPerfKPIFile",
+    ],
+    help="Benchmarking backend options, defaults omniperf",
+)
+parser.add_argument("--output_path", type=str, default=".", help="Path to output benchmark results.")
+parser.add_argument(
+    "--whitelist_config",
+    type=str,
+    default=None,
+    help="Path to YAML file with per-phase function whitelist patterns. Overrides --top_n for listed phases.",
+)
+
+# append AppLauncher cli args (provides --device, --headless, etc.)
+AppLauncher.add_app_launcher_args(parser)
+# parse the arguments
+args_cli, hydra_args = parser.parse_known_args()
+
+# clear out sys.argv for Hydra
+sys.argv = [sys.argv[0]] + hydra_args
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../.."))
+
+from isaaclab.test.benchmark import BaseIsaacLabBenchmark, SingleMeasurement
+from isaaclab.utils.timer import Timer, TimerError
+
+from scripts.benchmarks.utils import (
+    get_backend_type,
+    get_preset_string,
+    parse_cprofile_stats,
+)
+
+# -- Phase 1: Python imports (profiled) --------------------------------------
+
+imports_profile = cProfile.Profile()
+imports_time_begin = time.perf_counter_ns()
+imports_profile.enable()
+
+import gymnasium as gym  # noqa: E402
+import torch  # noqa: E402
+
+from isaaclab.envs import DirectMARLEnvCfg, DirectRLEnvCfg, ManagerBasedRLEnvCfg  # noqa: E402
+
+from isaaclab_tasks.utils import launch_simulation, resolve_task_config  # noqa: E402
+
+imports_profile.disable()
+
+if torch.cuda.is_available() and torch.cuda.is_initialized():
+    torch.cuda.synchronize()
+imports_time_end = time.perf_counter_ns()
+
+# -- Resolve task config (outside profiling) ---------------------------------
+
+env_cfg, _agent_cfg = resolve_task_config(args_cli.task, None)
+
+# -- Detect IsaacLab source prefixes for filtering ---------------------------
+
+_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
+_ISAACLAB_PREFIXES = [
+    os.path.join(_REPO_ROOT, "source", d)
+    for d in os.listdir(os.path.join(_REPO_ROOT, "source"))
+    if os.path.isdir(os.path.join(_REPO_ROOT, "source", d))
+]
+
+# -- Load whitelist config if provided ---------------------------------------
+
+_WHITELIST: dict[str, list[str]] = {}
+if args_cli.whitelist_config is not None:
+    import yaml
+
+    with open(args_cli.whitelist_config) as f:
+        _WHITELIST = yaml.safe_load(f) or {}
+
+# Resolve top_n default: 5 when using whitelist (fallback phases stay compact), 30 otherwise
+if args_cli.top_n is None:
+    args_cli.top_n = 5 if _WHITELIST else 30
+
+# -- Create the benchmark instance ------------------------------------------
+
+backend_type = get_backend_type(args_cli.benchmark_backend)
+benchmark = BaseIsaacLabBenchmark(
+    benchmark_name="benchmark_startup",
+    backend_type=backend_type,
+    output_path=args_cli.output_path,
+    use_recorders=True,
+    frametime_recorders=False,
+    output_prefix=f"benchmark_startup_{args_cli.task}",
+    workflow_metadata={
+        "metadata": [
+            {"name": "task", "data": args_cli.task},
+            {"name": "seed", "data": args_cli.seed},
+            {"name": "num_envs", "data": args_cli.num_envs},
+            {"name": "top_n", "data": args_cli.top_n},
+            {"name": "presets", "data": get_preset_string(hydra_args)},
+        ]
+    },
+)
+
+
+# -- Main profiling logic ---------------------------------------------------
+
+
+def main(
+    env_cfg: ManagerBasedRLEnvCfg | DirectRLEnvCfg | DirectMARLEnvCfg,
+    app_launch_profile: cProfile.Profile,
+    app_launch_wall_ms: float,
+):
+    """Run env creation and first step with profiling.
+
+    Args:
+        env_cfg: Resolved environment configuration for the task.
+        app_launch_profile: cProfile session from the app-launch phase.
+        app_launch_wall_ms: Wall-clock duration of the app-launch phase [ms].
+    """
+
+    # Override config with CLI args
+    env_cfg.scene.num_envs = args_cli.num_envs if args_cli.num_envs is not None else env_cfg.scene.num_envs
+    env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device
+    env_cfg.seed = args_cli.seed
+
+    # -- Phase 3: Env creation (gym.make + env.reset) profiled ---------------
+
+    env_creation_profile = cProfile.Profile()
+    env_creation_time_begin = time.perf_counter_ns()
+    env_creation_profile.enable()
+
+    env = gym.make(args_cli.task, cfg=env_cfg)
+    env.reset()
+
+    env_creation_profile.disable()
+
+    if torch.cuda.is_available() and torch.cuda.is_initialized():
+        torch.cuda.synchronize()
+    env_creation_time_end = time.perf_counter_ns()
+
+    # -- Phase 4: First step profiled ----------------------------------------
+
+    # Sample random actions
+    actions = (
+        torch.rand(env.unwrapped.num_envs, env.unwrapped.single_action_space.shape[0], device=env.unwrapped.device)
+        * 2.0
+        - 1.0
+    )
+
+    first_step_profile = cProfile.Profile()
+    first_step_time_begin = time.perf_counter_ns()
+    first_step_profile.enable()
+
+    env.step(actions)
+
+    first_step_profile.disable()
+
+    if torch.cuda.is_available() and torch.cuda.is_initialized():
+        torch.cuda.synchronize()
+    first_step_time_end = time.perf_counter_ns()
+
+    # -- Parse all profiles and log measurements -----------------------------
+
+    imports_wall_ms = (imports_time_end - imports_time_begin) / 1e6
+    env_creation_wall_ms = (env_creation_time_end - env_creation_time_begin) / 1e6
+    first_step_wall_ms = (first_step_time_end - first_step_time_begin) / 1e6
+
+    # Collect Timer-based sub-timings for env_creation phase (may not exist for all backends)
+    try:
+        scene_creation_ms = Timer.get_timer_info("scene_creation") * 1000
+    except TimerError:
+        scene_creation_ms = 0.0
+    try:
+        simulation_start_ms = Timer.get_timer_info("simulation_start") * 1000
+    except TimerError:
+        simulation_start_ms = 0.0
+
+    phases = {
+        "app_launch": {
+            "profile": app_launch_profile,
+            "wall_clock_ms": app_launch_wall_ms,
+            "extra_measurements": [],
+        },
+        "python_imports": {
+            "profile": imports_profile,
+            "wall_clock_ms": imports_wall_ms,
+            "extra_measurements": [],
+        },
+        "env_creation": {
+            "profile": env_creation_profile,
+            "wall_clock_ms": env_creation_wall_ms,
+            "extra_measurements": [
+                (name, val)
+                for name, val in [
+                    ("Scene Creation Time", scene_creation_ms),
+                    ("Simulation Start Time", simulation_start_ms),
+                ]
+                if val > 0.0
+            ],
+        },
+        "first_step": {
+            "profile": first_step_profile,
+            "wall_clock_ms": first_step_wall_ms,
+            "extra_measurements": [],
+        },
+    }
+
+    # Parse profiles and log measurements to benchmark
+    for phase_name, phase_data in phases.items():
+        phase_whitelist = _WHITELIST.get(phase_name)
+        functions = parse_cprofile_stats(
+            phase_data["profile"], _ISAACLAB_PREFIXES, top_n=args_cli.top_n, whitelist=phase_whitelist
+        )
+        wall_ms = phase_data["wall_clock_ms"]
+        extras = phase_data["extra_measurements"]
+
+        # Log wall-clock time
+        benchmark.add_measurement(
+            phase_name, measurement=SingleMeasurement(name="Wall Clock Time", value=wall_ms, unit="ms")
+        )
+
+        # Log extra sub-timings
+        for extra_name, extra_val in extras:
+            benchmark.add_measurement(
+                phase_name, measurement=SingleMeasurement(name=extra_name, value=extra_val, unit="ms")
+            )
+
+        # Log per-function measurements (tottime + cumtime)
+        for label, tottime_ms, cumtime_ms in functions:
+            benchmark.add_measurement(
+                phase_name, measurement=SingleMeasurement(name=label, value=round(tottime_ms, 2), unit="ms")
+            )
+            benchmark.add_measurement(
+                phase_name,
+                measurement=SingleMeasurement(name=f"{label} (cumtime)", value=round(cumtime_ms, 2), unit="ms"),
+            )
+
+    # Finalize benchmark output
+    benchmark.update_manual_recorders()
+    benchmark._finalize_impl()
+
+    # Close the simulator
+    env.close()
+
+
+if __name__ == "__main__":
+    # -- Phase 2: App launch (profiled) --------------------------------------
+
+    app_launch_profile = cProfile.Profile()
+    app_launch_time_begin = time.perf_counter_ns()
+    app_launch_profile.enable()
+
+    with launch_simulation(env_cfg, args_cli):
+        app_launch_profile.disable()
+
+        if torch.cuda.is_available() and torch.cuda.is_initialized():
+            torch.cuda.synchronize()
+        app_launch_time_end = time.perf_counter_ns()
+
+        app_launch_wall_ms = (app_launch_time_end - app_launch_time_begin) / 1e6
+        main(env_cfg, app_launch_profile, app_launch_wall_ms)
diff --git a/scripts/benchmarks/startup_whitelist.yaml b/scripts/benchmarks/startup_whitelist.yaml
new file mode 100644
index 000000000000..3da31b5db82a
--- /dev/null
+++ b/scripts/benchmarks/startup_whitelist.yaml
@@ -0,0 +1,21 @@
+app_launch:
+  - "isaaclab.utils.configclass:_wrap_resolvable_strings"
+  - "isaaclab.utils.configclass:_custom_post_init"
+  - "isaaclab.utils.configclass:_field_module_dir"
+
+env_creation:
+  - "isaaclab.cloner.*:usd_replicate"
+  - "isaaclab.cloner.*:filter_collisions"
+  - "isaaclab_physx.cloner.*:attach_end_fn"
+  - "isaaclab.scene.*:_init_scene"
+  - "isaaclab.envs.mdp.observations:*"
+  - "isaaclab.utils.assets:_find_usd_dependencies"
+  - "humanoid.mdp.observations:*"
+
+first_step:
+  - "isaaclab.envs.mdp.rewards:*"
+  - "isaaclab.envs.mdp.terminations:*"
+  - "isaaclab.envs.mdp.observations:*"
+  - "isaaclab.actuators.*:compute"
+  - "warp.*:launch"
+  - "warp.*:to_torch"
diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils.py
index ff3ab78efb68..851617c2bc80 100644
--- a/scripts/benchmarks/utils.py
+++ b/scripts/benchmarks/utils.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 
+import cProfile
 import glob
 import os
 import statistics
@@ -235,3 +236,166 @@ def log_convergence(
     benchmark.add_measurement(
         "train", SingleMeasurement(name="Convergence Passed", value=int(result["passed"]), unit="bool")
     )
+
+
+def parse_cprofile_stats(
+    profile: cProfile.Profile,
+    isaaclab_prefixes: list[str],
+    top_n: int = 30,
+    whitelist: list[str] | None = None,
+) -> list[tuple[str, float, float]]:
+    """Parse cProfile stats, filtering to IsaacLab + first-level external calls.
+
+    Walks the pstats data and keeps functions that are either (a) inside an
+    IsaacLab source directory, or (b) directly called by an IsaacLab function.
+    Results are sorted by own-time (tottime) descending.
+
+    When *whitelist* is provided, only functions whose labels match at least one
+    ``fnmatch`` pattern are returned. Patterns that match no profiled function
+    emit a ``(pattern, 0.0, 0.0)`` placeholder so dashboards always receive
+    consistent keys. The *top_n* parameter is ignored in whitelist mode.
+
+    Args:
+        profile: A completed cProfile.Profile instance (after .disable()).
+        isaaclab_prefixes: Absolute file path prefixes identifying IsaacLab source
+            (e.g. ["/home/user/IsaacLab/source/isaaclab", ...]).
+        top_n: Maximum number of functions to return per phase. Ignored when
+            *whitelist* is provided.
+        whitelist: Optional list of ``fnmatch`` patterns to select specific
+            functions (e.g. ``["isaaclab.cloner.*:usd_replicate"]``).
+
+    Returns:
+        List of (function_label, tottime_ms, cumtime_ms) tuples sorted by
+        tottime descending.
+    """
+    import io
+    import pstats
+
+    stats = pstats.Stats(profile, stream=io.StringIO())
+
+    def _is_isaaclab(filename: str) -> bool:
+        return any(filename.startswith(prefix) for prefix in isaaclab_prefixes)
+
+    def _make_label(filename: str, funcname: str) -> str:
+        # For builtins/C-extensions the filename is something like "~" or "<frozen ...>"
+        if not filename or filename.startswith("<") or filename == "~":
+            return funcname
+        # Convert absolute path to dotted module-style label
+        for prefix in isaaclab_prefixes:
+            if filename.startswith(prefix):
+                rel = os.path.relpath(filename, prefix)
+                # Strip .py, replace os.sep with dot
+                rel = rel.replace(os.sep, ".").removesuffix(".py")
+                return f"{rel}:{funcname}"
+        # External function — try to find the top-level package name
+        # e.g. ".../site-packages/torch/nn/modules/linear.py" -> "torch.nn.modules.linear"
+        parts = filename.replace(os.sep, "/").removesuffix(".py").split("/")
+        # Find "site-packages" anchor or fall back to last 3 components
+        try:
+            sp_idx = parts.index("site-packages")
+            short = ".".join(parts[sp_idx + 1 :])
+        except ValueError:
+            short = ".".join(parts[-3:]) if len(parts) >= 3 else ".".join(parts)
+        return f"{short}:{funcname}"
+
+    # stats.stats is dict[(filename, lineno, funcname)] -> (ncalls, totcalls, tottime, cumtime, callers)
+    # callers is dict[(filename, lineno, funcname)] -> (ncalls, totcalls, tottime, cumtime)
+    results = []
+    for func_key, (nc, cc, tottime, cumtime, callers) in stats.stats.items():
+        filename, lineno, funcname = func_key
+        if _is_isaaclab(filename):
+            label = _make_label(filename, funcname)
+            results.append((label, tottime * 1000.0, cumtime * 1000.0))
+        else:
+            # Check if any direct caller is an IsaacLab function
+            for caller_key in callers:
+                caller_filename = caller_key[0]
+                if _is_isaaclab(caller_filename):
+                    label = _make_label(filename, funcname)
+                    results.append((label, tottime * 1000.0, cumtime * 1000.0))
+                    break
+
+    # Sort by tottime (own-time) descending
+    results.sort(key=lambda x: x[1], reverse=True)
+
+    if whitelist is None:
+        return results[:top_n]
+
+    # Whitelist mode: filter by fnmatch patterns, emit placeholders for unmatched patterns
+    import fnmatch
+
+    matched: dict[str, tuple[str, float, float]] = {}
+    matched_patterns: set[str] = set()
+    for label, tottime, cumtime in results:
+        for pattern in whitelist:
+            if fnmatch.fnmatch(label, pattern):
+                if label not in matched:
+                    matched[label] = (label, tottime, cumtime)
+                matched_patterns.add(pattern)
+
+    # Add 0.0 placeholders for patterns that matched nothing
+    for pattern in whitelist:
+        if pattern not in matched_patterns:
+            matched[pattern] = (pattern, 0.0, 0.0)
+
+    filtered = list(matched.values())
+    filtered.sort(key=lambda x: x[1], reverse=True)
+    return filtered
+
+
+def print_startup_summary(
+    phase_results: dict[str, dict],
+) -> None:
+    """Print a human-readable startup profile summary to stdout.
+
+    Args:
+        phase_results: Dict mapping phase name to a dict with keys:
+            - "wall_clock_ms": float, total wall-clock time for the phase.
+            - "functions": list of (label, tottime_ms, cumtime_ms) tuples.
+            - "extra_measurements": optional list of (name, value_ms) for
+              sub-timings like Scene Creation Time.
+    """
+    width = 90
+    sep = "|" + "-" * (width - 2) + "|"
+
+    def box_line(text: str) -> None:
+        inner = width - 4
+        if not text:
+            print(f"| {' ' * inner} |")
+            return
+        # Wrap long lines
+        while len(text) > inner:
+            print(f"| {text[:inner]} |")
+            text = text[inner:]
+        print(f"| {text.ljust(inner)} |")
+
+    print()
+    print(sep)
+    box_line("Startup Profile Summary".center(width - 4))
+    print(sep)
+
+    for phase_name, data in phase_results.items():
+        wall_ms = data["wall_clock_ms"]
+        functions = data["functions"]
+        extras = data.get("extra_measurements", [])
+
+        box_line(f"Phase: {phase_name}  (wall clock: {wall_ms:.1f} ms)")
+        print(sep)
+
+        for name, val_ms in extras:
+            box_line(f"  {name}: {val_ms:.1f} ms")
+
+        if functions:
+            # Header
+            box_line(f"  {'Function':<58} {'Own (ms)':>10} {'Cum (ms)':>10}")
+            box_line(f"  {'-' * 58} {'-' * 10} {'-' * 10}")
+            for label, tottime, cumtime in functions:
+                # Truncate long labels
+                short_label = label if len(label) <= 58 else label[:55] + "..."
+                box_line(f"  {short_label:<58} {tottime:>10.1f} {cumtime:>10.1f}")
+        else:
+            box_line("  (no IsaacLab functions captured)")
+
+        print(sep)
+
+    print()
diff --git a/source/isaaclab/config/extension.toml b/source/isaaclab/config/extension.toml
index d6e20b221327..59b139eda365 100644
--- a/source/isaaclab/config/extension.toml
+++ b/source/isaaclab/config/extension.toml
@@ -1,7 +1,7 @@
 [package]
 
 # Note: Semantic Versioning is used: https://semver.org/
-version = "4.5.22"
+version = "4.5.23"
 
 # Description
 title = "Isaac Lab framework for Robot Learning"
diff --git a/source/isaaclab/docs/CHANGELOG.rst b/source/isaaclab/docs/CHANGELOG.rst
index 11524c3b3182..deec52aa04a5 100644
--- a/source/isaaclab/docs/CHANGELOG.rst
+++ b/source/isaaclab/docs/CHANGELOG.rst
@@ -1,6 +1,20 @@
 Changelog
 ---------
 
+4.5.23 (2026-03-20)
+~~~~~~~~~~~~~~~~~~~
+
+Changed
+^^^^^^^
+
+* Changed :class:`~isaaclab.test.benchmark.backends.SummaryMetrics` to
+  dynamically render unknown benchmark phases. Previously only hard-coded phase
+  names (``startup``, ``runtime``, ``train``, ``frametime``) were printed in the
+  summary report; any other phases were silently dropped. Unknown phases now
+  render their ``SingleMeasurement`` and ``StatisticalMeasurement`` entries
+  automatically.
+
+
 4.5.22 (2026-03-16)
 ~~~~~~~~~~~~~~~~~~~
 
diff --git a/source/isaaclab/isaaclab/test/benchmark/backends.py b/source/isaaclab/isaaclab/test/benchmark/backends.py
index f9187bf5b181..6b6c0acacb10 100644
--- a/source/isaaclab/isaaclab/test/benchmark/backends.py
+++ b/source/isaaclab/isaaclab/test/benchmark/backends.py
@@ -276,6 +276,33 @@ def _print_summary(self) -> None:
                 self._print_box_line(f"{label}: {value}")
             self._print_box_separator()
 
+        # Render any phases not handled above (e.g. profiling phases from benchmark_startup)
+        known_phases = {
+            "benchmark_info",
+            "runtime",
+            "startup",
+            "train",
+            "frametime",
+            "hardware_info",
+            "version_info",
+        }
+        for phase_name, phase in phases.items():
+            if phase_name in known_phases or not phase.measurements:
+                continue
+            self._print_box_line(f"Phase: {phase_name}")
+            for measurement in phase.measurements:
+                label = measurement.name
+                if isinstance(measurement, StatisticalMeasurement):
+                    unit_str = f" {measurement.unit.strip()}" if (measurement.unit and measurement.unit.strip()) else ""
+                    value = f"{self._format_scalar(measurement.mean)}{unit_str}"
+                elif isinstance(measurement, SingleMeasurement):
+                    unit_str = f" {measurement.unit.strip()}" if (measurement.unit and measurement.unit.strip()) else ""
+                    value = f"{self._format_scalar(measurement.value)}{unit_str}"
+                else:
+                    continue
+                self._print_box_line(f"{label}: {value}")
+            self._print_box_separator()
+
         if hardware_meta:
             self._print_box_line("System:")
             self._print_box_kv("cpu_name", hardware_meta.get("cpu_name"))

From 10016c2a06dc05e7a079c62f9c07264a196ca1b0 Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 13:29:37 +0100
Subject: [PATCH 02/12] Improve error handling in startup benchmark

- Add error handling for YAML whitelist loading (OSError, YAMLError,
  type validation)
- Use None instead of 0.0 for missing Timer sub-timings with info log
- Warn when IsaacLab source directory is missing (empty prefix list)
- Warn on unmatched whitelist patterns to catch typos
- Use try/finally for cProfile disable to handle exceptions cleanly
---
 scripts/benchmarks/benchmark_startup.py | 62 +++++++++++++++++--------
 scripts/benchmarks/utils.py             |  6 +++
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
index e7870d5e92be..bf0d9fa618b5 100644
--- a/scripts/benchmarks/benchmark_startup.py
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -101,11 +101,14 @@
 # -- Detect IsaacLab source prefixes for filtering ---------------------------
 
 _REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
-_ISAACLAB_PREFIXES = [
-    os.path.join(_REPO_ROOT, "source", d)
-    for d in os.listdir(os.path.join(_REPO_ROOT, "source"))
-    if os.path.isdir(os.path.join(_REPO_ROOT, "source", d))
-]
+_source_dir = os.path.join(_REPO_ROOT, "source")
+if os.path.isdir(_source_dir):
+    _ISAACLAB_PREFIXES = [
+        os.path.join(_source_dir, d) for d in os.listdir(_source_dir) if os.path.isdir(os.path.join(_source_dir, d))
+    ]
+else:
+    print(f"[WARNING] IsaacLab source directory not found at '{_source_dir}'. Function-level profiling will be empty.")
+    _ISAACLAB_PREFIXES = []
 
 # -- Load whitelist config if provided ---------------------------------------
 
@@ -113,8 +116,26 @@
 if args_cli.whitelist_config is not None:
     import yaml
 
-    with open(args_cli.whitelist_config) as f:
-        _WHITELIST = yaml.safe_load(f) or {}
+    try:
+        with open(args_cli.whitelist_config) as f:
+            raw = yaml.safe_load(f)
+    except OSError as e:
+        print(f"[ERROR] Cannot read whitelist config '{args_cli.whitelist_config}': {e}")
+        sys.exit(1)
+    except yaml.YAMLError as e:
+        print(f"[ERROR] Invalid YAML in whitelist config '{args_cli.whitelist_config}': {e}")
+        sys.exit(1)
+
+    if raw is None:
+        _WHITELIST = {}
+    elif not isinstance(raw, dict):
+        print(
+            f"[ERROR] Whitelist config must be a YAML mapping (got {type(raw).__name__})."
+            " Expected format: phase_name: [pattern, ...]"
+        )
+        sys.exit(1)
+    else:
+        _WHITELIST = raw
 
 # Resolve top_n default: 5 when using whitelist (fallback phases stay compact), 30 otherwise
 if args_cli.top_n is None:
@@ -168,11 +189,11 @@ def main(
     env_creation_profile = cProfile.Profile()
     env_creation_time_begin = time.perf_counter_ns()
     env_creation_profile.enable()
-
-    env = gym.make(args_cli.task, cfg=env_cfg)
-    env.reset()
-
-    env_creation_profile.disable()
+    try:
+        env = gym.make(args_cli.task, cfg=env_cfg)
+        env.reset()
+    finally:
+        env_creation_profile.disable()
 
     if torch.cuda.is_available() and torch.cuda.is_initialized():
         torch.cuda.synchronize()
@@ -190,10 +211,10 @@ def main(
     first_step_profile = cProfile.Profile()
     first_step_time_begin = time.perf_counter_ns()
     first_step_profile.enable()
-
-    env.step(actions)
-
-    first_step_profile.disable()
+    try:
+        env.step(actions)
+    finally:
+        first_step_profile.disable()
 
     if torch.cuda.is_available() and torch.cuda.is_initialized():
         torch.cuda.synchronize()
@@ -206,14 +227,17 @@ def main(
     first_step_wall_ms = (first_step_time_end - first_step_time_begin) / 1e6
 
     # Collect Timer-based sub-timings for env_creation phase (may not exist for all backends)
+    scene_creation_ms = None
     try:
         scene_creation_ms = Timer.get_timer_info("scene_creation") * 1000
     except TimerError:
-        scene_creation_ms = 0.0
+        print("[INFO] Timer 'scene_creation' not available; sub-timing will be omitted.")
+
+    simulation_start_ms = None
     try:
         simulation_start_ms = Timer.get_timer_info("simulation_start") * 1000
     except TimerError:
-        simulation_start_ms = 0.0
+        print("[INFO] Timer 'simulation_start' not available; sub-timing will be omitted.")
 
     phases = {
         "app_launch": {
@@ -235,7 +259,7 @@ def main(
                     ("Scene Creation Time", scene_creation_ms),
                     ("Simulation Start Time", simulation_start_ms),
                 ]
-                if val > 0.0
+                if val is not None
             ],
         },
         "first_step": {
diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils.py
index 851617c2bc80..511f12808c11 100644
--- a/scripts/benchmarks/utils.py
+++ b/scripts/benchmarks/utils.py
@@ -334,8 +334,14 @@ def _make_label(filename: str, funcname: str) -> str:
                 matched_patterns.add(pattern)
 
     # Add 0.0 placeholders for patterns that matched nothing
+    import warnings
+
     for pattern in whitelist:
         if pattern not in matched_patterns:
+            warnings.warn(
+                f"Whitelist pattern '{pattern}' matched no profiled functions. "
+                "Check for typos or verify the function ran during this phase."
+            )
             matched[pattern] = (pattern, 0.0, 0.0)
 
     filtered = list(matched.values())

From 8339a49ecc32a36b831ee7d90527242232d5d127 Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 13:31:14 +0100
Subject: [PATCH 03/12] Address PR review findings

- Fix orphaned docstring (convert to comment)
- Improve main() and module docstrings for accuracy
- Remove phase numbering from section comments (avoid confusion)
- Fix Timer comment wording (backends -> environment types)
- Fix top_n docstring (remove "per phase")
- Remove dead print_startup_summary function
---
 scripts/benchmarks/benchmark_startup.py | 17 +++----
 scripts/benchmarks/utils.py             | 60 +------------------------
 2 files changed, 10 insertions(+), 67 deletions(-)

diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
index bf0d9fa618b5..b9e6f90a4762 100644
--- a/scripts/benchmarks/benchmark_startup.py
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -7,10 +7,11 @@
 
 Each startup stage (app launch, python imports, env creation, first step) is
 wrapped in its own cProfile session. The top functions by own-time are emitted
-as SingleMeasurement entries via the standard benchmark backend.
+as SingleMeasurement entries (both own-time and cumulative time) via the
+standard benchmark backend.
 """
 
-"""Launch Isaac Sim Simulator first."""
+# Launch Isaac Sim Simulator first.
 
 import argparse
 import cProfile
@@ -75,7 +76,7 @@
     parse_cprofile_stats,
 )
 
-# -- Phase 1: Python imports (profiled) --------------------------------------
+# -- Python imports (profiled) ------------------------------------------------
 
 imports_profile = cProfile.Profile()
 imports_time_begin = time.perf_counter_ns()
@@ -171,7 +172,7 @@ def main(
     app_launch_profile: cProfile.Profile,
     app_launch_wall_ms: float,
 ):
-    """Run env creation and first step with profiling.
+    """Profile env creation and first step, then log all phase measurements.
 
     Args:
         env_cfg: Resolved environment configuration for the task.
@@ -184,7 +185,7 @@ def main(
     env_cfg.sim.device = args_cli.device if args_cli.device is not None else env_cfg.sim.device
     env_cfg.seed = args_cli.seed
 
-    # -- Phase 3: Env creation (gym.make + env.reset) profiled ---------------
+    # -- Env creation (gym.make + env.reset) profiled ---------------------------
 
     env_creation_profile = cProfile.Profile()
     env_creation_time_begin = time.perf_counter_ns()
@@ -199,7 +200,7 @@ def main(
         torch.cuda.synchronize()
     env_creation_time_end = time.perf_counter_ns()
 
-    # -- Phase 4: First step profiled ----------------------------------------
+    # -- First step profiled ----------------------------------------------------
 
     # Sample random actions
     actions = (
@@ -226,7 +227,7 @@ def main(
     env_creation_wall_ms = (env_creation_time_end - env_creation_time_begin) / 1e6
     first_step_wall_ms = (first_step_time_end - first_step_time_begin) / 1e6
 
-    # Collect Timer-based sub-timings for env_creation phase (may not exist for all backends)
+    # Collect Timer-based sub-timings for env_creation phase (may not exist for all environment types)
     scene_creation_ms = None
     try:
         scene_creation_ms = Timer.get_timer_info("scene_creation") * 1000
@@ -308,7 +309,7 @@ def main(
 
 
 if __name__ == "__main__":
-    # -- Phase 2: App launch (profiled) --------------------------------------
+    # -- App launch (profiled) --------------------------------------------------
 
     app_launch_profile = cProfile.Profile()
     app_launch_time_begin = time.perf_counter_ns()
diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils.py
index 511f12808c11..191e6a4532ea 100644
--- a/scripts/benchmarks/utils.py
+++ b/scripts/benchmarks/utils.py
@@ -259,7 +259,7 @@ def parse_cprofile_stats(
         profile: A completed cProfile.Profile instance (after .disable()).
         isaaclab_prefixes: Absolute file path prefixes identifying IsaacLab source
             (e.g. ["/home/user/IsaacLab/source/isaaclab", ...]).
-        top_n: Maximum number of functions to return per phase. Ignored when
+        top_n: Maximum number of functions to return. Ignored when
             *whitelist* is provided.
         whitelist: Optional list of ``fnmatch`` patterns to select specific
             functions (e.g. ``["isaaclab.cloner.*:usd_replicate"]``).
@@ -347,61 +347,3 @@ def _make_label(filename: str, funcname: str) -> str:
     filtered = list(matched.values())
     filtered.sort(key=lambda x: x[1], reverse=True)
     return filtered
-
-
-def print_startup_summary(
-    phase_results: dict[str, dict],
-) -> None:
-    """Print a human-readable startup profile summary to stdout.
-
-    Args:
-        phase_results: Dict mapping phase name to a dict with keys:
-            - "wall_clock_ms": float, total wall-clock time for the phase.
-            - "functions": list of (label, tottime_ms, cumtime_ms) tuples.
-            - "extra_measurements": optional list of (name, value_ms) for
-              sub-timings like Scene Creation Time.
-    """
-    width = 90
-    sep = "|" + "-" * (width - 2) + "|"
-
-    def box_line(text: str) -> None:
-        inner = width - 4
-        if not text:
-            print(f"| {' ' * inner} |")
-            return
-        # Wrap long lines
-        while len(text) > inner:
-            print(f"| {text[:inner]} |")
-            text = text[inner:]
-        print(f"| {text.ljust(inner)} |")
-
-    print()
-    print(sep)
-    box_line("Startup Profile Summary".center(width - 4))
-    print(sep)
-
-    for phase_name, data in phase_results.items():
-        wall_ms = data["wall_clock_ms"]
-        functions = data["functions"]
-        extras = data.get("extra_measurements", [])
-
-        box_line(f"Phase: {phase_name}  (wall clock: {wall_ms:.1f} ms)")
-        print(sep)
-
-        for name, val_ms in extras:
-            box_line(f"  {name}: {val_ms:.1f} ms")
-
-        if functions:
-            # Header
-            box_line(f"  {'Function':<58} {'Own (ms)':>10} {'Cum (ms)':>10}")
-            box_line(f"  {'-' * 58} {'-' * 10} {'-' * 10}")
-            for label, tottime, cumtime in functions:
-                # Truncate long labels
-                short_label = label if len(label) <= 58 else label[:55] + "..."
-                box_line(f"  {short_label:<58} {tottime:>10.1f} {cumtime:>10.1f}")
-        else:
-            box_line("  (no IsaacLab functions captured)")
-
-        print(sep)
-
-    print()

From 8f9239ea30a51d0a3e18019090f93f01d3b32e9d Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 13:31:55 +0100
Subject: [PATCH 04/12] Fix whitelist YAML: add license header, fix humanoid
 pattern

The humanoid observations pattern needs a leading wildcard to match the
full module path (isaaclab_tasks.manager_based.classic.humanoid...).
---
 scripts/benchmarks/startup_whitelist.yaml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/scripts/benchmarks/startup_whitelist.yaml b/scripts/benchmarks/startup_whitelist.yaml
index 3da31b5db82a..8844e4339c50 100644
--- a/scripts/benchmarks/startup_whitelist.yaml
+++ b/scripts/benchmarks/startup_whitelist.yaml
@@ -1,3 +1,8 @@
+# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
 app_launch:
   - "isaaclab.utils.configclass:_wrap_resolvable_strings"
   - "isaaclab.utils.configclass:_custom_post_init"
@@ -10,7 +15,7 @@ env_creation:
   - "isaaclab.scene.*:_init_scene"
   - "isaaclab.envs.mdp.observations:*"
   - "isaaclab.utils.assets:_find_usd_dependencies"
-  - "humanoid.mdp.observations:*"
+  - "*humanoid.mdp.observations:*"
 
 first_step:
   - "isaaclab.envs.mdp.rewards:*"

From 619b8680b833a7b112bc376645da27aea5558a8e Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 14:14:02 +0100
Subject: [PATCH 05/12] Address PR review findings

- Make --task required to fail early on missing argument
- Validate whitelist YAML values are list[str] per phase
- Warn on unknown phase names in whitelist config
- Wrap post-env-creation code in try/finally for env.close()
- Remove [UNMATCHED] prefix from placeholder labels to keep
  dashboard keys stable (matches docstring/RST contract)
- Move fnmatch import to function top, replace warnings.warn
  with print("[WARNING]") for consistency
- Remove stale "Launch Isaac Sim Simulator first" comment
- Use _ for unused lineno in pstats tuple unpacking
---
 docs/source/testing/benchmarks.rst      |   2 +-
 scripts/benchmarks/benchmark_startup.py | 210 +++++++++++++-----------
 scripts/benchmarks/utils.py             |  17 +-
 source/isaaclab/docs/CHANGELOG.rst      |   2 +-
 4 files changed, 120 insertions(+), 111 deletions(-)

diff --git a/docs/source/testing/benchmarks.rst b/docs/source/testing/benchmarks.rst
index 82d071a88bd7..03522bfa6e78 100644
--- a/docs/source/testing/benchmarks.rst
+++ b/docs/source/testing/benchmarks.rst
@@ -189,7 +189,7 @@ Patterns use ``fnmatch`` syntax (``*`` and ``?`` wildcards):
 
 .. code-block:: yaml
 
-   # startup_whitelist.yaml
+   # Example whitelist config
    app_launch:
      - "isaaclab.utils.configclass:_custom_post_init"
      - "isaaclab.sim.*:__init__"
diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
index b9e6f90a4762..8b44d61c0923 100644
--- a/scripts/benchmarks/benchmark_startup.py
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -11,8 +11,6 @@
 standard benchmark backend.
 """
 
-# Launch Isaac Sim Simulator first.
-
 import argparse
 import cProfile
 import os
@@ -25,7 +23,7 @@
 
 parser = argparse.ArgumentParser(description="Profile IsaacLab startup phases.")
 parser.add_argument("--num_envs", type=int, default=None, help="Number of environments to simulate.")
-parser.add_argument("--task", type=str, default=None, help="Name of the task.")
+parser.add_argument("--task", type=str, required=True, help="Name of the task.")
 parser.add_argument("--seed", type=int, default=None, help="Seed used for the environment")
 parser.add_argument(
     "--top_n",
@@ -136,6 +134,20 @@
         )
         sys.exit(1)
     else:
+        _VALID_PHASES = {"app_launch", "python_imports", "env_creation", "first_step"}
+        unknown_phases = set(raw.keys()) - _VALID_PHASES
+        if unknown_phases:
+            print(
+                f"[WARNING] Whitelist config contains unknown phase(s): {unknown_phases}. "
+                f"Valid phases: {_VALID_PHASES}. Check for typos."
+            )
+        for phase_name, patterns in raw.items():
+            if not isinstance(patterns, list) or not all(isinstance(p, str) for p in patterns):
+                print(
+                    f"[ERROR] Whitelist phase '{phase_name}' must be a list of strings, "
+                    f"got {type(patterns).__name__}. Check YAML formatting (use '- pattern' syntax)."
+                )
+                sys.exit(1)
         _WHITELIST = raw
 
 # Resolve top_n default: 5 when using whitelist (fallback phases stay compact), 30 otherwise
@@ -200,112 +212,112 @@ def main(
         torch.cuda.synchronize()
     env_creation_time_end = time.perf_counter_ns()
 
-    # -- First step profiled ----------------------------------------------------
-
-    # Sample random actions
-    actions = (
-        torch.rand(env.unwrapped.num_envs, env.unwrapped.single_action_space.shape[0], device=env.unwrapped.device)
-        * 2.0
-        - 1.0
-    )
-
-    first_step_profile = cProfile.Profile()
-    first_step_time_begin = time.perf_counter_ns()
-    first_step_profile.enable()
     try:
-        env.step(actions)
-    finally:
-        first_step_profile.disable()
+        # -- First step profiled ------------------------------------------------
 
-    if torch.cuda.is_available() and torch.cuda.is_initialized():
-        torch.cuda.synchronize()
-    first_step_time_end = time.perf_counter_ns()
-
-    # -- Parse all profiles and log measurements -----------------------------
-
-    imports_wall_ms = (imports_time_end - imports_time_begin) / 1e6
-    env_creation_wall_ms = (env_creation_time_end - env_creation_time_begin) / 1e6
-    first_step_wall_ms = (first_step_time_end - first_step_time_begin) / 1e6
-
-    # Collect Timer-based sub-timings for env_creation phase (may not exist for all environment types)
-    scene_creation_ms = None
-    try:
-        scene_creation_ms = Timer.get_timer_info("scene_creation") * 1000
-    except TimerError:
-        print("[INFO] Timer 'scene_creation' not available; sub-timing will be omitted.")
-
-    simulation_start_ms = None
-    try:
-        simulation_start_ms = Timer.get_timer_info("simulation_start") * 1000
-    except TimerError:
-        print("[INFO] Timer 'simulation_start' not available; sub-timing will be omitted.")
-
-    phases = {
-        "app_launch": {
-            "profile": app_launch_profile,
-            "wall_clock_ms": app_launch_wall_ms,
-            "extra_measurements": [],
-        },
-        "python_imports": {
-            "profile": imports_profile,
-            "wall_clock_ms": imports_wall_ms,
-            "extra_measurements": [],
-        },
-        "env_creation": {
-            "profile": env_creation_profile,
-            "wall_clock_ms": env_creation_wall_ms,
-            "extra_measurements": [
-                (name, val)
-                for name, val in [
-                    ("Scene Creation Time", scene_creation_ms),
-                    ("Simulation Start Time", simulation_start_ms),
-                ]
-                if val is not None
-            ],
-        },
-        "first_step": {
-            "profile": first_step_profile,
-            "wall_clock_ms": first_step_wall_ms,
-            "extra_measurements": [],
-        },
-    }
-
-    # Parse profiles and log measurements to benchmark
-    for phase_name, phase_data in phases.items():
-        phase_whitelist = _WHITELIST.get(phase_name)
-        functions = parse_cprofile_stats(
-            phase_data["profile"], _ISAACLAB_PREFIXES, top_n=args_cli.top_n, whitelist=phase_whitelist
+        # Sample random actions
+        actions = (
+            torch.rand(env.unwrapped.num_envs, env.unwrapped.single_action_space.shape[0], device=env.unwrapped.device)
+            * 2.0
+            - 1.0
         )
-        wall_ms = phase_data["wall_clock_ms"]
-        extras = phase_data["extra_measurements"]
 
-        # Log wall-clock time
-        benchmark.add_measurement(
-            phase_name, measurement=SingleMeasurement(name="Wall Clock Time", value=wall_ms, unit="ms")
-        )
+        first_step_profile = cProfile.Profile()
+        first_step_time_begin = time.perf_counter_ns()
+        first_step_profile.enable()
+        try:
+            env.step(actions)
+        finally:
+            first_step_profile.disable()
 
-        # Log extra sub-timings
-        for extra_name, extra_val in extras:
-            benchmark.add_measurement(
-                phase_name, measurement=SingleMeasurement(name=extra_name, value=extra_val, unit="ms")
+        if torch.cuda.is_available() and torch.cuda.is_initialized():
+            torch.cuda.synchronize()
+        first_step_time_end = time.perf_counter_ns()
+
+        # -- Parse all profiles and log measurements ----------------------------
+
+        imports_wall_ms = (imports_time_end - imports_time_begin) / 1e6
+        env_creation_wall_ms = (env_creation_time_end - env_creation_time_begin) / 1e6
+        first_step_wall_ms = (first_step_time_end - first_step_time_begin) / 1e6
+
+        # Collect Timer-based sub-timings for env_creation phase (may not exist for all environment types)
+        scene_creation_ms = None
+        try:
+            scene_creation_ms = Timer.get_timer_info("scene_creation") * 1000
+        except TimerError:
+            print("[INFO] Timer 'scene_creation' not available; sub-timing will be omitted.")
+
+        simulation_start_ms = None
+        try:
+            simulation_start_ms = Timer.get_timer_info("simulation_start") * 1000
+        except TimerError:
+            print("[INFO] Timer 'simulation_start' not available; sub-timing will be omitted.")
+
+        phases = {
+            "app_launch": {
+                "profile": app_launch_profile,
+                "wall_clock_ms": app_launch_wall_ms,
+                "extra_measurements": [],
+            },
+            "python_imports": {
+                "profile": imports_profile,
+                "wall_clock_ms": imports_wall_ms,
+                "extra_measurements": [],
+            },
+            "env_creation": {
+                "profile": env_creation_profile,
+                "wall_clock_ms": env_creation_wall_ms,
+                "extra_measurements": [
+                    (name, val)
+                    for name, val in [
+                        ("Scene Creation Time", scene_creation_ms),
+                        ("Simulation Start Time", simulation_start_ms),
+                    ]
+                    if val is not None
+                ],
+            },
+            "first_step": {
+                "profile": first_step_profile,
+                "wall_clock_ms": first_step_wall_ms,
+                "extra_measurements": [],
+            },
+        }
+
+        # Parse profiles and log measurements to benchmark
+        for phase_name, phase_data in phases.items():
+            phase_whitelist = _WHITELIST.get(phase_name)
+            functions = parse_cprofile_stats(
+                phase_data["profile"], _ISAACLAB_PREFIXES, top_n=args_cli.top_n, whitelist=phase_whitelist
             )
+            wall_ms = phase_data["wall_clock_ms"]
+            extras = phase_data["extra_measurements"]
 
-        # Log per-function measurements (tottime + cumtime)
-        for label, tottime_ms, cumtime_ms in functions:
-            benchmark.add_measurement(
-                phase_name, measurement=SingleMeasurement(name=label, value=round(tottime_ms, 2), unit="ms")
-            )
+            # Log wall-clock time
             benchmark.add_measurement(
-                phase_name,
-                measurement=SingleMeasurement(name=f"{label} (cumtime)", value=round(cumtime_ms, 2), unit="ms"),
+                phase_name, measurement=SingleMeasurement(name="Wall Clock Time", value=wall_ms, unit="ms")
             )
 
-    # Finalize benchmark output
-    benchmark.update_manual_recorders()
-    benchmark._finalize_impl()
-
-    # Close the simulator
-    env.close()
+            # Log extra sub-timings
+            for extra_name, extra_val in extras:
+                benchmark.add_measurement(
+                    phase_name, measurement=SingleMeasurement(name=extra_name, value=extra_val, unit="ms")
+                )
+
+            # Log per-function measurements (tottime + cumtime)
+            for label, tottime_ms, cumtime_ms in functions:
+                benchmark.add_measurement(
+                    phase_name, measurement=SingleMeasurement(name=label, value=round(tottime_ms, 2), unit="ms")
+                )
+                benchmark.add_measurement(
+                    phase_name,
+                    measurement=SingleMeasurement(name=f"{label} (cumtime)", value=round(cumtime_ms, 2), unit="ms"),
+                )
+
+        # Finalize benchmark output
+        benchmark.update_manual_recorders()
+        benchmark._finalize_impl()
+    finally:
+        env.close()
 
 
 if __name__ == "__main__":
diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils.py
index 191e6a4532ea..41487b676a10 100644
--- a/scripts/benchmarks/utils.py
+++ b/scripts/benchmarks/utils.py
@@ -268,6 +268,7 @@ def parse_cprofile_stats(
         List of (function_label, tottime_ms, cumtime_ms) tuples sorted by
         tottime descending.
     """
+    import fnmatch
     import io
     import pstats
 
@@ -298,11 +299,11 @@ def _make_label(filename: str, funcname: str) -> str:
             short = ".".join(parts[-3:]) if len(parts) >= 3 else ".".join(parts)
         return f"{short}:{funcname}"
 
-    # stats.stats is dict[(filename, lineno, funcname)] -> (ncalls, totcalls, tottime, cumtime, callers)
-    # callers is dict[(filename, lineno, funcname)] -> (ncalls, totcalls, tottime, cumtime)
+    # stats.stats: dict[(filename, lineno, funcname)] -> (cc, nc, tottime, cumtime, callers)
+    # callers: dict[(filename, lineno, funcname)] -> (cc, nc, tottime, cumtime)
     results = []
-    for func_key, (nc, cc, tottime, cumtime, callers) in stats.stats.items():
-        filename, lineno, funcname = func_key
+    for func_key, (_, _, tottime, cumtime, callers) in stats.stats.items():
+        filename, _, funcname = func_key
         if _is_isaaclab(filename):
             label = _make_label(filename, funcname)
             results.append((label, tottime * 1000.0, cumtime * 1000.0))
@@ -322,8 +323,6 @@ def _make_label(filename: str, funcname: str) -> str:
         return results[:top_n]
 
     # Whitelist mode: filter by fnmatch patterns, emit placeholders for unmatched patterns
-    import fnmatch
-
     matched: dict[str, tuple[str, float, float]] = {}
     matched_patterns: set[str] = set()
     for label, tottime, cumtime in results:
@@ -334,12 +333,10 @@ def _make_label(filename: str, funcname: str) -> str:
                 matched_patterns.add(pattern)
 
     # Add 0.0 placeholders for patterns that matched nothing
-    import warnings
-
     for pattern in whitelist:
         if pattern not in matched_patterns:
-            warnings.warn(
-                f"Whitelist pattern '{pattern}' matched no profiled functions. "
+            print(
+                f"[WARNING] Whitelist pattern '{pattern}' matched no profiled functions. "
                 "Check for typos or verify the function ran during this phase."
             )
             matched[pattern] = (pattern, 0.0, 0.0)
diff --git a/source/isaaclab/docs/CHANGELOG.rst b/source/isaaclab/docs/CHANGELOG.rst
index deec52aa04a5..52d10951b3ed 100644
--- a/source/isaaclab/docs/CHANGELOG.rst
+++ b/source/isaaclab/docs/CHANGELOG.rst
@@ -7,7 +7,7 @@ Changelog
 Changed
 ^^^^^^^
 
-* Changed :class:`~isaaclab.test.benchmark.backends.SummaryMetrics` to
+* Changed ``SummaryMetrics`` backend to
   dynamically render unknown benchmark phases. Previously only hard-coded phase
   names (``startup``, ``runtime``, ``train``, ``frametime``) were printed in the
   summary report; any other phases were silently dropped. Unknown phases now

From 9809268280eb6b249c4a8edb4b2e5f513e295aba Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 14:15:31 +0100
Subject: [PATCH 06/12] Fix pstats comment to use pcalls/ncalls per Python docs

---
 scripts/benchmarks/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils.py
index 41487b676a10..1e76829fe268 100644
--- a/scripts/benchmarks/utils.py
+++ b/scripts/benchmarks/utils.py
@@ -299,8 +299,8 @@ def _make_label(filename: str, funcname: str) -> str:
             short = ".".join(parts[-3:]) if len(parts) >= 3 else ".".join(parts)
         return f"{short}:{funcname}"
 
-    # stats.stats: dict[(filename, lineno, funcname)] -> (cc, nc, tottime, cumtime, callers)
-    # callers: dict[(filename, lineno, funcname)] -> (cc, nc, tottime, cumtime)
+    # stats.stats: dict[(filename, lineno, funcname)] -> (pcalls, ncalls, tottime, cumtime, callers)
+    # callers: dict[(filename, lineno, funcname)] -> (pcalls, ncalls, tottime, cumtime)
     results = []
     for func_key, (_, _, tottime, cumtime, callers) in stats.stats.items():
         filename, _, funcname = func_key

From 118289d60c4990f5f9d2657cfb6a7db1a99cc9ba Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 14:18:57 +0100
Subject: [PATCH 07/12] Profile resolve_task_config as its own task_config
 phase

---
 docs/source/testing/benchmarks.rst      |  3 ++-
 scripts/benchmarks/benchmark_startup.py | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/docs/source/testing/benchmarks.rst b/docs/source/testing/benchmarks.rst
index 03522bfa6e78..0127f87a3976 100644
--- a/docs/source/testing/benchmarks.rst
+++ b/docs/source/testing/benchmarks.rst
@@ -171,10 +171,11 @@ understanding where time is spent during initialization.
        --headless \
        --benchmark_backend summary
 
-The script profiles four phases independently:
+The script profiles five phases independently:
 
 - **app_launch**: ``launch_simulation()`` context entry (Kit/USD/PhysX init)
 - **python_imports**: importing gymnasium, torch, isaaclab_tasks, etc.
+- **task_config**: ``resolve_task_config()`` (Hydra config resolution)
 - **env_creation**: ``gym.make()`` + ``env.reset()`` (scene creation, sim start)
 - **first_step**: a single ``env.step()`` call
 
diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
index 8b44d61c0923..80efae5881a3 100644
--- a/scripts/benchmarks/benchmark_startup.py
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -93,10 +93,17 @@
     torch.cuda.synchronize()
 imports_time_end = time.perf_counter_ns()
 
-# -- Resolve task config (outside profiling) ---------------------------------
+# -- Resolve task config (profiled) ------------------------------------------
+
+task_config_profile = cProfile.Profile()
+task_config_time_begin = time.perf_counter_ns()
+task_config_profile.enable()
 
 env_cfg, _agent_cfg = resolve_task_config(args_cli.task, None)
 
+task_config_profile.disable()
+task_config_time_end = time.perf_counter_ns()
+
 # -- Detect IsaacLab source prefixes for filtering ---------------------------
 
 _REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
@@ -134,7 +141,7 @@
         )
         sys.exit(1)
     else:
-        _VALID_PHASES = {"app_launch", "python_imports", "env_creation", "first_step"}
+        _VALID_PHASES = {"app_launch", "python_imports", "task_config", "env_creation", "first_step"}
         unknown_phases = set(raw.keys()) - _VALID_PHASES
         if unknown_phases:
             print(
@@ -237,6 +244,7 @@ def main(
         # -- Parse all profiles and log measurements ----------------------------
 
         imports_wall_ms = (imports_time_end - imports_time_begin) / 1e6
+        task_config_wall_ms = (task_config_time_end - task_config_time_begin) / 1e6
         env_creation_wall_ms = (env_creation_time_end - env_creation_time_begin) / 1e6
         first_step_wall_ms = (first_step_time_end - first_step_time_begin) / 1e6
 
@@ -264,6 +272,11 @@ def main(
                 "wall_clock_ms": imports_wall_ms,
                 "extra_measurements": [],
             },
+            "task_config": {
+                "profile": task_config_profile,
+                "wall_clock_ms": task_config_wall_ms,
+                "extra_measurements": [],
+            },
             "env_creation": {
                 "profile": env_creation_profile,
                 "wall_clock_ms": env_creation_wall_ms,

From 93a68f615acc241aee870c81c794894d357f7728 Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 14:19:20 +0100
Subject: [PATCH 08/12] Remove env-specific humanoid pattern from default
 whitelist

---
 scripts/benchmarks/startup_whitelist.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/benchmarks/startup_whitelist.yaml b/scripts/benchmarks/startup_whitelist.yaml
index 8844e4339c50..121718d36b40 100644
--- a/scripts/benchmarks/startup_whitelist.yaml
+++ b/scripts/benchmarks/startup_whitelist.yaml
@@ -15,7 +15,6 @@ env_creation:
   - "isaaclab.scene.*:_init_scene"
   - "isaaclab.envs.mdp.observations:*"
   - "isaaclab.utils.assets:_find_usd_dependencies"
-  - "*humanoid.mdp.observations:*"
 
 first_step:
   - "isaaclab.envs.mdp.rewards:*"

From ffc90a7b98eff24f317c77ff3f6a9e7e6a510c80 Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 14:57:03 +0100
Subject: [PATCH 09/12] Fix env resource leak when env.reset() raises after
 gym.make()

Move the try/finally guarding env.close() to wrap everything after
gym.make() succeeds, so env.reset() failures also trigger cleanup.
---
 scripts/benchmarks/benchmark_startup.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
index 80efae5881a3..7ccaf26f2f54 100644
--- a/scripts/benchmarks/benchmark_startup.py
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -211,15 +211,19 @@ def main(
     env_creation_profile.enable()
     try:
         env = gym.make(args_cli.task, cfg=env_cfg)
-        env.reset()
-    finally:
+    except Exception:
         env_creation_profile.disable()
-
-    if torch.cuda.is_available() and torch.cuda.is_initialized():
-        torch.cuda.synchronize()
-    env_creation_time_end = time.perf_counter_ns()
+        raise
 
     try:
+        try:
+            env.reset()
+        finally:
+            env_creation_profile.disable()
+
+        if torch.cuda.is_available() and torch.cuda.is_initialized():
+            torch.cuda.synchronize()
+        env_creation_time_end = time.perf_counter_ns()
         # -- First step profiled ------------------------------------------------
 
         # Sample random actions

From bf0d4ec8761850b6be4ad2fef4368b958c883598 Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 14:57:47 +0100
Subject: [PATCH 10/12] Document reliance on internal pstats.Stats.stats dict

---
 scripts/benchmarks/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils.py
index 1e76829fe268..0a9dffd4f701 100644
--- a/scripts/benchmarks/utils.py
+++ b/scripts/benchmarks/utils.py
@@ -299,6 +299,10 @@ def _make_label(filename: str, funcname: str) -> str:
             short = ".".join(parts[-3:]) if len(parts) >= 3 else ".".join(parts)
         return f"{short}:{funcname}"
 
+    # NOTE: stats.stats is an internal CPython dict, not part of the public pstats API.
+    # The public get_stats_profile() (Python 3.9+) doesn't expose caller info, which
+    # we need for the first-level external call filter. If a future Python release
+    # breaks this, switch to get_stats_profile() and drop the caller-based filtering.
     # stats.stats: dict[(filename, lineno, funcname)] -> (pcalls, ncalls, tottime, cumtime, callers)
     # callers: dict[(filename, lineno, funcname)] -> (pcalls, ncalls, tottime, cumtime)
     results = []

From 0667af8bcb382e57a10496c01f2c61ea34999254 Mon Sep 17 00:00:00 2001
From: Antoine RICHARD <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 15:27:33 +0100
Subject: [PATCH 11/12] Update scripts/benchmarks/benchmark_startup.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
Signed-off-by: Antoine RICHARD <antoiner@nvidia.com>
---
 scripts/benchmarks/benchmark_startup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
index 7ccaf26f2f54..e406cf72c559 100644
--- a/scripts/benchmarks/benchmark_startup.py
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -169,7 +169,7 @@
     backend_type=backend_type,
     output_path=args_cli.output_path,
     use_recorders=True,
-    frametime_recorders=False,
+    env_cfg.seed = args_cli.seed if args_cli.seed is not None else env_cfg.seed
     output_prefix=f"benchmark_startup_{args_cli.task}",
     workflow_metadata={
         "metadata": [

From 8abd1d132fd627f93e8474fbd89b8ee03888ad93 Mon Sep 17 00:00:00 2001
From: Antoine Richard <antoiner@nvidia.com>
Date: Fri, 20 Mar 2026 15:30:46 +0100
Subject: [PATCH 12/12] Catch BaseException when disabling profiler on
 gym.make() failure

except Exception misses KeyboardInterrupt and SystemExit, leaving the
profiler enabled for the rest of the process lifetime.
---
 scripts/benchmarks/benchmark_startup.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/benchmarks/benchmark_startup.py b/scripts/benchmarks/benchmark_startup.py
index e406cf72c559..b09bcec35ed2 100644
--- a/scripts/benchmarks/benchmark_startup.py
+++ b/scripts/benchmarks/benchmark_startup.py
@@ -163,13 +163,14 @@
 
 # -- Create the benchmark instance ------------------------------------------
 
+env_cfg.seed = args_cli.seed if args_cli.seed is not None else env_cfg.seed
+
 backend_type = get_backend_type(args_cli.benchmark_backend)
 benchmark = BaseIsaacLabBenchmark(
     benchmark_name="benchmark_startup",
     backend_type=backend_type,
     output_path=args_cli.output_path,
     use_recorders=True,
-    env_cfg.seed = args_cli.seed if args_cli.seed is not None else env_cfg.seed
     output_prefix=f"benchmark_startup_{args_cli.task}",
     workflow_metadata={
         "metadata": [
@@ -211,7 +212,7 @@ def main(
     env_creation_profile.enable()
     try:
         env = gym.make(args_cli.task, cfg=env_cfg)
-    except Exception:
+    except BaseException:
         env_creation_profile.disable()
         raise