From e2171a3a966d1025880bee10c96cc0abac4803ad Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 22 Apr 2026 19:31:47 -0400 Subject: [PATCH 1/9] feat: ROCm path resolution (auto-detect, MAD_ROCM_PATH, TheRock markers) Add rocm_path_resolver and shared therock_markers; wire Context, run CLI, validators, and orchestrator. Document in README, configuration, and usage; update CHANGELOG. Extend unit and integration tests. Adjust therock_detector (sys.path helper) without ADR docs in this commit. Made-with: Cursor --- CHANGELOG.md | 1 + README.md | 6 +- docs/configuration.md | 15 +- docs/usage.md | 29 +- src/madengine/cli/commands/run.py | 2 +- src/madengine/cli/validators.py | 14 + src/madengine/core/constants.py | 15 +- src/madengine/core/context.py | 13 +- .../orchestration/run_orchestrator.py | 7 +- .../scripts/common/tools/therock_detector.py | 30 +- src/madengine/utils/rocm_path_resolver.py | 273 ++++++++++++++++++ src/madengine/utils/therock_markers.py | 37 +++ tests/integration/test_gpu_management.py | 24 +- tests/unit/test_rocm_path.py | 168 ++++++++++- tests/unit/test_therock_markers.py | 32 ++ 15 files changed, 623 insertions(+), 43 deletions(-) create mode 100644 src/madengine/utils/rocm_path_resolver.py create mode 100644 src/madengine/utils/therock_markers.py create mode 100644 tests/unit/test_therock_markers.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 436c6557..cc19b7d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed +- **ROCm path**: Auto-detect host ROCm root by default (traditional `/opt/rocm`, versioned `/opt/rocm-*`, TheRock `rocm-sdk` + markers, `rocminfo` / `amd-smi` / `rocm-smi` on `PATH`). **Host** overrides: top-level `MAD_ROCM_PATH` in `--additional-context`, then `--rocm-path` (alias). **Container** override: `docker_env_vars.MAD_ROCM_PATH` (sets in-container `ROCM_PATH`); if omitted, host path is mirrored. Set `MAD_AUTO_ROCM_PATH=0` to use legacy `ROCM_PATH` / `/opt/rocm` only without scanning. Code: `madengine.utils.rocm_path_resolver`, shared TheRock file markers in `madengine.utils.therock_markers`. **Design doc**: [ADR 0001](docs/adr/0001-rocm-path-resolution.md). - **Profiling**: `rocm_trace_lite` now sets `RTL_MODE=lite` explicitly; added tool `rocm_trace_lite_default` with `RTL_MODE=default` for A/B overhead comparison. `rtl_trace_wrapper.sh` passes `rtl trace --mode โ€ฆ` when `RTL_MODE` is set. ## [2.0.0] - 2026-04-09 diff --git a/README.md b/README.md index 966ce761..78b15538 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ madengine is a modern CLI tool for running Large Language Models (LLMs) and Deep - **๐ŸŽฏ Simple Deployment** - Run locally or deploy to Kubernetes/SLURM via configuration - **๐Ÿ”ง Distributed Launchers** - Full support for torchrun, DeepSpeed, Megatron-LM, TorchTitan, Primus, vLLM, SGLang - **๐Ÿณ Container-Native** - Docker-based execution with GPU support (ROCm, CUDA) -- **๐Ÿ“‚ ROCm Path** - Support for non-default ROCm installs via `--rocm-path` or `ROCM_PATH` (e.g. Rock, pip) +- **๐Ÿ“‚ ROCm Path** - Auto-detect host/container roots; overrides via `MAD_ROCM_PATH` in `--additional-context` and `--rocm-path` (host alias) - **๐Ÿ“Š Performance Tools** - Integrated profiling with rocprof/rocprofv3, [rocm-trace-lite](https://github.com/sunway513/rocm-trace-lite) (RTL), rocblas, MIOpen, RCCL tracing - **๐ŸŽฏ ROCprofv3 Profiles** - 8 pre-configured profiles for compute/memory/communication bottleneck analysis - **๐Ÿ” Environment Validation** - TheRock ROCm detection and validation tools @@ -71,7 +71,7 @@ madengine run --tags dummy \ > **Note**: For build operations, `gpu_vendor` defaults to `AMD` and `guest_os` defaults to `UBUNTU` if not specified. For production deployments or non-AMD/Ubuntu environments, explicitly specify these values. -If ROCm is not installed under `/opt/rocm` (e.g. Rock or pip install), use `--rocm-path` or set `ROCM_PATH`: +If auto-detection does not find your **host** ROCm root, set top-level `MAD_ROCM_PATH` in `--additional-context` or use `--rocm-path` (host only). For a different path **inside the container**, set `docker_env_vars.MAD_ROCM_PATH`. You can also set `ROCM_PATH` / `MAD_AUTO_ROCM_PATH=0` as documented in [docs/configuration.md](docs/configuration.md): ```bash madengine run --tags dummy --rocm-path /path/to/rocm @@ -619,7 +619,7 @@ madengine run --tags model --keep-alive madengine build --tags model --clean-docker-cache --verbose ``` -**ROCm not in /opt/rocm:** If you use a custom ROCm location (e.g. [TheRock](https://github.com/ROCm/TheRock) or pip), set `ROCM_PATH` or pass `--rocm-path` to `madengine run` so GPU detection and container env use the correct paths. +**ROCm not in /opt/rocm:** Use top-level `MAD_ROCM_PATH` or `--rocm-path` for the **host**; `docker_env_vars.MAD_ROCM_PATH` for the **container**; or rely on auto-detect. See [Configuration](docs/configuration.md#rocm-path-run-only). **Common Issues:** - **False failures with profiling**: If models show FAILURE but have performance metrics, see [Profiling Troubleshooting](docs/profiling.md#false-failure-detection-with-rocprof) diff --git a/docs/configuration.md b/docs/configuration.md index f6ae79a1..1309c74f 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -131,12 +131,19 @@ Disabling the scan does **not** change performance metric extraction from the lo ### ROCm path (run only) -When ROCm is not installed under `/opt/rocm` (e.g. [TheRock](https://github.com/ROCm/TheRock) or pip), set the ROCm root so GPU detection and container environment use the correct paths. Use the **run** command option or environment variable (not JSON context): +Design rationale and precedence are recorded in [ADR 0001: ROCm path resolution](adr/0001-rocm-path-resolution.md). -- **CLI:** `madengine run --rocm-path /path/to/rocm ...` -- **Environment:** `export ROCM_PATH=/path/to/rocm` +**Host** (where `madengine` runs validation): by default, the ROCm root is **auto-detected** (traditional `/opt/rocm`, [TheRock](https://github.com/ROCm/TheRock) `rocm-sdk` / manifest layout, or `ROCM_PATH`-like env hints). Set `MAD_AUTO_ROCM_PATH=0` to skip auto and use only legacy resolution (`ROCM_PATH` then `/opt/rocm`). -Resolution order: `--rocm-path` โ†’ `ROCM_PATH` โ†’ `/opt/rocm`. This applies only to the run phase; build does not perform GPU detection. +**Overrides** (recommended for CI): + +- **CLI (host only):** `madengine run --rocm-path /path/to/rocm` โ€” same meaning as top-level `MAD_ROCM_PATH` in additional context. +- **Additional context (host):** top-level `"MAD_ROCM_PATH": "/path/to/host/rocm"`. +- **Additional context (container):** `"docker_env_vars": { "MAD_ROCM_PATH": "/path/inside/image" }` โ€” sets the in-container `ROCM_PATH` passed to workloads; if omitted, the **host**-resolved path is mirrored into the container by default. + +Precedence (host): top-level `MAD_ROCM_PATH` โ†’ `--rocm-path` โ†’ auto-detect (unless disabled) โ†’ `ROCM_PATH` โ†’ `/opt/rocm`. + +This applies to the run phase; build uses build-only context (no GPU detection) but still honors `MAD_ROCM_PATH` in context when set. ## Build Configuration diff --git a/docs/usage.md b/docs/usage.md index 2fd71bea..8e76eae9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -309,21 +309,29 @@ madengine run --tags model \ - `gpu_vendor`: "AMD", "NVIDIA" - `guest_os`: "UBUNTU", "CENTOS" -### ROCm path (non-default installs) +### ROCm path (host and container) -When ROCm is not installed under `/opt/rocm` (e.g. [TheRock](https://github.com/ROCm/TheRock) or pip), set the ROCm root so GPU detection and container environment use the correct paths: +By default, **madengine** auto-detects the **host** ROCm root (apt under `/opt/rocm`, TheRock `rocm-sdk` layout, etc.). Disable scanning with `MAD_AUTO_ROCM_PATH=0` (then `ROCM_PATH` / `/opt/rocm` only). -```bash -# Via environment variable -export ROCM_PATH=/path/to/rocm -madengine run --tags model --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +**Host** overrides: top-level `MAD_ROCM_PATH` in `--additional-context`, or `--rocm-path` (same meaning as top-level `MAD_ROCM_PATH`). + +**Container** override: `{"docker_env_vars": {"MAD_ROCM_PATH": "/path/inside/image"}}` โ€” otherwise the host-resolved path is mirrored into `ROCM_PATH` in the container. -# Via CLI (overrides ROCM_PATH) -madengine run --tags model --rocm-path /path/to/rocm \ +```bash +# Host path via CLI (alias for top-level MAD_ROCM_PATH) +madengine run --tags model --rocm-path /path/to/host/rocm \ --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Or in additional context (host + optional container) +madengine run --tags model --additional-context '{ + "gpu_vendor": "AMD", + "guest_os": "UBUNTU", + "MAD_ROCM_PATH": "/path/to/host/rocm", + "docker_env_vars": {"MAD_ROCM_PATH": "/opt/rocm"} +}' ``` -`--rocm-path` applies only to the **run** command (not build). See [CLI Reference - run](cli-reference.md#run---execute-models). +`--rocm-path` applies only to the **run** command (not build). See [Configuration - ROCm path](configuration.md#rocm-path-run-only). ### Deploy to Kubernetes @@ -616,7 +624,8 @@ madengine build --tags model --clean-docker-cache --verbose | Variable | Description | Example | |----------|-------------|---------| | `MODEL_DIR` | MAD package directory | `/path/to/MAD` | -| `ROCM_PATH` | ROCm installation root (used when `--rocm-path` not set). Use when ROCm is not in `/opt/rocm` (e.g. Rock, pip). | `/path/to/rocm` | +| `ROCM_PATH` | Legacy host ROCm root when not using `MAD_ROCM_PATH` / `--rocm-path` and when auto is off or as a fallback. | `/path/to/rocm` | +| `MAD_AUTO_ROCM_PATH` | Set to `0` to disable host auto-detect (use `ROCM_PATH` then `/opt/rocm` only). Default: on. | `0` | | `MAD_VERBOSE_CONFIG` | Verbose config logging | `"true"` | | `MAD_DOCKERHUB_USER` | Docker Hub username | `"myusername"` | | `MAD_DOCKERHUB_PASSWORD` | Docker Hub password | `"mytoken"` | diff --git a/src/madengine/cli/commands/run.py b/src/madengine/cli/commands/run.py index a684973d..e1472076 100644 --- a/src/madengine/cli/commands/run.py +++ b/src/madengine/cli/commands/run.py @@ -156,7 +156,7 @@ def run( Optional[str], typer.Option( "--rocm-path", - help="ROCm installation path (overrides ROCM_PATH env; default: /opt/rocm). Use when ROCm is not under /opt/rocm (e.g. Rock tar/whl).", + help="Host ROCm root (alias for top-level MAD_ROCM_PATH in --additional-context). Auto-detect when omitted (set MAD_AUTO_ROCM_PATH=0 to disable).", ), ] = None, ) -> None: diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py index d99e87f7..0d36ac21 100644 --- a/src/madengine/cli/validators.py +++ b/src/madengine/cli/validators.py @@ -124,6 +124,15 @@ def validate_additional_context_structure(context: Dict[str, Any]) -> None: ): _fail_structure("docker_env_vars", "an object") + dev = context.get("docker_env_vars") + if isinstance(dev, dict) and "MAD_ROCM_PATH" in dev: + v = dev["MAD_ROCM_PATH"] + if not isinstance(v, (str, int, float, type(None))): + _fail_structure( + "docker_env_vars['MAD_ROCM_PATH']", + "a string (container ROCm root override)", + ) + if "docker_mounts" in context and not isinstance(context["docker_mounts"], dict): _fail_structure("docker_mounts", "an object") @@ -170,6 +179,11 @@ def validate_additional_context_structure(context: Dict[str, Any]) -> None: if "guest_os" in context and not isinstance(context["guest_os"], str): _fail_structure("guest_os", "a string") + if "MAD_ROCM_PATH" in context and not isinstance( + context["MAD_ROCM_PATH"], (str, type(None)) + ): + _fail_structure("MAD_ROCM_PATH", "a string (host ROCm root override)") + if "log_error_pattern_scan" in context and not isinstance( context["log_error_pattern_scan"], (bool, str, int, float, type(None)) ): diff --git a/src/madengine/core/constants.py b/src/madengine/core/constants.py index 8ca10483..4371f8d1 100644 --- a/src/madengine/core/constants.py +++ b/src/madengine/core/constants.py @@ -197,10 +197,13 @@ def _get_public_github_rocm_key(): def get_rocm_path(override=None): - """Return ROCm installation root directory. + """Return ROCm installation root (legacy, no automatic filesystem scan). - Resolution order: override (e.g. from CLI) -> ROCM_PATH env -> default /opt/rocm. - Path is normalized to absolute form with no trailing slash. + For full resolution (MAD_ROCM_PATH, --rocm-path, auto-detect) use + :func:`madengine.utils.rocm_path_resolver.resolve_host_rocm_path` in + :class:`~madengine.core.context.Context`. + + Resolution: ``override`` -> :envvar:`ROCM_PATH` -> ``/opt/rocm``. Args: override: Optional path overriding env and default. @@ -208,6 +211,6 @@ def get_rocm_path(override=None): Returns: str: Absolute ROCm root path. """ - raw = override if override else os.environ.get("ROCM_PATH", "/opt/rocm") - path = os.path.abspath(os.path.expanduser(str(raw).strip())) - return path.rstrip(os.sep) + from madengine.utils.rocm_path_resolver import get_rocm_path_legacy + + return get_rocm_path_legacy(override) diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index 24763588..3c19851e 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -22,7 +22,10 @@ # third-party modules from madengine.core.console import Console -from madengine.core.constants import get_rocm_path +from madengine.utils.rocm_path_resolver import ( + resolve_container_rocm_path, + resolve_host_rocm_path, +) from madengine.utils.gpu_validator import validate_rocm_installation, GPUInstallationError, GPUVendor from madengine.utils.gpu_tool_factory import get_gpu_tool_manager from madengine.utils.gpu_tool_manager import BaseGPUToolManager @@ -90,12 +93,11 @@ def __init__( additional_context: The additional context. additional_context_file: The additional context file. build_only_mode: Whether running in build-only mode (no GPU detection). - rocm_path: Optional ROCm installation path (overrides ROCM_PATH env; default /opt/rocm). + rocm_path: Host ROCm root (``--rocm-path``), same as top-level ``MAD_ROCM_PATH``. Raises: RuntimeError: If GPU detection fails and not in build-only mode. """ - self._rocm_path = get_rocm_path(rocm_path) # Initialize the console self.console = Console() self._gpu_context_initialized = False @@ -130,6 +132,9 @@ def __init__( dict_additional_context = ast.literal_eval(additional_context) update_dict(self.ctx, dict_additional_context) + # Host ROCm path after merge (top-level MAD_ROCM_PATH, then --rocm-path, then auto) + self._rocm_path = resolve_host_rocm_path(self.ctx, rocm_path) + # Initialize context based on mode # User-provided contexts will not be overridden by detection if not build_only_mode: @@ -255,7 +260,7 @@ def init_gpu_context(self) -> None: self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] self.ctx["rocm_path"] = self._rocm_path - self.ctx["docker_env_vars"]["ROCM_PATH"] = self._rocm_path + resolve_container_rocm_path(self.ctx["docker_env_vars"], self._rocm_path) if "MAD_SYSTEM_NGPUS" not in self.ctx["docker_env_vars"]: self.ctx["docker_env_vars"][ diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index 6725a457..56170033 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -30,7 +30,6 @@ create_error_context, handle_error, ) -from madengine.core.constants import get_rocm_path from madengine.utils.session_tracker import SessionTracker from madengine.orchestration.image_filtering import ( filter_images_by_gpu_compatibility as _filter_by_gpu_compat, @@ -113,11 +112,10 @@ def _init_runtime_context(self): else: context_string = None - rocm_path = get_rocm_path(getattr(self.args, "rocm_path", None)) self.context = Context( additional_context=context_string, build_only_mode=False, - rocm_path=rocm_path, + rocm_path=getattr(self.args, "rocm_path", None), ) # Initialize data provider if data config exists @@ -423,11 +421,10 @@ def _create_manifest_from_local_image( # Initialize build-only context for manifest generation # (we need context structure, but skip GPU detection since we're not building) context_string = repr(self.additional_context) if self.additional_context else None - rocm_path = get_rocm_path(getattr(self.args, "rocm_path", None)) build_context = Context( additional_context=context_string, build_only_mode=True, - rocm_path=rocm_path, + rocm_path=getattr(self.args, "rocm_path", None), ) # Create manifest structure diff --git a/src/madengine/scripts/common/tools/therock_detector.py b/src/madengine/scripts/common/tools/therock_detector.py index 441a0204..557ba55d 100755 --- a/src/madengine/scripts/common/tools/therock_detector.py +++ b/src/madengine/scripts/common/tools/therock_detector.py @@ -21,6 +21,32 @@ from pathlib import Path from typing import Dict, List, Optional +_MADENGINE_THEROCK_MARKER = Path("madengine") / "utils" / "therock_markers.py" + + +def _prepend_madengine_to_sys_path() -> None: + """Add the directory that contains the ``madengine`` package to ``sys.path`` if found.""" + for p in Path(__file__).resolve().parents: + if (p / _MADENGINE_THEROCK_MARKER).is_file(): + s = str(p) + if s not in sys.path: + sys.path.insert(0, s) + return + + +_prepend_madengine_to_sys_path() +try: + from madengine.utils.therock_markers import ( + therock_dist_info_path, + therock_manifest_path, + ) +except ImportError: # pragma: no cover โ€” script copied outside a package tree + def therock_manifest_path(path: Path) -> Path: # keep in sync with therock_markers + return path / "share" / "therock" / "therock_manifest.json" + + def therock_dist_info_path(path: Path) -> Path: + return path / "share" / "therock" / "dist_info.json" + class TherockDetector: """Detects TheRock ROCm installations on the system.""" @@ -80,7 +106,7 @@ def _is_therock_installation(self, path: Path) -> Optional[Dict]: details = {} # Marker 1: therock_manifest.json - manifest_path = path / "share" / "therock" / "therock_manifest.json" + manifest_path = therock_manifest_path(path) if manifest_path.exists(): self.log(f"Found therock_manifest.json at {manifest_path}") try: @@ -94,7 +120,7 @@ def _is_therock_installation(self, path: Path) -> Optional[Dict]: self.log(f"Error reading manifest: {e}") # Marker 2: dist_info.json - dist_info_path = path / "share" / "therock" / "dist_info.json" + dist_info_path = therock_dist_info_path(path) if dist_info_path.exists(): self.log(f"Found dist_info.json at {dist_info_path}") try: diff --git a/src/madengine/utils/rocm_path_resolver.py b/src/madengine/utils/rocm_path_resolver.py new file mode 100644 index 00000000..31b79678 --- /dev/null +++ b/src/madengine/utils/rocm_path_resolver.py @@ -0,0 +1,273 @@ +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +r"""ROCm root resolution: automatic discovery (TheRock + traditional) and overrides. + +This module exposes a :class:`RocmPathResolver` (single place for detection order and +injectable dependencies for tests) plus small module-level functions for backward +compatibility. The resolution pipeline is **not** the historical ``rocEnvTool``/v1 +``RocmPathResolver`` from madenginev1; it reuses the same *ideas* in a self-contained +helper for madengine (versioned ``/opt/rocm-*``, TheRock markers, ``PATH`` tools). +See **ADR 0001** in ``docs/adr/0001-rocm-path-resolution.md`` for the full design. + +Versioned installs (e.g. ``/opt/rocm-7.13.0`` with ``bin/amd-smi`` and ``bin/rocm-smi``) +are detected after ``/opt/rocm``. ``which(amd-smi)`` / ``which(rocm-smi)`` infer +the root when that tree validates (skipping a Python-only ``/opt/python`` stub that +fails the root check). + +**Host** overrides (precedence): + +1. Top-level ``MAD_ROCM_PATH`` in additional context. +2. ``--rocm-path`` (CLI) โ€” host-only; same meaning as (1). +3. Auto-detection (when ``MAD_AUTO_ROCM_PATH`` is not ``"0"``). +4. ``ROCM_PATH`` environment variable. +5. Default ``/opt/rocm``. + +**Container** (``docker_env_vars``): + +* ``MAD_ROCM_PATH`` โ€” container ROCm root; sets ``ROCM_PATH`` for Docker; key is + consumed and not passed to the workload. +* If only ``ROCM_PATH`` is set in ``docker_env_vars`` by the user, it is kept. +* Otherwise the host-resolved path is mirrored into ``ROCM_PATH``. +""" + +from __future__ import annotations + +import os +import shutil +import subprocess +from collections.abc import Mapping +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional + +from .therock_markers import is_therock_tree + +#: Key in top-level context and ``docker_env_vars`` for ROCm root overrides. +MAD_ROCM_PATH = "MAD_ROCM_PATH" + +OptionalPathStr = Optional[str] +WhichFn = Callable[..., Optional[str]] + + +def normalize_rocm_path(path: str) -> str: + """Return an absolute path with no trailing separator.""" + p = os.path.abspath(os.path.expanduser((path or "").strip())) + return p.rstrip(os.sep) or os.sep + + +def _environ_get(environ: Mapping[str, str], key: str, default: str = "") -> str: + return (environ.get(key) or default).strip() + + +class RocmPathResolver: + """Resolves a ROCm installation root using a fixed, documented probe order. + + * ``environ`` defaults to :data:`os.environ` for ``ROCM_*`` / ``HIP_*`` checks. + * ``which`` defaults to :func:`shutil.which` and is used to locate tools on ``PATH``; + tests may pass a custom callable to simulate ``which`` without touching the + process environment. + """ + + __slots__ = ("_environ", "_which") + + def __init__( + self, + environ: Optional[Mapping[str, str]] = None, + which: Optional[WhichFn] = None, + ) -> None: + self._environ: Mapping[str, str] = ( + os.environ if environ is None else environ + ) + self._which: WhichFn = which if which is not None else shutil.which + + @staticmethod + def rocm_root_from_bin_tool(tool_path: str) -> Optional[Path]: + """Map ...//bin/{rocminfo,amd-smi,rocm-smi} to .""" + p = Path(tool_path).resolve() + if p.name not in ("rocminfo", "amd-smi", "rocm-smi"): + return None + if p.parent.name != "bin": + return None + return p.parent.parent + + def looks_like_rocm_root(self, root: Path) -> bool: + if not root.is_dir(): + return False + if is_therock_tree(root): + return True + if (root / "bin" / "rocminfo").is_file(): + return True + # Versioned apt/tar layouts (e.g. /opt/rocm-7.13.0) and many TheRock images + if (root / "bin" / "amd-smi").is_file() and (root / "bin" / "rocm-smi").is_file(): + return True + if (root / "bin" / "rocm-smi").is_file() and (root / ".info" / "version").is_file(): + return True + if (root / "bin" / "amd-smi").is_file() and (root / ".info" / "version").is_file(): + return True + if (root / ".info" / "version").is_file(): + return True + return False + + def versioned_opt_rocm_dirs(self) -> List[Path]: + """Return ``/opt/rocm-*`` versioned directories (e.g. ``/opt/rocm-7.13.0``).""" + try: + opt = Path("/opt") + if not opt.is_dir(): + return [] + return sorted(p for p in opt.glob("rocm-*") if p.is_dir()) + except OSError: + return [] + + def infer_from_path_tools(self) -> OptionalPathStr: + """Use ``which`` on rocminfo, amd-smi, rocm-smi; return first plausible root.""" + from madengine.utils import rocm_path_resolver as m # same module; for patch hooks + + for name in ("rocminfo", "amd-smi", "rocm-smi"): + w = self._which(name) # type: ignore[operator] + if not w: + continue + inferred = self.rocm_root_from_bin_tool(w) + if inferred is not None and m._looks_like_rocm_root(inferred): + return normalize_rocm_path(str(inferred)) + return None + + def auto_detect(self) -> OptionalPathStr: + """Heuristic search for a usable ROCm installation (see class doc + module doc).""" + from madengine.utils import rocm_path_resolver as m # same module; for patch hooks + + opt = Path("/opt/rocm") + if m._looks_like_rocm_root(opt): + return normalize_rocm_path(str(opt)) + + for cand in m._versioned_opt_rocm_dirs(): + if m._looks_like_rocm_root(cand): + return normalize_rocm_path(str(cand)) + + if self._which("rocm-sdk"): + try: + r = subprocess.run( + ["rocm-sdk", "path", "--root"], + capture_output=True, + text=True, + timeout=8, + ) + if r.returncode == 0 and r.stdout.strip(): + root = Path(r.stdout.strip()) + if m._looks_like_rocm_root(root): + return normalize_rocm_path(str(root)) + except (OSError, subprocess.SubprocessError): + pass + + for var in ("ROCM_PATH", "ROCM_HOME", "HIP_PATH"): + raw = _environ_get(self._environ, var, "") + if not raw: + continue + root = Path(os.path.expanduser(raw)) + if not root.is_dir(): + continue + if m._looks_like_rocm_root(root): + return normalize_rocm_path(str(root)) + for _ in range(4): + if m._looks_like_rocm_root(root): + return normalize_rocm_path(str(root)) + if root == root.parent: + break + root = root.parent + + candidates: List[Path] = [ + Path("/usr/local/rocm"), + Path.home() / "rocm", + Path.home() / "therock", + ] + for c in candidates: + if m._looks_like_rocm_root(c): + return normalize_rocm_path(str(c)) + + found = m._infer_root_from_path_tools() + if found: + return found + + return None + + +# --- Module-level hooks: tests may monkeypatch ``_looks_like_rocm_root``, +# ``_versioned_opt_rocm_dirs``, ``_infer_root_from_path_tools``, and related names. +# Implementation delegates to :class:`RocmPathResolver` (default instance) for the logic. + + +def _looks_like_rocm_root(root: Path) -> bool: + return RocmPathResolver().looks_like_rocm_root(root) + + +def _rocm_root_from_bin_tool(tool_path: str) -> Optional[Path]: + return RocmPathResolver.rocm_root_from_bin_tool(tool_path) + + +def _versioned_opt_rocm_dirs() -> List[Path]: + return RocmPathResolver().versioned_opt_rocm_dirs() + + +def _infer_root_from_path_tools() -> OptionalPathStr: + return RocmPathResolver().infer_from_path_tools() + + +def auto_detect_rocm_path() -> OptionalPathStr: + return RocmPathResolver().auto_detect() + + +def get_rocm_path_legacy(override: Optional[str] = None) -> str: + """No auto-detect: CLI override, then ``ROCM_PATH`` env, then default.""" + if override and str(override).strip(): + return normalize_rocm_path(str(override).strip()) + e = os.environ.get("ROCM_PATH", "").strip() + if e: + return normalize_rocm_path(e) + return normalize_rocm_path("/opt/rocm") + + +def resolve_host_rocm_path( + ctx: Optional[Dict[str, Any]] = None, + cli_rocm_path: Optional[str] = None, +) -> str: + """ + Resolve the host ROCm installation root for madengine (GPU tools, validation). + + See module docstring for precedence. + """ + ctx = ctx or {} + mad = ctx.get(MAD_ROCM_PATH) + if isinstance(mad, str) and mad.strip(): + return normalize_rocm_path(mad) + if cli_rocm_path and str(cli_rocm_path).strip(): + return normalize_rocm_path(str(cli_rocm_path).strip()) + + if os.environ.get("MAD_AUTO_ROCM_PATH", "1").strip() == "0": + return get_rocm_path_legacy(None) + + found = auto_detect_rocm_path() + if found: + return found + + return get_rocm_path_legacy(None) + + +def resolve_container_rocm_path(docker_env: Dict[str, Any], host_path: str) -> str: + """ + Set ``docker_env['ROCM_PATH']`` and return the container ROCm root. + + * If ``MAD_ROCM_PATH`` is in ``docker_env``, use it, remove the key, set ``ROCM_PATH``. + * Elif ``ROCM_PATH`` already set, normalize and return (no host mirror). + * Else mirror ``host_path``. + """ + d = docker_env + if MAD_ROCM_PATH in d and d[MAD_ROCM_PATH] is not None: + s = d.pop(MAD_ROCM_PATH) + if isinstance(s, (int, float)): + s = str(s) + if not isinstance(s, str) or not str(s).strip(): + s = host_path + croot = normalize_rocm_path(str(s).strip()) + elif d.get("ROCM_PATH") not in (None, ""): + croot = normalize_rocm_path(str(d["ROCM_PATH"]).strip()) + else: + croot = host_path + d["ROCM_PATH"] = croot + return croot diff --git a/src/madengine/utils/therock_markers.py b/src/madengine/utils/therock_markers.py new file mode 100644 index 00000000..a7c94dba --- /dev/null +++ b/src/madengine/utils/therock_markers.py @@ -0,0 +1,37 @@ +# Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +r"""TheRock on-disk markers under a ROCm (or TheRock) root. + +These paths are the **minimal** "file marker" test used for ROCm path resolution +(:mod:`madengine.utils.rocm_path_resolver`) and for richer detection in +``scripts/common/tools/therock_detector.py``. TheRock may also be inferable +from symlinks, binaries, or Python packages; those are handled elsewhere. + +**Canonical tree** (at least one file present): + +* ``/share/therock/therock_manifest.json`` +* ``/share/therock/dist_info.json`` +""" +from __future__ import annotations + +from pathlib import Path +from typing import Final + +# Relative to a putative ROCm / TheRock install root; keep in sync with ADR 0001. +THEROCK_SHARE_DIR: Final = Path("share") / "therock" +THEROCK_MANIFEST_RELPATH: Final = THEROCK_SHARE_DIR / "therock_manifest.json" +THEROCK_DIST_INFO_RELPATH: Final = THEROCK_SHARE_DIR / "dist_info.json" + + +def therock_manifest_path(root: Path) -> Path: + return root / THEROCK_MANIFEST_RELPATH + + +def therock_dist_info_path(root: Path) -> Path: + return root / THEROCK_DIST_INFO_RELPATH + + +def is_therock_tree(root: Path) -> bool: + """True if the tree has standard TheRock ``share/therock`` file markers.""" + m = therock_manifest_path(root) + d = therock_dist_info_path(root) + return m.is_file() or d.is_file() diff --git a/tests/integration/test_gpu_management.py b/tests/integration/test_gpu_management.py index ef18a810..5ae55c60 100644 --- a/tests/integration/test_gpu_management.py +++ b/tests/integration/test_gpu_management.py @@ -267,9 +267,12 @@ def test_different_vendors_different_instances(self): def test_auto_detect_vendor(self): """Test auto-detection of GPU vendor.""" - with patch('madengine.utils.gpu_validator.detect_gpu_vendor', return_value=GPUVendor.AMD): + with patch( + "madengine.utils.gpu_tool_factory.detect_gpu_vendor", + return_value=GPUVendor.AMD, + ): manager = get_gpu_tool_manager(vendor=None) - + assert isinstance(manager, ROCmToolManager) def test_unknown_vendor_raises_error(self): @@ -445,11 +448,20 @@ class TestGetGpuRenderDNodesIntegration: @pytest.mark.skipif(is_amd_gpu(), reason="Test requires non-AMD GPU or no GPU") def test_returns_none_for_non_amd_gpu(self): """Test that the function returns None for non-AMD GPUs.""" - context = Context() - + from unittest.mock import patch + + with patch.object(Context, "get_gpu_vendor", return_value="NVIDIA"), \ + patch.object(Context, "get_system_ngpus", return_value=0), \ + patch.object(Context, "get_system_gpu_architecture", return_value=""), \ + patch.object(Context, "get_system_gpu_product_name", return_value=""), \ + patch.object(Context, "get_system_hip_version", return_value="5.0"), \ + patch.object(Context, "get_docker_gpus", return_value="0"), \ + patch.object(Context, "get_gpu_renderD_nodes", return_value=None): + context = Context() + # Should return None for non-AMD GPUs - if context.ctx['docker_env_vars']['MAD_GPU_VENDOR'] != 'AMD': - assert context.ctx['gpu_renderDs'] is None + if context.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] != "AMD": + assert context.ctx["gpu_renderDs"] is None @pytest.mark.skipif(not is_amd_gpu(), reason="Test requires AMD GPU") def test_returns_list_for_amd_gpu(self): diff --git a/tests/unit/test_rocm_path.py b/tests/unit/test_rocm_path.py index c0e3dbd9..95f334c1 100644 --- a/tests/unit/test_rocm_path.py +++ b/tests/unit/test_rocm_path.py @@ -8,6 +8,15 @@ import pytest from madengine.core.constants import get_rocm_path +from madengine.utils import rocm_path_resolver as rpr +from madengine.utils.rocm_path_resolver import ( + MAD_ROCM_PATH, + auto_detect_rocm_path, + get_rocm_path_legacy, + normalize_rocm_path, + resolve_container_rocm_path, + resolve_host_rocm_path, +) @pytest.mark.unit @@ -54,6 +63,7 @@ def test_context_runtime_includes_rocm_path_in_ctx(self): """Context in runtime mode includes rocm_path and ROCM_PATH in docker_env_vars.""" from madengine.core.context import Context from unittest.mock import patch + from madengine.utils.rocm_path_resolver import normalize_rocm_path with patch.object(Context, "get_gpu_vendor", return_value="AMD"), \ patch.object(Context, "get_system_ngpus", return_value=2), \ @@ -63,8 +73,37 @@ def test_context_runtime_includes_rocm_path_in_ctx(self): patch.object(Context, "get_docker_gpus", return_value="0-1"), \ patch.object(Context, "get_gpu_renderD_nodes", return_value=None): ctx = Context(rocm_path="/my/rocm") - assert ctx.ctx.get("rocm_path") == "/my/rocm" - assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") == "/my/rocm" + exp = normalize_rocm_path("/my/rocm") + assert ctx.ctx.get("rocm_path") == exp + assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") == exp + + def test_context_container_mad_overrides_mirror(self): + """docker_env_vars MAD_ROCM_PATH sets in-container ROCM_PATH (not host path).""" + from madengine.core.context import Context + from unittest.mock import patch + from madengine.utils.rocm_path_resolver import normalize_rocm_path + + ac = repr( + { + "docker_env_vars": {MAD_ROCM_PATH: "/in/image"}, + } + ) + with patch.object(Context, "get_gpu_vendor", return_value="AMD"), \ + patch.object(Context, "get_system_ngpus", return_value=2), \ + patch.object(Context, "get_system_gpu_architecture", return_value="gfx90a"), \ + patch.object(Context, "get_system_gpu_product_name", return_value="MI250"), \ + patch.object(Context, "get_system_hip_version", return_value="5.4"), \ + patch.object(Context, "get_docker_gpus", return_value="0-1"), \ + patch.object(Context, "get_gpu_renderD_nodes", return_value=None): + ctx = Context( + additional_context=ac, + rocm_path="/on/host", + ) + assert ctx._rocm_path == normalize_rocm_path("/on/host") + assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") == normalize_rocm_path( + "/in/image" + ) + assert MAD_ROCM_PATH not in ctx.ctx["docker_env_vars"] @pytest.mark.unit @@ -95,3 +134,128 @@ def test_run_help_includes_rocm_path(self): result = runner.invoke(app, ["run", "--help"]) assert result.exit_code == 0 assert "--rocm-path" in result.output + + +@pytest.mark.unit +class TestResolveHostContainerRocmPath: + """Test madengine rocm_path_resolver (MAD_ROCM_PATH / auto).""" + + def test_resolve_host_mad_takes_precedence_over_cli(self): + """Top-level MAD_ROCM_PATH in context wins over --rocm-path.""" + p = resolve_host_rocm_path({MAD_ROCM_PATH: "/from/context"}, "/from/cli") + assert os.path.isabs(p) + assert p == os.path.abspath("/from/context").rstrip(os.sep) + + def test_resolve_host_cli_when_no_mad(self): + """--rocm-path (cli) is used when MAD_ROCM_PATH is absent.""" + p = resolve_host_rocm_path({}, "/only/cli") + assert p == os.path.abspath("/only/cli").rstrip(os.sep) + + def test_resolve_host_legacy_when_auto_off(self, monkeypatch): + """MAD_AUTO_ROCM_PATH=0 uses ROCM_PATH env then default.""" + monkeypatch.setenv("MAD_AUTO_ROCM_PATH", "0") + monkeypatch.delenv("ROCM_PATH", raising=False) + p = resolve_host_rocm_path({}, None) + assert p == os.path.abspath("/opt/rocm").rstrip(os.sep) + monkeypatch.setenv("ROCM_PATH", "/env/ro") + p2 = resolve_host_rocm_path({}, None) + assert p2 == os.path.abspath("/env/ro").rstrip(os.sep) + + def test_resolve_container_mad_consumes_key(self): + d = {MAD_ROCM_PATH: "/in/container"} + host = "/on/host" + r = resolve_container_rocm_path(d, host) + assert r == os.path.abspath("/in/container").rstrip(os.sep) + assert d.get("ROCM_PATH") == r + assert MAD_ROCM_PATH not in d + + def test_resolve_container_mirrors_host(self): + d = {} + r = resolve_container_rocm_path(d, "/hostpath") + assert r == os.path.abspath("/hostpath").rstrip(os.sep) + assert d["ROCM_PATH"] == r + + def test_get_rocm_path_legacy_alias(self, monkeypatch): + monkeypatch.delenv("ROCM_PATH", raising=False) + g = get_rocm_path_legacy(None) + assert g == os.path.abspath("/opt/rocm").rstrip(os.sep) + + +@pytest.mark.unit +class TestTheRockVersionedContainerLayout: + """ + TheRock / CI images often use /opt/rocm-7.x.y instead of /opt/rocm, with + amd-smi and rocm-smi under that tree (or duplicate copies under /opt/python). + """ + + def test_looks_like_root_with_amd_smi_and_rocm_smi(self, tmp_path): + from madengine.utils.rocm_path_resolver import _looks_like_rocm_root + + root = tmp_path / "rocm-7.13.0" + (root / "bin").mkdir(parents=True) + for n in ("amd-smi", "rocm-smi"): + f = root / "bin" / n + f.write_text("#!/bin/sh\necho\n", encoding="utf-8") + f.chmod(0o755) + assert _looks_like_rocm_root(root) + + def test_rocm_root_from_bin_tool_amd_smi(self, tmp_path): + from madengine.utils.rocm_path_resolver import _rocm_root_from_bin_tool + + root = tmp_path / "rocm-7.13.0" + (root / "bin").mkdir(parents=True) + smi = root / "bin" / "amd-smi" + smi.write_text("x", encoding="utf-8") + smi.chmod(0o755) + assert _rocm_root_from_bin_tool(str(smi.resolve())) == root.resolve() + + def test_auto_detect_finds_injected_versioned_opt_rocm( + self, monkeypatch, tmp_path + ): + """Simulate /opt/rocm-7.13.0 without depending on the host /opt tree.""" + vroot = (tmp_path / "rocm-7.13.0").resolve() + (vroot / "bin").mkdir(parents=True) + for n in ("amd-smi", "rocm-smi"): + f = vroot / "bin" / n + f.write_text("#!/bin/sh\necho\n", encoding="utf-8") + f.chmod(0o755) + + real_looks = rpr._looks_like_rocm_root + + def merged_looks(p): + r = p.resolve() + if r == vroot: + return real_looks(r) + if str(r) == "/opt/rocm": + return False + return real_looks(p) + + monkeypatch.setattr(rpr, "_looks_like_rocm_root", merged_looks) + monkeypatch.setattr( + rpr, "_versioned_opt_rocm_dirs", lambda: [vroot] + ) + out = auto_detect_rocm_path() + assert out == normalize_rocm_path(str(vroot)) + + def test_infer_root_from_path_tools_amd_smi( + self, monkeypatch, tmp_path + ): + """`which(amd-smi)` โ†’ .../rocm-7.13.0/bin/amd-smi` yields root with both smi tools.""" + vroot = (tmp_path / "rocm-7.13.0").resolve() + (vroot / "bin").mkdir(parents=True) + for n in ("amd-smi", "rocm-smi"): + f = vroot / "bin" / n + f.write_text("#!/bin/sh\necho\n", encoding="utf-8") + f.chmod(0o755) + + def fake_which(name, path=None): + if name == "rocminfo": + return None + if name in ("amd-smi", "rocm-smi"): + return str(vroot / "bin" / name) + return None + + monkeypatch.setattr(rpr.shutil, "which", fake_which) + from madengine.utils.rocm_path_resolver import _infer_root_from_path_tools + + assert _infer_root_from_path_tools() == normalize_rocm_path(str(vroot)) diff --git a/tests/unit/test_therock_markers.py b/tests/unit/test_therock_markers.py new file mode 100644 index 00000000..c819562e --- /dev/null +++ b/tests/unit/test_therock_markers.py @@ -0,0 +1,32 @@ +""" +Unit tests for TheRock on-disk marker helpers. +""" + +import pytest +from pathlib import Path + +from madengine.utils.therock_markers import ( + is_therock_tree, + therock_dist_info_path, + therock_manifest_path, +) + + +@pytest.mark.unit +def test_is_therock_tree_manifest_only(tmp_path: Path) -> None: + p = tmp_path / "share" / "therock" / "therock_manifest.json" + p.parent.mkdir(parents=True) + p.write_text("{}", encoding="utf-8") + assert is_therock_tree(tmp_path) + + +@pytest.mark.unit +def test_is_therock_tree_false_without_markers(tmp_path: Path) -> None: + assert not is_therock_tree(tmp_path) + + +@pytest.mark.unit +def test_relpath_helpers_match_expected_layout() -> None: + root = Path("/opt/rocm-7.0.0") + assert therock_manifest_path(root) == root / "share" / "therock" / "therock_manifest.json" + assert therock_dist_info_path(root) == root / "share" / "therock" / "dist_info.json" From c0d836a7a410e5871027ba1877ded5f4a9b25db5 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 22 Apr 2026 19:56:05 -0500 Subject: [PATCH 2/9] Fixed the unit test --- tests/unit/test_rocm_path.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_rocm_path.py b/tests/unit/test_rocm_path.py index 95f334c1..acd1f259 100644 --- a/tests/unit/test_rocm_path.py +++ b/tests/unit/test_rocm_path.py @@ -226,7 +226,15 @@ def merged_looks(p): r = p.resolve() if r == vroot: return real_looks(r) - if str(r) == "/opt/rocm": + # Suppress any real /opt/rocm installation on the host (including + # when /opt/rocm is a symlink, e.g. /opt/rocm -> /opt/rocm-6.4.2, + # so p.resolve() won't equal Path("/opt/rocm")). + p_str = str(p) + r_str = str(r) + if p_str == "/opt/rocm" or r_str == "/opt/rocm": + return False + # Also suppress versioned host installs reachable via /opt/rocm symlink + if p_str.startswith("/opt/rocm-") and r_str != str(vroot): return False return real_looks(p) From 0387174cc23d05796a665b5bc4ef078e5f1e6e29 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 22 Apr 2026 21:35:49 -0500 Subject: [PATCH 3/9] fix: TheRock container compatibility for rocEnvTool and run phase GPU checks - csv_parser.py: resolve rocm-smi via shutil.which() so TheRock images (where tools live in a Python venv, not /opt/rocm/bin/) are detected correctly; accept path_resolver to get ROCm version without hardcoding /opt/rocm/.info/version; add bounds check in NVIDIA GPU info parser - rocenv_tool.py: pass path_resolver to CSVParser so version resolution uses RocmPathResolver.get_version() for both TheRock and traditional installs - container_runner.py: replace host-resolved hardcoded amd-smi/rocm-smi paths in container exec with PATH-based resolution; add run phase environment table showing host vs container installation type, ROCm/CUDA root, and version side-by-side (supports apt install and therock layouts) - docs: document run phase environment table in usage.md and configuration.md Co-Authored-By: Claude Sonnet 4 --- docs/configuration.md | 2 + docs/usage.md | 25 +++ src/madengine/execution/container_runner.py | 143 ++++++++++++++++-- .../pre_scripts/rocEnvTool/csv_parser.py | 25 ++- .../pre_scripts/rocEnvTool/rocenv_tool.py | 2 +- 5 files changed, 176 insertions(+), 21 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 1309c74f..38426578 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -145,6 +145,8 @@ Precedence (host): top-level `MAD_ROCM_PATH` โ†’ `--rocm-path` โ†’ auto-detect ( This applies to the run phase; build uses build-only context (no GPU detection) but still honors `MAD_ROCM_PATH` in context when set. +At the start of each container run, a **Run Phase Environment** table is printed showing host vs container installation type (`apt install` or `therock`), ROCm/CUDA root, and version side-by-side. See [Run phase environment table](usage.md#run-phase-environment-table). + ## Build Configuration ### Batch Manifest diff --git a/docs/usage.md b/docs/usage.md index 8e76eae9..37d59b3f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -333,6 +333,31 @@ madengine run --tags model --additional-context '{ `--rocm-path` applies only to the **run** command (not build). See [Configuration - ROCm path](configuration.md#rocm-path-run-only). +### Run phase environment table + +At the start of each container run, madengine prints a side-by-side environment summary for the **host** and the **container**: + +``` +๐Ÿ–ฅ๏ธ RUN PHASE ENVIRONMENT +================================================================================ + HOST CONTAINER + โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + GPU Vendor AMD AMD + Installation Type apt install therock + ROCm Root /opt/rocm /opt/python/lib/python3.13/site-packages/_rocm_sdk_devel + ROCm Version 6.4.0 7.13.0a20260415 + โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +================================================================================ +``` + +| Field | AMD values | NVIDIA values | +|---|---|---| +| Installation Type | `apt install` (traditional `/opt/rocm`) or `therock` (Python-package layout) | `CUDA toolkit` | +| ROCm / CUDA Root | Resolved via `RocmPathResolver` (host) or `rocm-sdk path --root` (container) | `nvcc` binary location | +| ROCm / CUDA Version | From `amd-smi` / `rocm-sdk version` / `.info/version` | From `nvcc --version` / `nvidia-smi` | + +The host column uses the same resolution as `--rocm-path` / `MAD_ROCM_PATH`. The container column queries the container's own `PATH` at runtime, so it correctly reflects TheRock images where tools live in a Python venv rather than `/opt/rocm/bin/`. + ### Deploy to Kubernetes ```bash diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index d5f27cf0..f8b32a09 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -20,7 +20,6 @@ from madengine.core.console import Console from madengine.core.context import Context from madengine.core.docker import Docker -from madengine.core.constants import get_rocm_path from madengine.core.timeout import Timeout from madengine.core.dataprovider import Data from madengine.utils.ops import PythonicTee, file_print @@ -34,6 +33,7 @@ from madengine.utils.config_parser import ConfigParser from madengine.utils.path_utils import scripts_base_dir_from from madengine.utils.run_details import get_build_number, get_pipeline +from madengine.utils.therock_markers import is_therock_tree from madengine.execution.container_runner_helpers import ( log_text_has_error_pattern, make_run_log_file_path, @@ -42,6 +42,123 @@ ) +def _print_run_env_table( + gpu_vendor: str, + context, + model_docker, + rich_console, +) -> None: + """Print a side-by-side environment table for host and container. + + Covers AMD (installation type, ROCm root, ROCm version) and NVIDIA + (installation type, CUDA root, CUDA version) for both sides. + Follows container_runner.py convention: rich_console for styled borders, + plain print() for data rows (so they appear in the run log file). + """ + from pathlib import Path + + COL_W = 36 # width of each data column + + def row(label: str, host_val: str, container_val: str) -> str: + return f" {label:<26} {host_val:<{COL_W}} {container_val:<{COL_W}}" + + def separator() -> str: + return " " + "-" * (26 + 2 + COL_W + 2 + COL_W) + + def _sh(cmd: str) -> str: + """Run a command in the container and return stripped output.""" + try: + return (model_docker.sh(cmd) or "").strip() + except Exception: + return "N/A" + + rich_console.print("\n[bold blue]๐Ÿ–ฅ๏ธ RUN PHASE ENVIRONMENT[/bold blue]") + rich_console.print(f"[dim]{'=' * 80}[/dim]") + print(row("", "HOST", "CONTAINER")) + print(separator()) + + if "AMD" in gpu_vendor: + # โ”€โ”€ Host side โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + host_rocm_root = getattr(context, "_rocm_path", None) or "unknown" + host_install_type = ( + "therock" + if is_therock_tree(Path(host_rocm_root)) + else "apt install" + ) + try: + host_rocm_ver = context._get_tool_manager().get_version() or "unknown" + except Exception: + host_rocm_ver = "unknown" + + # โ”€โ”€ Container side โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # Installation type: check for TheRock share markers first, then + # fall back to looking for the traditional .info/version file. + ctr_install_type = _sh( + "if [ -f \"$(rocm-sdk path --root 2>/dev/null)/share/therock/therock_manifest.json\" ] " + "|| [ -f \"$(rocm-sdk path --root 2>/dev/null)/share/therock/dist_info.json\" ] " + "2>/dev/null; then echo therock; " + "elif [ -f /opt/rocm/.info/version ]; then echo apt install; " + "else echo unknown; fi 2>/dev/null || echo unknown" + ) + + # ROCm root: prefer rocm-sdk, then ROCM_PATH env, then /opt/rocm + ctr_rocm_root = _sh( + "rocm-sdk path --root 2>/dev/null " + "|| echo \"${ROCM_PATH:-/opt/rocm}\"" + ) + + # ROCm version: prefer rocm-sdk, then .info/version, then rocminfo + ctr_rocm_ver = _sh( + "rocm-sdk version 2>/dev/null " + "|| cat \"${ROCM_PATH:-/opt/rocm}/.info/version\" 2>/dev/null " + "|| rocminfo 2>/dev/null | grep -i 'ROCm Version' | head -n1 | grep -oP '[\\d.]+' " + "|| echo unknown" + ) + + print(row("GPU Vendor", "AMD", "AMD")) + print(row("Installation Type", host_install_type, ctr_install_type)) + print(row("ROCm Root", host_rocm_root, ctr_rocm_root)) + print(row("ROCm Version", host_rocm_ver, ctr_rocm_ver)) + + elif "NVIDIA" in gpu_vendor: + # โ”€โ”€ Host side โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + def _host_sh(cmd: str) -> str: + try: + return subprocess.check_output(cmd, shell=True, stderr=subprocess.DEVNULL, text=True).strip() + except Exception: + return "unknown" + + host_cuda_root = _host_sh( + "nvcc --version 2>/dev/null | grep -oP 'release \\K[\\d.]+' | head -1 | " + "xargs -I{} dirname $(which nvcc 2>/dev/null) 2>/dev/null | xargs dirname 2>/dev/null " + "|| echo \"${CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}\"" + ) + host_cuda_ver = _host_sh( + "nvcc --version 2>/dev/null | grep -oP 'release \\K[\\d.]+' | head -1 " + "|| nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \\K[\\d.]+' | head -1 " + "|| echo unknown" + ) + + # โ”€โ”€ Container side โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + ctr_cuda_root = _sh( + "dirname $(which nvcc 2>/dev/null) 2>/dev/null | xargs dirname 2>/dev/null " + "|| echo \"${CUDA_PATH:-${CUDA_HOME:-/usr/local/cuda}}\"" + ) + ctr_cuda_ver = _sh( + "nvcc --version 2>/dev/null | grep -oP 'release \\K[\\d.]+' | head -1 " + "|| nvidia-smi 2>/dev/null | grep -oP 'CUDA Version: \\K[\\d.]+' | head -1 " + "|| echo unknown" + ) + + print(row("GPU Vendor", "NVIDIA", "NVIDIA")) + print(row("Installation Type", "CUDA toolkit", "CUDA toolkit")) + print(row("CUDA Root", host_cuda_root, ctr_cuda_root)) + print(row("CUDA Version", host_cuda_ver, ctr_cuda_ver)) + + print(separator()) + rich_console.print(f"[dim]{'=' * 80}[/dim]\n") + + def _resolve_multiple_results_path(multiple_results: str, model_dir: str) -> typing.Optional[str]: """Resolve multiple_results CSV path: try cwd then model_dir. Return first that exists.""" if not multiple_results: @@ -1025,25 +1142,23 @@ def run_container( whoami = model_docker.sh("whoami") print(f"๐Ÿ‘ค Running as user: {whoami}") - # Show GPU info with version-aware tool selection (PR #54) + # Show GPU info โ€” let the container resolve its own tool paths via + # PATH rather than using host-resolved paths, which break on + # TheRock images where amd-smi/rocm-smi live in a Python venv + # (e.g. /opt/python/bin/) rather than /opt/rocm/bin/. if gpu_vendor.find("AMD") != -1: print(f"๐ŸŽฎ Checking AMD GPU status...") - rocm_path = self.context.ctx.get("rocm_path") or get_rocm_path() - amd_smi_path = os.path.join(rocm_path, "bin", "amd-smi") - rocm_smi_path = os.path.join(rocm_path, "bin", "rocm-smi") - try: - tool_manager = self.context._get_tool_manager() - preferred_tool = tool_manager.get_preferred_smi_tool() - if preferred_tool == "amd-smi": - model_docker.sh(f"{amd_smi_path} || {rocm_smi_path} || true") - else: - model_docker.sh(f"{rocm_smi_path} || {amd_smi_path} || true") - except Exception: - model_docker.sh(f"{amd_smi_path} || {rocm_smi_path} || true") + model_docker.sh( + "amd-smi 2>/dev/null || rocm-smi 2>/dev/null || " + "echo 'GPU SMI tool not available in container PATH'" + ) elif gpu_vendor.find("NVIDIA") != -1: print(f"๐ŸŽฎ Checking NVIDIA GPU status...") model_docker.sh("/usr/bin/nvidia-smi || true") + # Print host vs container environment summary table + _print_run_env_table(gpu_vendor, self.context, model_docker, self.rich_console) + # Prepare model directory model_dir = "run_directory" if "url" in model_info and model_info["url"] != "": diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py index db504803..fd8f66fc 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py @@ -3,6 +3,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ import os +import shutil from console import Console ''' @@ -21,21 +22,25 @@ numa_balancing ''' class CSVParser: - def __init__(self, filename, sys_config_files_path, tags): + def __init__(self, filename, sys_config_files_path, tags, path_resolver=None): self.filename = filename self.path = sys_config_files_path self.tags = tags self.sys_config_info_list = [] + self.path_resolver = path_resolver self.gpu_device_type = self.determine_gpu_device_type() def determine_gpu_device_type(self): console = Console() gpu_device_type = "" - rocm_smi_out = console.sh("/opt/rocm/bin/rocm-smi || true") + # Use PATH resolution first so TheRock images (where rocm-smi lives + # under a Python venv, not /opt/rocm/bin/) are handled correctly. + rocm_smi_cmd = shutil.which("rocm-smi") or "/opt/rocm/bin/rocm-smi" + rocm_smi_out = console.sh(f"{rocm_smi_cmd} || true") nv_smi_out = console.sh("nvidia-smi -L || true") - if not "not found" in rocm_smi_out: + if "not found" not in rocm_smi_out and "No such file" not in rocm_smi_out: gpu_device_type = "AMD" - if not "not found" in nv_smi_out: + if "not found" not in nv_smi_out and "No such file" not in nv_smi_out: gpu_device_type = "NVIDIA" return gpu_device_type @@ -116,6 +121,8 @@ def dump_gpu_information_in_csv(self, gpu_log_path, device_type): if "GPU" in line: num_gpu += 1 values = line.split(":") + if len(values) < 3: + continue name = values[1].split("(UUID")[0] uuid = values[2] info_list.append("Name|" + name) @@ -154,8 +161,14 @@ def dump_rocm_information_in_csv(self, rocm_info_path): lines = self.get_log_file_data(rocm_info_path) info_list = [] info_list.append(lines[0].rstrip()) - version_path = os.path.join("/opt/rocm", ".info", "version") - rocm_version = open(version_path).read().rstrip() + if self.path_resolver is not None: + rocm_version = self.path_resolver.get_version() + else: + version_path = os.path.join("/opt/rocm", ".info", "version") + try: + rocm_version = open(version_path).read().rstrip() + except FileNotFoundError: + rocm_version = "unknown" info_list.append("ROCm version|" + rocm_version) return info_list diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py index 50202081..b2288dea 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py @@ -769,7 +769,7 @@ def main(): if args.dump_csv or args.print_csv: csv_file = args.output_name + ".csv" out_dir = "." + args.output_name - csv_parser = CSVParser(csv_file, out_dir, configs) + csv_parser = CSVParser(csv_file, out_dir, configs, path_resolver=path_resolver) csv_parser.dump_csv_output() if args.print_csv: csv_parser.print_csv_output() From f6bee02c943119f12c080f406df209c1d8182d3a Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 22 Apr 2026 21:48:31 -0500 Subject: [PATCH 4/9] fix: correct container installation type detection in run phase env table The previous shell command embedded \$(rocm-sdk path --root) inside [ -f "..." ], which broke quoting when passed via docker exec bash -c "..." causing the test to always fall through to 'unknown'. Replace with a quoting-safe check: if rocm-sdk command exists and returns a root path, report 'therock'; otherwise check /opt/rocm/.info/version for traditional apt install layout. Co-Authored-By: Claude Sonnet 4 --- src/madengine/execution/container_runner.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index f8b32a09..844daf57 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -91,14 +91,15 @@ def _sh(cmd: str) -> str: host_rocm_ver = "unknown" # โ”€โ”€ Container side โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - # Installation type: check for TheRock share markers first, then - # fall back to looking for the traditional .info/version file. + # Installation type: if rocm-sdk resolves a root it is TheRock; + # otherwise fall back to the traditional .info/version marker. + # Avoids nested quoting issues by not embedding $(...) inside [ -f "..." ]. ctr_install_type = _sh( - "if [ -f \"$(rocm-sdk path --root 2>/dev/null)/share/therock/therock_manifest.json\" ] " - "|| [ -f \"$(rocm-sdk path --root 2>/dev/null)/share/therock/dist_info.json\" ] " - "2>/dev/null; then echo therock; " - "elif [ -f /opt/rocm/.info/version ]; then echo apt install; " - "else echo unknown; fi 2>/dev/null || echo unknown" + "if command -v rocm-sdk >/dev/null 2>&1 " + "&& rocm-sdk path --root >/dev/null 2>&1; " + "then echo therock; " + "elif [ -f /opt/rocm/.info/version ]; then echo 'apt install'; " + "else echo unknown; fi" ) # ROCm root: prefer rocm-sdk, then ROCM_PATH env, then /opt/rocm From 4801f2078f8877e60c793d81950dba2fa81e48fb Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Thu, 23 Apr 2026 22:09:09 -0400 Subject: [PATCH 5/9] fix(rocm): resolve in-container ROCM_PATH without host mirroring - Apply docker_env_vars overrides in context; finalize in run_container (AMD) from OCI image env, then docker run probe, then /opt/rocm with warning - Update docs, README, CLI reference, CHANGELOG, and unit tests - ADR 0001 left out of this commit (untracked) Made-with: Cursor --- CHANGELOG.md | 2 +- README.md | 6 +- docs/cli-reference.md | 5 +- docs/configuration.md | 4 +- docs/installation.md | 4 +- docs/usage.md | 6 +- src/madengine/core/context.py | 8 +- src/madengine/execution/container_runner.py | 11 + src/madengine/utils/rocm_path_resolver.py | 237 +++++++++++++++++++- tests/unit/test_rocm_path.py | 54 ++++- 10 files changed, 304 insertions(+), 33 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc19b7d6..72105f6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- **ROCm path**: Auto-detect host ROCm root by default (traditional `/opt/rocm`, versioned `/opt/rocm-*`, TheRock `rocm-sdk` + markers, `rocminfo` / `amd-smi` / `rocm-smi` on `PATH`). **Host** overrides: top-level `MAD_ROCM_PATH` in `--additional-context`, then `--rocm-path` (alias). **Container** override: `docker_env_vars.MAD_ROCM_PATH` (sets in-container `ROCM_PATH`); if omitted, host path is mirrored. Set `MAD_AUTO_ROCM_PATH=0` to use legacy `ROCM_PATH` / `/opt/rocm` only without scanning. Code: `madengine.utils.rocm_path_resolver`, shared TheRock file markers in `madengine.utils.therock_markers`. **Design doc**: [ADR 0001](docs/adr/0001-rocm-path-resolution.md). +- **ROCm path**: Auto-detect host ROCm root by default (traditional `/opt/rocm`, versioned `/opt/rocm-*`, TheRock `rocm-sdk` + markers, `rocminfo` / `amd-smi` / `rocm-smi` on `PATH`). **Host** overrides: top-level `MAD_ROCM_PATH` in `--additional-context`, then `--rocm-path` (alias). **Container** `ROCM_PATH` for Docker runs (AMD): `docker_env_vars.MAD_ROCM_PATH` if set; else `ROCM_PATH` / `ROCM_HOME` from the image OCI config (`docker image inspect`); else an in-image shell probe (`docker run --rm`); else default `/opt/rocm` with a warning. The host-resolved path is no longer mirrored into the container by default. Set `MAD_AUTO_ROCM_PATH=0` to use legacy host `ROCM_PATH` / `/opt/rocm` only without scanning. Code: `madengine.utils.rocm_path_resolver`, shared TheRock file markers in `madengine.utils.therock_markers`. **Design doc**: [ADR 0001](docs/adr/0001-rocm-path-resolution.md). - **Profiling**: `rocm_trace_lite` now sets `RTL_MODE=lite` explicitly; added tool `rocm_trace_lite_default` with `RTL_MODE=default` for A/B overhead comparison. `rtl_trace_wrapper.sh` passes `rtl trace --mode โ€ฆ` when `RTL_MODE` is set. ## [2.0.0] - 2026-04-09 diff --git a/README.md b/README.md index 78b15538..db273e8c 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ madengine is a modern CLI tool for running Large Language Models (LLMs) and Deep - **๐ŸŽฏ Simple Deployment** - Run locally or deploy to Kubernetes/SLURM via configuration - **๐Ÿ”ง Distributed Launchers** - Full support for torchrun, DeepSpeed, Megatron-LM, TorchTitan, Primus, vLLM, SGLang - **๐Ÿณ Container-Native** - Docker-based execution with GPU support (ROCm, CUDA) -- **๐Ÿ“‚ ROCm Path** - Auto-detect host/container roots; overrides via `MAD_ROCM_PATH` in `--additional-context` and `--rocm-path` (host alias) +- **๐Ÿ“‚ ROCm Path** - Auto-detect **host** ROCm root; optional `docker_env_vars.MAD_ROCM_PATH`; in-container `ROCM_PATH` is resolved at Docker run (image OCI env + in-image probe, not host mirroring) โ€” see [Configuration](docs/configuration.md#rocm-path-run-only) - **๐Ÿ“Š Performance Tools** - Integrated profiling with rocprof/rocprofv3, [rocm-trace-lite](https://github.com/sunway513/rocm-trace-lite) (RTL), rocblas, MIOpen, RCCL tracing - **๐ŸŽฏ ROCprofv3 Profiles** - 8 pre-configured profiles for compute/memory/communication bottleneck analysis - **๐Ÿ” Environment Validation** - TheRock ROCm detection and validation tools @@ -71,7 +71,7 @@ madengine run --tags dummy \ > **Note**: For build operations, `gpu_vendor` defaults to `AMD` and `guest_os` defaults to `UBUNTU` if not specified. For production deployments or non-AMD/Ubuntu environments, explicitly specify these values. -If auto-detection does not find your **host** ROCm root, set top-level `MAD_ROCM_PATH` in `--additional-context` or use `--rocm-path` (host only). For a different path **inside the container**, set `docker_env_vars.MAD_ROCM_PATH`. You can also set `ROCM_PATH` / `MAD_AUTO_ROCM_PATH=0` as documented in [docs/configuration.md](docs/configuration.md): +If auto-detection does not find your **host** ROCm root, set top-level `MAD_ROCM_PATH` in `--additional-context` or use `--rocm-path` (host only). For a different ROCm root **inside the container**, set `docker_env_vars.MAD_ROCM_PATH` in additional context. If you omit it, madengine derives in-container `ROCM_PATH` when running Docker (from the imageโ€™s baked-in env, then an in-container probe, then `/opt/rocm` โ€” it does **not** copy the host path). You can also set `ROCM_PATH` / `MAD_AUTO_ROCM_PATH=0` for **host** behavior as documented in [docs/configuration.md](docs/configuration.md): ```bash madengine run --tags dummy --rocm-path /path/to/rocm @@ -619,7 +619,7 @@ madengine run --tags model --keep-alive madengine build --tags model --clean-docker-cache --verbose ``` -**ROCm not in /opt/rocm:** Use top-level `MAD_ROCM_PATH` or `--rocm-path` for the **host**; `docker_env_vars.MAD_ROCM_PATH` for the **container**; or rely on auto-detect. See [Configuration](docs/configuration.md#rocm-path-run-only). +**ROCm not in /opt/rocm:** Use top-level `MAD_ROCM_PATH` or `--rocm-path` for the **host**; for **in-container** paths, set `docker_env_vars.MAD_ROCM_PATH`, or let madengine resolve `ROCM_PATH` at run from the image and probe (see [Configuration](docs/configuration.md#rocm-path-run-only)). **Common Issues:** - **False failures with profiling**: If models show FAILURE but have performance metrics, see [Profiling Troubleshooting](docs/profiling.md#false-failure-detection-with-rocprof) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index d23d3b87..ab80aeba 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -211,7 +211,7 @@ madengine run [OPTIONS] |--------|-------|------|---------|-------------| | `--tags` | `-t` | TEXT | `[]` | Model tags to run (can specify multiple) | | `--manifest-file` | `-m` | TEXT | `""` | Build manifest file path (for pre-built images) | -| `--rocm-path` | | TEXT | `None` | ROCm installation root (default: `ROCM_PATH` env or `/opt/rocm`). Use when ROCm is not in `/opt/rocm` (e.g. Rock, pip). | +| `--rocm-path` | | TEXT | `None` | **Host** ROCm installation root only (default: `ROCM_PATH` env or `/opt/rocm`). Use when the **runnerโ€™s** ROCm is not in `/opt/rocm` (e.g. TheRock, pip). Does not set the in-container `ROCM_PATH` for Docker; that is resolved at `run` from the image, probe, or `docker_env_vars` โ€” see [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only). | | `--registry` | `-r` | TEXT | `None` | Docker registry URL | | `--timeout` | | INT | `-1` | Timeout in seconds (-1=default 7200s, 0=no timeout) | | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | @@ -617,7 +617,8 @@ madengine recognizes these environment variables: | Variable | Description | Default | |----------|-------------|---------| | `MODEL_DIR` | Path to MAD package directory | Auto-detected | -| `ROCM_PATH` | ROCm installation root (used when `--rocm-path` not set) | `/opt/rocm` | +| `ROCM_PATH` | **Host** ROCm installation root (used when `--rocm-path` not set and for host auto-detect / legacy). In-container `ROCM_PATH` for Docker is not taken from this variable unless the image or run logic supplies it. | `/opt/rocm` | +| `MAD_AUTO_ROCM_PATH` | Set to `0` to disable **host** auto-detect (`ROCM_PATH` then `/opt/rocm` on the host). | (default: scan on) | | `MAD_VERBOSE_CONFIG` | Enable verbose configuration logging | `false` | | `MAD_DOCKERHUB_USER` | Docker Hub username | None | | `MAD_DOCKERHUB_PASSWORD` | Docker Hub password/token | None | diff --git a/docs/configuration.md b/docs/configuration.md index 1309c74f..dc24fa76 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -139,10 +139,12 @@ Design rationale and precedence are recorded in [ADR 0001: ROCm path resolution] - **CLI (host only):** `madengine run --rocm-path /path/to/rocm` โ€” same meaning as top-level `MAD_ROCM_PATH` in additional context. - **Additional context (host):** top-level `"MAD_ROCM_PATH": "/path/to/host/rocm"`. -- **Additional context (container):** `"docker_env_vars": { "MAD_ROCM_PATH": "/path/inside/image" }` โ€” sets the in-container `ROCM_PATH` passed to workloads; if omitted, the **host**-resolved path is mirrored into the container by default. +- **Additional context (container):** `"docker_env_vars": { "MAD_ROCM_PATH": "/path/inside/image" }` โ€” sets the in-container `ROCM_PATH` for Docker runs. If omitted, at `run` time madengine uses the image OCI `Env` (`ROCM_PATH` / `ROCM_HOME`) if present, then an in-container probe, then defaults to `/opt/rocm` (see [ADR 0001](adr/0001-rocm-path-resolution.md)). The host-resolved path is **not** mirrored into the container. Precedence (host): top-level `MAD_ROCM_PATH` โ†’ `--rocm-path` โ†’ auto-detect (unless disabled) โ†’ `ROCM_PATH` โ†’ `/opt/rocm`. +Precedence (container, **local Docker `run`**, **AMD**): `docker_env_vars.MAD_ROCM_PATH` (maps to `ROCM_PATH` for the workload) or explicit `ROCM_PATH` in `docker_env_vars` โ†’ image OCI `Env` (`ROCM_PATH` / `ROCM_HOME`) โ†’ in-image probe โ†’ default `/opt/rocm` with a warning. Implemented in `ContainerRunner.run_container` after the run image is resolved; see [ADR 0001](adr/0001-rocm-path-resolution.md). + This applies to the run phase; build uses build-only context (no GPU detection) but still honors `MAD_ROCM_PATH` in context when set. ## Build Configuration diff --git a/docs/installation.md b/docs/installation.md index fd0c43ab..e553537a 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -83,7 +83,7 @@ madengine run --tags dummy \ --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' ``` -**Non-default ROCm location:** If ROCm is not under `/opt/rocm` (e.g. [TheRock](https://github.com/ROCm/TheRock) or pip install), set `ROCM_PATH` or use `madengine run --rocm-path /path/to/rocm` so GPU detection and container env use the correct paths. +**Non-default ROCm location (host):** If ROCm is not under `/opt/rocm` (e.g. [TheRock](https://github.com/ROCm/TheRock) or pip install), set `ROCM_PATH` on the **host** or use `madengine run --rocm-path /path/to/rocm` so **host** GPU checks (`amd-smi`, etc.) resolve correctly. **In-container** `ROCM_PATH` for Docker workloads is set separately at run (image OCI env, in-image probe, or `docker_env_vars.MAD_ROCM_PATH`); it is not copied from the host. See [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only). ### NVIDIA CUDA @@ -140,7 +140,7 @@ rocm-smi ls -la /dev/kfd /dev/dri ``` -If ROCm is installed in a non-default path (e.g. Rock or pip), set `export ROCM_PATH=/path/to/rocm` or use `madengine run --rocm-path /path/to/rocm`. +If ROCm is installed in a non-default path on the **host** (e.g. TheRock or pip), set `export ROCM_PATH=/path/to/rocm` or use `madengine run --rocm-path /path/to/rocm` (host validation only; see [ROCm path (run only)](configuration.md#rocm-path-run-only) for in-container behavior). ### MAD Package Not Found diff --git a/docs/usage.md b/docs/usage.md index 8e76eae9..31cbc61f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -315,7 +315,7 @@ By default, **madengine** auto-detects the **host** ROCm root (apt under `/opt/r **Host** overrides: top-level `MAD_ROCM_PATH` in `--additional-context`, or `--rocm-path` (same meaning as top-level `MAD_ROCM_PATH`). -**Container** override: `{"docker_env_vars": {"MAD_ROCM_PATH": "/path/inside/image"}}` โ€” otherwise the host-resolved path is mirrored into `ROCM_PATH` in the container. +**In-container** `ROCM_PATH` (AMD Docker runs) is **not** copied from the host. If you do not set `docker_env_vars.MAD_ROCM_PATH` (or a literal `ROCM_PATH` in `docker_env_vars`), madengine sets it at **run** time from, in order: the image OCI `Env` (`ROCM_PATH` / `ROCM_HOME` via `docker image inspect`), an in-container probe (`docker run --rm`), or `/opt/rocm` with a warning. Override explicitly with `{"docker_env_vars": {"MAD_ROCM_PATH": "/path/inside/image"}}` when the image needs a fixed root. Details: [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only), [ADR 0001](adr/0001-rocm-path-resolution.md). ```bash # Host path via CLI (alias for top-level MAD_ROCM_PATH) @@ -624,8 +624,8 @@ madengine build --tags model --clean-docker-cache --verbose | Variable | Description | Example | |----------|-------------|---------| | `MODEL_DIR` | MAD package directory | `/path/to/MAD` | -| `ROCM_PATH` | Legacy host ROCm root when not using `MAD_ROCM_PATH` / `--rocm-path` and when auto is off or as a fallback. | `/path/to/rocm` | -| `MAD_AUTO_ROCM_PATH` | Set to `0` to disable host auto-detect (use `ROCM_PATH` then `/opt/rocm` only). Default: on. | `0` | +| `ROCM_PATH` | **Host** ROCm root when not using top-level `MAD_ROCM_PATH` / `--rocm-path`, and (with `MAD_AUTO_ROCM_PATH=0` or as fallback) for host-side detection and tools. In-container `ROCM_PATH` for Docker is set separately at run; see [ROCm path (host and container)](#rocm-path-host-and-container). | `/path/to/rocm` | +| `MAD_AUTO_ROCM_PATH` | Set to `0` to disable **host** auto-detect (use `ROCM_PATH` then `/opt/rocm` only on the host). Default: on. | `0` | | `MAD_VERBOSE_CONFIG` | Verbose config logging | `"true"` | | `MAD_DOCKERHUB_USER` | Docker Hub username | `"myusername"` | | `MAD_DOCKERHUB_PASSWORD` | Docker Hub password | `"mytoken"` | diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index 3c19851e..24adf16c 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -23,7 +23,7 @@ # third-party modules from madengine.core.console import Console from madengine.utils.rocm_path_resolver import ( - resolve_container_rocm_path, + apply_container_rocm_path_overrides, resolve_host_rocm_path, ) from madengine.utils.gpu_validator import validate_rocm_installation, GPUInstallationError, GPUVendor @@ -260,7 +260,11 @@ def init_gpu_context(self) -> None: self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] self.ctx["rocm_path"] = self._rocm_path - resolve_container_rocm_path(self.ctx["docker_env_vars"], self._rocm_path) + # In-container ``ROCM_PATH`` for Docker is finalized in ``ContainerRunner.run_container`` + # (OCI env + in-image probe). Here we only map ``MAD_ROCM_PATH`` / user ``ROCM_PATH``. + apply_container_rocm_path_overrides( + self.ctx["docker_env_vars"], self._rocm_path + ) if "MAD_SYSTEM_NGPUS" not in self.ctx["docker_env_vars"]: self.ctx["docker_env_vars"][ diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index d5f27cf0..191c4c49 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -935,6 +935,17 @@ def run_container( if merged_count > 0: print(f"โ„น๏ธ Merged {merged_count} environment variables from additional_context") + if self.context and str(self.context.ctx.get("gpu_vendor", "")).upper().find( + "AMD" + ) != -1: + from madengine.utils.rocm_path_resolver import finalize_container_rocm_path + + finalize_container_rocm_path( + self.context.ctx["docker_env_vars"], + docker_image, + self.context._rocm_path, + ) + if "data" in model_info and model_info["data"] != "" and self.data: mount_datapaths = self.data.get_mountpaths(model_info["data"]) model_dataenv = self.data.get_env(model_info["data"]) diff --git a/src/madengine/utils/rocm_path_resolver.py b/src/madengine/utils/rocm_path_resolver.py index 31b79678..e82433bf 100644 --- a/src/madengine/utils/rocm_path_resolver.py +++ b/src/madengine/utils/rocm_path_resolver.py @@ -21,16 +21,23 @@ 4. ``ROCM_PATH`` environment variable. 5. Default ``/opt/rocm``. -**Container** (``docker_env_vars``): - -* ``MAD_ROCM_PATH`` โ€” container ROCm root; sets ``ROCM_PATH`` for Docker; key is - consumed and not passed to the workload. -* If only ``ROCM_PATH`` is set in ``docker_env_vars`` by the user, it is kept. -* Otherwise the host-resolved path is mirrored into ``ROCM_PATH``. +**Container** (``docker_env_vars``) โ€” set when the run uses Docker (``run_container``): + +* ``MAD_ROCM_PATH`` โ€” in-image ROCm root; the key is consumed and mapped to ``ROCM_PATH``. +* If the user set ``ROCM_PATH`` in ``docker_env_vars`` in additional context, it is kept + (normalized) until the run; it wins over OCI / probe in :func:`finalize_container_rocm_path` + if already present after :func:`apply_container_rocm_path_overrides`. +* Full in-container resolution order is implemented in + :func:`finalize_container_rocm_path` (not host mirroring): (1) existing ``ROCM_PATH`` + in ``docker_env`` after overrides, (2) ``ROCM_PATH`` / ``ROCM_HOME`` from + ``docker image inspect`` OCI ``Config.Env``, (3) ``docker run --rm`` with an + in-image shell probe aligned with :class:`RocmPathResolver` heuristics, (4) default + ``/opt/rocm`` with a warning. """ from __future__ import annotations +import json import os import shutil import subprocess @@ -249,13 +256,139 @@ def resolve_host_rocm_path( return get_rocm_path_legacy(None) -def resolve_container_rocm_path(docker_env: Dict[str, Any], host_path: str) -> str: +# POSIX ``/bin/sh`` script run *inside* the target image to infer ROCm root (TheRock + +# traditional trees). Kept in sync with :class:`RocmPathResolver` intent. +_CONTAINER_PROBE_SH = r"""try_root() { + d="$1" + [ -d "$d" ] || return 1 + { [ -f "$d/bin/rocminfo" ] || [ -f "$d/bin/amd-smi" ] || [ -f "$d/share/therock/therock_manifest.json" ] || [ -f "$d/share/therock/dist_info.json" ]; } && return 0 + return 1 +} +if command -v rocm-sdk >/dev/null 2>&1; then + r=$(rocm-sdk path --root 2>/dev/null) + if [ -n "$r" ] && try_root "$r"; then + printf %s "$r" + exit 0 + fi +fi +if try_root /opt/rocm; then + printf %s /opt/rocm + exit 0 +fi +for d in /opt/rocm-*/; do + d=${d%/} + [ -d "$d" ] && try_root "$d" && { printf %s "$d"; exit 0; } +done +for d in /opt/*/; do + d=${d%/} + case $d in /opt/rocm) continue ;; esac + [ -d "$d" ] && try_root "$d" && { printf %s "$d"; exit 0; } +done +if [ -d /usr/local/rocm ] && try_root /usr/local/rocm; then + printf %s /usr/local/rocm + exit 0 +fi +exit 1 +""" + + +def rocm_path_from_docker_image_config(docker_image: str) -> Optional[str]: + """Read ``ROCM_PATH`` (else ``ROCM_HOME``) from the image OCI config via ``docker image inspect``.""" + if not (docker_image or "").strip(): + return None + try: + proc = subprocess.run( + [ + "docker", + "image", + "inspect", + docker_image.strip(), + "--format", + "{{json .Config.Env}}", + ], + capture_output=True, + text=True, + check=True, + timeout=60, + ) + except (OSError, subprocess.CalledProcessError, subprocess.SubprocessError): + return None + try: + env_list: Any = json.loads((proc.stdout or "").strip() or "[]") + except json.JSONDecodeError: + return None + if not isinstance(env_list, list): + return None + for key in ("ROCM_PATH", "ROCM_HOME"): + for entry in env_list: + if not isinstance(entry, str): + continue + if entry.startswith(f"{key}="): + val = entry.split("=", 1)[1].strip() + if val: + return val + return None + + +def infer_rocm_path_via_docker_run( + docker_image: str, + log: Optional[Callable[[str], None]] = None, +) -> Optional[str]: + """ + One-shot ``docker run --rm`` in-image probe using :data:`_CONTAINER_PROBE_SH`. """ - Set ``docker_env['ROCM_PATH']`` and return the container ROCm root. + _print = log or print + if not (docker_image or "").strip(): + return None + cmd = [ + "docker", + "run", + "--rm", + "--entrypoint", + "/bin/sh", + docker_image.strip(), + "-c", + _CONTAINER_PROBE_SH, + ] + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=120, + ) + except (OSError, subprocess.SubprocessError) as e: + _print( + f"Warning: in-container ROCm path probe (docker run) failed for " + f"'{docker_image}': {e}" + ) + return None + if proc.returncode != 0: + err = (proc.stderr or "").strip() + if err: + _print( + f"Warning: in-container ROCm path probe failed (exit {proc.returncode}): " + f"{err[:500]}" + ) + return None + out = (proc.stdout or "").strip() + if not out: + return None + line = out.splitlines()[0].strip() + if not line: + return None + return line + - * If ``MAD_ROCM_PATH`` is in ``docker_env``, use it, remove the key, set ``ROCM_PATH``. - * Elif ``ROCM_PATH`` already set, normalize and return (no host mirror). - * Else mirror ``host_path``. +def apply_container_rocm_path_overrides( + docker_env: Dict[str, Any], host_path: str +) -> Optional[str]: + """ + Map ``MAD_ROCM_PATH`` to ``ROCM_PATH`` and normalize a user ``ROCM_PATH`` in + ``docker_env``. Does **not** set ``ROCM_PATH`` from the host; see + :func:`finalize_container_rocm_path` for run-time resolution. + + Returns the resolved path if one was set, else ``None``. """ d = docker_env if MAD_ROCM_PATH in d and d[MAD_ROCM_PATH] is not None: @@ -268,6 +401,86 @@ def resolve_container_rocm_path(docker_env: Dict[str, Any], host_path: str) -> s elif d.get("ROCM_PATH") not in (None, ""): croot = normalize_rocm_path(str(d["ROCM_PATH"]).strip()) else: - croot = host_path + return None d["ROCM_PATH"] = croot return croot + + +def finalize_container_rocm_path( + docker_env: Dict[str, Any], + docker_image: str, + host_path: str, + *, + log: Callable[[str], None] = print, +) -> str: + """ + Set ``docker_env['ROCM_PATH']`` for container workloads (AMD Docker runs). + + Precedence: + + 1. If ``ROCM_PATH`` is already in ``docker_env`` (e.g. from + :func:`apply_container_rocm_path_overrides`), use it. + 2. Else if OCI :envvar:`ROCM_PATH` / :envvar:`ROCM_HOME` is set on the image, use + that (``docker image inspect``). + 3. Else log a short warning, run the in-image shell probe, use its output if + successful. + 4. Else default to ``/opt/rocm`` and log a warning. + + ``host_path`` is reserved for callers that need a fallback; it is **not** used + for mirroring unless future policy adds an explicit opt-in. + """ + d = docker_env + # Re-run overrides so ``docker_env_vars`` merged in ``run_container`` (late) still + # maps ``MAD_ROCM_PATH`` to ``ROCM_PATH`` and consumes the ``MAD_`` key. + apply_container_rocm_path_overrides(d, host_path) + + existing = d.get("ROCM_PATH") + if existing not in (None, ""): + croot = normalize_rocm_path(str(existing).strip()) + d["ROCM_PATH"] = croot + return croot + + oci = rocm_path_from_docker_image_config(docker_image) + if oci: + croot = normalize_rocm_path(oci) + d["ROCM_PATH"] = croot + log( + f"ROCm container ROCM_PATH from image OCI config ({docker_image}): {croot}" + ) + return croot + + log( + f"Warning: image {docker_image!r} has no ROCM_PATH/ROCM_HOME in OCI Config.Env; " + f"probing in-container for ROCm root (docker run --rm). " + f"Set docker_env_vars.MAD_ROCM_PATH to skip." + ) + probed = infer_rocm_path_via_docker_run(docker_image, log=log) + if probed: + croot = normalize_rocm_path(probed) + d["ROCM_PATH"] = croot + log(f"ROCm container ROCM_PATH from in-image probe: {croot}") + return croot + + croot = normalize_rocm_path("/opt/rocm") + d["ROCM_PATH"] = croot + log( + "Warning: could not infer ROCm in container; defaulting ROCM_PATH to " + f"{croot} (set docker_env_vars.MAD_ROCM_PATH if wrong)." + ) + return croot + + +def resolve_container_rocm_path(docker_env: Dict[str, Any], host_path: str) -> str: + """ + Apply :func:`apply_container_rocm_path_overrides` only (used from + :class:`~madengine.core.context.Context` init). Does not mirror the host + or run Docker inspect/probe. Run-time finalization is + :func:`finalize_container_rocm_path`. + + For backward compatibility, returns the in-``docker_env`` ``ROCM_PATH`` if set, + else an empty string. + """ + out = apply_container_rocm_path_overrides(docker_env, host_path) + if out is not None: + return out + return "" diff --git a/tests/unit/test_rocm_path.py b/tests/unit/test_rocm_path.py index 95f334c1..02bb71b5 100644 --- a/tests/unit/test_rocm_path.py +++ b/tests/unit/test_rocm_path.py @@ -11,7 +11,9 @@ from madengine.utils import rocm_path_resolver as rpr from madengine.utils.rocm_path_resolver import ( MAD_ROCM_PATH, + apply_container_rocm_path_overrides, auto_detect_rocm_path, + finalize_container_rocm_path, get_rocm_path_legacy, normalize_rocm_path, resolve_container_rocm_path, @@ -60,7 +62,7 @@ def test_context_build_only_stores_rocm_path(self): assert ctx._rocm_path == "/opt/rocm" def test_context_runtime_includes_rocm_path_in_ctx(self): - """Context in runtime mode includes rocm_path and ROCM_PATH in docker_env_vars.""" + """Context stores host rocm_path; in-container ROCM_PATH is set at run time.""" from madengine.core.context import Context from unittest.mock import patch from madengine.utils.rocm_path_resolver import normalize_rocm_path @@ -75,9 +77,10 @@ def test_context_runtime_includes_rocm_path_in_ctx(self): ctx = Context(rocm_path="/my/rocm") exp = normalize_rocm_path("/my/rocm") assert ctx.ctx.get("rocm_path") == exp - assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") == exp + # Host path is not mirrored into docker_env_vars.ROCM_PATH at init. + assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") in (None, "") - def test_context_container_mad_overrides_mirror(self): + def test_context_container_mad_overrides_host(self): """docker_env_vars MAD_ROCM_PATH sets in-container ROCM_PATH (not host path).""" from madengine.core.context import Context from unittest.mock import patch @@ -169,17 +172,54 @@ def test_resolve_container_mad_consumes_key(self): assert d.get("ROCM_PATH") == r assert MAD_ROCM_PATH not in d - def test_resolve_container_mirrors_host(self): + def test_apply_overrides_does_not_mirror_host(self): d = {} - r = resolve_container_rocm_path(d, "/hostpath") - assert r == os.path.abspath("/hostpath").rstrip(os.sep) - assert d["ROCM_PATH"] == r + r = apply_container_rocm_path_overrides(d, "/hostpath") + assert r is None + assert "ROCM_PATH" not in d + assert resolve_container_rocm_path(d, "/hostpath") == "" def test_get_rocm_path_legacy_alias(self, monkeypatch): monkeypatch.delenv("ROCM_PATH", raising=False) g = get_rocm_path_legacy(None) assert g == os.path.abspath("/opt/rocm").rstrip(os.sep) + def test_finalize_prefers_oci(self, monkeypatch): + monkeypatch.setattr( + "madengine.utils.rocm_path_resolver.rocm_path_from_docker_image_config", + lambda _img: "/opt/from/oci", + ) + d = {} + p = finalize_container_rocm_path(d, "x:latest", "/h", log=lambda _s: None) + assert p == normalize_rocm_path("/opt/from/oci") + assert d["ROCM_PATH"] == p + + def test_finalize_uses_probe_when_no_oci(self, monkeypatch): + monkeypatch.setattr( + "madengine.utils.rocm_path_resolver.rocm_path_from_docker_image_config", + lambda _img: None, + ) + monkeypatch.setattr( + "madengine.utils.rocm_path_resolver.infer_rocm_path_via_docker_run", + lambda _img, log=None: "/opt/probed", + ) + d = {} + p = finalize_container_rocm_path(d, "x:latest", "/h", log=lambda _s: None) + assert p == normalize_rocm_path("/opt/probed") + + def test_finalize_defaults(self, monkeypatch): + monkeypatch.setattr( + "madengine.utils.rocm_path_resolver.rocm_path_from_docker_image_config", + lambda _img: None, + ) + monkeypatch.setattr( + "madengine.utils.rocm_path_resolver.infer_rocm_path_via_docker_run", + lambda _img, log=None: None, + ) + d = {} + p = finalize_container_rocm_path(d, "x:latest", "/h", log=lambda _s: None) + assert p == normalize_rocm_path("/opt/rocm") + @pytest.mark.unit class TestTheRockVersionedContainerLayout: From c155cb263c9533f4720884c897c360c11c748a92 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 24 Apr 2026 03:08:26 +0000 Subject: [PATCH 6/9] test: add gfx950 to dummy_skip_gpu_arch skip list MI350X (gfx950) was missing from the skip_gpu_arch fixture, causing test_commandline_argument_skip_gpu_arch to fail on this machine. Co-Authored-By: Claude Sonnet 4 --- tests/fixtures/dummy/models.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index fab99fae..140779ab 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -128,7 +128,7 @@ "dummies" ], "args": "", - "skip_gpu_arch": "gfx908, gfx90a, gfx1100, gfx940, gfx941, gfx942, A100, H100" + "skip_gpu_arch": "gfx908, gfx90a, gfx1100, gfx940, gfx941, gfx942, gfx950, A100, H100" }, { "name": "dummy_data_local", From 76798cb059b235af1d574c26bd5850afea34a813 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 24 Apr 2026 18:59:55 +0000 Subject: [PATCH 7/9] refactor(rocm): remove --rocm-path CLI option in favor of MAD_ROCM_PATH in --additional-context The --rocm-path flag was an alias for top-level MAD_ROCM_PATH but its help text implied it could set both host and container paths, risking confusion when environments differ. Users should set host and container ROCm roots independently via --additional-context: Host: {"MAD_ROCM_PATH": "/path/to/host/rocm"} Container: {"docker_env_vars": {"MAD_ROCM_PATH": "/path/in/container"}} Also fixes datetime.utcnow() deprecation warnings in mongodb.py (replaced with datetime.now(timezone.utc)). Co-Authored-By: Claude Sonnet 4 --- README.md | 11 ++-- docs/cli-reference.md | 13 ++-- docs/configuration.md | 7 ++- docs/installation.md | 4 +- docs/usage.md | 18 +++--- src/madengine/cli/commands/run.py | 9 --- src/madengine/core/constants.py | 2 +- src/madengine/core/context.py | 8 +-- src/madengine/database/mongodb.py | 6 +- .../orchestration/run_orchestrator.py | 2 - src/madengine/utils/rocm_path_resolver.py | 10 +--- tests/unit/test_rocm_path.py | 60 ++++++++----------- 12 files changed, 65 insertions(+), 85 deletions(-) diff --git a/README.md b/README.md index db273e8c..43a3764e 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ madengine is a modern CLI tool for running Large Language Models (LLMs) and Deep - **๐ŸŽฏ Simple Deployment** - Run locally or deploy to Kubernetes/SLURM via configuration - **๐Ÿ”ง Distributed Launchers** - Full support for torchrun, DeepSpeed, Megatron-LM, TorchTitan, Primus, vLLM, SGLang - **๐Ÿณ Container-Native** - Docker-based execution with GPU support (ROCm, CUDA) -- **๐Ÿ“‚ ROCm Path** - Auto-detect **host** ROCm root; optional `docker_env_vars.MAD_ROCM_PATH`; in-container `ROCM_PATH` is resolved at Docker run (image OCI env + in-image probe, not host mirroring) โ€” see [Configuration](docs/configuration.md#rocm-path-run-only) +- **๐Ÿ“‚ ROCm Path** - Auto-detect **host** ROCm root (override with top-level `MAD_ROCM_PATH`); in-container `ROCM_PATH` is set independently via `docker_env_vars.MAD_ROCM_PATH` and resolved at Docker run (image OCI env + in-image probe, not host mirroring) โ€” see [Configuration](docs/configuration.md#rocm-path-run-only) - **๐Ÿ“Š Performance Tools** - Integrated profiling with rocprof/rocprofv3, [rocm-trace-lite](https://github.com/sunway513/rocm-trace-lite) (RTL), rocblas, MIOpen, RCCL tracing - **๐ŸŽฏ ROCprofv3 Profiles** - 8 pre-configured profiles for compute/memory/communication bottleneck analysis - **๐Ÿ” Environment Validation** - TheRock ROCm detection and validation tools @@ -71,11 +71,14 @@ madengine run --tags dummy \ > **Note**: For build operations, `gpu_vendor` defaults to `AMD` and `guest_os` defaults to `UBUNTU` if not specified. For production deployments or non-AMD/Ubuntu environments, explicitly specify these values. -If auto-detection does not find your **host** ROCm root, set top-level `MAD_ROCM_PATH` in `--additional-context` or use `--rocm-path` (host only). For a different ROCm root **inside the container**, set `docker_env_vars.MAD_ROCM_PATH` in additional context. If you omit it, madengine derives in-container `ROCM_PATH` when running Docker (from the imageโ€™s baked-in env, then an in-container probe, then `/opt/rocm` โ€” it does **not** copy the host path). You can also set `ROCM_PATH` / `MAD_AUTO_ROCM_PATH=0` for **host** behavior as documented in [docs/configuration.md](docs/configuration.md): +If auto-detection does not find your **host** ROCm root, set top-level `MAD_ROCM_PATH` in `--additional-context`. For a different ROCm root **inside the container**, set `docker_env_vars.MAD_ROCM_PATH` in additional context. If you omit it, madengine derives in-container `ROCM_PATH` when running Docker (from the imageโ€™s baked-in env, then an in-container probe, then `/opt/rocm` โ€” it does **not** copy the host path). You can also set `ROCM_PATH` / `MAD_AUTO_ROCM_PATH=0` for **host** behavior as documented in [docs/configuration.md](docs/configuration.md): ```bash -madengine run --tags dummy --rocm-path /path/to/rocm +# Override host ROCm root: +madengine run --tags dummy --additional-context โ€˜{"MAD_ROCM_PATH": "/path/to/rocm"}โ€™ # or: export ROCM_PATH=/path/to/rocm && madengine run --tags dummy +# Override in-container ROCm root independently: +madengine run --tags dummy --additional-context โ€˜{"docker_env_vars": {"MAD_ROCM_PATH": "/path/in/container"}}โ€™ ``` **Results:** Performance data is written to `perf.csv` (and optionally `perf_entry.csv`). The file is created automatically if missing. Failed runs (including pre-run setup failures) are recorded with status `FAILURE` so every attempted model appears in the table. See [Exit Codes](docs/cli-reference.md#exit-codes) for CI/script usage. @@ -619,7 +622,7 @@ madengine run --tags model --keep-alive madengine build --tags model --clean-docker-cache --verbose ``` -**ROCm not in /opt/rocm:** Use top-level `MAD_ROCM_PATH` or `--rocm-path` for the **host**; for **in-container** paths, set `docker_env_vars.MAD_ROCM_PATH`, or let madengine resolve `ROCM_PATH` at run from the image and probe (see [Configuration](docs/configuration.md#rocm-path-run-only)). +**ROCm not in /opt/rocm:** Set top-level `MAD_ROCM_PATH` in `--additional-context` for the **host**; for **in-container** paths, set `docker_env_vars.MAD_ROCM_PATH`, or let madengine resolve `ROCM_PATH` at run from the image and probe (see [Configuration](docs/configuration.md#rocm-path-run-only)). **Common Issues:** - **False failures with profiling**: If models show FAILURE but have performance metrics, see [Profiling Troubleshooting](docs/profiling.md#false-failure-detection-with-rocprof) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index ab80aeba..9340ae1c 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -211,7 +211,6 @@ madengine run [OPTIONS] |--------|-------|------|---------|-------------| | `--tags` | `-t` | TEXT | `[]` | Model tags to run (can specify multiple) | | `--manifest-file` | `-m` | TEXT | `""` | Build manifest file path (for pre-built images) | -| `--rocm-path` | | TEXT | `None` | **Host** ROCm installation root only (default: `ROCM_PATH` env or `/opt/rocm`). Use when the **runnerโ€™s** ROCm is not in `/opt/rocm` (e.g. TheRock, pip). Does not set the in-container `ROCM_PATH` for Docker; that is resolved at `run` from the image, probe, or `docker_env_vars` โ€” see [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only). | | `--registry` | `-r` | TEXT | `None` | Docker registry URL | | `--timeout` | | INT | `-1` | Timeout in seconds (-1=default 7200s, 0=no timeout) | | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | @@ -240,9 +239,13 @@ madengine run [OPTIONS] madengine run --tags dummy \ --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' -# Custom ROCm path (when ROCm is not in /opt/rocm, e.g. Rock or pip install) -madengine run --tags dummy --rocm-path /path/to/rocm \ - --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +# Custom host ROCm path (when ROCm is not in /opt/rocm, e.g. TheRock or pip install) +madengine run --tags dummy \ + --additional-context '{"MAD_ROCM_PATH": "/path/to/rocm", "gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Custom in-container ROCm path (independent from host) +madengine run --tags dummy \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU", "docker_env_vars": {"MAD_ROCM_PATH": "/path/in/container"}}' # Run with pre-built images (manifest-based) madengine run --manifest-file build_manifest.json @@ -617,7 +620,7 @@ madengine recognizes these environment variables: | Variable | Description | Default | |----------|-------------|---------| | `MODEL_DIR` | Path to MAD package directory | Auto-detected | -| `ROCM_PATH` | **Host** ROCm installation root (used when `--rocm-path` not set and for host auto-detect / legacy). In-container `ROCM_PATH` for Docker is not taken from this variable unless the image or run logic supplies it. | `/opt/rocm` | +| `ROCM_PATH` | **Host** ROCm installation root (fallback when `MAD_ROCM_PATH` is not set in additional context and auto-detect is disabled or finds nothing). In-container `ROCM_PATH` for Docker is not taken from this variable; set `docker_env_vars.MAD_ROCM_PATH` in additional context instead. | `/opt/rocm` | | `MAD_AUTO_ROCM_PATH` | Set to `0` to disable **host** auto-detect (`ROCM_PATH` then `/opt/rocm` on the host). | (default: scan on) | | `MAD_VERBOSE_CONFIG` | Enable verbose configuration logging | `false` | | `MAD_DOCKERHUB_USER` | Docker Hub username | None | diff --git a/docs/configuration.md b/docs/configuration.md index 4c14931b..d7868715 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -137,11 +137,12 @@ Design rationale and precedence are recorded in [ADR 0001: ROCm path resolution] **Overrides** (recommended for CI): -- **CLI (host only):** `madengine run --rocm-path /path/to/rocm` โ€” same meaning as top-level `MAD_ROCM_PATH` in additional context. -- **Additional context (host):** top-level `"MAD_ROCM_PATH": "/path/to/host/rocm"`. +- **Additional context (host):** top-level `"MAD_ROCM_PATH": "/path/to/host/rocm"` โ€” controls where madengine looks for host GPU tools (`rocminfo`, `amd-smi`, etc.). - **Additional context (container):** `"docker_env_vars": { "MAD_ROCM_PATH": "/path/inside/image" }` โ€” sets the in-container `ROCM_PATH` for Docker runs. If omitted, at `run` time madengine uses the image OCI `Env` (`ROCM_PATH` / `ROCM_HOME`) if present, then an in-container probe, then defaults to `/opt/rocm` (see [ADR 0001](adr/0001-rocm-path-resolution.md)). The host-resolved path is **not** mirrored into the container. -Precedence (host): top-level `MAD_ROCM_PATH` โ†’ `--rocm-path` โ†’ auto-detect (unless disabled) โ†’ `ROCM_PATH` โ†’ `/opt/rocm`. +These two keys are independent, allowing host and container to use different ROCm installations without confusion. + +Precedence (host): top-level `MAD_ROCM_PATH` โ†’ auto-detect (unless disabled) โ†’ `ROCM_PATH` โ†’ `/opt/rocm`. Precedence (container, **local Docker `run`**, **AMD**): `docker_env_vars.MAD_ROCM_PATH` (maps to `ROCM_PATH` for the workload) or explicit `ROCM_PATH` in `docker_env_vars` โ†’ image OCI `Env` (`ROCM_PATH` / `ROCM_HOME`) โ†’ in-image probe โ†’ default `/opt/rocm` with a warning. Implemented in `ContainerRunner.run_container` after the run image is resolved; see [ADR 0001](adr/0001-rocm-path-resolution.md). diff --git a/docs/installation.md b/docs/installation.md index e553537a..6aa3e830 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -83,7 +83,7 @@ madengine run --tags dummy \ --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' ``` -**Non-default ROCm location (host):** If ROCm is not under `/opt/rocm` (e.g. [TheRock](https://github.com/ROCm/TheRock) or pip install), set `ROCM_PATH` on the **host** or use `madengine run --rocm-path /path/to/rocm` so **host** GPU checks (`amd-smi`, etc.) resolve correctly. **In-container** `ROCM_PATH` for Docker workloads is set separately at run (image OCI env, in-image probe, or `docker_env_vars.MAD_ROCM_PATH`); it is not copied from the host. See [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only). +**Non-default ROCm location (host):** If ROCm is not under `/opt/rocm` (e.g. [TheRock](https://github.com/ROCm/TheRock) or pip install), set `ROCM_PATH` on the **host** or set top-level `MAD_ROCM_PATH` in `--additional-context` so **host** GPU checks (`amd-smi`, etc.) resolve correctly. **In-container** `ROCM_PATH` for Docker workloads is set separately at run (image OCI env, in-image probe, or `docker_env_vars.MAD_ROCM_PATH`); it is not copied from the host. See [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only). ### NVIDIA CUDA @@ -140,7 +140,7 @@ rocm-smi ls -la /dev/kfd /dev/dri ``` -If ROCm is installed in a non-default path on the **host** (e.g. TheRock or pip), set `export ROCM_PATH=/path/to/rocm` or use `madengine run --rocm-path /path/to/rocm` (host validation only; see [ROCm path (run only)](configuration.md#rocm-path-run-only) for in-container behavior). +If ROCm is installed in a non-default path on the **host** (e.g. TheRock or pip), set `export ROCM_PATH=/path/to/rocm` or pass `MAD_ROCM_PATH` in `--additional-context` (host validation only; see [ROCm path (run only)](configuration.md#rocm-path-run-only) for in-container behavior). ### MAD Package Not Found diff --git a/docs/usage.md b/docs/usage.md index 653e2d7f..5558479d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -313,16 +313,18 @@ madengine run --tags model \ By default, **madengine** auto-detects the **host** ROCm root (apt under `/opt/rocm`, TheRock `rocm-sdk` layout, etc.). Disable scanning with `MAD_AUTO_ROCM_PATH=0` (then `ROCM_PATH` / `/opt/rocm` only). -**Host** overrides: top-level `MAD_ROCM_PATH` in `--additional-context`, or `--rocm-path` (same meaning as top-level `MAD_ROCM_PATH`). +**Host** override: set top-level `MAD_ROCM_PATH` in `--additional-context` to tell madengine where host GPU tools live (`rocminfo`, `amd-smi`, etc.). **In-container** `ROCM_PATH` (AMD Docker runs) is **not** copied from the host. If you do not set `docker_env_vars.MAD_ROCM_PATH` (or a literal `ROCM_PATH` in `docker_env_vars`), madengine sets it at **run** time from, in order: the image OCI `Env` (`ROCM_PATH` / `ROCM_HOME` via `docker image inspect`), an in-container probe (`docker run --rm`), or `/opt/rocm` with a warning. Override explicitly with `{"docker_env_vars": {"MAD_ROCM_PATH": "/path/inside/image"}}` when the image needs a fixed root. Details: [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only), [ADR 0001](adr/0001-rocm-path-resolution.md). +The two keys are independent โ€” host and container can point to different ROCm installations: + ```bash -# Host path via CLI (alias for top-level MAD_ROCM_PATH) -madengine run --tags model --rocm-path /path/to/host/rocm \ - --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' +# Override host ROCm root only +madengine run --tags model \ + --additional-context '{"MAD_ROCM_PATH": "/path/to/host/rocm", "gpu_vendor": "AMD", "guest_os": "UBUNTU"}' -# Or in additional context (host + optional container) +# Override host and container paths independently madengine run --tags model --additional-context '{ "gpu_vendor": "AMD", "guest_os": "UBUNTU", @@ -331,7 +333,7 @@ madengine run --tags model --additional-context '{ }' ``` -`--rocm-path` applies only to the **run** command (not build). See [Configuration - ROCm path](configuration.md#rocm-path-run-only). +See [Configuration - ROCm path](configuration.md#rocm-path-run-only). ### Run phase environment table @@ -356,7 +358,7 @@ At the start of each container run, madengine prints a side-by-side environment | ROCm / CUDA Root | Resolved via `RocmPathResolver` (host) or `rocm-sdk path --root` (container) | `nvcc` binary location | | ROCm / CUDA Version | From `amd-smi` / `rocm-sdk version` / `.info/version` | From `nvcc --version` / `nvidia-smi` | -The host column uses the same resolution as `--rocm-path` / `MAD_ROCM_PATH`. The container column queries the container's own `PATH` at runtime, so it correctly reflects TheRock images where tools live in a Python venv rather than `/opt/rocm/bin/`. +The host column uses the same resolution as top-level `MAD_ROCM_PATH` in additional context. The container column queries the container's own `PATH` at runtime, so it correctly reflects TheRock images where tools live in a Python venv rather than `/opt/rocm/bin/`. ### Deploy to Kubernetes @@ -649,7 +651,7 @@ madengine build --tags model --clean-docker-cache --verbose | Variable | Description | Example | |----------|-------------|---------| | `MODEL_DIR` | MAD package directory | `/path/to/MAD` | -| `ROCM_PATH` | **Host** ROCm root when not using top-level `MAD_ROCM_PATH` / `--rocm-path`, and (with `MAD_AUTO_ROCM_PATH=0` or as fallback) for host-side detection and tools. In-container `ROCM_PATH` for Docker is set separately at run; see [ROCm path (host and container)](#rocm-path-host-and-container). | `/path/to/rocm` | +| `ROCM_PATH` | **Host** ROCm root fallback when `MAD_ROCM_PATH` is not set in additional context and auto-detect is disabled or finds nothing. In-container `ROCM_PATH` for Docker is set separately at run; see [ROCm path (host and container)](#rocm-path-host-and-container). | `/path/to/rocm` | | `MAD_AUTO_ROCM_PATH` | Set to `0` to disable **host** auto-detect (use `ROCM_PATH` then `/opt/rocm` only on the host). Default: on. | `0` | | `MAD_VERBOSE_CONFIG` | Verbose config logging | `"true"` | | `MAD_DOCKERHUB_USER` | Docker Hub username | `"myusername"` | diff --git a/src/madengine/cli/commands/run.py b/src/madengine/cli/commands/run.py index e1472076..c3a6c4de 100644 --- a/src/madengine/cli/commands/run.py +++ b/src/madengine/cli/commands/run.py @@ -152,13 +152,6 @@ def run( help="Remove intermediate perf_entry files after run (keeps perf.csv and perf_super files)", ), ] = False, - rocm_path: Annotated[ - Optional[str], - typer.Option( - "--rocm-path", - help="Host ROCm root (alias for top-level MAD_ROCM_PATH in --additional-context). Auto-detect when omitted (set MAD_AUTO_ROCM_PATH=0 to disable).", - ), - ] = None, ) -> None: """ ๐Ÿš€ Run model containers in distributed scenarios. @@ -237,7 +230,6 @@ def run( disable_skip_gpu_arch=disable_skip_gpu_arch, verbose=verbose, cleanup_perf=cleanup_perf, - rocm_path=rocm_path, skip_model_run=skip_model_run, _separate_phases=True, ) @@ -342,7 +334,6 @@ def run( disable_skip_gpu_arch=disable_skip_gpu_arch, verbose=verbose, cleanup_perf=cleanup_perf, - rocm_path=rocm_path, skip_model_run=skip_model_run, _separate_phases=False, # Full workflow uses .live.log (not .run.live.log) ) diff --git a/src/madengine/core/constants.py b/src/madengine/core/constants.py index 4371f8d1..d1afa4c9 100644 --- a/src/madengine/core/constants.py +++ b/src/madengine/core/constants.py @@ -199,7 +199,7 @@ def _get_public_github_rocm_key(): def get_rocm_path(override=None): """Return ROCm installation root (legacy, no automatic filesystem scan). - For full resolution (MAD_ROCM_PATH, --rocm-path, auto-detect) use + For full resolution (MAD_ROCM_PATH, auto-detect) use :func:`madengine.utils.rocm_path_resolver.resolve_host_rocm_path` in :class:`~madengine.core.context.Context`. diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index 24adf16c..46fa065d 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -85,7 +85,6 @@ def __init__( additional_context: str = None, additional_context_file: str = None, build_only_mode: bool = False, - rocm_path: str = None, ) -> None: """Constructor of the Context class. @@ -93,7 +92,6 @@ def __init__( additional_context: The additional context. additional_context_file: The additional context file. build_only_mode: Whether running in build-only mode (no GPU detection). - rocm_path: Host ROCm root (``--rocm-path``), same as top-level ``MAD_ROCM_PATH``. Raises: RuntimeError: If GPU detection fails and not in build-only mode. @@ -132,8 +130,8 @@ def __init__( dict_additional_context = ast.literal_eval(additional_context) update_dict(self.ctx, dict_additional_context) - # Host ROCm path after merge (top-level MAD_ROCM_PATH, then --rocm-path, then auto) - self._rocm_path = resolve_host_rocm_path(self.ctx, rocm_path) + # Host ROCm path: top-level MAD_ROCM_PATH in context, then auto-detect + self._rocm_path = resolve_host_rocm_path(self.ctx) # Initialize context based on mode # User-provided contexts will not be overridden by detection @@ -396,7 +394,7 @@ def get_gpu_vendor(self) -> str: print(f"Warning: nvidia-smi check failed: {e}") # Check AMD - try amd-smi first, fallback to rocm-smi (PR #54) - # Use configurable ROCm path (ROCM_PATH / --rocm-path) for non-default installs + # Use configurable ROCm path (MAD_ROCM_PATH / ROCM_PATH) for non-default installs amd_smi_paths = [ os.path.join(self._rocm_path, "bin", "amd-smi"), "/usr/local/bin/amd-smi", diff --git a/src/madengine/database/mongodb.py b/src/madengine/database/mongodb.py index 0b36ae60..7713e991 100644 --- a/src/madengine/database/mongodb.py +++ b/src/madengine/database/mongodb.py @@ -11,7 +11,7 @@ import logging from abc import ABC, abstractmethod from dataclasses import dataclass, field -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional from enum import Enum @@ -303,11 +303,11 @@ def _add_metadata(self, doc: Dict[str, Any]) -> Dict[str, Any]: # Add upload timestamp if not present if f"{prefix}_uploaded_at" not in doc: - doc[f"{prefix}_uploaded_at"] = datetime.utcnow() + doc[f"{prefix}_uploaded_at"] = datetime.now(timezone.utc) # Preserve original created_date if present if "created_date" not in doc: - doc["created_date"] = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + doc["created_date"] = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") return doc diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index 56170033..4a3b3f8d 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -115,7 +115,6 @@ def _init_runtime_context(self): self.context = Context( additional_context=context_string, build_only_mode=False, - rocm_path=getattr(self.args, "rocm_path", None), ) # Initialize data provider if data config exists @@ -424,7 +423,6 @@ def _create_manifest_from_local_image( build_context = Context( additional_context=context_string, build_only_mode=True, - rocm_path=getattr(self.args, "rocm_path", None), ) # Create manifest structure diff --git a/src/madengine/utils/rocm_path_resolver.py b/src/madengine/utils/rocm_path_resolver.py index e82433bf..291660b3 100644 --- a/src/madengine/utils/rocm_path_resolver.py +++ b/src/madengine/utils/rocm_path_resolver.py @@ -16,10 +16,9 @@ **Host** overrides (precedence): 1. Top-level ``MAD_ROCM_PATH`` in additional context. -2. ``--rocm-path`` (CLI) โ€” host-only; same meaning as (1). -3. Auto-detection (when ``MAD_AUTO_ROCM_PATH`` is not ``"0"``). -4. ``ROCM_PATH`` environment variable. -5. Default ``/opt/rocm``. +2. Auto-detection (when ``MAD_AUTO_ROCM_PATH`` is not ``"0"``). +3. ``ROCM_PATH`` environment variable. +4. Default ``/opt/rocm``. **Container** (``docker_env_vars``) โ€” set when the run uses Docker (``run_container``): @@ -232,7 +231,6 @@ def get_rocm_path_legacy(override: Optional[str] = None) -> str: def resolve_host_rocm_path( ctx: Optional[Dict[str, Any]] = None, - cli_rocm_path: Optional[str] = None, ) -> str: """ Resolve the host ROCm installation root for madengine (GPU tools, validation). @@ -243,8 +241,6 @@ def resolve_host_rocm_path( mad = ctx.get(MAD_ROCM_PATH) if isinstance(mad, str) and mad.strip(): return normalize_rocm_path(mad) - if cli_rocm_path and str(cli_rocm_path).strip(): - return normalize_rocm_path(str(cli_rocm_path).strip()) if os.environ.get("MAD_AUTO_ROCM_PATH", "1").strip() == "0": return get_rocm_path_legacy(None) diff --git a/tests/unit/test_rocm_path.py b/tests/unit/test_rocm_path.py index 65f03e59..4ce8e86b 100644 --- a/tests/unit/test_rocm_path.py +++ b/tests/unit/test_rocm_path.py @@ -1,5 +1,5 @@ """ -Unit tests for ROCm path (ROCM_PATH / --rocm-path) support. +Unit tests for ROCm path (ROCM_PATH / MAD_ROCM_PATH) support. Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ @@ -54,12 +54,21 @@ def test_get_rocm_path_env(self, monkeypatch): class TestContextRocmPath: """Test Context stores and uses rocm_path.""" - def test_context_build_only_stores_rocm_path(self): - """Context with build_only_mode=True and rocm_path sets _rocm_path.""" + def test_context_build_only_default_rocm_path(self): + """Context with build_only_mode=True resolves _rocm_path via auto-detect or default.""" from madengine.core.context import Context - ctx = Context(build_only_mode=True, rocm_path="/opt/rocm") - assert ctx._rocm_path == "/opt/rocm" + ctx = Context(build_only_mode=True) + assert os.path.isabs(ctx._rocm_path) + + def test_context_build_only_mad_rocm_path(self): + """Context with build_only_mode=True and top-level MAD_ROCM_PATH sets _rocm_path.""" + from madengine.core.context import Context + from madengine.utils.rocm_path_resolver import normalize_rocm_path + + ac = repr({MAD_ROCM_PATH: "/opt/rocm-test"}) + ctx = Context(build_only_mode=True, additional_context=ac) + assert ctx._rocm_path == normalize_rocm_path("/opt/rocm-test") def test_context_runtime_includes_rocm_path_in_ctx(self): """Context stores host rocm_path; in-container ROCM_PATH is set at run time.""" @@ -67,6 +76,7 @@ def test_context_runtime_includes_rocm_path_in_ctx(self): from unittest.mock import patch from madengine.utils.rocm_path_resolver import normalize_rocm_path + ac = repr({MAD_ROCM_PATH: "/my/rocm"}) with patch.object(Context, "get_gpu_vendor", return_value="AMD"), \ patch.object(Context, "get_system_ngpus", return_value=2), \ patch.object(Context, "get_system_gpu_architecture", return_value="gfx90a"), \ @@ -74,20 +84,21 @@ def test_context_runtime_includes_rocm_path_in_ctx(self): patch.object(Context, "get_system_hip_version", return_value="5.4"), \ patch.object(Context, "get_docker_gpus", return_value="0-1"), \ patch.object(Context, "get_gpu_renderD_nodes", return_value=None): - ctx = Context(rocm_path="/my/rocm") + ctx = Context(additional_context=ac) exp = normalize_rocm_path("/my/rocm") assert ctx.ctx.get("rocm_path") == exp # Host path is not mirrored into docker_env_vars.ROCM_PATH at init. assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") in (None, "") def test_context_container_mad_overrides_host(self): - """docker_env_vars MAD_ROCM_PATH sets in-container ROCM_PATH (not host path).""" + """docker_env_vars MAD_ROCM_PATH sets in-container ROCM_PATH independently.""" from madengine.core.context import Context from unittest.mock import patch from madengine.utils.rocm_path_resolver import normalize_rocm_path ac = repr( { + MAD_ROCM_PATH: "/on/host", "docker_env_vars": {MAD_ROCM_PATH: "/in/image"}, } ) @@ -98,10 +109,7 @@ def test_context_container_mad_overrides_host(self): patch.object(Context, "get_system_hip_version", return_value="5.4"), \ patch.object(Context, "get_docker_gpus", return_value="0-1"), \ patch.object(Context, "get_gpu_renderD_nodes", return_value=None): - ctx = Context( - additional_context=ac, - rocm_path="/on/host", - ) + ctx = Context(additional_context=ac) assert ctx._rocm_path == normalize_rocm_path("/on/host") assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") == normalize_rocm_path( "/in/image" @@ -124,44 +132,24 @@ def test_rocm_tool_manager_paths_under_rocm_path(self): assert manager.ROCM_VERSION_FILE == "/custom/rocm/.info/version" -@pytest.mark.unit -class TestRunCommandRocmPath: - """Test run command exposes --rocm-path.""" - - def test_run_help_includes_rocm_path(self): - """madengine run --help mentions --rocm-path.""" - from typer.testing import CliRunner - from madengine.cli import app - - runner = CliRunner() - result = runner.invoke(app, ["run", "--help"]) - assert result.exit_code == 0 - assert "--rocm-path" in result.output - - @pytest.mark.unit class TestResolveHostContainerRocmPath: """Test madengine rocm_path_resolver (MAD_ROCM_PATH / auto).""" - def test_resolve_host_mad_takes_precedence_over_cli(self): - """Top-level MAD_ROCM_PATH in context wins over --rocm-path.""" - p = resolve_host_rocm_path({MAD_ROCM_PATH: "/from/context"}, "/from/cli") + def test_resolve_host_mad_context_takes_precedence(self): + """Top-level MAD_ROCM_PATH in context is used for host path.""" + p = resolve_host_rocm_path({MAD_ROCM_PATH: "/from/context"}) assert os.path.isabs(p) assert p == os.path.abspath("/from/context").rstrip(os.sep) - def test_resolve_host_cli_when_no_mad(self): - """--rocm-path (cli) is used when MAD_ROCM_PATH is absent.""" - p = resolve_host_rocm_path({}, "/only/cli") - assert p == os.path.abspath("/only/cli").rstrip(os.sep) - def test_resolve_host_legacy_when_auto_off(self, monkeypatch): """MAD_AUTO_ROCM_PATH=0 uses ROCM_PATH env then default.""" monkeypatch.setenv("MAD_AUTO_ROCM_PATH", "0") monkeypatch.delenv("ROCM_PATH", raising=False) - p = resolve_host_rocm_path({}, None) + p = resolve_host_rocm_path({}) assert p == os.path.abspath("/opt/rocm").rstrip(os.sep) monkeypatch.setenv("ROCM_PATH", "/env/ro") - p2 = resolve_host_rocm_path({}, None) + p2 = resolve_host_rocm_path({}) assert p2 == os.path.abspath("/env/ro").rstrip(os.sep) def test_resolve_container_mad_consumes_key(self): From aac1c47f11c42d047b35b35b25ac0b86bf38c05b Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 24 Apr 2026 16:33:38 -0500 Subject: [PATCH 8/9] fix: address Copilot review comments in PR #110 - validators.py: restrict docker_env_vars.MAD_ROCM_PATH to str|None only (was allowing int/float, inconsistent with error message) - rocm_path_resolver.py: always pop MAD_ROCM_PATH from docker_env even when None, preventing the key from leaking into docker run env - rocm_path_resolver.py: add bin/rocm-smi to _CONTAINER_PROBE_SH try_root() predicate to match Python-side looks_like_rocm_root() - container_runner.py: clear ROCM_PATH before finalize_container_rocm_path each run so multi-model runs re-resolve per docker_image instead of reusing image 1's path for all subsequent images - csv_parser.py: shlex.quote() the rocm-smi path before shell interpolation; only increment num_gpu after successful NVIDIA line parse (git-ignored path, committed separately if needed) - README.md: replace curly/smart quotes with ASCII quotes in bash examples - CHANGELOG.md: remove stale --rocm-path alias reference (flag was removed); remove broken docs/adr/ link Co-Authored-By: Claude Sonnet 4 --- CHANGELOG.md | 2 +- README.md | 6 +++--- src/madengine/cli/validators.py | 2 +- src/madengine/execution/container_runner.py | 6 ++++++ .../scripts/common/pre_scripts/rocEnvTool/csv_parser.py | 5 +++-- src/madengine/utils/rocm_path_resolver.py | 8 +++++--- 6 files changed, 19 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4cbb7266..1ad08fa2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- **ROCm path**: Auto-detect host ROCm root by default (traditional `/opt/rocm`, versioned `/opt/rocm-*`, TheRock `rocm-sdk` + markers, `rocminfo` / `amd-smi` / `rocm-smi` on `PATH`). **Host** overrides: top-level `MAD_ROCM_PATH` in `--additional-context`, then `--rocm-path` (alias). **Container** `ROCM_PATH` for Docker runs (AMD): `docker_env_vars.MAD_ROCM_PATH` if set; else `ROCM_PATH` / `ROCM_HOME` from the image OCI config (`docker image inspect`); else an in-image shell probe (`docker run --rm`); else default `/opt/rocm` with a warning. The host-resolved path is no longer mirrored into the container by default. Set `MAD_AUTO_ROCM_PATH=0` to use legacy host `ROCM_PATH` / `/opt/rocm` only without scanning. Code: `madengine.utils.rocm_path_resolver`, shared TheRock file markers in `madengine.utils.therock_markers`. **Design doc**: [ADR 0001](docs/adr/0001-rocm-path-resolution.md). +- **ROCm path**: Auto-detect host ROCm root by default (traditional `/opt/rocm`, versioned `/opt/rocm-*`, TheRock `rocm-sdk` + markers, `rocminfo` / `amd-smi` / `rocm-smi` on `PATH`). **Host** overrides: top-level `MAD_ROCM_PATH` in `--additional-context`. **Container** `ROCM_PATH` for Docker runs (AMD): `docker_env_vars.MAD_ROCM_PATH` if set; else `ROCM_PATH` / `ROCM_HOME` from the image OCI config (`docker image inspect`); else an in-image shell probe (`docker run --rm`); else default `/opt/rocm` with a warning. The host-resolved path is no longer mirrored into the container by default. Set `MAD_AUTO_ROCM_PATH=0` to use legacy host `ROCM_PATH` / `/opt/rocm` only without scanning. Code: `madengine.utils.rocm_path_resolver`, shared TheRock file markers in `madengine.utils.therock_markers`. - **Profiling**: `rocm_trace_lite` now sets `RTL_MODE=lite` explicitly; added tool `rocm_trace_lite_default` with `RTL_MODE=default` for A/B overhead comparison. `rtl_trace_wrapper.sh` passes `rtl trace --mode โ€ฆ` when `RTL_MODE` is set. ## [2.0.1] - 2026-04-23 diff --git a/README.md b/README.md index 43a3764e..ea3fbbb5 100644 --- a/README.md +++ b/README.md @@ -71,14 +71,14 @@ madengine run --tags dummy \ > **Note**: For build operations, `gpu_vendor` defaults to `AMD` and `guest_os` defaults to `UBUNTU` if not specified. For production deployments or non-AMD/Ubuntu environments, explicitly specify these values. -If auto-detection does not find your **host** ROCm root, set top-level `MAD_ROCM_PATH` in `--additional-context`. For a different ROCm root **inside the container**, set `docker_env_vars.MAD_ROCM_PATH` in additional context. If you omit it, madengine derives in-container `ROCM_PATH` when running Docker (from the imageโ€™s baked-in env, then an in-container probe, then `/opt/rocm` โ€” it does **not** copy the host path). You can also set `ROCM_PATH` / `MAD_AUTO_ROCM_PATH=0` for **host** behavior as documented in [docs/configuration.md](docs/configuration.md): +If auto-detection does not find your **host** ROCm root, set top-level `MAD_ROCM_PATH` in `--additional-context`. For a different ROCm root **inside the container**, set `docker_env_vars.MAD_ROCM_PATH` in additional context. If you omit it, madengine derives in-container `ROCM_PATH` when running Docker (from the image's baked-in env, then an in-container probe, then `/opt/rocm` โ€” it does **not** copy the host path). You can also set `ROCM_PATH` / `MAD_AUTO_ROCM_PATH=0` for **host** behavior as documented in [docs/configuration.md](docs/configuration.md): ```bash # Override host ROCm root: -madengine run --tags dummy --additional-context โ€˜{"MAD_ROCM_PATH": "/path/to/rocm"}โ€™ +madengine run --tags dummy --additional-context '{"MAD_ROCM_PATH": "/path/to/rocm"}' # or: export ROCM_PATH=/path/to/rocm && madengine run --tags dummy # Override in-container ROCm root independently: -madengine run --tags dummy --additional-context โ€˜{"docker_env_vars": {"MAD_ROCM_PATH": "/path/in/container"}}โ€™ +madengine run --tags dummy --additional-context '{"docker_env_vars": {"MAD_ROCM_PATH": "/path/in/container"}}' ``` **Results:** Performance data is written to `perf.csv` (and optionally `perf_entry.csv`). The file is created automatically if missing. Failed runs (including pre-run setup failures) are recorded with status `FAILURE` so every attempted model appears in the table. See [Exit Codes](docs/cli-reference.md#exit-codes) for CI/script usage. diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py index 0d36ac21..8fdb1ae4 100644 --- a/src/madengine/cli/validators.py +++ b/src/madengine/cli/validators.py @@ -127,7 +127,7 @@ def validate_additional_context_structure(context: Dict[str, Any]) -> None: dev = context.get("docker_env_vars") if isinstance(dev, dict) and "MAD_ROCM_PATH" in dev: v = dev["MAD_ROCM_PATH"] - if not isinstance(v, (str, int, float, type(None))): + if not isinstance(v, (str, type(None))): _fail_structure( "docker_env_vars['MAD_ROCM_PATH']", "a string (container ROCm root override)", diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 3d0d21e6..21352de9 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -1059,6 +1059,12 @@ def run_container( ) != -1: from madengine.utils.rocm_path_resolver import finalize_container_rocm_path + # Clear any ROCM_PATH left from a previous model run so finalize always + # re-resolves per docker_image (OCI config โ†’ probe โ†’ default). + # If the user supplied MAD_ROCM_PATH via additional_context it was merged + # into docker_env_vars just above and finalize will consume it correctly. + self.context.ctx["docker_env_vars"].pop("ROCM_PATH", None) + finalize_container_rocm_path( self.context.ctx["docker_env_vars"], docker_image, diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py index fd8f66fc..7c1599ab 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/csv_parser.py @@ -3,6 +3,7 @@ Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ import os +import shlex import shutil from console import Console @@ -36,7 +37,7 @@ def determine_gpu_device_type(self): # Use PATH resolution first so TheRock images (where rocm-smi lives # under a Python venv, not /opt/rocm/bin/) are handled correctly. rocm_smi_cmd = shutil.which("rocm-smi") or "/opt/rocm/bin/rocm-smi" - rocm_smi_out = console.sh(f"{rocm_smi_cmd} || true") + rocm_smi_out = console.sh(shlex.quote(rocm_smi_cmd) + " || true") nv_smi_out = console.sh("nvidia-smi -L || true") if "not found" not in rocm_smi_out and "No such file" not in rocm_smi_out: gpu_device_type = "AMD" @@ -119,10 +120,10 @@ def dump_gpu_information_in_csv(self, gpu_log_path, device_type): for j in range(1, len(lines)): line = lines[j].rstrip() if "GPU" in line: - num_gpu += 1 values = line.split(":") if len(values) < 3: continue + num_gpu += 1 name = values[1].split("(UUID")[0] uuid = values[2] info_list.append("Name|" + name) diff --git a/src/madengine/utils/rocm_path_resolver.py b/src/madengine/utils/rocm_path_resolver.py index 291660b3..f62e73c6 100644 --- a/src/madengine/utils/rocm_path_resolver.py +++ b/src/madengine/utils/rocm_path_resolver.py @@ -257,7 +257,7 @@ def resolve_host_rocm_path( _CONTAINER_PROBE_SH = r"""try_root() { d="$1" [ -d "$d" ] || return 1 - { [ -f "$d/bin/rocminfo" ] || [ -f "$d/bin/amd-smi" ] || [ -f "$d/share/therock/therock_manifest.json" ] || [ -f "$d/share/therock/dist_info.json" ]; } && return 0 + { [ -f "$d/bin/rocminfo" ] || [ -f "$d/bin/amd-smi" ] || [ -f "$d/bin/rocm-smi" ] || [ -f "$d/share/therock/therock_manifest.json" ] || [ -f "$d/share/therock/dist_info.json" ]; } && return 0 return 1 } if command -v rocm-sdk >/dev/null 2>&1; then @@ -387,12 +387,14 @@ def apply_container_rocm_path_overrides( Returns the resolved path if one was set, else ``None``. """ d = docker_env - if MAD_ROCM_PATH in d and d[MAD_ROCM_PATH] is not None: - s = d.pop(MAD_ROCM_PATH) + if MAD_ROCM_PATH in d: + s = d.pop(MAD_ROCM_PATH) # always consume, even when None if isinstance(s, (int, float)): s = str(s) if not isinstance(s, str) or not str(s).strip(): s = host_path + if not s: + return None croot = normalize_rocm_path(str(s).strip()) elif d.get("ROCM_PATH") not in (None, ""): croot = normalize_rocm_path(str(d["ROCM_PATH"]).strip()) From a5c0850b3ce2cde6d097a13568941e1df09eabf5 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 24 Apr 2026 20:50:11 -0500 Subject: [PATCH 9/9] fix(rocm): remove ADR links, dead code, and duplicated path resolution logic - Remove broken docs/adr/ links from configuration.md, usage.md, rocm_path_resolver.py docstring, and therock_markers.py comment - Remove resolve_container_rocm_path() (thin wrapper with no unique logic) - Remove early apply_container_rocm_path_overrides() call in init_gpu_context: it set ROCM_PATH only for run_container to immediately pop it; container ROCM_PATH finalization now happens exclusively in finalize_container_rocm_path - Fix auto_detect() inner loop: first iteration was a duplicate _looks_like_rocm_root(root) check already done before the loop; walk to root.parent first, then check - Silence grep -oP stderr in container ROCm version command (BusyBox images) - Guard whitespace-only ROCM_PATH in apply_container_rocm_path_overrides - Update tests to match: MAD_ROCM_PATH preserved in docker_env_vars at context init, consumed at run time by finalize_container_rocm_path All 383 unit tests pass. Co-Authored-By: Claude Sonnet 4 --- docs/configuration.md | 6 ++-- docs/usage.md | 2 +- src/madengine/core/context.py | 13 +++---- src/madengine/execution/container_runner.py | 7 ++-- src/madengine/utils/rocm_path_resolver.py | 38 ++++++--------------- src/madengine/utils/therock_markers.py | 2 +- tests/unit/test_rocm_path.py | 14 ++++---- 7 files changed, 29 insertions(+), 53 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index d7868715..034ac6d8 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -131,20 +131,18 @@ Disabling the scan does **not** change performance metric extraction from the lo ### ROCm path (run only) -Design rationale and precedence are recorded in [ADR 0001: ROCm path resolution](adr/0001-rocm-path-resolution.md). - **Host** (where `madengine` runs validation): by default, the ROCm root is **auto-detected** (traditional `/opt/rocm`, [TheRock](https://github.com/ROCm/TheRock) `rocm-sdk` / manifest layout, or `ROCM_PATH`-like env hints). Set `MAD_AUTO_ROCM_PATH=0` to skip auto and use only legacy resolution (`ROCM_PATH` then `/opt/rocm`). **Overrides** (recommended for CI): - **Additional context (host):** top-level `"MAD_ROCM_PATH": "/path/to/host/rocm"` โ€” controls where madengine looks for host GPU tools (`rocminfo`, `amd-smi`, etc.). -- **Additional context (container):** `"docker_env_vars": { "MAD_ROCM_PATH": "/path/inside/image" }` โ€” sets the in-container `ROCM_PATH` for Docker runs. If omitted, at `run` time madengine uses the image OCI `Env` (`ROCM_PATH` / `ROCM_HOME`) if present, then an in-container probe, then defaults to `/opt/rocm` (see [ADR 0001](adr/0001-rocm-path-resolution.md)). The host-resolved path is **not** mirrored into the container. +- **Additional context (container):** `"docker_env_vars": { "MAD_ROCM_PATH": "/path/inside/image" }` โ€” sets the in-container `ROCM_PATH` for Docker runs. If omitted, at `run` time madengine uses the image OCI `Env` (`ROCM_PATH` / `ROCM_HOME`) if present, then an in-container probe, then defaults to `/opt/rocm`. The host-resolved path is **not** mirrored into the container. These two keys are independent, allowing host and container to use different ROCm installations without confusion. Precedence (host): top-level `MAD_ROCM_PATH` โ†’ auto-detect (unless disabled) โ†’ `ROCM_PATH` โ†’ `/opt/rocm`. -Precedence (container, **local Docker `run`**, **AMD**): `docker_env_vars.MAD_ROCM_PATH` (maps to `ROCM_PATH` for the workload) or explicit `ROCM_PATH` in `docker_env_vars` โ†’ image OCI `Env` (`ROCM_PATH` / `ROCM_HOME`) โ†’ in-image probe โ†’ default `/opt/rocm` with a warning. Implemented in `ContainerRunner.run_container` after the run image is resolved; see [ADR 0001](adr/0001-rocm-path-resolution.md). +Precedence (container, **local Docker `run`**, **AMD**): `docker_env_vars.MAD_ROCM_PATH` (maps to `ROCM_PATH` for the workload) or explicit `ROCM_PATH` in `docker_env_vars` โ†’ image OCI `Env` (`ROCM_PATH` / `ROCM_HOME`) โ†’ in-image probe โ†’ default `/opt/rocm` with a warning. Implemented in `ContainerRunner.run_container` after the run image is resolved. This applies to the run phase; build uses build-only context (no GPU detection) but still honors `MAD_ROCM_PATH` in context when set. diff --git a/docs/usage.md b/docs/usage.md index 5558479d..010a6b6c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -315,7 +315,7 @@ By default, **madengine** auto-detects the **host** ROCm root (apt under `/opt/r **Host** override: set top-level `MAD_ROCM_PATH` in `--additional-context` to tell madengine where host GPU tools live (`rocminfo`, `amd-smi`, etc.). -**In-container** `ROCM_PATH` (AMD Docker runs) is **not** copied from the host. If you do not set `docker_env_vars.MAD_ROCM_PATH` (or a literal `ROCM_PATH` in `docker_env_vars`), madengine sets it at **run** time from, in order: the image OCI `Env` (`ROCM_PATH` / `ROCM_HOME` via `docker image inspect`), an in-container probe (`docker run --rm`), or `/opt/rocm` with a warning. Override explicitly with `{"docker_env_vars": {"MAD_ROCM_PATH": "/path/inside/image"}}` when the image needs a fixed root. Details: [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only), [ADR 0001](adr/0001-rocm-path-resolution.md). +**In-container** `ROCM_PATH` (AMD Docker runs) is **not** copied from the host. If you do not set `docker_env_vars.MAD_ROCM_PATH` (or a literal `ROCM_PATH` in `docker_env_vars`), madengine sets it at **run** time from, in order: the image OCI `Env` (`ROCM_PATH` / `ROCM_HOME` via `docker image inspect`), an in-container probe (`docker run --rm`), or `/opt/rocm` with a warning. Override explicitly with `{"docker_env_vars": {"MAD_ROCM_PATH": "/path/inside/image"}}` when the image needs a fixed root. Details: [Configuration โ€” ROCm path](configuration.md#rocm-path-run-only). The two keys are independent โ€” host and container can point to different ROCm installations: diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index 46fa065d..d4c97005 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -22,10 +22,7 @@ # third-party modules from madengine.core.console import Console -from madengine.utils.rocm_path_resolver import ( - apply_container_rocm_path_overrides, - resolve_host_rocm_path, -) +from madengine.utils.rocm_path_resolver import resolve_host_rocm_path from madengine.utils.gpu_validator import validate_rocm_installation, GPUInstallationError, GPUVendor from madengine.utils.gpu_tool_factory import get_gpu_tool_manager from madengine.utils.gpu_tool_manager import BaseGPUToolManager @@ -258,11 +255,9 @@ def init_gpu_context(self) -> None: self.ctx["docker_env_vars"]["MAD_GPU_VENDOR"] = self.ctx["gpu_vendor"] self.ctx["rocm_path"] = self._rocm_path - # In-container ``ROCM_PATH`` for Docker is finalized in ``ContainerRunner.run_container`` - # (OCI env + in-image probe). Here we only map ``MAD_ROCM_PATH`` / user ``ROCM_PATH``. - apply_container_rocm_path_overrides( - self.ctx["docker_env_vars"], self._rocm_path - ) + # In-container ROCM_PATH is finalized at run time in run_container + # via finalize_container_rocm_path (OCI env โ†’ probe โ†’ default). + # MAD_ROCM_PATH in docker_env_vars is consumed there, not here. if "MAD_SYSTEM_NGPUS" not in self.ctx["docker_env_vars"]: self.ctx["docker_env_vars"][ diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 21352de9..4d831728 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -113,7 +113,7 @@ def _sh(cmd: str) -> str: ctr_rocm_ver = _sh( "rocm-sdk version 2>/dev/null " "|| cat \"${ROCM_PATH:-/opt/rocm}/.info/version\" 2>/dev/null " - "|| rocminfo 2>/dev/null | grep -i 'ROCm Version' | head -n1 | grep -oP '[\\d.]+' " + "|| rocminfo 2>/dev/null | grep -i 'ROCm Version' | head -n1 | grep -oP '[\\d.]+' 2>/dev/null " "|| echo unknown" ) @@ -1061,8 +1061,9 @@ def run_container( # Clear any ROCM_PATH left from a previous model run so finalize always # re-resolves per docker_image (OCI config โ†’ probe โ†’ default). - # If the user supplied MAD_ROCM_PATH via additional_context it was merged - # into docker_env_vars just above and finalize will consume it correctly. + # ROCM_PATH is program-managed; users override via MAD_ROCM_PATH in + # additional_context.docker_env_vars, which was merged just above and + # will be consumed by finalize_container_rocm_path correctly. self.context.ctx["docker_env_vars"].pop("ROCM_PATH", None) finalize_container_rocm_path( diff --git a/src/madengine/utils/rocm_path_resolver.py b/src/madengine/utils/rocm_path_resolver.py index f62e73c6..acf9ac07 100644 --- a/src/madengine/utils/rocm_path_resolver.py +++ b/src/madengine/utils/rocm_path_resolver.py @@ -6,7 +6,7 @@ compatibility. The resolution pipeline is **not** the historical ``rocEnvTool``/v1 ``RocmPathResolver`` from madenginev1; it reuses the same *ideas* in a self-contained helper for madengine (versioned ``/opt/rocm-*``, TheRock markers, ``PATH`` tools). -See **ADR 0001** in ``docs/adr/0001-rocm-path-resolution.md`` for the full design. +See the module docstring and inline comments for design details. Versioned installs (e.g. ``/opt/rocm-7.13.0`` with ``bin/amd-smi`` and ``bin/rocm-smi``) are detected after ``/opt/rocm``. ``which(amd-smi)`` / ``which(rocm-smi)`` infer @@ -22,16 +22,14 @@ **Container** (``docker_env_vars``) โ€” set when the run uses Docker (``run_container``): -* ``MAD_ROCM_PATH`` โ€” in-image ROCm root; the key is consumed and mapped to ``ROCM_PATH``. -* If the user set ``ROCM_PATH`` in ``docker_env_vars`` in additional context, it is kept - (normalized) until the run; it wins over OCI / probe in :func:`finalize_container_rocm_path` - if already present after :func:`apply_container_rocm_path_overrides`. +* ``MAD_ROCM_PATH`` in ``docker_env_vars`` โ€” in-image ROCm root; the key is consumed and + mapped to ``ROCM_PATH`` inside the container. ``ROCM_PATH`` itself is program-managed + and should not be set directly by users. * Full in-container resolution order is implemented in - :func:`finalize_container_rocm_path` (not host mirroring): (1) existing ``ROCM_PATH`` - in ``docker_env`` after overrides, (2) ``ROCM_PATH`` / ``ROCM_HOME`` from - ``docker image inspect`` OCI ``Config.Env``, (3) ``docker run --rm`` with an - in-image shell probe aligned with :class:`RocmPathResolver` heuristics, (4) default - ``/opt/rocm`` with a warning. + :func:`finalize_container_rocm_path` (no host mirroring): (1) ``MAD_ROCM_PATH`` override, + (2) ``ROCM_PATH`` / ``ROCM_HOME`` from ``docker image inspect`` OCI ``Config.Env``, + (3) ``docker run --rm`` with an in-image shell probe aligned with + :class:`RocmPathResolver` heuristics, (4) default ``/opt/rocm`` with a warning. """ from __future__ import annotations @@ -172,11 +170,11 @@ def auto_detect(self) -> OptionalPathStr: if m._looks_like_rocm_root(root): return normalize_rocm_path(str(root)) for _ in range(4): - if m._looks_like_rocm_root(root): - return normalize_rocm_path(str(root)) if root == root.parent: break root = root.parent + if m._looks_like_rocm_root(root): + return normalize_rocm_path(str(root)) candidates: List[Path] = [ Path("/usr/local/rocm"), @@ -396,7 +394,7 @@ def apply_container_rocm_path_overrides( if not s: return None croot = normalize_rocm_path(str(s).strip()) - elif d.get("ROCM_PATH") not in (None, ""): + elif d.get("ROCM_PATH") not in (None, "") and str(d.get("ROCM_PATH", "")).strip(): croot = normalize_rocm_path(str(d["ROCM_PATH"]).strip()) else: return None @@ -468,17 +466,3 @@ def finalize_container_rocm_path( return croot -def resolve_container_rocm_path(docker_env: Dict[str, Any], host_path: str) -> str: - """ - Apply :func:`apply_container_rocm_path_overrides` only (used from - :class:`~madengine.core.context.Context` init). Does not mirror the host - or run Docker inspect/probe. Run-time finalization is - :func:`finalize_container_rocm_path`. - - For backward compatibility, returns the in-``docker_env`` ``ROCM_PATH`` if set, - else an empty string. - """ - out = apply_container_rocm_path_overrides(docker_env, host_path) - if out is not None: - return out - return "" diff --git a/src/madengine/utils/therock_markers.py b/src/madengine/utils/therock_markers.py index a7c94dba..fc791f96 100644 --- a/src/madengine/utils/therock_markers.py +++ b/src/madengine/utils/therock_markers.py @@ -16,7 +16,7 @@ from pathlib import Path from typing import Final -# Relative to a putative ROCm / TheRock install root; keep in sync with ADR 0001. +# Relative to a putative ROCm / TheRock install root. THEROCK_SHARE_DIR: Final = Path("share") / "therock" THEROCK_MANIFEST_RELPATH: Final = THEROCK_SHARE_DIR / "therock_manifest.json" THEROCK_DIST_INFO_RELPATH: Final = THEROCK_SHARE_DIR / "dist_info.json" diff --git a/tests/unit/test_rocm_path.py b/tests/unit/test_rocm_path.py index 4ce8e86b..e89b5ed8 100644 --- a/tests/unit/test_rocm_path.py +++ b/tests/unit/test_rocm_path.py @@ -16,7 +16,6 @@ finalize_container_rocm_path, get_rocm_path_legacy, normalize_rocm_path, - resolve_container_rocm_path, resolve_host_rocm_path, ) @@ -91,7 +90,7 @@ def test_context_runtime_includes_rocm_path_in_ctx(self): assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") in (None, "") def test_context_container_mad_overrides_host(self): - """docker_env_vars MAD_ROCM_PATH sets in-container ROCM_PATH independently.""" + """docker_env_vars MAD_ROCM_PATH is preserved at context init; consumed at run time.""" from madengine.core.context import Context from unittest.mock import patch from madengine.utils.rocm_path_resolver import normalize_rocm_path @@ -111,10 +110,10 @@ def test_context_container_mad_overrides_host(self): patch.object(Context, "get_gpu_renderD_nodes", return_value=None): ctx = Context(additional_context=ac) assert ctx._rocm_path == normalize_rocm_path("/on/host") - assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") == normalize_rocm_path( - "/in/image" - ) - assert MAD_ROCM_PATH not in ctx.ctx["docker_env_vars"] + # ROCM_PATH is set at run time by finalize_container_rocm_path, not at context init. + assert ctx.ctx["docker_env_vars"].get("ROCM_PATH") is None + # MAD_ROCM_PATH stays in docker_env_vars until consumed by finalize at run time. + assert ctx.ctx["docker_env_vars"].get(MAD_ROCM_PATH) == "/in/image" @pytest.mark.unit @@ -155,7 +154,7 @@ def test_resolve_host_legacy_when_auto_off(self, monkeypatch): def test_resolve_container_mad_consumes_key(self): d = {MAD_ROCM_PATH: "/in/container"} host = "/on/host" - r = resolve_container_rocm_path(d, host) + r = apply_container_rocm_path_overrides(d, host) assert r == os.path.abspath("/in/container").rstrip(os.sep) assert d.get("ROCM_PATH") == r assert MAD_ROCM_PATH not in d @@ -165,7 +164,6 @@ def test_apply_overrides_does_not_mirror_host(self): r = apply_container_rocm_path_overrides(d, "/hostpath") assert r is None assert "ROCM_PATH" not in d - assert resolve_container_rocm_path(d, "/hostpath") == "" def test_get_rocm_path_legacy_alias(self, monkeypatch): monkeypatch.delenv("ROCM_PATH", raising=False)