From 78b51107efca09f7c26f6227934d7b7afe3b8d6f Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Thu, 23 Apr 2026 22:56:56 -0500 Subject: [PATCH] refactor(gpu-arch): auto-detect MAD_SYSTEM_GPU_ARCHITECTURE for local full-run mode In full workflow (madengine run --tags), the build phase ran with build_only_mode=True and skipped GPU detection, leaving Dockerfiles that declare ARG MAD_SYSTEM_GPU_ARCHITECTURE without a default built with an empty value. Users had to manually pass --additional-context every time. - Context.__init__: add detect_local_gpu_arch param (default False), thread it to init_build_context() - Context.init_build_context: add detect_gpu_arch param; when True, reuse detect_gpu_vendor() + get_gpu_tool_manager() + normalize_architecture_name() to detect and inject MAD_SYSTEM_GPU_ARCHITECTURE into docker_build_arg before the build; user-provided value is never overridden; fails gracefully - BuildOrchestrator.__init__: accept and forward detect_local_gpu_arch to Context; add resolved-arch confirmation print in execute() - RunOrchestrator._build_phase: pass detect_local_gpu_arch=True so only the full-workflow build path auto-detects; standalone madengine build is unaffected (flag defaults to False) Co-Authored-By: Claude Sonnet 4 --- src/madengine/core/context.py | 37 ++++++++++++++++--- .../orchestration/build_orchestrator.py | 17 ++++++++- .../orchestration/run_orchestrator.py | 11 +++++- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/src/madengine/core/context.py b/src/madengine/core/context.py index 24763588..d67e8c6a 100644 --- a/src/madengine/core/context.py +++ b/src/madengine/core/context.py @@ -83,6 +83,7 @@ def __init__( additional_context_file: str = None, build_only_mode: bool = False, rocm_path: str = None, + detect_local_gpu_arch: bool = False, ) -> None: """Constructor of the Context class. @@ -91,6 +92,9 @@ def __init__( additional_context_file: The additional context file. build_only_mode: Whether running in build-only mode (no GPU detection). rocm_path: Optional ROCm installation path (overrides ROCM_PATH env; default /opt/rocm). + detect_local_gpu_arch: When True and in build_only_mode, attempt to auto-detect + MAD_SYSTEM_GPU_ARCHITECTURE from the local node and inject it into docker_build_arg. + Has no effect when build_only_mode=False (runtime mode detects it via init_gpu_context). Raises: RuntimeError: If GPU detection fails and not in build-only mode. @@ -100,6 +104,7 @@ def __init__( self.console = Console() self._gpu_context_initialized = False self._build_only_mode = build_only_mode + self._detect_local_gpu_arch = detect_local_gpu_arch self._system_context_initialized = False self._gpu_tool_manager = None # Lazy initialization @@ -137,17 +142,22 @@ def __init__( self.init_runtime_context() else: # For build-only mode, only initialize what's needed for building - self.init_build_context() + self.init_build_context(detect_gpu_arch=self._detect_local_gpu_arch) ## ADD MORE CONTEXTS HERE ## - def init_build_context(self) -> None: + def init_build_context(self, detect_gpu_arch: bool = False) -> None: """Initialize build-specific context. This method sets up only the context needed for Docker builds, avoiding GPU detection that would fail on build-only nodes. System-specific contexts (host_os, numa_balancing, etc.) should be provided via --additional-context for build-only nodes if needed. + + Args: + detect_gpu_arch: When True, attempt to auto-detect MAD_SYSTEM_GPU_ARCHITECTURE + from the local node and inject it into docker_build_arg. Fails gracefully + if no GPU is present (e.g., on a pure CI build node). """ print("Initializing build-only context...") @@ -168,9 +178,26 @@ def init_build_context(self) -> None: "Consider providing host_os via --additional-context if needed for build" ) - # Don't detect GPU-specific contexts in build-only mode - # These should be provided via additional_context if needed for build args - # (GPU arch guidance is emitted in BuildOrchestrator after model/Dockerfile discovery.) + # Optionally auto-detect GPU architecture for local full-workflow builds (build+run). + # Skipped for standalone `madengine build` on non-GPU/CI nodes (detect_gpu_arch=False). + if detect_gpu_arch and "MAD_SYSTEM_GPU_ARCHITECTURE" not in self.ctx.get("docker_build_arg", {}): + try: + from madengine.utils.gpu_validator import detect_gpu_vendor + from madengine.execution.dockerfile_utils import normalize_architecture_name + + vendor = detect_gpu_vendor(self._rocm_path) + if vendor in (GPUVendor.AMD, GPUVendor.NVIDIA): + manager = get_gpu_tool_manager(vendor, self._rocm_path) + raw_arch = manager.get_gpu_architecture() + arch = normalize_architecture_name(raw_arch) or raw_arch.strip() + self.ctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"] = arch + print(f"Auto-detected GPU architecture for build: {arch}") + else: + print("Warning: No supported GPU detected; MAD_SYSTEM_GPU_ARCHITECTURE will not be set automatically.") + print("Consider providing it via --additional-context if needed for build args.") + except Exception as e: + print(f"Warning: Could not auto-detect GPU architecture for build: {e}") + print("Consider providing MAD_SYSTEM_GPU_ARCHITECTURE via --additional-context if needed for build args.") # Don't initialize NUMA balancing check for build-only nodes # This is runtime-specific and should be handled on execution nodes diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index da06f91f..0825ab16 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -46,13 +46,17 @@ class BuildOrchestrator: - Save deployment_config from --additional-context """ - def __init__(self, args, additional_context: Optional[Dict] = None): + def __init__(self, args, additional_context: Optional[Dict] = None, detect_local_gpu_arch: bool = False): """ Initialize build orchestrator. Args: args: CLI arguments namespace additional_context: Dict from --additional-context (merged with args if present) + detect_local_gpu_arch: When True, auto-detect MAD_SYSTEM_GPU_ARCHITECTURE from the + local node before building. Intended for full workflow (build+run) on a local + single node. Has no effect if the user already provided the value via + --additional-context. Default False preserves existing standalone-build behavior. """ self.args = args self.console = Console(live_output=getattr(args, "live_output", True)) @@ -120,7 +124,9 @@ def __init__(self, args, additional_context: Optional[Dict] = None): )) self.rich_console.print() - # Initialize context in build-only mode (no GPU detection) + # Initialize context in build-only mode (no GPU detection by default). + # Pass detect_local_gpu_arch so Context.init_build_context() can optionally + # auto-detect MAD_SYSTEM_GPU_ARCHITECTURE for full workflow (build+run) runs. # Context expects additional_context as a string representation of Python dict # Use repr() instead of json.dumps() because Context uses ast.literal_eval() # Use self.additional_context (post-ConfigLoader), not pre-defaults merged_context @@ -128,6 +134,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None): self.context = Context( additional_context=context_string, build_only_mode=True, + detect_local_gpu_arch=detect_local_gpu_arch, ) # Load credentials if available @@ -288,6 +295,12 @@ def execute( ) self._warn_if_mad_arch_unresolved_for_dockerfiles(models, builder) + resolved_arch = self.context.ctx.get("docker_build_arg", {}).get("MAD_SYSTEM_GPU_ARCHITECTURE") + if resolved_arch: + self.rich_console.print( + f"[green]✓ MAD_SYSTEM_GPU_ARCHITECTURE resolved: {resolved_arch}[/green]\n" + ) + # Step 3: Build Docker images self.rich_console.print("[bold cyan]🏗️ Building Docker images...[/bold cyan]") diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index 6725a457..67749514 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -345,7 +345,16 @@ def _build_phase(self, tags: list, registry: Optional[str] = None) -> str: # Update args with tags self.args.tags = tags - build_orch = BuildOrchestrator(self.args, self.additional_context) + # detect_local_gpu_arch=True: full workflow on a local single node — auto-detect + # MAD_SYSTEM_GPU_ARCHITECTURE before the build so Dockerfiles that require it + # (ARG MAD_SYSTEM_GPU_ARCHITECTURE with no default) are built correctly without + # requiring the user to manually pass --additional-context. + # The user's explicitly provided value (if any) is still respected and not overridden. + build_orch = BuildOrchestrator( + self.args, + self.additional_context, + detect_local_gpu_arch=True, + ) manifest_file = build_orch.execute( registry=registry, clean_cache=getattr(self.args, "clean_docker_cache", False),