-
Notifications
You must be signed in to change notification settings - Fork 13
refactor(gpu-arch): auto-detect MAD_SYSTEM_GPU_ARCHITECTURE for local full-run mode #113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,13 +46,17 @@ class BuildOrchestrator: | |
| - Save deployment_config from --additional-context | ||
| """ | ||
|
|
||
| def __init__(self, args, additional_context: Optional[Dict] = None): | ||
| def __init__(self, args, additional_context: Optional[Dict] = None, detect_local_gpu_arch: bool = False): | ||
| """ | ||
| Initialize build orchestrator. | ||
|
|
||
| Args: | ||
| args: CLI arguments namespace | ||
| additional_context: Dict from --additional-context (merged with args if present) | ||
| detect_local_gpu_arch: When True, auto-detect MAD_SYSTEM_GPU_ARCHITECTURE from the | ||
| local node before building. Intended for full workflow (build+run) on a local | ||
| single node. Has no effect if the user already provided the value via | ||
| --additional-context. Default False preserves existing standalone-build behavior. | ||
| """ | ||
| self.args = args | ||
| self.console = Console(live_output=getattr(args, "live_output", True)) | ||
|
|
@@ -120,14 +124,17 @@ def __init__(self, args, additional_context: Optional[Dict] = None): | |
| )) | ||
| self.rich_console.print() | ||
|
|
||
| # Initialize context in build-only mode (no GPU detection) | ||
| # Initialize context in build-only mode (no GPU detection by default). | ||
| # Pass detect_local_gpu_arch so Context.init_build_context() can optionally | ||
| # auto-detect MAD_SYSTEM_GPU_ARCHITECTURE for full workflow (build+run) runs. | ||
| # Context expects additional_context as a string representation of Python dict | ||
| # Use repr() instead of json.dumps() because Context uses ast.literal_eval() | ||
| # Use self.additional_context (post-ConfigLoader), not pre-defaults merged_context | ||
| context_string = repr(self.additional_context) | ||
| self.context = Context( | ||
| additional_context=context_string, | ||
| build_only_mode=True, | ||
| detect_local_gpu_arch=detect_local_gpu_arch, | ||
| ) | ||
|
Comment on lines
133
to
138
|
||
|
|
||
| # Load credentials if available | ||
|
|
@@ -288,6 +295,12 @@ def execute( | |
| ) | ||
| self._warn_if_mad_arch_unresolved_for_dockerfiles(models, builder) | ||
|
|
||
| resolved_arch = self.context.ctx.get("docker_build_arg", {}).get("MAD_SYSTEM_GPU_ARCHITECTURE") | ||
| if resolved_arch: | ||
| self.rich_console.print( | ||
| f"[green]✓ MAD_SYSTEM_GPU_ARCHITECTURE resolved: {resolved_arch}[/green]\n" | ||
| ) | ||
|
|
||
| # Step 3: Build Docker images | ||
| self.rich_console.print("[bold cyan]🏗️ Building Docker images...[/bold cyan]") | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -345,7 +345,16 @@ def _build_phase(self, tags: list, registry: Optional[str] = None) -> str: | |
| # Update args with tags | ||
| self.args.tags = tags | ||
|
|
||
| build_orch = BuildOrchestrator(self.args, self.additional_context) | ||
| # detect_local_gpu_arch=True: full workflow on a local single node — auto-detect | ||
| # MAD_SYSTEM_GPU_ARCHITECTURE before the build so Dockerfiles that require it | ||
| # (ARG MAD_SYSTEM_GPU_ARCHITECTURE with no default) are built correctly without | ||
| # requiring the user to manually pass --additional-context. | ||
| # The user's explicitly provided value (if any) is still respected and not overridden. | ||
| build_orch = BuildOrchestrator( | ||
| self.args, | ||
| self.additional_context, | ||
| detect_local_gpu_arch=True, | ||
| ) | ||
|
Comment on lines
+348
to
+357
|
||
| manifest_file = build_orch.execute( | ||
| registry=registry, | ||
| clean_cache=getattr(self.args, "clean_docker_cache", False), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new build-only auto-detection path (
detect_gpu_arch=True) is currently untested. SinceContextalready has unit coverage, please add a unit test that patchesdetect_gpu_vendor()/get_gpu_tool_manager().get_gpu_architecture()and asserts thatctx["docker_build_arg"]["MAD_SYSTEM_GPU_ARCHITECTURE"]is injected only when absent, and that failures are handled without raising in build-only mode.