From 284f7d2c6cb5cfe9f8ec8ca6e77a3adf678a552d Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 5 Feb 2026 07:33:40 +0000 Subject: [PATCH 01/12] Updates to slurm launcher --- src/madengine/cli/commands/build.py | 18 + src/madengine/deployment/slurm.py | 343 +++++++++++++++++- src/madengine/execution/container_runner.py | 203 ++++++++++- .../orchestration/build_orchestrator.py | 283 +++++++++++++++ 4 files changed, 839 insertions(+), 8 deletions(-) diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py index 5b10a65c..f359432d 100644 --- a/src/madengine/cli/commands/build.py +++ b/src/madengine/cli/commands/build.py @@ -55,6 +55,20 @@ def build( "--batch-manifest", help="Input batch.json file for batch build mode" ), ] = None, + use_image: Annotated[ + Optional[str], + typer.Option( + "--use-image", + help="Skip Docker build and use pre-built image (e.g., lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x)" + ), + ] = None, + build_on_compute: Annotated[ + bool, + typer.Option( + "--build-on-compute", + help="Build Docker images on SLURM compute node instead of login node" + ), + ] = False, additional_context: Annotated[ str, typer.Option( @@ -191,6 +205,8 @@ def build( verbose=verbose, _separate_phases=True, batch_build_metadata=batch_build_metadata if batch_build_metadata else None, + use_image=use_image, + build_on_compute=build_on_compute, ) # Initialize orchestrator in build-only mode @@ -211,6 +227,8 @@ def build( clean_cache=clean_docker_cache, manifest_output=manifest_output, batch_build_metadata=batch_build_metadata, + use_image=use_image, + build_on_compute=build_on_compute, ) # Load build summary for display diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 0083c2d4..7e1f2249 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -68,6 +68,7 @@ def __init__(self, config: DeploymentConfig): self.gpus_per_node = self.slurm_config.get("gpus_per_node", 8) self.time_limit = self.slurm_config.get("time", "24:00:00") self.output_dir = Path(self.slurm_config.get("output_dir", "./slurm_results")) + self.reservation = self.slurm_config.get("reservation", None) # Setup Jinja2 template engine template_dir = Path(__file__).parent / "templates" / "slurm" @@ -76,6 +77,115 @@ def __init__(self, config: DeploymentConfig): # Generated script path self.script_path = None + # ========== OPTION 2: Detect existing SLURM allocation ========== + # If SLURM_JOB_ID exists, we're inside an salloc allocation + self.inside_allocation = os.environ.get("SLURM_JOB_ID") is not None + self.existing_job_id = os.environ.get("SLURM_JOB_ID", "") + self.allocation_nodes = self._get_allocation_node_count() + + if self.inside_allocation: + self.console.print( + f"[cyan]✓ Detected existing SLURM allocation: Job {self.existing_job_id}[/cyan]" + ) + self.console.print( + f" Allocation has {self.allocation_nodes} nodes available" + ) + + def _get_allocation_node_count(self) -> int: + """ + Get number of nodes in current SLURM allocation. + + Note: SLURM_NNODES reflects the current job step, not the full allocation. + We query the job directly using scontrol to get the actual node count. + """ + if not self.inside_allocation: + return 0 + + job_id = self.existing_job_id + + # Query the actual job's node count using scontrol (most accurate) + try: + result = subprocess.run( + ["scontrol", "show", "job", job_id], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + # Parse NumNodes=X from output + for line in result.stdout.split("\n"): + if "NumNodes=" in line: + # Format: "NumNodes=3 NumCPUs=..." + for part in line.split(): + if part.startswith("NumNodes="): + try: + return int(part.split("=")[1]) + except (ValueError, IndexError): + pass + except Exception: + pass + + # Fallback: Try SLURM_JOB_NUM_NODES (full job node count, if set) + job_num_nodes = os.environ.get("SLURM_JOB_NUM_NODES") + if job_num_nodes: + try: + return int(job_num_nodes) + except ValueError: + pass + + # Fallback: SLURM_NNODES (may be step-specific, not full allocation) + nnodes = os.environ.get("SLURM_NNODES") + if nnodes: + try: + return int(nnodes) + except ValueError: + pass + + # Last resort: count nodes in SLURM_NODELIST + nodelist = os.environ.get("SLURM_NODELIST") + if nodelist: + try: + result = subprocess.run( + ["scontrol", "show", "hostname", nodelist], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + return len(result.stdout.strip().split("\n")) + except Exception: + pass + + return 0 + + def _validate_allocation_nodes(self) -> tuple[bool, str]: + """ + Validate that existing allocation has enough nodes for the job. + + Returns: + Tuple of (is_valid, error_message) + """ + if not self.inside_allocation: + return True, "" + + requested_nodes = self.nodes + available_nodes = self.allocation_nodes + + if available_nodes < requested_nodes: + return False, ( + f"Insufficient nodes in current allocation. " + f"Requested: {requested_nodes}, Available: {available_nodes}. " + f"Either reduce nodes in config or use a larger allocation." + ) + + if available_nodes > requested_nodes: + self.console.print( + f"[yellow]⚠ Note: Using {requested_nodes} of {available_nodes} " + f"available nodes in allocation[/yellow]" + ) + + return True, "" + def validate(self) -> bool: """Validate SLURM commands are available locally.""" # Check required SLURM CLI tools @@ -177,11 +287,6 @@ def prepare(self) -> bool: """Generate sbatch script from template.""" # Validate environment BEFORE generating job scripts self.console.print("\n[bold]Validating submission environment...[/bold]") - if not self._validate_cli_availability(): - self.console.print( - "\n[yellow]⚠ Tip: Compute nodes inherit your submission environment[/yellow]" - ) - return False try: self.output_dir.mkdir(parents=True, exist_ok=True) @@ -194,6 +299,23 @@ def prepare(self) -> bool: model_key = model_keys[0] model_info = self.manifest["built_models"][model_key] + # Check if this is a baremetal launcher (sglang-disagg, vllm-disagg) + launcher_type = self.distributed_config.get("launcher", "torchrun") + launcher_normalized = launcher_type.lower().replace("_", "-") + + if launcher_normalized in ["sglang-disagg", "vllm-disagg"]: + # For disagg launchers, generate simple wrapper script + # that runs the model's .slurm script directly on baremetal + self.console.print(f"[cyan]Detected baremetal launcher: {launcher_type}[/cyan]") + return self._prepare_baremetal_script(model_info) + + # Standard flow: validate madengine availability for complex job template + if not self._validate_cli_availability(): + self.console.print( + "\n[yellow]⚠ Tip: Compute nodes inherit your submission environment[/yellow]" + ) + return False + # Prepare template context context = self._prepare_template_context(model_info) @@ -222,6 +344,114 @@ def _normalize_nodelist(nodelist: Optional[str]) -> Optional[str]: return None return ",".join(n.strip() for n in nodelist.split(",") if n.strip()) + def _prepare_baremetal_script(self, model_info: Dict) -> bool: + """ + Generate a simple wrapper script for baremetal launchers (sglang-disagg, vllm-disagg). + + These launchers run the model's .slurm script directly on baremetal, + which then manages Docker containers via srun. No madengine wrapper needed. + """ + # Get the model's script path + model_script = model_info.get("scripts", "") + if not model_script: + self.console.print("[red]✗ No scripts defined in model_info[/red]") + return False + + # Get manifest directory (where the model script is relative to) + manifest_dir = Path(self.config.manifest_file).parent.absolute() + model_script_path = manifest_dir / model_script + + if not model_script_path.exists(): + self.console.print(f"[red]✗ Model script not found: {model_script_path}[/red]") + return False + + # Get environment variables + env_vars = {} + + # From model_info.env_vars + if "env_vars" in model_info: + env_vars.update(model_info["env_vars"]) + + # From additional_context.env_vars + if "env_vars" in self.config.additional_context: + env_vars.update(self.config.additional_context["env_vars"]) + + # From distributed config + sglang_disagg_config = self.distributed_config.get("sglang_disagg", {}) + if sglang_disagg_config: + env_vars["xP"] = str(sglang_disagg_config.get("prefill_nodes", 1)) + env_vars["yD"] = str(sglang_disagg_config.get("decode_nodes", 1)) + + # Get model args + model_args = model_info.get("args", "") + + # Generate simple wrapper script + # IMPORTANT: SBATCH directives MUST be at the top, right after #!/bin/bash + script_lines = [ + "#!/bin/bash", + f"#SBATCH --job-name=madengine-{model_info['name']}", + f"#SBATCH --output={self.output_dir}/madengine-{model_info['name']}_%j.out", + f"#SBATCH --error={self.output_dir}/madengine-{model_info['name']}_%j.err", + f"#SBATCH --partition={self.partition}", + f"#SBATCH --nodes={self.nodes}", + f"#SBATCH --ntasks={self.nodes}", + f"#SBATCH --gpus-per-node={self.gpus_per_node}", + f"#SBATCH --time={self.time_limit}", + "#SBATCH --exclusive", + ] + + # Add reservation if specified + if self.reservation: + script_lines.append(f"#SBATCH --reservation={self.reservation}") + + script_lines.extend([ + "", + f"# Baremetal launcher script for {model_info['name']}", + f"# Generated by madengine for sglang-disagg", + "", + "set -e", + "", + "# Environment variables", + ]) + + for key, value in env_vars.items(): + script_lines.append(f"export {key}=\"{value}\"") + + script_lines.append("") + script_lines.extend([ + "echo '=========================================='", + "echo 'Baremetal Launcher - SGLang Disaggregated'", + "echo '=========================================='", + f"echo 'Model: {model_info['name']}'", + f"echo 'Script: {model_script_path}'", + "echo 'SLURM_JOB_ID:' $SLURM_JOB_ID", + "echo 'SLURM_NNODES:' $SLURM_NNODES", + "echo 'SLURM_NODELIST:' $SLURM_NODELIST", + "echo ''", + "", + "# Change to script directory", + f"cd {model_script_path.parent}", + "", + "# Run the model script directly on baremetal", + f"echo 'Executing: bash {model_script_path.name} {model_args}'", + f"bash {model_script_path.name} {model_args}", + "", + "echo ''", + "echo 'Script completed.'", + ]) + + script_content = "\n".join(script_lines) + + # Save script + self.script_path = self.output_dir / f"madengine_{model_info['name']}.sh" + self.script_path.write_text(script_content) + self.script_path.chmod(0o755) + + self.console.print(f"[green]✓ Generated baremetal script: {self.script_path}[/green]") + self.console.print(f" Model script: {model_script_path}") + self.console.print(f" Environment: {len(env_vars)} variables") + + return True def _prepare_template_context(self, model_info: Dict) -> Dict[str, Any]: """Prepare context for Jinja2 template rendering.""" # Use hierarchical GPU resolution: runtime > deployment > model > default @@ -661,7 +891,12 @@ def _generate_basic_env_command( # Model script should handle launcher invocation''' def deploy(self) -> DeploymentResult: - """Submit sbatch script to SLURM scheduler (locally).""" + """ + Deploy to SLURM - either via sbatch (new job) or bash (existing allocation). + + If SLURM_JOB_ID is set (inside salloc), runs script directly with bash. + Otherwise, submits a new job via sbatch. + """ if not self.script_path or not self.script_path.exists(): return DeploymentResult( status=DeploymentStatus.FAILED, @@ -669,6 +904,85 @@ def deploy(self) -> DeploymentResult: message="Script not generated. Run prepare() first.", ) + # ========== BRANCH: Inside allocation vs new job ========== + if self.inside_allocation: + return self._run_inside_existing_allocation() + else: + return self._submit_new_job() + + def _run_inside_existing_allocation(self) -> DeploymentResult: + """ + Run script directly inside existing salloc allocation using bash. + + The script will use the nodes already allocated to the current job. + SLURM environment variables (SLURM_NODELIST, etc.) are inherited. + """ + # Validate node count before running + is_valid, error_msg = self._validate_allocation_nodes() + if not is_valid: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=self.existing_job_id, + message=error_msg, + ) + + self.console.print( + f"\n[bold cyan]Running inside existing SLURM allocation[/bold cyan]" + ) + self.console.print(f" Job ID: {self.existing_job_id}") + self.console.print(f" Using {self.nodes} of {self.allocation_nodes} allocated nodes") + self.console.print(f" GPUs per node: {self.gpus_per_node}") + self.console.print(f" Script: {self.script_path}") + self.console.print(f"\n[dim]Executing: bash {self.script_path}[/dim]\n") + + try: + # Run script directly with bash (synchronous, blocks until done) + # Don't capture output - let it stream directly to console + result = subprocess.run( + ["bash", str(self.script_path)], + timeout=self.config.timeout if self.config.timeout > 0 else None, + ) + + if result.returncode == 0: + self.console.print( + f"\n[green]✓ Script completed successfully in allocation {self.existing_job_id}[/green]" + ) + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=self.existing_job_id, + message=f"Completed inside existing allocation {self.existing_job_id}", + logs_path=str(self.output_dir), + ) + else: + self.console.print( + f"\n[red]✗ Script failed with exit code {result.returncode}[/red]" + ) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=self.existing_job_id, + message=f"Script failed with exit code {result.returncode}", + logs_path=str(self.output_dir), + ) + + except subprocess.TimeoutExpired: + self.console.print( + f"\n[red]✗ Script timed out after {self.config.timeout}s[/red]" + ) + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=self.existing_job_id, + message=f"Script timed out after {self.config.timeout}s", + ) + except Exception as e: + self.console.print(f"\n[red]✗ Execution error: {e}[/red]") + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=self.existing_job_id, + message=f"Execution error: {str(e)}", + ) + + def _submit_new_job(self) -> DeploymentResult: + """Submit new SLURM job via sbatch (original behavior).""" # ==================== PREFLIGHT NODE SELECTION ==================== # For single- and multi-node jobs, check for clean nodes and exclude bad ones. # Single-node: we still run the check so bad nodes (e.g. Docker broken) get excluded; @@ -780,6 +1094,15 @@ def deploy(self) -> DeploymentResult: def monitor(self, deployment_id: str) -> DeploymentResult: """Check SLURM job status (locally).""" + # If we ran inside an existing allocation, script already completed synchronously + # No need to poll - just return success (deploy() already handled the result) + if self.inside_allocation: + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=deployment_id, + message=f"Completed (ran inside existing allocation {deployment_id})", + ) + try: # Query job status using squeue (runs locally) result = subprocess.run( @@ -1482,6 +1805,14 @@ def _collect_results_parse_perf_csv( def cleanup(self, deployment_id: str) -> bool: """Cancel SLURM job if still running (locally).""" + # CRITICAL: Never cancel an existing allocation we're running inside! + # The user's salloc session should not be terminated by madengine + if self.inside_allocation: + self.console.print( + f"[dim]Skipping cleanup - running inside existing allocation {deployment_id}[/dim]" + ) + return True + try: subprocess.run( ["scancel", deployment_id], capture_output=True, timeout=10 diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 0b561f09..2c504cbd 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -14,6 +14,13 @@ import json import typing import warnings + +BAREMETAL_LAUNCHERS = [ + "sglang-disagg", + "sglang_disagg", + "vllm-disagg", + "vllm_disagg", +] from rich.console import Console as RichConsole from contextlib import redirect_stdout, redirect_stderr from madengine.core.console import Console @@ -668,6 +675,151 @@ def apply_tools( else: print(f" Note: Command '{cmd}' already added by another tool, skipping duplicate.") + def _run_on_baremetal( + self, + model_info: typing.Dict, + build_info: typing.Dict, + log_file_path: str, + timeout: int, + run_results: typing.Dict, + pre_encapsulate_post_scripts: typing.Dict, + run_env: typing.Dict, + ) -> typing.Dict: + """ + Run script directly on baremetal (not inside Docker). + + Used for launchers like sglang-disagg that manage their own Docker containers + via SLURM srun commands. The script is executed directly on the node. + + Args: + model_info: Model configuration from manifest + build_info: Build information from manifest + log_file_path: Path to log file + timeout: Execution timeout in seconds + run_results: Dictionary to store run results + pre_encapsulate_post_scripts: Pre/post script configuration + run_env: Environment variables for the script + + Returns: + Dictionary with run results + """ + import shutil + + self.rich_console.print(f"[dim]{'='*80}[/dim]") + + # Prepare script path + scripts_arg = model_info["scripts"] + + # Get the current working directory (might be temp workspace) + cwd = os.getcwd() + print(f"📂 Current directory: {cwd}") + + if scripts_arg.endswith(".sh") or scripts_arg.endswith(".slurm"): + script_path = scripts_arg + script_name = os.path.basename(scripts_arg) + elif scripts_arg.endswith(".py"): + script_path = scripts_arg + script_name = os.path.basename(scripts_arg) + else: + # Directory specified - look for run.sh + script_path = os.path.join(scripts_arg, "run.sh") + script_name = "run.sh" + + # If script path is relative, make it absolute from cwd + if not os.path.isabs(script_path): + script_path = os.path.join(cwd, script_path) + + # Check script exists + if not os.path.exists(script_path): + print(f"⚠️ Script not found at: {script_path}") + # Try alternative locations + alt_path = os.path.join(cwd, os.path.basename(scripts_arg)) + if os.path.exists(alt_path): + script_path = alt_path + print(f"✓ Found at alternative location: {script_path}") + else: + raise FileNotFoundError(f"Script not found: {script_path}") + + script_dir = os.path.dirname(script_path) or cwd + print(f"📜 Script: {script_path}") + print(f"📁 Working directory: {script_dir}") + + # Prepare model arguments + model_args = self.context.ctx.get("model_args", model_info.get("args", "")) + print(f"📝 Arguments: {model_args}") + + # Build command + if script_path.endswith(".py"): + cmd = f"python3 {script_path} {model_args}" + else: + cmd = f"bash {script_path} {model_args}" + + print(f"🔧 Command: {cmd}") + + # Prepare environment + env = os.environ.copy() + env.update(run_env) + + # Add model-specific env vars from model_info + if "env_vars" in model_info and model_info["env_vars"]: + for key, value in model_info["env_vars"].items(): + env[key] = str(value) + print(f" ENV: {key}={value}") + + # Add env vars from additional_context + if self.additional_context and "env_vars" in self.additional_context: + for key, value in self.additional_context["env_vars"].items(): + env[key] = str(value) + + # Run script with logging + test_start_time = time.time() + self.rich_console.print("\n[bold blue]Running script on baremetal...[/bold blue]") + + try: + with open(log_file_path, mode="w", buffering=1) as outlog: + with redirect_stdout( + PythonicTee(outlog, self.live_output) + ), redirect_stderr(PythonicTee(outlog, self.live_output)): + print(f"⏰ Setting timeout to {timeout} seconds.") + print(f"🚀 Executing: {cmd}") + print(f"📂 Working directory: {script_dir}") + print(f"{'='*80}") + + result = subprocess.run( + cmd, + shell=True, + cwd=script_dir, + env=env, + timeout=timeout if timeout > 0 else None, + ) + + run_results["test_duration"] = time.time() - test_start_time + print(f"\n{'='*80}") + print(f"⏱️ Test Duration: {run_results['test_duration']:.2f} seconds") + + if result.returncode == 0: + run_results["status"] = "SUCCESS" + self.rich_console.print("[bold green]✓ Script completed successfully[/bold green]") + else: + run_results["status"] = "FAILURE" + run_results["status_detail"] = f"Exit code {result.returncode}" + self.rich_console.print(f"[bold red]✗ Script failed with exit code {result.returncode}[/bold red]") + raise subprocess.CalledProcessError(result.returncode, cmd) + + except subprocess.TimeoutExpired: + run_results["status"] = "FAILURE" + run_results["status_detail"] = f"Timeout after {timeout}s" + run_results["test_duration"] = time.time() - test_start_time + self.rich_console.print(f"[bold red]✗ Script timed out after {timeout}s[/bold red]") + raise + except Exception as e: + run_results["status"] = "FAILURE" + run_results["status_detail"] = str(e) + run_results["test_duration"] = time.time() - test_start_time + raise + + return run_results + def run_pre_post_script( self, model_docker: Docker, model_dir: str, pre_post: typing.List ) -> None: @@ -876,6 +1028,15 @@ def run_container( if merged_count > 0: print(f"ℹ️ Merged {merged_count} environment variables from additional_context") + # Merge env_vars from model_info (models.json) into docker_env_vars + if "env_vars" in model_info and model_info["env_vars"]: + model_env_count = 0 + for key, value in model_info["env_vars"].items(): + self.context.ctx["docker_env_vars"][key] = str(value) + model_env_count += 1 + if model_env_count > 0: + print(f"ℹ️ Merged {model_env_count} environment variables from model_info (models.json)") + if "data" in model_info and model_info["data"] != "" and self.data: mount_datapaths = self.data.get_mountpaths(model_info["data"]) model_dataenv = self.data.get_env(model_info["data"]) @@ -937,6 +1098,44 @@ def run_container( print(f"Docker options: {docker_options}") + # ========== CHECK FOR BAREMETAL LAUNCHERS ========== + # Launchers like sglang-disagg run scripts directly on baremetal, + # not inside Docker. The script itself manages Docker containers via srun. + launcher = "" + + # Debug: Print all sources + print(f"🔍 Baremetal check - looking for launcher...") + print(f" MAD_LAUNCHER_TYPE env: {os.environ.get('MAD_LAUNCHER_TYPE', '')}") + if self.additional_context: + distributed_config = self.additional_context.get("distributed", {}) + launcher = distributed_config.get("launcher", "") + print(f" additional_context.distributed.launcher: {launcher or ''}") + if not launcher and model_info.get("distributed"): + launcher = model_info["distributed"].get("launcher", "") + print(f" model_info.distributed.launcher: {launcher or ''}") + if not launcher: + launcher = os.environ.get("MAD_LAUNCHER_TYPE", "") + print(f" Fallback to MAD_LAUNCHER_TYPE: {launcher or ''}") + + print(f" Final launcher detected: {launcher or ''}") + + # Normalize launcher name (replace underscores with hyphens) + launcher_normalized = launcher.lower().replace("_", "-") if launcher else "" + + if launcher_normalized and launcher_normalized in [l.lower().replace("_", "-") for l in BAREMETAL_LAUNCHERS]: + self.rich_console.print(f"\n[bold cyan]🖥️ Running on BAREMETAL (launcher: {launcher})[/bold cyan]") + self.rich_console.print(f"[dim]Script will manage its own Docker containers via SLURM[/dim]") + return self._run_on_baremetal( + model_info=model_info, + build_info=build_info, + log_file_path=log_file_path, + timeout=timeout, + run_results=run_results, + pre_encapsulate_post_scripts=pre_encapsulate_post_scripts, + run_env=run_env, + ) + # ========== END BAREMETAL CHECK ========== + self.rich_console.print(f"\n[bold blue]🏃 Starting Docker container execution...[/bold blue]") print(f"🏷️ Image: {docker_image}") print(f"📦 Container: {container_name}") @@ -1055,8 +1254,8 @@ def run_container( # Prepare script execution scripts_arg = model_info["scripts"] - if scripts_arg.endswith(".sh"): - # Shell script specified directly + if scripts_arg.endswith(".sh") or scripts_arg.endswith(".slurm"): + # Shell script specified directly (.sh or .slurm for SLURM batch scripts) dir_path = os.path.dirname(scripts_arg) script_name = "bash " + os.path.basename(scripts_arg) elif scripts_arg.endswith(".py"): diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index da06f91f..50495d97 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -236,6 +236,8 @@ def execute( clean_cache: bool = False, manifest_output: str = "build_manifest.json", batch_build_metadata: Optional[Dict] = None, + use_image: Optional[str] = None, + build_on_compute: bool = False, ) -> str: """ Execute build workflow. @@ -245,6 +247,8 @@ def execute( clean_cache: Whether to use --no-cache for Docker builds manifest_output: Output file for build manifest batch_build_metadata: Optional batch build metadata + use_image: Pre-built Docker image to use (skip Docker build) + build_on_compute: Build on SLURM compute node instead of login node Returns: Path to generated build_manifest.json @@ -253,6 +257,21 @@ def execute( DiscoveryError: If model discovery fails BuildError: If Docker build fails """ + # Handle pre-built image mode + if use_image: + return self._execute_with_prebuilt_image( + use_image=use_image, + manifest_output=manifest_output, + ) + + # Handle build-on-compute mode + if build_on_compute: + return self._execute_build_on_compute( + registry=registry, + clean_cache=clean_cache, + manifest_output=manifest_output, + batch_build_metadata=batch_build_metadata, + ) self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") self.rich_console.print("[bold blue]🔨 BUILD PHASE[/bold blue]") self.rich_console.print("[yellow](Build-only mode - no GPU detection)[/yellow]") @@ -465,3 +484,267 @@ def _save_deployment_config(self, manifest_file: str): # Non-fatal - just warn self.rich_console.print(f"[yellow]Warning: Could not save deployment config: {e}[/yellow]") + def _execute_with_prebuilt_image( + self, + use_image: str, + manifest_output: str = "build_manifest.json", + ) -> str: + """ + Generate manifest for a pre-built Docker image (skip Docker build). + + This is useful when using external images like: + - lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x + - nvcr.io/nvidia/pytorch:24.01-py3 + + Args: + use_image: Pre-built Docker image name + manifest_output: Output file for build manifest + + Returns: + Path to generated build_manifest.json + """ + self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") + self.rich_console.print("[bold blue]🔨 BUILD PHASE (Pre-built Image Mode)[/bold blue]") + self.rich_console.print(f"[cyan]Using pre-built image: {use_image}[/cyan]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + try: + # Step 1: Discover models + self.rich_console.print("[bold cyan]🔍 Discovering models...[/bold cyan]") + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + + if not models: + raise DiscoveryError( + "No models discovered", + context=create_error_context( + operation="discover_models", + component="BuildOrchestrator", + ), + suggestions=[ + "Check if models.json exists", + "Verify --tags parameter is correct", + ], + ) + + self.rich_console.print(f"[green]✓ Found {len(models)} models[/green]\n") + + # Step 2: Generate manifest with pre-built image + self.rich_console.print("[bold cyan]📄 Generating manifest for pre-built image...[/bold cyan]") + + manifest = { + "built_images": { + use_image: { + "image_name": use_image, + "dockerfile": "", + "build_time": 0, + "prebuilt": True, + } + }, + "built_models": {}, + "context": self.context.ctx if hasattr(self.context, 'ctx') else {}, + "credentials_required": [], + "summary": { + "successful_builds": [], + "failed_builds": [], + "total_build_time": 0, + "successful_pushes": [], + "failed_pushes": [], + }, + } + + # Add each discovered model with the pre-built image + for model in models: + model_name = model.get("name", "unknown") + manifest["built_models"][model_name] = { + "name": model_name, + "image": use_image, + "dockerfile": model.get("dockerfile", ""), + "scripts": model.get("scripts", ""), + "data": model.get("data", ""), + "n_gpus": model.get("n_gpus", "8"), + "owner": model.get("owner", ""), + "training_precision": model.get("training_precision", ""), + "multiple_results": model.get("multiple_results", ""), + "tags": model.get("tags", []), + "timeout": model.get("timeout", -1), + "args": model.get("args", ""), + "slurm": model.get("slurm", {}), + "distributed": model.get("distributed", {}), + "env_vars": model.get("env_vars", {}), + "prebuilt": True, + } + manifest["summary"]["successful_builds"].append(model_name) + + # Save manifest + with open(manifest_output, "w") as f: + json.dump(manifest, f, indent=2) + + # Save deployment config + self._save_deployment_config(manifest_output) + + self.rich_console.print(f"[green]✓ Generated manifest: {manifest_output}[/green]") + self.rich_console.print(f" Pre-built image: {use_image}") + self.rich_console.print(f" Models: {len(models)}") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + return manifest_output + + except (DiscoveryError, BuildError): + raise + except Exception as e: + raise BuildError( + f"Failed to generate manifest for pre-built image: {e}", + context=create_error_context( + operation="prebuilt_manifest", + component="BuildOrchestrator", + ), + ) from e + + def _execute_build_on_compute( + self, + registry: Optional[str] = None, + clean_cache: bool = False, + manifest_output: str = "build_manifest.json", + batch_build_metadata: Optional[Dict] = None, + ) -> str: + """ + Execute Docker build on a SLURM compute node instead of login node. + + This submits a SLURM job that runs the Docker build on a compute node, + which is useful when: + - Login node has limited disk space + - Login node shouldn't run heavy workloads + - Compute nodes have faster storage/network + + Args: + registry: Optional registry to push images to + clean_cache: Whether to use --no-cache for Docker builds + manifest_output: Output file for build manifest + batch_build_metadata: Optional batch build metadata + + Returns: + Path to generated build_manifest.json + """ + import subprocess + import os + + self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") + self.rich_console.print("[bold blue]🔨 BUILD PHASE (Compute Node Mode)[/bold blue]") + self.rich_console.print("[cyan]Building on SLURM compute node...[/cyan]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + + # Check if we're inside an existing allocation + inside_allocation = os.environ.get("SLURM_JOB_ID") is not None + existing_job_id = os.environ.get("SLURM_JOB_ID", "") + + # Get SLURM config from additional_context + slurm_config = self.additional_context.get("slurm", {}) + partition = slurm_config.get("partition", "gpu") + reservation = slurm_config.get("reservation", "") + time_limit = slurm_config.get("time", "02:00:00") + + # Build the madengine build command (without --build-on-compute to avoid recursion) + tags = getattr(self.args, 'tags', []) + tags_str = " ".join([f"-t {tag}" for tag in tags]) if tags else "" + + additional_context_str = "" + if self.additional_context: + # Serialize additional context for the compute node + import json + ctx_json = json.dumps(self.additional_context) + additional_context_str = f"--additional-context '{ctx_json}'" + + build_cmd = f"madengine build {tags_str} {additional_context_str} --manifest-output {manifest_output}" + if registry: + build_cmd += f" --registry {registry}" + if clean_cache: + build_cmd += " --clean-docker-cache" + + if inside_allocation: + # Run build on compute node via srun + self.rich_console.print(f"[cyan]Running build via srun (inside allocation {existing_job_id})...[/cyan]") + cmd = ["srun", "-N1", "--ntasks=1", "bash", "-c", build_cmd] + else: + # Generate and submit build script + self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]") + + build_script_content = f"""#!/bin/bash +#SBATCH --job-name=madengine-build +#SBATCH --partition={partition} +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +#SBATCH --time={time_limit} +{f'#SBATCH --reservation={reservation}' if reservation else ''} +#SBATCH --output=madengine_build_%j.out +#SBATCH --error=madengine_build_%j.err + +echo "=== Building on compute node: $(hostname) ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Build command: {build_cmd}" +echo "" + +# Activate virtual environment if available +if [ -f "venv/bin/activate" ]; then + source venv/bin/activate +fi + +# Run the build +{build_cmd} + +echo "" +echo "=== Build completed ===" +""" + build_script_path = Path("madengine_build_job.sh") + build_script_path.write_text(build_script_content) + build_script_path.chmod(0o755) + + self.rich_console.print(f" Build script: {build_script_path}") + cmd = ["sbatch", "--wait", str(build_script_path)] + + # Execute the build + self.rich_console.print(f" Command: {' '.join(cmd)}") + self.rich_console.print("") + + try: + result = subprocess.run( + cmd, + capture_output=False, # Let output flow to console + text=True, + ) + + if result.returncode != 0: + raise BuildError( + f"Build on compute node failed with exit code {result.returncode}", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + suggestions=[ + "Check the build log files (madengine_build_*.out/err)", + "Verify SLURM partition and reservation settings", + "Ensure Docker is available on compute nodes", + ], + ) + + self.rich_console.print(f"[green]✓ Build completed on compute node[/green]") + self.rich_console.print(f"[green]✓ Manifest: {manifest_output}[/green]") + return manifest_output + + except subprocess.TimeoutExpired: + raise BuildError( + "Build on compute node timed out", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + ) + except Exception as e: + raise BuildError( + f"Failed to build on compute node: {e}", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + ) from e + From 23384e110c2ffd60d9b1a012c2c594ea0c3bd1a1 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Mon, 9 Feb 2026 22:44:52 +0000 Subject: [PATCH 02/12] Fix manifest generation and reservation handling for sglang-disagg - Add docker_image field to built_images and built_models in --use-image mode - Merge model's distributed.launcher into deployment_config for proper detection - Make reservation optional (empty string is ignored) - Pass reservation to SlurmNodeSelector for health checks Co-authored-by: Cursor --- src/madengine/deployment/slurm.py | 36 +++- .../deployment/slurm_node_selector.py | 17 ++ .../orchestration/build_orchestrator.py | 166 +++++++++++++++--- 3 files changed, 194 insertions(+), 25 deletions(-) diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 7e1f2249..ef3e8477 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -300,14 +300,17 @@ def prepare(self) -> bool: model_info = self.manifest["built_models"][model_key] # Check if this is a baremetal launcher (sglang-disagg, vllm-disagg) - launcher_type = self.distributed_config.get("launcher", "torchrun") + # Priority: model_info.distributed.launcher > additional_context.distributed.launcher + model_distributed = model_info.get("distributed", {}) + launcher_type = model_distributed.get("launcher") or self.distributed_config.get("launcher", "torchrun") launcher_normalized = launcher_type.lower().replace("_", "-") if launcher_normalized in ["sglang-disagg", "vllm-disagg"]: # For disagg launchers, generate simple wrapper script # that runs the model's .slurm script directly on baremetal self.console.print(f"[cyan]Detected baremetal launcher: {launcher_type}[/cyan]") - return self._prepare_baremetal_script(model_info) + # Pass model_key as docker_image_name (for manifests, the key IS the built image name) + return self._prepare_baremetal_script(model_info, docker_image_name=model_key) # Standard flow: validate madengine availability for complex job template if not self._validate_cli_availability(): @@ -344,12 +347,16 @@ def _normalize_nodelist(nodelist: Optional[str]) -> Optional[str]: return None return ",".join(n.strip() for n in nodelist.split(",") if n.strip()) - def _prepare_baremetal_script(self, model_info: Dict) -> bool: + def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = None) -> bool: """ Generate a simple wrapper script for baremetal launchers (sglang-disagg, vllm-disagg). These launchers run the model's .slurm script directly on baremetal, which then manages Docker containers via srun. No madengine wrapper needed. + + Args: + model_info: Model configuration from manifest + docker_image_name: The built Docker image name from manifest key """ # Get the model's script path model_script = model_info.get("scripts", "") @@ -376,12 +383,30 @@ def _prepare_baremetal_script(self, model_info: Dict) -> bool: if "env_vars" in self.config.additional_context: env_vars.update(self.config.additional_context["env_vars"]) - # From distributed config - sglang_disagg_config = self.distributed_config.get("sglang_disagg", {}) + # From distributed config (model's distributed section) + model_distributed = model_info.get("distributed", {}) + sglang_disagg_config = model_distributed.get("sglang_disagg", {}) or self.distributed_config.get("sglang_disagg", {}) if sglang_disagg_config: env_vars["xP"] = str(sglang_disagg_config.get("prefill_nodes", 1)) env_vars["yD"] = str(sglang_disagg_config.get("decode_nodes", 1)) + # Override DOCKER_IMAGE_NAME with the built image from manifest + # This ensures the run uses the freshly built image, not the base image + # Priority: docker_image_name param > model_info.docker_image > env_vars.DOCKER_IMAGE_NAME + if docker_image_name and docker_image_name.startswith("ci-"): + # The manifest key IS the built image name for madengine-built images + self.console.print(f"[cyan]Using built Docker image: {docker_image_name}[/cyan]") + env_vars["DOCKER_IMAGE_NAME"] = docker_image_name + elif "docker_image" in model_info: + built_image = model_info["docker_image"] + self.console.print(f"[cyan]Using Docker image: {built_image}[/cyan]") + env_vars["DOCKER_IMAGE_NAME"] = built_image + elif "image" in model_info: + # Fallback to 'image' field + built_image = model_info["image"] + self.console.print(f"[cyan]Using Docker image: {built_image}[/cyan]") + env_vars["DOCKER_IMAGE_NAME"] = built_image + # Get model args model_args = model_info.get("args", "") @@ -1000,6 +1025,7 @@ def _submit_new_job(self) -> DeploymentResult: console=self.console, auto_cleanup=auto_cleanup, verbose=self.slurm_config.get("verbose_node_check", False), + reservation=self.reservation, ) clean_nodes, updated_exclude = selector.select_nodes( partition=self.partition, diff --git a/src/madengine/deployment/slurm_node_selector.py b/src/madengine/deployment/slurm_node_selector.py index 408e8d3c..88636b65 100644 --- a/src/madengine/deployment/slurm_node_selector.py +++ b/src/madengine/deployment/slurm_node_selector.py @@ -72,6 +72,7 @@ def __init__( auto_cleanup: bool = False, verbose: bool = False, timeout: int = 30, + reservation: Optional[str] = None, ): """ Initialize node selector. @@ -81,11 +82,13 @@ def __init__( auto_cleanup: Automatically clean dirty nodes verbose: Enable verbose logging timeout: Timeout for srun commands (seconds) + reservation: SLURM reservation name for reserved nodes """ self.console = console or Console() self.auto_cleanup = auto_cleanup self.verbose = verbose self.timeout = timeout + self.reservation = reservation # Max candidates to check (avoids excessive checks on large clusters) MAX_CANDIDATES_CAP = 100 @@ -209,6 +212,8 @@ def check_node_health(self, node: str, job_name: Optional[str] = None) -> NodeSt ] if job_name: srun_cmd.append(f"--job-name={job_name}") + if hasattr(self, 'reservation') and self.reservation: + srun_cmd.append(f"--reservation={self.reservation}") srun_cmd.extend(["bash", "-c", check_script]) try: @@ -326,6 +331,18 @@ def cleanup_node(self, node: str, job_name: Optional[str] = None) -> bool: srun_cmd.extend(["bash", "-c", cleanup_script]) try: + srun_cmd = [ + "srun", + f"--nodelist={node}", + "--ntasks=1", + "--time=00:01:00", + "--overlap", + "--quiet", + ] + if self.reservation: + srun_cmd.append(f"--reservation={self.reservation}") + srun_cmd.extend(["bash", "-c", cleanup_script]) + result = subprocess.run( srun_cmd, capture_output=True, diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 50495d97..17d1cd24 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -536,6 +536,7 @@ def _execute_with_prebuilt_image( "built_images": { use_image: { "image_name": use_image, + "docker_image": use_image, "dockerfile": "", "build_time": 0, "prebuilt": True, @@ -554,11 +555,16 @@ def _execute_with_prebuilt_image( } # Add each discovered model with the pre-built image + # Use the image name as the key (matches how madengine build does it) for model in models: model_name = model.get("name", "unknown") - manifest["built_models"][model_name] = { + model_distributed = model.get("distributed", {}) + + # Use image name as key so slurm.py can find docker_image + manifest["built_models"][use_image] = { "name": model_name, "image": use_image, + "docker_image": use_image, "dockerfile": model.get("dockerfile", ""), "scripts": model.get("scripts", ""), "data": model.get("data", ""), @@ -570,7 +576,7 @@ def _execute_with_prebuilt_image( "timeout": model.get("timeout", -1), "args": model.get("args", ""), "slurm": model.get("slurm", {}), - "distributed": model.get("distributed", {}), + "distributed": model_distributed, "env_vars": model.get("env_vars", {}), "prebuilt": True, } @@ -582,6 +588,28 @@ def _execute_with_prebuilt_image( # Save deployment config self._save_deployment_config(manifest_output) + + # Merge model's distributed config (especially launcher) into deployment_config + # This ensures sglang-disagg launcher is in deployment_config even if not in additional-context + if models and models[0].get("distributed"): + with open(manifest_output, "r") as f: + saved_manifest = json.load(f) + + model_distributed = models[0].get("distributed", {}) + if "deployment_config" not in saved_manifest: + saved_manifest["deployment_config"] = {} + + # Merge model's distributed into deployment_config.distributed + if "distributed" not in saved_manifest["deployment_config"]: + saved_manifest["deployment_config"]["distributed"] = {} + + # Copy launcher and other critical fields from model config + for key in ["launcher", "nnodes", "nproc_per_node", "backend", "port", "sglang_disagg"]: + if key in model_distributed and key not in saved_manifest["deployment_config"]["distributed"]: + saved_manifest["deployment_config"]["distributed"][key] = model_distributed[key] + + with open(manifest_output, "w") as f: + json.dump(saved_manifest, f, indent=2) self.rich_console.print(f"[green]✓ Generated manifest: {manifest_output}[/green]") self.rich_console.print(f" Pre-built image: {use_image}") @@ -643,23 +671,36 @@ def _execute_build_on_compute( partition = slurm_config.get("partition", "gpu") reservation = slurm_config.get("reservation", "") time_limit = slurm_config.get("time", "02:00:00") + # Get number of nodes - build on ALL nodes so image is available everywhere + nodes = slurm_config.get("nodes", 1) # Build the madengine build command (without --build-on-compute to avoid recursion) tags = getattr(self.args, 'tags', []) tags_str = " ".join([f"-t {tag}" for tag in tags]) if tags else "" + # Write additional context to a file to avoid shell quoting issues + context_file_path = None additional_context_str = "" if self.additional_context: - # Serialize additional context for the compute node import json - ctx_json = json.dumps(self.additional_context) - additional_context_str = f"--additional-context '{ctx_json}'" - - build_cmd = f"madengine build {tags_str} {additional_context_str} --manifest-output {manifest_output}" + context_file_path = Path("madengine_build_context.json") + with open(context_file_path, 'w') as f: + json.dump(self.additional_context, f) + self.rich_console.print(f" Context file: {context_file_path}") + + # Base build command + build_cmd_parts = ["madengine", "build"] + if tags_str: + build_cmd_parts.extend(tags_str.split()) + if context_file_path: + build_cmd_parts.extend(["--additional-context-file", str(context_file_path)]) + build_cmd_parts.extend(["--manifest-output", manifest_output]) if registry: - build_cmd += f" --registry {registry}" + build_cmd_parts.extend(["--registry", registry]) if clean_cache: - build_cmd += " --clean-docker-cache" + build_cmd_parts.append("--clean-docker-cache") + + build_cmd = " ".join(build_cmd_parts) if inside_allocation: # Run build on compute node via srun @@ -669,31 +710,117 @@ def _execute_build_on_compute( # Generate and submit build script self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]") + # Get absolute path for context file + abs_context_file = str(context_file_path.absolute()) if context_file_path else "" + abs_manifest_output = str(Path(manifest_output).absolute()) + + # Rebuild command with absolute paths for sbatch + build_cmd_abs = f"madengine build {tags_str}" + if abs_context_file: + build_cmd_abs += f" --additional-context-file {abs_context_file}" + build_cmd_abs += f" --manifest-output {abs_manifest_output}" + if registry: + build_cmd_abs += f" --registry {registry}" + if clean_cache: + build_cmd_abs += " --clean-docker-cache" + + # Discover models to get Dockerfile path + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + dockerfile_path = "" + dockerfile_name = "" + if models: + dockerfile = models[0].get("dockerfile", "") + # Find the actual Dockerfile + import glob + dockerfile_patterns = [ + f"{dockerfile}.ubuntu.amd.Dockerfile", + f"{dockerfile}.Dockerfile", + f"{dockerfile}", + ] + for pattern in dockerfile_patterns: + matches = glob.glob(pattern) + if matches: + dockerfile_path = matches[0] + dockerfile_name = Path(dockerfile_path).name + break + + self.rich_console.print(f" Nodes: {nodes} (building on all nodes)") + if dockerfile_path: + self.rich_console.print(f" Dockerfile: {dockerfile_path}") + build_script_content = f"""#!/bin/bash #SBATCH --job-name=madengine-build #SBATCH --partition={partition} -#SBATCH --nodes=1 -#SBATCH --ntasks=1 +#SBATCH --nodes={nodes} +#SBATCH --ntasks={nodes} #SBATCH --time={time_limit} {f'#SBATCH --reservation={reservation}' if reservation else ''} #SBATCH --output=madengine_build_%j.out #SBATCH --error=madengine_build_%j.err -echo "=== Building on compute node: $(hostname) ===" +echo "=== Building on compute nodes ===" echo "Job ID: $SLURM_JOB_ID" -echo "Build command: {build_cmd}" +echo "Nodes: $SLURM_NNODES" +echo "Node list: $SLURM_NODELIST" +echo "Working directory: $(pwd)" echo "" +# Change to submission directory +cd {Path.cwd().absolute()} + # Activate virtual environment if available -if [ -f "venv/bin/activate" ]; then - source venv/bin/activate +if [ -f "{Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()}" ]; then + source {Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()} + echo "Activated virtual environment" +fi# Step 1: Build Docker image on ALL nodes in parallel +echo "" +echo "=== Building Docker image on all $SLURM_NNODES nodes ===" +DOCKERFILE="{dockerfile_path}" +if [ -n "$DOCKERFILE" ] && [ -f "$DOCKERFILE" ]; then + # Get the image name - must match exactly what madengine generates + # Format: ci-_ + IMAGE_NAME=$(basename $DOCKERFILE .Dockerfile) + FULL_IMAGE_NAME="ci-{models[0].get('name', 'model') if models else 'model'}_$IMAGE_NAME" + + echo "Dockerfile: $DOCKERFILE" + echo "Image name: $FULL_IMAGE_NAME" + + # Build on all nodes in parallel using srun + srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES bash -c " + echo \\\"[\\$(hostname)] Building Docker image...\\\" + cd {Path.cwd().absolute()} + docker build --network=host -t $FULL_IMAGE_NAME --pull -f $DOCKERFILE ./docker + BUILD_RC=\\$? + if [ \\$BUILD_RC -eq 0 ]; then + echo \\\"[\\$(hostname)] Docker build SUCCESS\\\" + else + echo \\\"[\\$(hostname)] Docker build FAILED with exit code \\$BUILD_RC\\\" + fi + exit \\$BUILD_RC + " + DOCKER_BUILD_EXIT=$? + + if [ $DOCKER_BUILD_EXIT -ne 0 ]; then + echo "Docker build failed on one or more nodes" + exit $DOCKER_BUILD_EXIT + fi + echo "" + echo "=== Docker image built on all nodes ===" fi -# Run the build -{build_cmd} +# Step 2: Run madengine build on rank 0 to generate manifest +echo "" +echo "=== Generating build manifest ===" +echo "Build command: {build_cmd_abs}" +echo "" + +{build_cmd_abs} +BUILD_EXIT=$? echo "" -echo "=== Build completed ===" +echo "=== Build completed with exit code: $BUILD_EXIT ===" +exit $BUILD_EXIT """ build_script_path = Path("madengine_build_job.sh") build_script_path.write_text(build_script_content) @@ -746,5 +873,4 @@ def _execute_build_on_compute( operation="build_on_compute", component="BuildOrchestrator", ), - ) from e - + ) from e \ No newline at end of file From e2c089f63ab3c8d17863051de74d7b3000734c22 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 11 Feb 2026 04:57:41 +0000 Subject: [PATCH 03/12] Fix syntax error in build-on-compute sbatch script generation Add missing newline after 'fi' statement to prevent syntax error in generated sbatch script when using --build-on-compute option. Co-authored-by: Cursor --- src/madengine/orchestration/build_orchestrator.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 17d1cd24..5dc7a981 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -773,7 +773,9 @@ def _execute_build_on_compute( if [ -f "{Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()}" ]; then source {Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()} echo "Activated virtual environment" -fi# Step 1: Build Docker image on ALL nodes in parallel +fi + +# Step 1: Build Docker image on ALL nodes in parallel echo "" echo "=== Building Docker image on all $SLURM_NNODES nodes ===" DOCKERFILE="{dockerfile_path}" From 944cfaf06e8984374b0788ba346b0888be2041f0 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 4 Mar 2026 07:04:32 +0000 Subject: [PATCH 04/12] Rename sglang-disagg/vllm-disagg launchers to slurm_multi Replace legacy launcher names with unified slurm_multi: - Remove sglang-disagg, vllm-disagg from VALID_LAUNCHERS - Add slurm_multi as the only baremetal multi-node launcher - Update deployment logic in slurm.py and kubernetes.py - Rename example config files from sglang-disagg-* to slurm-multi-* - Update docs/launchers.md references Breaking change: sglang-disagg and vllm-disagg are no longer valid launcher names. Use slurm_multi instead. Made-with: Cursor --- docs/launchers.md | 18 +- ...lit.json => slurm-multi-custom-split.json} | 2 +- ...json => slurm-multi-multi-node-basic.json} | 2 +- ...-minimal.json => slurm-multi-minimal.json} | 2 +- ...lit.json => slurm-multi-custom-split.json} | 2 +- ...-node.json => slurm-multi-multi-node.json} | 2 +- ...-minimal.json => slurm-multi-minimal.json} | 2 +- src/madengine/deployment/kubernetes.py | 857 +++++++++++++++++- src/madengine/deployment/slurm.py | 39 +- src/madengine/execution/container_runner.py | 13 +- 10 files changed, 897 insertions(+), 42 deletions(-) rename examples/k8s-configs/basic/{sglang-disagg-custom-split.json => slurm-multi-custom-split.json} (97%) rename examples/k8s-configs/basic/{sglang-disagg-multi-node-basic.json => slurm-multi-multi-node-basic.json} (97%) rename examples/k8s-configs/minimal/{sglang-disagg-minimal.json => slurm-multi-minimal.json} (92%) rename examples/slurm-configs/basic/{sglang-disagg-custom-split.json => slurm-multi-custom-split.json} (98%) rename examples/slurm-configs/basic/{sglang-disagg-multi-node.json => slurm-multi-multi-node.json} (98%) rename examples/slurm-configs/minimal/{sglang-disagg-minimal.json => slurm-multi-minimal.json} (93%) diff --git a/docs/launchers.md b/docs/launchers.md index 836fc2d0..d5c62f9e 100644 --- a/docs/launchers.md +++ b/docs/launchers.md @@ -364,7 +364,7 @@ SGLang Disaggregated separates inference into specialized node pools: ```json { "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 5, "nproc_per_node": 8, "sglang_disagg": { @@ -403,7 +403,7 @@ Override automatic split based on workload characteristics: ```json { "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 7, "nproc_per_node": 8, "sglang_disagg": { @@ -457,12 +457,12 @@ SGLANG_NODE_IPS="10.0.0.1,10.0.0.2,..." ``` **Examples**: -- K8s Minimal: `examples/k8s-configs/minimal/sglang-disagg-minimal.json` -- K8s Basic: `examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json` -- K8s Custom: `examples/k8s-configs/basic/sglang-disagg-custom-split.json` -- SLURM Minimal: `examples/slurm-configs/minimal/sglang-disagg-minimal.json` -- SLURM Basic: `examples/slurm-configs/basic/sglang-disagg-multi-node.json` -- SLURM Custom: `examples/slurm-configs/basic/sglang-disagg-custom-split.json` +- K8s Minimal: `examples/k8s-configs/minimal/slurm_multi-minimal.json` +- K8s Basic: `examples/k8s-configs/basic/slurm_multi-multi-node-basic.json` +- K8s Custom: `examples/k8s-configs/basic/slurm_multi-custom-split.json` +- SLURM Minimal: `examples/slurm-configs/minimal/slurm_multi-minimal.json` +- SLURM Basic: `examples/slurm-configs/basic/slurm_multi-multi-node.json` +- SLURM Custom: `examples/slurm-configs/basic/slurm_multi-custom-split.json` **Comparison: SGLang vs SGLang Disaggregated**: @@ -681,7 +681,7 @@ SGLANG_NODE_RANK=${SLURM_PROCID} ```bash Error: Unknown launcher type 'xyz' ``` -Solution: Use one of: `torchrun`, `deepspeed`, `megatron`, `torchtitan`, `vllm`, `sglang`, `sglang-disagg` +Solution: Use one of: `torchrun`, `deepspeed`, `megatron`, `torchtitan`, `vllm`, `sglang`, `slurm_multi` **2. Multi-Node Communication Fails** ```bash diff --git a/examples/k8s-configs/basic/sglang-disagg-custom-split.json b/examples/k8s-configs/basic/slurm-multi-custom-split.json similarity index 97% rename from examples/k8s-configs/basic/sglang-disagg-custom-split.json rename to examples/k8s-configs/basic/slurm-multi-custom-split.json index 49aeecb1..12fd2a4a 100644 --- a/examples/k8s-configs/basic/sglang-disagg-custom-split.json +++ b/examples/k8s-configs/basic/slurm-multi-custom-split.json @@ -26,7 +26,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 7, "nproc_per_node": 8, "master_port": 29500, diff --git a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json b/examples/k8s-configs/basic/slurm-multi-multi-node-basic.json similarity index 97% rename from examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json rename to examples/k8s-configs/basic/slurm-multi-multi-node-basic.json index c16fd342..94f51fc5 100644 --- a/examples/k8s-configs/basic/sglang-disagg-multi-node-basic.json +++ b/examples/k8s-configs/basic/slurm-multi-multi-node-basic.json @@ -25,7 +25,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 5, "nproc_per_node": 8, "master_port": 29500 diff --git a/examples/k8s-configs/minimal/sglang-disagg-minimal.json b/examples/k8s-configs/minimal/slurm-multi-minimal.json similarity index 92% rename from examples/k8s-configs/minimal/sglang-disagg-minimal.json rename to examples/k8s-configs/minimal/slurm-multi-minimal.json index f0f6ad05..1aab4386 100644 --- a/examples/k8s-configs/minimal/sglang-disagg-minimal.json +++ b/examples/k8s-configs/minimal/slurm-multi-minimal.json @@ -11,7 +11,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 3, "nproc_per_node": 1 } diff --git a/examples/slurm-configs/basic/sglang-disagg-custom-split.json b/examples/slurm-configs/basic/slurm-multi-custom-split.json similarity index 98% rename from examples/slurm-configs/basic/sglang-disagg-custom-split.json rename to examples/slurm-configs/basic/slurm-multi-custom-split.json index f38bcf64..4437085b 100644 --- a/examples/slurm-configs/basic/sglang-disagg-custom-split.json +++ b/examples/slurm-configs/basic/slurm-multi-custom-split.json @@ -25,7 +25,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 7, "nproc_per_node": 8, "backend": "nccl", diff --git a/examples/slurm-configs/basic/sglang-disagg-multi-node.json b/examples/slurm-configs/basic/slurm-multi-multi-node.json similarity index 98% rename from examples/slurm-configs/basic/sglang-disagg-multi-node.json rename to examples/slurm-configs/basic/slurm-multi-multi-node.json index 7dfbae19..6f985e85 100644 --- a/examples/slurm-configs/basic/sglang-disagg-multi-node.json +++ b/examples/slurm-configs/basic/slurm-multi-multi-node.json @@ -24,7 +24,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 5, "nproc_per_node": 8, "backend": "nccl", diff --git a/examples/slurm-configs/minimal/sglang-disagg-minimal.json b/examples/slurm-configs/minimal/slurm-multi-minimal.json similarity index 93% rename from examples/slurm-configs/minimal/sglang-disagg-minimal.json rename to examples/slurm-configs/minimal/slurm-multi-minimal.json index ee4ad9f2..95fb9761 100644 --- a/examples/slurm-configs/minimal/sglang-disagg-minimal.json +++ b/examples/slurm-configs/minimal/slurm-multi-minimal.json @@ -14,7 +14,7 @@ }, "distributed": { - "launcher": "sglang-disagg", + "launcher": "slurm_multi", "nnodes": 3, "nproc_per_node": 1 } diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py index 71550d5e..959fa0d1 100644 --- a/src/madengine/deployment/kubernetes.py +++ b/src/madengine/deployment/kubernetes.py @@ -73,6 +73,21 @@ from .kubernetes_launcher_mixin import KubernetesLauncherMixin +VALID_LAUNCHERS = [ + "torchrun", + "torchtitan", + "deepspeed", + "megatron-lm", + "vllm", + "sglang", + "slurm_multi", +] + +SLURM_MULTI_ALIASES = [ + "slurm_multi", + "slurm-multi", +] + def match_pvc_subdir_to_k8s_pod( pvc_subdir: str, @@ -849,20 +864,20 @@ def _prepare_template_context( model_args=model_info.get("args", ""), ) - elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": + elif launcher_type.lower().replace("_", "-") in [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES]: if nnodes < 3: raise ValueError( - f"SGLang Disaggregated requires minimum 3 nodes " + f"slurm_multi launcher requires minimum 3 nodes " f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" ) # Always create headless service for disaggregated architecture create_headless_service = True - self.console.print(f"[dim]SGLang Disaggregated: Creating headless service for {nnodes} pods[/dim]") + self.console.print(f"[dim]slurm_multi: Creating headless service for {nnodes} pods[/dim]") self.console.print(f"[dim] Architecture: 1 proxy + {max(1, (nnodes-1)*2//5)} prefill + {nnodes-1-max(1, (nnodes-1)*2//5)} decode[/dim]") - # Generate SGLang Disaggregated launcher command - launcher_command = self._generate_sglang_disagg_command( + # Generate slurm_multi launcher command + launcher_command = self._generate_slurm_multi_command( nnodes=nnodes, nproc_per_node=nproc_per_node, master_port=master_port, @@ -1202,6 +1217,838 @@ def _enrich_tools_with_cmd(self, tools: List[Dict]) -> List[Dict]: return enriched_tools + def _generate_torchrun_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate torchrun launcher command for K8s Indexed Jobs. + + For single-node (nnodes=1), generates standalone torchrun command. + For multi-node (nnodes>1), generates distributed torchrun with headless + service DNS for coordination. + + Uses K8s environment variables for distributed coordination: + - JOB_COMPLETION_INDEX: Pod index (0, 1, 2, ...) + - Headless service DNS for MASTER_ADDR + + CRITICAL FIX: For bash scripts that use ${BASH_SOURCE[0]}, we cd into the + script directory first so relative paths resolve correctly. This fixes the + issue where profiling tool wrappers prevent BASH_SOURCE from resolving. + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port. Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete torchrun command string + + Raises: + ValueError: If any parameter is invalid + """ + from pathlib import Path + + # Validate inputs (defensive programming) + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # Check if model_script is a bash script + # If so, execute it directly as it handles torchrun internally + if model_script.endswith('.sh'): + # For bash scripts, set environment variables and execute script + # The script itself will invoke torchrun with the appropriate Python file + # CRITICAL: cd to script directory first so BASH_SOURCE[0] resolves correctly + script_dir = str(Path(model_script).parent) + script_name = str(Path(model_script).name) + if nnodes == 1: + return f"""export MAD_MULTI_NODE_RUNNER="torchrun --standalone --nproc_per_node={nproc_per_node}" +export MAD_RUNTIME_NGPUS={nproc_per_node} +cd {script_dir} && bash {script_name}""" + else: + return f"""# Multi-node torchrun setup (Kubernetes Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export MAD_MULTI_NODE_RUNNER="torchrun --nnodes={nnodes} --nproc_per_node={nproc_per_node} --node_rank=${{JOB_COMPLETION_INDEX}} --master_addr=${{MASTER_ADDR}} --master_port={master_port}" +export MAD_RUNTIME_NGPUS={nproc_per_node} +cd {script_dir} && bash {script_name}""" + + # For Python scripts, invoke torchrun directly + # For single-node, simpler standalone command + if nnodes == 1: + return f"""torchrun \\ + --standalone \\ + --nnodes=1 \\ + --nproc_per_node={nproc_per_node} \\ + {model_script}""" + + # Multi-node: Use headless service DNS and JOB_COMPLETION_INDEX + return f"""# Multi-node torchrun setup (Kubernetes Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export RANK=${{JOB_COMPLETION_INDEX}} +export WORLD_SIZE={nnodes} +export LOCAL_RANK=0 +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +echo "Torchrun Configuration:" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " RANK: $RANK" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NPROC_PER_NODE: $NPROC_PER_NODE" + +torchrun \\ + --nnodes={nnodes} \\ + --nproc_per_node={nproc_per_node} \\ + --rdzv_backend=c10d \\ + --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \\ + --rdzv_id={self.job_name} \\ + --role=worker \\ + --tee=3 \\ + {model_script}""" + + def _generate_deepspeed_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate DeepSpeed launcher command for K8s Indexed Jobs. + + DeepSpeed has its own launcher that handles: + - ZeRO optimization stages (ZeRO-1, ZeRO-2, ZeRO-3) + - Gradient accumulation + - Mixed precision training + - Pipeline parallelism + - Hostfile management (handled by K8s in our case) + + For single-node (nnodes=1), uses localhost setup. + For multi-node (nnodes>1), uses headless service DNS for coordination. + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port. Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete DeepSpeed launcher command string + + Raises: + ValueError: If any parameter is invalid + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node + if nnodes == 1: + return f"""# DeepSpeed Single-Node Setup +export MASTER_ADDR=localhost +export MASTER_PORT={master_port} +export RANK=0 +export LOCAL_RANK=0 +export WORLD_SIZE={nproc_per_node} + +echo "DeepSpeed Configuration:" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NUM_GPUS: {nproc_per_node}" + +# DeepSpeed launcher (single-node) +deepspeed --num_gpus={nproc_per_node} \\ + --master_port={master_port} \\ + {model_script}""" + + # Multi-node: Use K8s headless service for coordination + return f"""# Multi-node DeepSpeed setup (Kubernetes Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export RANK=${{JOB_COMPLETION_INDEX}} +export LOCAL_RANK=0 +export WORLD_SIZE={nnodes * nproc_per_node} +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +echo "DeepSpeed Multi-Node Configuration:" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " RANK (Node Rank): $RANK" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NNODES: $NNODES" +echo " NPROC_PER_NODE: $NPROC_PER_NODE" + +# Create hostfile for DeepSpeed (K8s Indexed Job aware) +cat > /tmp/hostfile << EOF +{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local slots={nproc_per_node} +EOF + +# Add all nodes to hostfile +for i in $(seq 1 $((NNODES - 1))); do + echo "{self.job_name}-$i.{self.job_name}.{self.namespace}.svc.cluster.local slots={nproc_per_node}" >> /tmp/hostfile +done + +echo "" +echo "Generated hostfile:" +cat /tmp/hostfile +echo "" + +# DeepSpeed launcher (multi-node with hostfile) +deepspeed --hostfile=/tmp/hostfile \\ + --master_addr=$MASTER_ADDR \\ + --master_port=$MASTER_PORT \\ + --num_nodes={nnodes} \\ + --num_gpus={nproc_per_node} \\ + {model_script}""" + + def _generate_bash_script_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate command to execute a bash script directly. + + This is used when the model script is a .sh file that handles + launcher invocation internally (e.g., using torchrun inside the script). + + Sets up environment variables for distributed training that the bash + script can use. + + Args: + nnodes: Number of nodes (pods) + nproc_per_node: GPUs per node + master_port: Master communication port + model_script: Path to the bash script + + Returns: + Command to execute the bash script with environment setup + """ + # For single-node + if nnodes == 1: + return f"""# Bash Script Execution (Single-Node) +# Setting up environment for script to use +export MASTER_ADDR=localhost +export MASTER_PORT={master_port} +export RANK=0 +export LOCAL_RANK=0 +export WORLD_SIZE={nproc_per_node} +export NNODES=1 +export NPROC_PER_NODE={nproc_per_node} + +echo "Bash Script Configuration:" +echo " Script: {model_script}" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NNODES: $NNODES" +echo " NPROC_PER_NODE: $NPROC_PER_NODE" +echo "" + +# Execute the bash script directly +bash {model_script}""" + + # Multi-node: Use K8s headless service for coordination + return f"""# Bash Script Execution (Multi-Node) +# Setting up environment for script to use +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export RANK=${{JOB_COMPLETION_INDEX}} +export LOCAL_RANK=0 +export WORLD_SIZE={nnodes * nproc_per_node} +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +echo "Bash Script Multi-Node Configuration:" +echo " Script: {model_script}" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " RANK (Node Rank): $RANK" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " NNODES: $NNODES" +echo " NPROC_PER_NODE: $NPROC_PER_NODE" +echo "" + +# Execute the bash script directly +bash {model_script}""" + + def _generate_torchtitan_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate TorchTitan launcher command for K8s Indexed Jobs. + + TorchTitan is a PyTorch native platform for large-scale LLM pre-training + that supports multi-dimensional parallelism: + - FSDP2 (Fully Sharded Data Parallel v2) + - Tensor Parallel (TP) + - Pipeline Parallel (PP) + - Context Parallel (CP) + + TorchTitan uses torchrun as its underlying distributed launcher but + requires additional configuration for its parallelism strategies. + + For single-node (nnodes=1): Uses standalone torchrun with TP + For multi-node (nnodes>1): Uses distributed torchrun with TP+PP+FSDP2 + + Uses K8s environment variables for distributed coordination: + - JOB_COMPLETION_INDEX: Pod index (0, 1, 2, ...) + - Headless service DNS for MASTER_ADDR + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port. Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete torchtitan launch command string with environment setup + + Raises: + ValueError: If any parameter is invalid + + Example single-node output: + export TORCHTITAN_TENSOR_PARALLEL_SIZE=8 + export TORCHTITAN_PIPELINE_PARALLEL_SIZE=1 + torchrun --standalone --nproc_per_node=8 train.py --config llama3_8b.toml + + Example multi-node output: + export MASTER_ADDR="job-0.job.namespace.svc.cluster.local" + export TORCHTITAN_TENSOR_PARALLEL_SIZE=8 + export TORCHTITAN_PIPELINE_PARALLEL_SIZE=4 + export TORCHTITAN_FSDP_ENABLED=1 + torchrun --nnodes=4 --nproc_per_node=8 ... train.py --config llama3_405b.toml + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node, use standalone mode with Tensor Parallelism only + if nnodes == 1: + return f"""# TorchTitan single-node setup (Tensor Parallelism) +export TORCHTITAN_TENSOR_PARALLEL_SIZE={nproc_per_node} +export TORCHTITAN_PIPELINE_PARALLEL_SIZE=1 +export TORCHTITAN_FSDP_ENABLED=0 +export TORCHTITAN_CONTEXT_PARALLEL_SIZE=1 + +echo "TorchTitan Configuration (Single Node):" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Pipeline Parallel Size: 1" +echo " Total GPUs: {nproc_per_node}" + +torchrun \\ + --standalone \\ + --nnodes=1 \\ + --nproc_per_node={nproc_per_node} \\ + {model_script}""" + + # Multi-node: Use headless service DNS and enable all parallelism strategies + return f"""# TorchTitan multi-node setup (K8s Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export RANK=${{JOB_COMPLETION_INDEX}} +export WORLD_SIZE={nnodes} +export LOCAL_RANK=0 +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +# TorchTitan multi-dimensional parallelism configuration +# These can be overridden by TOML config file in model script +export TORCHTITAN_TENSOR_PARALLEL_SIZE={nproc_per_node} +export TORCHTITAN_PIPELINE_PARALLEL_SIZE={nnodes} +export TORCHTITAN_FSDP_ENABLED=1 +export TORCHTITAN_CONTEXT_PARALLEL_SIZE=1 + +echo "TorchTitan Configuration (Multi-Node):" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " RANK: $RANK" +echo " WORLD_SIZE: $WORLD_SIZE" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Pipeline Parallel Size: {nnodes}" +echo " FSDP: Enabled" +echo " Total GPUs: {nnodes * nproc_per_node}" + +torchrun \\ + --nnodes={nnodes} \\ + --nproc_per_node={nproc_per_node} \\ + --rdzv_backend=c10d \\ + --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT \\ + --rdzv_id={self.job_name} \\ + --role=worker \\ + --tee=3 \\ + {model_script}""" + + def _generate_slurm_multi_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate slurm_multi launcher command for K8s Indexed Jobs. + + slurm_multi uses separate node pools for: + - Proxy (index 0): Load balancer and request router + - Prefill (indices 1 to xP): Prompt processing + - Decode (indices xP+1 to end): Token generation + + Communication via Mooncake framework for efficient KV cache transfer. + + Architecture: + - Pod 0: Runs mini_lb (proxy/load balancer) + - Pods 1-xP: Run prefill servers + - Pods xP+1 to N-1: Run decode servers + + Args: + nnodes: Total number of pods (must be >= 3) + nproc_per_node: GPUs per pod + master_port: Port for proxy service + model_script: Path to model launch script + + Returns: + Complete multi-node launch setup + + Raises: + ValueError: If nnodes < 3 or invalid parameters + """ + # Validate + if not isinstance(nnodes, int) or nnodes < 3: + raise ValueError( + f"slurm_multi requires minimum 3 nodes, got {nnodes}" + ) + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be >= 1, got {nproc_per_node}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string") + + # Check if custom split is specified in additional_context + sglang_disagg_config = self.config.additional_context.get("distributed", {}).get("sglang_disagg", {}) + prefill_nodes = sglang_disagg_config.get("prefill_nodes") + decode_nodes = sglang_disagg_config.get("decode_nodes") + + if prefill_nodes is not None and decode_nodes is not None: + # User specified custom split - validate + if prefill_nodes < 1 or decode_nodes < 1: + raise ValueError( + f"SGLang Disaggregated requires at least 1 prefill and 1 decode node, " + f"got prefill={prefill_nodes}, decode={decode_nodes}" + ) + if prefill_nodes + decode_nodes + 1 != nnodes: + raise ValueError( + f"Custom split validation failed: " + f"prefill_nodes ({prefill_nodes}) + decode_nodes ({decode_nodes}) + 1 proxy " + f"must equal nnodes ({nnodes}), but got {prefill_nodes + decode_nodes + 1}" + ) + xP = prefill_nodes + yD = decode_nodes + else: + # Default automatic split (can be customized via additional_context) + xP = max(1, (nnodes - 1) * 2 // 5) # ~40% prefill + yD = nnodes - 1 - xP # remaining decode + + # Build prefill and decode server lists + prefill_servers = " ".join([ + f"http://{self.job_name}-{i}.{self.job_name}.{self.namespace}.svc.cluster.local:30000" + for i in range(1, xP + 1) + ]) + + decode_servers = " ".join([ + f"http://{self.job_name}-{i}.{self.job_name}.{self.namespace}.svc.cluster.local:30000" + for i in range(xP + 1, nnodes) + ]) + + return f"""# SGLang Disaggregated K8s Setup +# ============================================ +# Cluster: {nnodes} pods total +# Proxy: Pod 0 +# Prefill: Pods 1-{xP} ({xP} nodes) +# Decode: Pods {xP+1}-{nnodes-1} ({yD} nodes) +# ============================================ + +export POD_INDEX=${{JOB_COMPLETION_INDEX:-0}} +export TOTAL_PODS={nnodes} +export PREFILL_COUNT={xP} +export DECODE_COUNT={yD} +export TP_SIZE={nproc_per_node} + +# Get pod IP +export POD_IP=$(hostname -i | awk '{{print $1}}') + +echo "==========================================" +echo "SGLang Disaggregated Pod Info" +echo "==========================================" +echo "Pod Index: $POD_INDEX" +echo "Pod IP: $POD_IP" +echo "Total Pods: $TOTAL_PODS" +echo "Prefill Pods: $PREFILL_COUNT" +echo "Decode Pods: $DECODE_COUNT" +echo "TP Size: $TP_SIZE" +echo "==========================================" + +# Node role assignment based on pod index +if [ "$POD_INDEX" -eq 0 ]; then + # Proxy Node (Load Balancer) + echo "🔀 This pod is PROXY (Load Balancer)" + + python3 -m sglang.srt.disaggregation.mini_lb \\ + --prefill {prefill_servers} \\ + --decode {decode_servers} \\ + --host 0.0.0.0 \\ + --port {master_port} + +elif [ "$POD_INDEX" -le "{xP}" ]; then + # Prefill Nodes + echo "⚡ This pod is PREFILL Node" + + python3 -m sglang.launch_server \\ + --model-path "$MODEL_PATH" \\ + --disaggregation-mode prefill \\ + --tp-size {nproc_per_node} \\ + --host $POD_IP \\ + --port 30000 \\ + --trust-remote-code \\ + --disaggregation-transfer-backend mooncake + +else + # Decode Nodes + echo "🔤 This pod is DECODE Node" + + python3 -m sglang.launch_server \\ + --model-path "$MODEL_PATH" \\ + --disaggregation-mode decode \\ + --tp-size {nproc_per_node} \\ + --host $POD_IP \\ + --port 30000 \\ + --trust-remote-code \\ + --disaggregation-transfer-backend mooncake +fi + +echo "SGLang Disaggregated setup complete" +""" + + def _generate_vllm_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate vLLM launcher command for K8s Indexed Jobs. + + vLLM is an inference engine with its own process management via Ray. + Unlike training frameworks, vLLM doesn't use torchrun. + + Architecture: + - Single-node: Tensor Parallelism (TP) across GPUs, no Ray needed + - Multi-node: Data Parallelism where each node runs independent vLLM replica + * Each replica uses TP across its local GPUs + * Ray coordinates resources on each node independently + * Benefits: Simpler, more robust, better for inference serving + + For K8s multi-node: + - Each pod runs its own independent vLLM instance + - Uses Ray for local GPU coordination + - NO shared Ray cluster across pods (Data Parallelism mode) + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port (for Ray). Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete vLLM launch setup with environment configuration + + Raises: + ValueError: If any parameter is invalid + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node, simple TP setup (no Ray needed) + if nnodes == 1: + return f"""# vLLM single-node setup (Tensor Parallelism) +export VLLM_TENSOR_PARALLEL_SIZE={nproc_per_node} +export VLLM_PIPELINE_PARALLEL_SIZE=1 +export VLLM_DISTRIBUTED_BACKEND="auto" +export NNODES=1 +export NPROC_PER_NODE={nproc_per_node} +export NODE_RANK=0 + +echo "vLLM Configuration (Single Node):" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Pipeline Parallel Size: 1" +echo " Distributed Backend: auto (no Ray)" +echo " Total GPUs: {nproc_per_node}" + +# vLLM handles process management - just run the script +{model_script}""" + + # Multi-node: Data Parallelism with independent Ray clusters per pod + return f"""# vLLM multi-node setup (K8s Data Parallelism Mode) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export NODE_RANK=${{JOB_COMPLETION_INDEX}} +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +# vLLM Data Parallelism configuration +# Each pod runs INDEPENDENT vLLM replica (no shared Ray cluster) +export VLLM_TENSOR_PARALLEL_SIZE={nproc_per_node} +export VLLM_PIPELINE_PARALLEL_SIZE=1 +export VLLM_DISTRIBUTED_BACKEND="ray" + +# Get current pod IP for Ray +POD_IP=$(hostname -i | awk '{{print $1}}') +export VLLM_HOST_IP="$POD_IP" + +echo "vLLM Configuration (Multi-Node Data Parallelism):" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " NODE_RANK: $NODE_RANK (Pod Index)" +echo " NNODES: $NNODES" +echo " Tensor Parallel Size: {nproc_per_node} (per pod)" +echo " Data Parallel Size: {nnodes} (independent replicas)" +echo " Pod IP: $POD_IP" +echo " Total GPUs: {nnodes * nproc_per_node}" +echo "" +echo "Mode: Each pod runs independent vLLM replica with local Ray" + +# Clean any existing Ray processes +ray stop --force 2>/dev/null || true +pkill -9 -f "ray::" 2>/dev/null || true +sleep 2 + +# Start independent Ray cluster on THIS pod only +echo "Starting Ray cluster on Pod $NODE_RANK..." +ray start --head --port=6379 --node-ip-address="$POD_IP" --num-gpus={nproc_per_node} +sleep 3 + +echo "Ray cluster ready:" +ray status + +# Run vLLM inference script +{model_script} + +# Cleanup Ray on exit +trap "ray stop --force 2>/dev/null || true" EXIT""" + + def _generate_sglang_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate SGLang launcher command for K8s Indexed Jobs. + + SGLang is an inference engine with native launcher (sglang.launch_server). + Similar to vLLM, it manages its own process spawning via Ray. + + Architecture: + - Single-node: Tensor Parallelism (TP) across GPUs + - Multi-node: Uses SGLang's native multi-node launcher with Ray + * TP across GPUs within each node + * Ray for distributed coordination + + For K8s: + - Uses headless service for node discovery (similar to torchrun) + - Each pod knows its rank via JOB_COMPLETION_INDEX + - SGLang native launcher handles Ray cluster setup + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port (for NCCL/Ray). Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete SGLang launch setup with environment configuration + + Raises: + ValueError: If any parameter is invalid + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node, simple TP setup + if nnodes == 1: + return f"""# SGLang single-node setup (Tensor Parallelism) +export SGLANG_TENSOR_PARALLEL_SIZE={nproc_per_node} +export SGLANG_PIPELINE_PARALLEL_SIZE=1 +export NNODES=1 +export NPROC_PER_NODE={nproc_per_node} +export NODE_RANK=0 + +echo "SGLang Configuration (Single Node):" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Total GPUs: {nproc_per_node}" + +# SGLang native launcher handles everything +{model_script}""" + + # Multi-node: Use SGLang's native multi-node support + return f"""# SGLang multi-node setup (K8s Indexed Job) +export MASTER_ADDR="{self.job_name}-0.{self.job_name}.{self.namespace}.svc.cluster.local" +export MASTER_PORT={master_port} +export NODE_RANK=${{JOB_COMPLETION_INDEX}} +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} + +# SGLang parallelism configuration +export SGLANG_TENSOR_PARALLEL_SIZE={nproc_per_node} +export SGLANG_PIPELINE_PARALLEL_SIZE=1 + +# Get current pod IP +POD_IP=$(hostname -i | awk '{{print $1}}') +export SGLANG_HOST_IP="$POD_IP" + +echo "SGLang Configuration (Multi-Node):" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " NODE_RANK: $NODE_RANK (Pod Index)" +echo " NNODES: $NNODES" +echo " Tensor Parallel Size: {nproc_per_node}" +echo " Pod IP: $POD_IP" +echo " Total GPUs: {nnodes * nproc_per_node}" + +# Clean any existing Ray processes +ray stop --force 2>/dev/null || true +pkill -9 -f "ray::" 2>/dev/null || true +sleep 2 + +# SGLang native launcher will handle Ray cluster coordination +# Pass NCCL init address for multi-node setup +export NCCL_INIT_ADDR="${{MASTER_ADDR}}:${{MASTER_PORT}}" + +echo "Starting SGLang with native multi-node launcher..." +{model_script} + +# Cleanup Ray on exit +trap "ray stop --force 2>/dev/null || true" EXIT""" + + def _generate_megatron_command( + self, nnodes: int, nproc_per_node: int, master_port: int, model_script: str + ) -> str: + """ + Generate Megatron-LM launcher command for K8s Indexed Jobs. + + Megatron-LM is a training framework for large transformers with tensor and pipeline parallelism. + It uses torchrun as the underlying launcher but with Megatron-specific environment variables. + + Architecture: + - Single-node: Tensor Parallelism (TP) across GPUs + - Multi-node: Tensor + Pipeline Parallelism + * TP across GPUs within each node + * PP across nodes + + For K8s: + - Uses headless service for node discovery (like torchrun/deepspeed) + - Each pod knows its rank via JOB_COMPLETION_INDEX + - Sets TENSOR_MODEL_PARALLEL_SIZE and PIPELINE_MODEL_PARALLEL_SIZE (Megatron-Core standard) + + Args: + nnodes: Number of nodes (pods). Must be >= 1. + nproc_per_node: GPUs per node. Must be >= 1. + master_port: Master communication port (for NCCL). Must be 1-65535. + model_script: Path to model's run script. Cannot be empty. + + Returns: + Complete Megatron-LM launch setup with environment configuration + + Raises: + ValueError: If any parameter is invalid + """ + # Validate inputs + if not isinstance(nnodes, int) or nnodes < 1: + raise ValueError(f"nnodes must be integer >= 1, got {nnodes}") + if not isinstance(nproc_per_node, int) or nproc_per_node < 1: + raise ValueError(f"nproc_per_node must be integer >= 1, got {nproc_per_node}") + if not isinstance(master_port, int) or not (1 <= master_port <= 65535): + raise ValueError(f"master_port must be 1-65535, got {master_port}") + if not model_script or not isinstance(model_script, str): + raise ValueError(f"model_script must be non-empty string, got {model_script}") + + # For single-node, use TP only + if nnodes == 1: + return f"""# Megatron-LM single-node setup (Tensor Parallelism) +export TENSOR_MODEL_PARALLEL_SIZE={min(nproc_per_node, 8)} +export PIPELINE_MODEL_PARALLEL_SIZE=1 +export CONTEXT_PARALLEL_SIZE=1 +export NNODES=1 +export NPROC_PER_NODE={nproc_per_node} +export MASTER_ADDR=localhost +export MASTER_PORT={master_port} +export NODE_RANK=0 + +echo "Megatron-LM Configuration (Single-Node):" +echo " Tensor Model Parallel Size: {min(nproc_per_node, 8)}" +echo " Pipeline Model Parallel Size: 1" +echo " Total GPUs: {nproc_per_node}" + +# Launch using torchrun with Megatron configuration +torchrun \\ + --standalone \\ + --nproc_per_node={nproc_per_node} \\ + {model_script}""" + + # Multi-node: TP + PP + else: + # Use headless service for node discovery (set by template) + return f"""# Megatron-LM multi-node setup (Tensor + Pipeline Parallelism) +export TENSOR_MODEL_PARALLEL_SIZE={nproc_per_node} +export PIPELINE_MODEL_PARALLEL_SIZE={nnodes} +export CONTEXT_PARALLEL_SIZE=1 +export NNODES={nnodes} +export NPROC_PER_NODE={nproc_per_node} +export NODE_RANK=${{JOB_COMPLETION_INDEX}} +export MASTER_ADDR=${{MASTER_ADDR}} +export MASTER_PORT={master_port} + +echo "Megatron-LM Configuration (Multi-Node):" +echo " MASTER_ADDR: $MASTER_ADDR" +echo " MASTER_PORT: $MASTER_PORT" +echo " NODE_RANK: $NODE_RANK (Pod Index)" +echo " NNODES: $NNODES" +echo " Tensor Model Parallel Size: {nproc_per_node}" +echo " Pipeline Model Parallel Size: {nnodes}" +echo " Total GPUs: {nnodes * nproc_per_node}" + +# Wait for all pods to be ready (K8s Indexed Job coordination) +echo "Waiting for all {nnodes} pods to be ready..." +sleep 5 + +# Launch using torchrun with Megatron multi-node configuration +torchrun \\ + --nnodes={nnodes} \\ + --nproc_per_node={nproc_per_node} \\ + --node_rank=${{NODE_RANK}} \\ + --master_addr=${{MASTER_ADDR}} \\ + --master_port={master_port} \\ + {model_script}""" def _load_k8s_tools(self) -> Dict: """ Load K8s-specific tools configuration. diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index ef3e8477..b3d5dc9f 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -24,6 +24,12 @@ from madengine.utils.run_details import get_build_number, get_pipeline from madengine.utils.path_utils import scripts_base_dir_from import json +from typing import Optional + +SLURM_MULTI_ALIASES = [ + "slurm_multi", + "slurm-multi", +] class SlurmDeployment(BaseDeployment): @@ -299,16 +305,18 @@ def prepare(self) -> bool: model_key = model_keys[0] model_info = self.manifest["built_models"][model_key] - # Check if this is a baremetal launcher (sglang-disagg, vllm-disagg) + # Check if this is a slurm_multi launcher (baremetal multi-node) # Priority: model_info.distributed.launcher > additional_context.distributed.launcher model_distributed = model_info.get("distributed", {}) launcher_type = model_distributed.get("launcher") or self.distributed_config.get("launcher", "torchrun") launcher_normalized = launcher_type.lower().replace("_", "-") - if launcher_normalized in ["sglang-disagg", "vllm-disagg"]: - # For disagg launchers, generate simple wrapper script + # Check against slurm_multi aliases (includes legacy sglang-disagg, vllm-disagg) + slurm_multi_aliases_normalized = [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES] + if launcher_normalized in slurm_multi_aliases_normalized: + # For slurm_multi launchers, generate simple wrapper script # that runs the model's .slurm script directly on baremetal - self.console.print(f"[cyan]Detected baremetal launcher: {launcher_type}[/cyan]") + self.console.print(f"[cyan]Detected slurm_multi launcher: {launcher_type}[/cyan]") # Pass model_key as docker_image_name (for manifests, the key IS the built image name) return self._prepare_baremetal_script(model_info, docker_image_name=model_key) @@ -349,10 +357,11 @@ def _normalize_nodelist(nodelist: Optional[str]) -> Optional[str]: def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = None) -> bool: """ - Generate a simple wrapper script for baremetal launchers (sglang-disagg, vllm-disagg). + Generate a simple wrapper script for baremetal/slurm_multi launchers. - These launchers run the model's .slurm script directly on baremetal, - which then manages Docker containers via srun. No madengine wrapper needed. + These launchers (slurm_multi, sglang-disagg, vllm-disagg) run the model's + .slurm script directly on baremetal, which then manages Docker containers + via srun. No madengine wrapper needed. Args: model_info: Model configuration from manifest @@ -432,7 +441,7 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N script_lines.extend([ "", f"# Baremetal launcher script for {model_info['name']}", - f"# Generated by madengine for sglang-disagg", + f"# Generated by madengine for slurm_multi", "", "set -e", "", @@ -594,8 +603,8 @@ def _generate_launcher_command( return self._generate_vllm_command(nnodes, nproc_per_node, master_port) elif launcher_type == "sglang": return self._generate_sglang_command(nnodes, nproc_per_node, master_port) - elif launcher_type == "sglang-disagg" or launcher_type == "sglang_disagg": - return self._generate_sglang_disagg_command(nnodes, nproc_per_node, master_port) + elif launcher_type.lower().replace("_", "-") in [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES]: + return self._generate_slurm_multi_command(nnodes, nproc_per_node, master_port) elif launcher_type == "deepspeed": return self._generate_deepspeed_command(nnodes, nproc_per_node, master_port) elif launcher_type == "megatron": @@ -694,13 +703,13 @@ def _generate_sglang_command( export SGLANG_PIPELINE_PARALLEL_SIZE=1 # SGLang handles its own process management - no MAD_MULTI_NODE_RUNNER needed''' - def _generate_sglang_disagg_command( + def _generate_slurm_multi_command( self, nnodes: int, nproc_per_node: int, master_port: int ) -> str: """ - Generate SGLang Disaggregated launcher environment for SLURM. + Generate slurm_multi launcher environment for SLURM. - SGLang Disaggregated Architecture: + slurm_multi Architecture (multi-node baremetal): - Node 0: Proxy (load balancer) - Nodes 1 to xP: Prefill nodes - Nodes xP+1 to xP+yD: Decode nodes @@ -716,11 +725,11 @@ def _generate_sglang_disagg_command( Environment setup with node role assignment Raises: - ValueError: If nnodes < 3 (minimum for disagg) + ValueError: If nnodes < 3 """ if nnodes < 3: raise ValueError( - f"SGLang Disaggregated requires minimum 3 nodes " + f"slurm_multi requires minimum 3 nodes " f"(1 proxy + 1 prefill + 1 decode), got {nnodes}" ) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 2c504cbd..086629c4 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -15,12 +15,11 @@ import typing import warnings -BAREMETAL_LAUNCHERS = [ - "sglang-disagg", - "sglang_disagg", - "vllm-disagg", - "vllm_disagg", +SLURM_MULTI_ALIASES = [ + "slurm_multi", + "slurm-multi", ] +BAREMETAL_LAUNCHERS = SLURM_MULTI_ALIASES from rich.console import Console as RichConsole from contextlib import redirect_stdout, redirect_stderr from madengine.core.console import Console @@ -688,7 +687,7 @@ def _run_on_baremetal( """ Run script directly on baremetal (not inside Docker). - Used for launchers like sglang-disagg that manage their own Docker containers + Used for slurm_multi launchers that manage their own Docker containers via SLURM srun commands. The script is executed directly on the node. Args: @@ -1099,7 +1098,7 @@ def run_container( print(f"Docker options: {docker_options}") # ========== CHECK FOR BAREMETAL LAUNCHERS ========== - # Launchers like sglang-disagg run scripts directly on baremetal, + # slurm_multi launchers run scripts directly on baremetal, # not inside Docker. The script itself manages Docker containers via srun. launcher = "" From 2177ab9b86f0383460d31c23729839281baff602 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 4 Mar 2026 21:46:14 +0000 Subject: [PATCH 05/12] Add documentation for --use-image, --build-on-compute, and SLURM features docs/cli-reference.md: - Add --use-image and --build-on-compute options to build command - Add examples for pre-built image and compute node build workflows - Document when to use each mode docs/usage.md: - Add "Pre-built Image Mode" section with examples - Add "Build on Compute Node" section explaining workflow docs/deployment.md: - Add "SLURM Allocation Detection" section - Add "Baremetal Execution (slurm_multi)" section - Document environment variables and behavior inside salloc docs/launchers.md: - Fix example file paths (slurm_multi -> slurm-multi) Made-with: Cursor --- docs/cli-reference.md | 78 +++++++++++++++++++++++++++++++++++++++++++ docs/deployment.md | 70 ++++++++++++++++++++++++++++++++++++++ docs/launchers.md | 12 +++---- docs/usage.md | 54 ++++++++++++++++++++++++++++++ 4 files changed, 208 insertions(+), 6 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index d23d3b87..04b45f1f 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -98,6 +98,8 @@ madengine build [OPTIONS] | `--target-archs` | `-a` | TEXT | `[]` | Target GPU architectures (e.g., gfx908,gfx90a,gfx942) | | `--registry` | `-r` | TEXT | `None` | Docker registry to push images to | | `--batch-manifest` | | TEXT | `None` | Input batch.json file for batch build mode | +| `--use-image` | | TEXT | `None` | Skip Docker build, use pre-built image instead | +| `--build-on-compute` | | FLAG | `False` | Build Docker images on SLURM compute node instead of login node | | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | | `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON | | `--clean-docker-cache` | | FLAG | `False` | Rebuild images without using cache | @@ -142,6 +144,16 @@ madengine build --tags model \ # Real-time output with verbose logging madengine build --tags model --live-output --verbose + +# Use pre-built image (skip Docker build) +madengine build --tags sglang_disagg \ + --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ + --additional-context-file slurm-config.json + +# Build on SLURM compute node instead of login node +madengine build --tags model \ + --build-on-compute \ + --additional-context-file slurm-config.json ``` **Default Values:** @@ -193,6 +205,72 @@ When using `--batch-manifest`, provide a JSON file with selective build configur See [Batch Build Guide](batch-build.md) for details. +**Pre-built Image Mode (`--use-image`):** + +Skip Docker build and use an existing image from a registry or local Docker cache: + +```bash +# Use image from Docker Hub +madengine build --tags sglang_disagg \ + --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ + --additional-context-file config.json + +# Use image from NGC +madengine build --tags model \ + --use-image nvcr.io/nvidia/pytorch:24.01-py3 + +# Use locally cached image +madengine build --tags model \ + --use-image my-local-image:latest +``` + +**When to use `--use-image`:** +- Using official framework images (SGLang, vLLM, etc.) +- Image is pre-cached on compute nodes +- Testing without rebuilding +- CI/CD pipelines with external images + +The generated manifest marks the image as `"prebuilt": true` with `build_time: 0`. + +**Build on Compute Node (`--build-on-compute`):** + +Build Docker images on SLURM compute nodes instead of the login node: + +```bash +# Build on compute node (requires SLURM config) +madengine build --tags model \ + --build-on-compute \ + --additional-context-file slurm-config.json +``` + +**slurm-config.json for build-on-compute:** +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 3, + "time": "02:00:00", + "reservation": "my-reservation" + } +} +``` + +**When to use `--build-on-compute`:** +- Login node has limited disk space or resources +- Build requires GPU access (e.g., AOT compilation) +- Need image available on all compute nodes simultaneously +- Login node policies prohibit heavy workloads + +**How it works:** +1. Generates `madengine_build_job.sh` sbatch script +2. Submits via `sbatch --wait` +3. Runs `docker build` on ALL allocated nodes in parallel via `srun` +4. Generates manifest on completion + +**Inside existing SLURM allocation:** + +If you're already inside an `salloc` allocation, `--build-on-compute` uses `srun` directly instead of submitting a new job. + --- ### `run` - Execute Models diff --git a/docs/deployment.md b/docs/deployment.md index a0a825cf..09125b09 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -272,6 +272,39 @@ SLURM automatically provides: - Network interface configuration - Rank assignment via `$SLURM_PROCID` +### SLURM Allocation Detection + +madengine automatically detects if you're running inside an existing SLURM allocation (via `salloc`): + +```bash +# Allocate nodes interactively +salloc -N 3 -p gpu --gpus-per-node=8 -t 04:00:00 + +# madengine detects the allocation automatically +madengine run --manifest-file build_manifest.json +# Output: ✓ Detected existing SLURM allocation: Job 12345 +# Allocation has 3 nodes available +``` + +**Behavior inside allocation:** +- Uses `srun` directly instead of `sbatch` +- Validates requested nodes ≤ available nodes +- Warns if using fewer nodes than allocated +- Skips job submission (already allocated) + +**Build inside allocation:** + +```bash +# Inside salloc session +madengine build --tags model --build-on-compute +# Uses srun instead of sbatch --wait +``` + +**Environment variables detected:** +- `SLURM_JOB_ID` - Indicates inside allocation +- `SLURM_NNODES` - Number of nodes available +- `SLURM_NODELIST` - List of allocated nodes + ### Monitoring ```bash @@ -386,6 +419,43 @@ scancel -u $USER } ``` +### Baremetal Execution (slurm_multi) + +For disaggregated inference workloads like SGLang Disaggregated, madengine supports baremetal execution where the model's `.slurm` script manages Docker containers directly: + +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 3, + "gpus_per_node": 8, + "time": "04:00:00" + }, + "distributed": { + "launcher": "slurm_multi", + "nnodes": 3, + "nproc_per_node": 8, + "sglang_disagg": { + "prefill_nodes": 1, + "decode_nodes": 1 + } + } +} +``` + +**How baremetal execution works:** +1. madengine generates a wrapper script (not a Docker container) +2. The wrapper runs the model's `.slurm` script directly on baremetal +3. The `.slurm` script manages Docker containers via `srun` +4. Environment variables from `models.json` and `additional-context` are passed through + +**When to use `slurm_multi`:** +- SGLang Disaggregated inference (proxy + prefill + decode nodes) +- Workloads requiring direct SLURM node control +- Custom Docker orchestration via `.slurm` scripts + +See [Launchers Guide](launchers.md#7-sglang-disaggregated-new) for detailed configuration. + ## Troubleshooting ### Kubernetes Issues diff --git a/docs/launchers.md b/docs/launchers.md index d5c62f9e..54205867 100644 --- a/docs/launchers.md +++ b/docs/launchers.md @@ -457,12 +457,12 @@ SGLANG_NODE_IPS="10.0.0.1,10.0.0.2,..." ``` **Examples**: -- K8s Minimal: `examples/k8s-configs/minimal/slurm_multi-minimal.json` -- K8s Basic: `examples/k8s-configs/basic/slurm_multi-multi-node-basic.json` -- K8s Custom: `examples/k8s-configs/basic/slurm_multi-custom-split.json` -- SLURM Minimal: `examples/slurm-configs/minimal/slurm_multi-minimal.json` -- SLURM Basic: `examples/slurm-configs/basic/slurm_multi-multi-node.json` -- SLURM Custom: `examples/slurm-configs/basic/slurm_multi-custom-split.json` +- K8s Minimal: `examples/k8s-configs/minimal/slurm-multi-minimal.json` +- K8s Basic: `examples/k8s-configs/basic/slurm-multi-multi-node-basic.json` +- K8s Custom: `examples/k8s-configs/basic/slurm-multi-custom-split.json` +- SLURM Minimal: `examples/slurm-configs/minimal/slurm-multi-minimal.json` +- SLURM Basic: `examples/slurm-configs/basic/slurm-multi-multi-node.json` +- SLURM Custom: `examples/slurm-configs/basic/slurm-multi-custom-split.json` **Comparison: SGLang vs SGLang Disaggregated**: diff --git a/docs/usage.md b/docs/usage.md index 2fd71bea..f769d243 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -279,6 +279,60 @@ madengine build --batch-manifest batch.json \ ] ``` +### Pre-built Image Mode + +Skip Docker build entirely and use an existing image: + +```bash +# Use external image (e.g., from Docker Hub) +madengine build --tags sglang_disagg \ + --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ + --additional-context-file slurm-config.json + +# Then run normally +madengine run --manifest-file build_manifest.json +``` + +**Use cases:** +- Official framework images (SGLang, vLLM, PyTorch NGC) +- Pre-cached images on compute nodes +- Quick testing without rebuild time +- CI/CD with external registries + +The manifest marks the image as `"prebuilt": true` with zero build time. + +### Build on Compute Node + +For SLURM environments where login nodes have limited resources: + +```bash +# Build on compute nodes instead of login node +madengine build --tags model \ + --build-on-compute \ + --additional-context-file slurm-config.json +``` + +**slurm-config.json:** +```json +{ + "slurm": { + "partition": "gpu", + "nodes": 3, + "time": "02:00:00" + } +} +``` + +**How it works:** +1. Submits `madengine_build_job.sh` via `sbatch --wait` +2. Builds Docker image on ALL allocated nodes in parallel +3. Generates manifest after completion + +**Benefits:** +- Offloads heavy build to compute resources +- Image available on all nodes simultaneously +- Respects login node resource policies + ## Run Workflow ### Skip model run after build From 328fb7abef283516630ebbf7fed3b34288bd74bc Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 5 Mar 2026 04:32:44 +0000 Subject: [PATCH 06/12] Enhance --use-image and --build-on-compute options --use-image changes: - Make image name optional (auto-detect from model card's DOCKER_IMAGE_NAME) - Add mutual exclusivity with --registry and --build-on-compute - Handle multiple models with different images (use first, warn) --build-on-compute changes: - Require --registry option (build once, push, pull everywhere) - Build on 1 compute node only, push to registry - Merge SLURM config from model card + additional-context (CLI overrides) - Add registry credential validation before build - Add docker login step in build script - Smart registry image naming (namespace/repo vs namespace formats) - Parallel docker pull on all nodes in run phase - Clear error messages for missing partition, credentials Run phase changes: - Detect built_on_compute flag in manifest - Pull image in parallel on all nodes via srun before model execution Documentation updated for new workflows. Made-with: Cursor --- docs/cli-reference.md | 73 ++- docs/usage.md | 52 +- src/madengine/cli/commands/build.py | 29 +- src/madengine/deployment/slurm.py | 43 +- .../orchestration/build_orchestrator.py | 560 +++++++++++++----- 5 files changed, 572 insertions(+), 185 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 04b45f1f..f844ffaf 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -98,7 +98,7 @@ madengine build [OPTIONS] | `--target-archs` | `-a` | TEXT | `[]` | Target GPU architectures (e.g., gfx908,gfx90a,gfx942) | | `--registry` | `-r` | TEXT | `None` | Docker registry to push images to | | `--batch-manifest` | | TEXT | `None` | Input batch.json file for batch build mode | -| `--use-image` | | TEXT | `None` | Skip Docker build, use pre-built image instead | +| `--use-image` | | TEXT | `None` | Skip Docker build, use pre-built image. Omit value to auto-detect from model's `DOCKER_IMAGE_NAME` | | `--build-on-compute` | | FLAG | `False` | Build Docker images on SLURM compute node instead of login node | | `--additional-context` | `-c` | TEXT | `"{}"` | Additional context as JSON string | | `--additional-context-file` | `-f` | TEXT | `None` | File containing additional context JSON | @@ -210,7 +210,12 @@ See [Batch Build Guide](batch-build.md) for details. Skip Docker build and use an existing image from a registry or local Docker cache: ```bash -# Use image from Docker Hub +# Auto-detect image from model card's DOCKER_IMAGE_NAME env var +madengine build --tags sglang_disagg \ + --use-image \ + --additional-context-file config.json + +# Explicitly specify image from Docker Hub madengine build --tags sglang_disagg \ --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ --additional-context-file config.json @@ -224,6 +229,18 @@ madengine build --tags model \ --use-image my-local-image:latest ``` +**Image Resolution Priority:** +1. If `--use-image ` is specified, use that image +2. If `--use-image` (no value), auto-detect from model card's `DOCKER_IMAGE_NAME` env var +3. If no image found in model card, error with helpful suggestions + +**Multiple Models Warning:** +When using auto-detection with multiple models that have different `DOCKER_IMAGE_NAME` values, the first model's image is used and a warning is printed. + +**Mutual Exclusivity:** +- `--use-image` cannot be used with `--registry` (push requires local build) +- `--use-image` cannot be used with `--build-on-compute` (skip build vs. build on compute) + **When to use `--use-image`:** - Using official framework images (SGLang, vLLM, etc.) - Image is pre-cached on compute nodes @@ -234,43 +251,61 @@ The generated manifest marks the image as `"prebuilt": true` with `build_time: 0 **Build on Compute Node (`--build-on-compute`):** -Build Docker images on SLURM compute nodes instead of the login node: +Build Docker images on a SLURM compute node, push to registry, and pull in parallel during run phase: ```bash -# Build on compute node (requires SLURM config) +# Build on compute node and push to registry (--registry REQUIRED) madengine build --tags model \ --build-on-compute \ + --registry docker.io/myorg \ --additional-context-file slurm-config.json ``` -**slurm-config.json for build-on-compute:** -```json -{ - "slurm": { - "partition": "gpu", - "nodes": 3, - "time": "02:00:00", - "reservation": "my-reservation" - } -} +**Required:** `--registry` must be specified with `--build-on-compute`. + +**SLURM Config Priority:** +1. Model card's `slurm` section (base configuration) +2. `--additional-context` overrides (command line takes precedence) + +If the model card already has `slurm` config, you only need to provide missing or override values: + +```bash +# Model card has partition/time, just override reservation +madengine build --tags model \ + --build-on-compute \ + --registry docker.io/myorg \ + --additional-context '{"slurm": {"reservation": "my-res"}}' ``` **When to use `--build-on-compute`:** - Login node has limited disk space or resources - Build requires GPU access (e.g., AOT compilation) -- Need image available on all compute nodes simultaneously - Login node policies prohibit heavy workloads +- Distributing images to many compute nodes (build once, pull everywhere) **How it works:** -1. Generates `madengine_build_job.sh` sbatch script -2. Submits via `sbatch --wait` -3. Runs `docker build` on ALL allocated nodes in parallel via `srun` -4. Generates manifest on completion + +*Build Phase:* +1. Discovers model and merges SLURM config (model card + additional-context) +2. Submits build job to **1 compute node** via `sbatch --wait` +3. Builds Docker image on that node +4. Pushes image to registry +5. Generates manifest with registry image name + +*Run Phase:* +1. Detects `built_on_compute: true` in manifest +2. Pulls image **in parallel on ALL nodes** via `srun docker pull` +3. Executes model script **Inside existing SLURM allocation:** If you're already inside an `salloc` allocation, `--build-on-compute` uses `srun` directly instead of submitting a new job. +**Error Messages:** + +If required SLURM fields are missing, specific errors are shown: +- Missing `partition`: "Add partition to model card's slurm section or via --additional-context" + --- ### `run` - Execute Models diff --git a/docs/usage.md b/docs/usage.md index f769d243..bf325ed1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -284,7 +284,12 @@ madengine build --batch-manifest batch.json \ Skip Docker build entirely and use an existing image: ```bash -# Use external image (e.g., from Docker Hub) +# Auto-detect image from model card's DOCKER_IMAGE_NAME env var +madengine build --tags sglang_disagg \ + --use-image \ + --additional-context-file slurm-config.json + +# Or explicitly specify the image madengine build --tags sglang_disagg \ --use-image lmsysorg/sglang:v0.5.5.post3-rocm700-mi30x \ --additional-context-file slurm-config.json @@ -293,6 +298,15 @@ madengine build --tags sglang_disagg \ madengine run --manifest-file build_manifest.json ``` +**Image Resolution:** +1. If `--use-image ` provided → use that image +2. If `--use-image` (no value) → auto-detect from model card's `DOCKER_IMAGE_NAME` +3. If no image found → error with helpful message + +**Mutual Exclusivity:** +- Cannot use with `--registry` (push requires local build) +- Cannot use with `--build-on-compute` (skip vs. build) + **Use cases:** - Official framework images (SGLang, vLLM, PyTorch NGC) - Pre-cached images on compute nodes @@ -306,32 +320,36 @@ The manifest marks the image as `"prebuilt": true` with zero build time. For SLURM environments where login nodes have limited resources: ```bash -# Build on compute nodes instead of login node +# Build on 1 compute node, push to registry, pull in parallel at runtime madengine build --tags model \ --build-on-compute \ - --additional-context-file slurm-config.json + --registry docker.io/myorg \ + --additional-context '{"slurm": {"reservation": "my-res"}}' ``` -**slurm-config.json:** -```json -{ - "slurm": { - "partition": "gpu", - "nodes": 3, - "time": "02:00:00" - } -} -``` +**Required:** `--registry` must be specified. + +**SLURM Config Merging:** +- Model card's `slurm` section provides base configuration +- `--additional-context` overrides specific fields +- Only specify what's missing or needs override **How it works:** -1. Submits `madengine_build_job.sh` via `sbatch --wait` -2. Builds Docker image on ALL allocated nodes in parallel -3. Generates manifest after completion + +*Build Phase:* +1. Builds Docker image on **1 compute node** +2. Pushes image to registry +3. Stores registry image name in manifest + +*Run Phase:* +1. Pulls image **in parallel on ALL nodes** via `srun docker pull` +2. Executes model script **Benefits:** - Offloads heavy build to compute resources -- Image available on all nodes simultaneously +- Build once, distribute via registry pull - Respects login node resource policies +- Parallel pull scales to many nodes ## Run Workflow diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py index f359432d..a3534fd5 100644 --- a/src/madengine/cli/commands/build.py +++ b/src/madengine/cli/commands/build.py @@ -59,7 +59,9 @@ def build( Optional[str], typer.Option( "--use-image", - help="Skip Docker build and use pre-built image (e.g., lmsysorg/sglang:v0.5.2rc1-rocm700-mi30x)" + is_flag=False, + flag_value="auto", + help="Skip Docker build and use pre-built image. Optionally specify image name, or omit to auto-detect from model card's DOCKER_IMAGE_NAME" ), ] = None, build_on_compute: Annotated[ @@ -130,6 +132,31 @@ def build( ) raise typer.Exit(ExitCode.INVALID_ARGS) + if use_image and registry: + console.print( + "❌ [bold red]Error: Cannot specify both --use-image and --registry options[/bold red]\n" + "[yellow]Use --use-image for pre-built external images.[/yellow]\n" + "[yellow]Use --registry to push locally built images.[/yellow]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + if use_image and build_on_compute: + console.print( + "❌ [bold red]Error: Cannot specify both --use-image and --build-on-compute options[/bold red]\n" + "[yellow]--use-image skips Docker build entirely.[/yellow]\n" + "[yellow]--build-on-compute builds on SLURM compute nodes.[/yellow]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + + if build_on_compute and not registry: + console.print( + "❌ [bold red]Error: --build-on-compute requires --registry option[/bold red]\n" + "[yellow]Build on compute node pushes image to registry.[/yellow]\n" + "[yellow]Run phase will pull image in parallel on all nodes.[/yellow]\n" + "[dim]Example: --build-on-compute --registry docker.io/myorg[/dim]" + ) + raise typer.Exit(ExitCode.INVALID_ARGS) + # Process batch manifest if provided batch_data = None effective_tags = processed_tags diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index b3d5dc9f..8d4b7553 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -454,7 +454,7 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N script_lines.append("") script_lines.extend([ "echo '=========================================='", - "echo 'Baremetal Launcher - SGLang Disaggregated'", + "echo 'Baremetal Launcher - slurm_multi'", "echo '=========================================='", f"echo 'Model: {model_info['name']}'", f"echo 'Script: {model_script_path}'", @@ -462,6 +462,47 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "echo 'SLURM_NNODES:' $SLURM_NNODES", "echo 'SLURM_NODELIST:' $SLURM_NODELIST", "echo ''", + ]) + + # Check if image was built on compute and needs parallel pull + built_on_compute = model_info.get("built_on_compute", False) + docker_image = env_vars.get("DOCKER_IMAGE_NAME", "") + + if built_on_compute and docker_image: + # Add parallel docker pull on all nodes + script_lines.extend([ + "", + "# Pull Docker image in parallel on all nodes", + "echo '=========================================='", + "echo 'Pulling Docker image on all nodes in parallel'", + "echo '=========================================='", + f"echo 'Image: {docker_image}'", + "echo ''", + "", + f"srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES bash -c \"", + f" echo \\\"[\\$(hostname)] Pulling {docker_image}...\\\"", + f" docker pull {docker_image}", + " PULL_RC=\\$?", + " if [ \\$PULL_RC -eq 0 ]; then", + " echo \\\"[\\$(hostname)] Pull SUCCESS\\\"", + " else", + " echo \\\"[\\$(hostname)] Pull FAILED with exit code \\$PULL_RC\\\"", + " fi", + " exit \\$PULL_RC", + "\"", + "PULL_EXIT=$?", + "", + "if [ $PULL_EXIT -ne 0 ]; then", + " echo 'Docker pull failed on one or more nodes'", + " exit $PULL_EXIT", + "fi", + "", + "echo ''", + "echo 'Docker image pulled on all nodes'", + "echo ''", + ]) + + script_lines.extend([ "", "# Change to script directory", f"cd {model_script_path.parent}", diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 5dc7a981..1ef3212c 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -259,6 +259,10 @@ def execute( """ # Handle pre-built image mode if use_image: + # If use_image is "auto", resolve from model card + if use_image == "auto": + use_image = self._resolve_image_from_model_card() + return self._execute_with_prebuilt_image( use_image=use_image, manifest_output=manifest_output, @@ -629,6 +633,86 @@ def _execute_with_prebuilt_image( ), ) from e + def _resolve_image_from_model_card(self) -> str: + """ + Resolve Docker image name from model card's DOCKER_IMAGE_NAME env var. + + This method discovers models and extracts the DOCKER_IMAGE_NAME from + env_vars. If multiple models have different images, uses the first + and prints a warning. + + Returns: + Docker image name from model card + + Raises: + ConfigurationError: If no DOCKER_IMAGE_NAME found in any model + """ + self.rich_console.print("[bold cyan]🔍 Auto-detecting image from model card...[/bold cyan]") + + # Discover models to get their env_vars + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + + if not models: + raise ConfigurationError( + "No models discovered for image auto-detection", + context=create_error_context( + operation="resolve_image", + component="BuildOrchestrator", + ), + suggestions=[ + "Specify image name explicitly with --use-image ", + "Check if models.json exists", + "Verify --tags parameter is correct", + ], + ) + + # Collect DOCKER_IMAGE_NAME from all models + images_found = {} + for model in models: + model_name = model.get("name", "unknown") + env_vars = model.get("env_vars", {}) + docker_image = env_vars.get("DOCKER_IMAGE_NAME") + + if docker_image: + images_found[model_name] = docker_image + + if not images_found: + model_names = [m.get("name", "unknown") for m in models] + raise ConfigurationError( + "No DOCKER_IMAGE_NAME found in model card env_vars", + context=create_error_context( + operation="resolve_image", + component="BuildOrchestrator", + model_names=model_names, + ), + suggestions=[ + "Add DOCKER_IMAGE_NAME to model's env_vars in models.json", + "Specify image name explicitly with --use-image ", + 'Example: "env_vars": {"DOCKER_IMAGE_NAME": "myimage:tag"}', + ], + ) + + # Use first model's image + first_model = list(images_found.keys())[0] + resolved_image = images_found[first_model] + + # Warn if multiple models have different images + unique_images = set(images_found.values()) + if len(unique_images) > 1: + self.rich_console.print( + f"[yellow]⚠️ Warning: Multiple models have different DOCKER_IMAGE_NAME values:[/yellow]" + ) + for model_name, image in images_found.items(): + self.rich_console.print(f" - {model_name}: {image}") + self.rich_console.print( + f"[yellow] Using image from '{first_model}': {resolved_image}[/yellow]\n" + ) + else: + self.rich_console.print(f"[green]✓ Auto-detected image: {resolved_image}[/green]\n") + + return resolved_image + def _execute_build_on_compute( self, registry: Optional[str] = None, @@ -637,16 +721,16 @@ def _execute_build_on_compute( batch_build_metadata: Optional[Dict] = None, ) -> str: """ - Execute Docker build on a SLURM compute node instead of login node. + Execute Docker build on a SLURM compute node and push to registry. - This submits a SLURM job that runs the Docker build on a compute node, - which is useful when: - - Login node has limited disk space - - Login node shouldn't run heavy workloads - - Compute nodes have faster storage/network + Build workflow: + 1. Build on 1 compute node only + 2. Push image to registry + 3. Store registry image name in manifest + 4. Run phase will pull image in parallel on all nodes Args: - registry: Optional registry to push images to + registry: Registry to push images to (REQUIRED) clean_cache: Whether to use --no-cache for Docker builds manifest_output: Output file for build manifest batch_build_metadata: Optional batch build metadata @@ -656,189 +740,316 @@ def _execute_build_on_compute( """ import subprocess import os + import glob self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") self.rich_console.print("[bold blue]🔨 BUILD PHASE (Compute Node Mode)[/bold blue]") - self.rich_console.print("[cyan]Building on SLURM compute node...[/cyan]") + self.rich_console.print("[cyan]Building on 1 compute node, pushing to registry...[/cyan]") self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + # Discover models first to get SLURM config from model card + self.rich_console.print("[bold cyan]🔍 Discovering models...[/bold cyan]") + discover_models = DiscoverModels(args=self.args) + models = discover_models.run() + + if not models: + raise DiscoveryError( + "No models discovered for build-on-compute", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + suggestions=[ + "Check if models.json exists", + "Verify --tags parameter is correct", + ], + ) + + model = models[0] + model_name = model.get("name", "unknown") + self.rich_console.print(f"[green]✓ Found model: {model_name}[/green]\n") + + # Merge SLURM config: model card (base) + additional-context (override) + model_slurm_config = model.get("slurm", {}) + context_slurm_config = self.additional_context.get("slurm", {}) + + # Start with model card config, then override with command-line context + slurm_config = {**model_slurm_config, **context_slurm_config} + + self.rich_console.print("[bold cyan]📋 SLURM Configuration (merged):[/bold cyan]") + if model_slurm_config: + self.rich_console.print(f" [dim]From model card:[/dim] {list(model_slurm_config.keys())}") + if context_slurm_config: + self.rich_console.print(f" [dim]From --additional-context (overrides):[/dim] {list(context_slurm_config.keys())}") + + # Validate required fields + partition = slurm_config.get("partition") + if not partition: + raise ConfigurationError( + "Missing required SLURM field: partition", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + ), + suggestions=[ + 'Add "partition" to model card\'s slurm section', + 'Or specify via --additional-context \'{"slurm": {"partition": "gpu"}}\'', + ], + ) + + reservation = slurm_config.get("reservation", "") + time_limit = slurm_config.get("time", "02:00:00") + + self.rich_console.print(f" Partition: {partition}") + self.rich_console.print(f" Time limit: {time_limit}") + if reservation: + self.rich_console.print(f" Reservation: {reservation}") + self.rich_console.print("") + + # Validate registry credentials + self.rich_console.print("[bold cyan]🔐 Registry Configuration:[/bold cyan]") + self.rich_console.print(f" Registry: {registry}") + + # Check for credentials - either from environment or credential.json + dockerhub_user = os.environ.get("MAD_DOCKERHUB_USER", "") + dockerhub_password = os.environ.get("MAD_DOCKERHUB_PASSWORD", "") + + # Try to load from credential.json if env vars not set + credential_file = Path("credential.json") + if not dockerhub_user and credential_file.exists(): + try: + with open(credential_file) as f: + creds = json.load(f) + dockerhub_creds = creds.get("dockerhub", {}) + dockerhub_user = dockerhub_creds.get("username", "") + dockerhub_password = dockerhub_creds.get("password", "") + if dockerhub_user: + self.rich_console.print(f" Credentials: Found in credential.json") + except (json.JSONDecodeError, IOError) as e: + self.rich_console.print(f" [yellow]Warning: Could not read credential.json: {e}[/yellow]") + elif dockerhub_user: + self.rich_console.print(f" Credentials: Found in environment (MAD_DOCKERHUB_USER)") + + # Determine if registry requires authentication + requires_auth = True + public_registries = ["docker.io", "ghcr.io", "gcr.io", "quay.io", "nvcr.io"] + registry_lower = registry.lower() if registry else "" + + # For docker.io pushes, authentication is always required + if any(pub_reg in registry_lower for pub_reg in public_registries): + if not dockerhub_user or not dockerhub_password: + raise ConfigurationError( + f"Registry credentials required for pushing to {registry}", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + registry=registry, + ), + suggestions=[ + "Set environment variables: MAD_DOCKERHUB_USER and MAD_DOCKERHUB_PASSWORD", + 'Or create credential.json: {"dockerhub": {"username": "...", "password": "..."}}', + "For Docker Hub, use a Personal Access Token (PAT) as password", + f"Example: export MAD_DOCKERHUB_USER=myuser", + f"Example: export MAD_DOCKERHUB_PASSWORD=dckr_pat_xxxxx", + ], + ) + self.rich_console.print(f" Auth: Will login to registry before push") + else: + # Private/internal registry - may not need auth + self.rich_console.print(f" Auth: Private registry (auth may not be required)") + requires_auth = dockerhub_user and dockerhub_password + + self.rich_console.print("") + # Check if we're inside an existing allocation inside_allocation = os.environ.get("SLURM_JOB_ID") is not None existing_job_id = os.environ.get("SLURM_JOB_ID", "") - - # Get SLURM config from additional_context - slurm_config = self.additional_context.get("slurm", {}) - partition = slurm_config.get("partition", "gpu") - reservation = slurm_config.get("reservation", "") - time_limit = slurm_config.get("time", "02:00:00") - # Get number of nodes - build on ALL nodes so image is available everywhere - nodes = slurm_config.get("nodes", 1) - - # Build the madengine build command (without --build-on-compute to avoid recursion) - tags = getattr(self.args, 'tags', []) - tags_str = " ".join([f"-t {tag}" for tag in tags]) if tags else "" - # Write additional context to a file to avoid shell quoting issues - context_file_path = None - additional_context_str = "" - if self.additional_context: - import json - context_file_path = Path("madengine_build_context.json") - with open(context_file_path, 'w') as f: - json.dump(self.additional_context, f) - self.rich_console.print(f" Context file: {context_file_path}") - - # Base build command - build_cmd_parts = ["madengine", "build"] - if tags_str: - build_cmd_parts.extend(tags_str.split()) - if context_file_path: - build_cmd_parts.extend(["--additional-context-file", str(context_file_path)]) - build_cmd_parts.extend(["--manifest-output", manifest_output]) - if registry: - build_cmd_parts.extend(["--registry", registry]) - if clean_cache: - build_cmd_parts.append("--clean-docker-cache") - - build_cmd = " ".join(build_cmd_parts) - - if inside_allocation: - # Run build on compute node via srun - self.rich_console.print(f"[cyan]Running build via srun (inside allocation {existing_job_id})...[/cyan]") - cmd = ["srun", "-N1", "--ntasks=1", "bash", "-c", build_cmd] + # Find Dockerfile + dockerfile = model.get("dockerfile", "") + dockerfile_path = "" + dockerfile_patterns = [ + f"{dockerfile}.ubuntu.amd.Dockerfile", + f"{dockerfile}.Dockerfile", + f"{dockerfile}", + ] + for pattern in dockerfile_patterns: + matches = glob.glob(pattern) + if matches: + dockerfile_path = matches[0] + break + + if not dockerfile_path: + raise ConfigurationError( + f"Dockerfile not found for model {model_name}", + context=create_error_context( + operation="build_on_compute", + component="BuildOrchestrator", + dockerfile=dockerfile, + ), + suggestions=[ + f"Check if {dockerfile}.ubuntu.amd.Dockerfile exists", + "Verify the dockerfile path in models.json", + ], + ) + + # Generate image name for registry + dockerfile_basename = Path(dockerfile_path).name.replace(".Dockerfile", "").replace(".ubuntu.amd", "") + local_image_name = f"ci-{model_name}_{dockerfile_basename}" + + # Determine registry image name based on registry format + # docker.io/namespace/repo -> use model name as tag: docker.io/namespace/repo:model_name + # docker.io/namespace -> use model name as repo: docker.io/namespace/model_name:latest + registry_parts = registry.replace("docker.io/", "").split("/") + if len(registry_parts) >= 2: + # Registry already includes repo name (e.g., rocm/pytorch-private) + # Use model name as tag + registry_image_name = f"{registry}:{model_name}" + self.rich_console.print(f" [dim]Registry format: namespace/repo -> using model name as tag[/dim]") else: - # Generate and submit build script - self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]") - - # Get absolute path for context file - abs_context_file = str(context_file_path.absolute()) if context_file_path else "" - abs_manifest_output = str(Path(manifest_output).absolute()) - - # Rebuild command with absolute paths for sbatch - build_cmd_abs = f"madengine build {tags_str}" - if abs_context_file: - build_cmd_abs += f" --additional-context-file {abs_context_file}" - build_cmd_abs += f" --manifest-output {abs_manifest_output}" - if registry: - build_cmd_abs += f" --registry {registry}" - if clean_cache: - build_cmd_abs += " --clean-docker-cache" - - # Discover models to get Dockerfile path - discover_models = DiscoverModels(args=self.args) - models = discover_models.run() - dockerfile_path = "" - dockerfile_name = "" - if models: - dockerfile = models[0].get("dockerfile", "") - # Find the actual Dockerfile - import glob - dockerfile_patterns = [ - f"{dockerfile}.ubuntu.amd.Dockerfile", - f"{dockerfile}.Dockerfile", - f"{dockerfile}", - ] - for pattern in dockerfile_patterns: - matches = glob.glob(pattern) - if matches: - dockerfile_path = matches[0] - dockerfile_name = Path(dockerfile_path).name - break - - self.rich_console.print(f" Nodes: {nodes} (building on all nodes)") - if dockerfile_path: - self.rich_console.print(f" Dockerfile: {dockerfile_path}") - - build_script_content = f"""#!/bin/bash + # Registry is just namespace (e.g., myuser) + # Use model name as repo + registry_image_name = f"{registry}/{model_name}:latest" + self.rich_console.print(f" [dim]Registry format: namespace -> using model name as repo[/dim]") + + self.rich_console.print("[bold cyan]🐳 Docker Configuration:[/bold cyan]") + self.rich_console.print(f" Dockerfile: {dockerfile_path}") + self.rich_console.print(f" Local image: {local_image_name}") + self.rich_console.print(f" Registry image: {registry_image_name}") + self.rich_console.print("") + + # Determine registry host for docker login + registry_host = registry.split("/")[0] if "/" in registry else registry + + # Build script content - builds on 1 node, pushes to registry + build_script_content = f"""#!/bin/bash #SBATCH --job-name=madengine-build #SBATCH --partition={partition} -#SBATCH --nodes={nodes} -#SBATCH --ntasks={nodes} +#SBATCH --nodes=1 +#SBATCH --ntasks=1 #SBATCH --time={time_limit} {f'#SBATCH --reservation={reservation}' if reservation else ''} #SBATCH --output=madengine_build_%j.out #SBATCH --error=madengine_build_%j.err -echo "=== Building on compute nodes ===" +echo "============================================================" +echo "=== MADENGINE BUILD ON COMPUTE NODE ===" +echo "============================================================" +echo "" echo "Job ID: $SLURM_JOB_ID" -echo "Nodes: $SLURM_NNODES" -echo "Node list: $SLURM_NODELIST" +echo "Build Node: $(hostname)" echo "Working directory: $(pwd)" +echo "Registry: {registry}" echo "" # Change to submission directory cd {Path.cwd().absolute()} -# Activate virtual environment if available -if [ -f "{Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()}" ]; then - source {Path('/shared_inference/ravgupta/madenginev2_slurm/venv/bin/activate').absolute()} - echo "Activated virtual environment" +# Step 0: Docker login for registry push +echo "=== Step 0: Docker Registry Authentication ===" +DOCKER_USER="${{MAD_DOCKERHUB_USER:-}}" +DOCKER_PASS="${{MAD_DOCKERHUB_PASSWORD:-}}" + +# Try credential.json if env vars not set +if [ -z "$DOCKER_USER" ] && [ -f "credential.json" ]; then + echo "Reading credentials from credential.json..." + DOCKER_USER=$(python3 -c "import json; print(json.load(open('credential.json')).get('dockerhub', {{}}).get('username', ''))" 2>/dev/null || echo "") + DOCKER_PASS=$(python3 -c "import json; print(json.load(open('credential.json')).get('dockerhub', {{}}).get('password', ''))" 2>/dev/null || echo "") fi -# Step 1: Build Docker image on ALL nodes in parallel -echo "" -echo "=== Building Docker image on all $SLURM_NNODES nodes ===" -DOCKERFILE="{dockerfile_path}" -if [ -n "$DOCKERFILE" ] && [ -f "$DOCKERFILE" ]; then - # Get the image name - must match exactly what madengine generates - # Format: ci-_ - IMAGE_NAME=$(basename $DOCKERFILE .Dockerfile) - FULL_IMAGE_NAME="ci-{models[0].get('name', 'model') if models else 'model'}_$IMAGE_NAME" - - echo "Dockerfile: $DOCKERFILE" - echo "Image name: $FULL_IMAGE_NAME" - - # Build on all nodes in parallel using srun - srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES bash -c " - echo \\\"[\\$(hostname)] Building Docker image...\\\" - cd {Path.cwd().absolute()} - docker build --network=host -t $FULL_IMAGE_NAME --pull -f $DOCKERFILE ./docker - BUILD_RC=\\$? - if [ \\$BUILD_RC -eq 0 ]; then - echo \\\"[\\$(hostname)] Docker build SUCCESS\\\" - else - echo \\\"[\\$(hostname)] Docker build FAILED with exit code \\$BUILD_RC\\\" - fi - exit \\$BUILD_RC - " - DOCKER_BUILD_EXIT=$? - - if [ $DOCKER_BUILD_EXIT -ne 0 ]; then - echo "Docker build failed on one or more nodes" - exit $DOCKER_BUILD_EXIT +if [ -n "$DOCKER_USER" ] && [ -n "$DOCKER_PASS" ]; then + echo "Logging in to registry as $DOCKER_USER..." + echo "$DOCKER_PASS" | docker login {registry_host} -u "$DOCKER_USER" --password-stdin + LOGIN_RC=$? + if [ $LOGIN_RC -ne 0 ]; then + echo "" + echo "❌ Docker login FAILED with exit code $LOGIN_RC" + echo "" + echo "Troubleshooting:" + echo " - Verify MAD_DOCKERHUB_USER and MAD_DOCKERHUB_PASSWORD are correct" + echo " - For Docker Hub, use a Personal Access Token (PAT) not your password" + echo " - Check if the registry URL is correct: {registry_host}" + exit $LOGIN_RC fi + echo "✅ Docker login SUCCESS" +else + echo "No credentials found - assuming public registry or pre-authenticated" +fi +echo "" + +# Step 1: Build Docker image +echo "" +echo "=== Step 1: Building Docker image ===" +echo "Dockerfile: {dockerfile_path}" +echo "Local image name: {local_image_name}" +echo "" + +docker build --network=host -t {local_image_name} {"--no-cache" if clean_cache else ""} --pull -f {dockerfile_path} ./docker +BUILD_RC=$? + +if [ $BUILD_RC -ne 0 ]; then echo "" - echo "=== Docker image built on all nodes ===" + echo "❌ Docker build FAILED on $(hostname) with exit code $BUILD_RC" + exit $BUILD_RC fi -# Step 2: Run madengine build on rank 0 to generate manifest echo "" -echo "=== Generating build manifest ===" -echo "Build command: {build_cmd_abs}" +echo "✅ Docker build SUCCESS on $(hostname)" echo "" -{build_cmd_abs} -BUILD_EXIT=$? +# Step 2: Tag and push to registry +echo "=== Step 2: Pushing to registry ===" +echo "Tagging: {local_image_name} -> {registry_image_name}" +docker tag {local_image_name} {registry_image_name} +echo "Pushing: {registry_image_name}" +docker push {registry_image_name} +PUSH_RC=$? + +if [ $PUSH_RC -ne 0 ]; then + echo "" + echo "❌ Docker push FAILED with exit code $PUSH_RC" + echo "" + echo "Troubleshooting:" + echo " - Check if you have push access to {registry}" + echo " - Verify credentials are correct (MAD_DOCKERHUB_USER, MAD_DOCKERHUB_PASSWORD)" + echo " - For Docker Hub, ensure the repository exists or you have create permissions" + exit $PUSH_RC +fi + +echo "" +echo "============================================================" +echo "✅ BUILD AND PUSH COMPLETE" +echo "============================================================" +echo "" +echo "Build Node: $(hostname)" +echo "Registry Image: {registry_image_name}" echo "" -echo "=== Build completed with exit code: $BUILD_EXIT ===" -exit $BUILD_EXIT +echo "Run phase will pull this image in parallel on all nodes." +echo "============================================================" + +exit 0 """ - build_script_path = Path("madengine_build_job.sh") - build_script_path.write_text(build_script_content) - build_script_path.chmod(0o755) - - self.rich_console.print(f" Build script: {build_script_path}") + + build_script_path = Path("madengine_build_job.sh") + build_script_path.write_text(build_script_content) + build_script_path.chmod(0o755) + + if inside_allocation: + self.rich_console.print(f"[cyan]Running build via srun (inside allocation {existing_job_id})...[/cyan]") + cmd = ["srun", "-N1", "--ntasks=1", "bash", str(build_script_path)] + else: + self.rich_console.print("[cyan]Submitting build job via sbatch...[/cyan]") cmd = ["sbatch", "--wait", str(build_script_path)] - - # Execute the build + + self.rich_console.print(f" Build script: {build_script_path}") self.rich_console.print(f" Command: {' '.join(cmd)}") self.rich_console.print("") try: result = subprocess.run( cmd, - capture_output=False, # Let output flow to console + capture_output=False, text=True, ) @@ -853,11 +1064,64 @@ def _execute_build_on_compute( "Check the build log files (madengine_build_*.out/err)", "Verify SLURM partition and reservation settings", "Ensure Docker is available on compute nodes", + "Verify registry credentials are configured", ], ) + # Generate manifest with registry image name + self.rich_console.print(f"\n[bold cyan]📄 Generating manifest...[/bold cyan]") + + manifest = { + "built_images": { + registry_image_name: { + "image_name": registry_image_name, + "docker_image": registry_image_name, + "local_image": local_image_name, + "dockerfile": dockerfile_path, + "build_time": 0, + "built_on_compute": True, + "registry": registry, + } + }, + "built_models": { + registry_image_name: { + "name": model_name, + "image": registry_image_name, + "docker_image": registry_image_name, + "dockerfile": dockerfile_path, + "scripts": model.get("scripts", ""), + "data": model.get("data", ""), + "n_gpus": model.get("n_gpus", "8"), + "tags": model.get("tags", []), + "slurm": slurm_config, + "distributed": model.get("distributed", {}), + "env_vars": model.get("env_vars", {}), + "built_on_compute": True, + } + }, + "context": self.context.ctx if hasattr(self.context, 'ctx') else {}, + "deployment_config": { + "slurm": slurm_config, + "distributed": model.get("distributed", {}), + }, + "credentials_required": [], + "summary": { + "successful_builds": [model_name], + "failed_builds": [], + "total_build_time": 0, + "successful_pushes": [registry_image_name], + "failed_pushes": [], + }, + } + + with open(manifest_output, "w") as f: + json.dump(manifest, f, indent=2) + self.rich_console.print(f"[green]✓ Build completed on compute node[/green]") + self.rich_console.print(f"[green]✓ Image pushed: {registry_image_name}[/green]") self.rich_console.print(f"[green]✓ Manifest: {manifest_output}[/green]") + self.rich_console.print(f"[dim]{'=' * 60}[/dim]\n") + return manifest_output except subprocess.TimeoutExpired: @@ -868,6 +1132,8 @@ def _execute_build_on_compute( component="BuildOrchestrator", ), ) + except (DiscoveryError, ConfigurationError, BuildError): + raise except Exception as e: raise BuildError( f"Failed to build on compute node: {e}", From b3470db1d69db53f03d9acc4dba25acecf964b66 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 5 Mar 2026 06:57:09 +0000 Subject: [PATCH 07/12] Enforce registry requirement for slurm_multi launcher - slurm_multi launcher now requires --registry or --use-image - Parallel docker pull on all nodes for any registry image (not just built_on_compute) - DOCKER_IMAGE_NAME added to manifest env_vars for all registry builds - Documentation updated for slurm_multi requirements Made-with: Cursor --- docs/cli-reference.md | 52 +++++++++++++++++++ src/madengine/deployment/slurm.py | 8 +-- src/madengine/execution/docker_builder.py | 11 ++++ .../orchestration/build_orchestrator.py | 35 ++++++++++++- 4 files changed, 101 insertions(+), 5 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index f844ffaf..b9d9a943 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -308,6 +308,58 @@ If required SLURM fields are missing, specific errors are shown: --- +**Multi-Node SLURM Launcher (`slurm_multi`):** + +Models using the `slurm_multi` launcher (for multi-node distributed inference) **require** either `--registry` or `--use-image`: + +```bash +# Option 1: Build and push to registry +madengine build --tags sglang_model \ + --registry docker.io/myorg \ + --additional-context '{"gpu_vendor": "AMD", "guest_os": "UBUNTU"}' + +# Option 2: Use pre-built image from registry +madengine build --tags sglang_model \ + --use-image docker.io/myorg/sglang:latest + +# Option 3: Build on compute and push +madengine build --tags sglang_model \ + --build-on-compute \ + --registry docker.io/myorg \ + --additional-context-file config.json +``` + +**Why this requirement?** + +Multi-node SLURM jobs run on multiple compute nodes. Each node needs access to the Docker image: +- Local builds only exist on the login/build node +- Compute nodes cannot access locally built images +- Registry images enable parallel `docker pull` on all nodes + +**Parallel Image Pull:** + +During `madengine run`, images from a registry are automatically pulled in parallel on all allocated nodes: + +```bash +srun --nodes=$SLURM_NNODES --ntasks=$SLURM_NNODES docker pull +``` + +This ensures fast, consistent image availability across the cluster. + +**Re-using Images:** + +For subsequent runs with the same image, use `--use-image` to skip building: + +```bash +# First run: build and push +madengine build --tags model --registry docker.io/myorg + +# Subsequent runs: use pre-built image +madengine build --tags model --use-image docker.io/myorg/model:latest +``` + +--- + ### `run` - Execute Models Run models locally or deploy to Kubernetes/SLURM clusters. diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 8d4b7553..9b4ab2e7 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -464,12 +464,14 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "echo ''", ]) - # Check if image was built on compute and needs parallel pull - built_on_compute = model_info.get("built_on_compute", False) + # Check if image needs parallel pull on all nodes + # Pull if: image is from registry (contains / or .) and not a local ci-* build docker_image = env_vars.get("DOCKER_IMAGE_NAME", "") + is_registry_image = docker_image and not docker_image.startswith("ci-") and ("/" in docker_image or "." in docker_image) - if built_on_compute and docker_image: + if is_registry_image: # Add parallel docker pull on all nodes + # This ensures all nodes have the image before running script_lines.extend([ "", "# Pull Docker image in parallel on all nodes", diff --git a/src/madengine/execution/docker_builder.py b/src/madengine/execution/docker_builder.py index 5e0c78d7..0dcb4f25 100644 --- a/src/madengine/execution/docker_builder.py +++ b/src/madengine/execution/docker_builder.py @@ -413,6 +413,17 @@ def export_build_manifest( "registry" ) + # Update built_models with registry image name for parallel pull in slurm_multi + # Map local image to registry image for env_vars + for image_name, build_info in self.built_images.items(): + registry_image = build_info.get("registry_image") + if registry_image and image_name in self.built_models: + model_data = self.built_models[image_name] + if "env_vars" not in model_data: + model_data["env_vars"] = {} + # Set DOCKER_IMAGE_NAME to registry image for parallel pull + model_data["env_vars"]["DOCKER_IMAGE_NAME"] = registry_image + manifest = { "built_images": self.built_images, "built_models": self.built_models, diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 1ef3212c..e1e0e41e 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -276,6 +276,33 @@ def execute( manifest_output=manifest_output, batch_build_metadata=batch_build_metadata, ) + + # For normal build: check if slurm_multi launcher requires registry + # Discover models first to check launcher + discover_models = DiscoverModels(args=self.args) + discovered_models = discover_models.run() + + if discovered_models: + for model in discovered_models: + launcher = model.get("distributed", {}).get("launcher", "") + if launcher in ["slurm_multi", "slurm-multi"] and not registry: + model_name = model.get("name", "unknown") + raise ConfigurationError( + f"slurm_multi launcher requires --registry or --use-image", + context=create_error_context( + operation="build", + component="BuildOrchestrator", + model=model_name, + launcher=launcher, + ), + suggestions=[ + "Use --registry docker.io/myorg to push image (nodes will pull in parallel)", + "Use --use-image to use a pre-built image from registry", + "Use --build-on-compute --registry to build on compute and push", + "For subsequent runs with same image, use: --use-image", + ], + ) + self.rich_console.print(f"\n[dim]{'=' * 60}[/dim]") self.rich_console.print("[bold blue]🔨 BUILD PHASE[/bold blue]") self.rich_console.print("[yellow](Build-only mode - no GPU detection)[/yellow]") @@ -564,6 +591,10 @@ def _execute_with_prebuilt_image( model_name = model.get("name", "unknown") model_distributed = model.get("distributed", {}) + # Merge DOCKER_IMAGE_NAME into env_vars for parallel pull in run phase + model_env_vars = model.get("env_vars", {}).copy() + model_env_vars["DOCKER_IMAGE_NAME"] = use_image + # Use image name as key so slurm.py can find docker_image manifest["built_models"][use_image] = { "name": model_name, @@ -581,7 +612,7 @@ def _execute_with_prebuilt_image( "args": model.get("args", ""), "slurm": model.get("slurm", {}), "distributed": model_distributed, - "env_vars": model.get("env_vars", {}), + "env_vars": model_env_vars, "prebuilt": True, } manifest["summary"]["successful_builds"].append(model_name) @@ -1095,7 +1126,7 @@ def _execute_build_on_compute( "tags": model.get("tags", []), "slurm": slurm_config, "distributed": model.get("distributed", {}), - "env_vars": model.get("env_vars", {}), + "env_vars": {**model.get("env_vars", {}), "DOCKER_IMAGE_NAME": registry_image_name}, "built_on_compute": True, } }, From b13e1f154c03c0d6fc2b69a2d78693a8f9f5d743 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 5 Mar 2026 06:59:36 +0000 Subject: [PATCH 08/12] Add slurm_multi registry requirement to all docs Ensure consistent documentation across: - deployment.md - launchers.md - usage.md All now mention that slurm_multi requires --registry or --use-image. Made-with: Cursor --- docs/deployment.md | 17 +++++++++++++++++ docs/launchers.md | 12 ++++++++++++ docs/usage.md | 21 +++++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/docs/deployment.md b/docs/deployment.md index 09125b09..b68d9cc3 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -454,6 +454,23 @@ For disaggregated inference workloads like SGLang Disaggregated, madengine suppo - Workloads requiring direct SLURM node control - Custom Docker orchestration via `.slurm` scripts +**Registry Requirement:** + +Models using `slurm_multi` **require** either `--registry` or `--use-image` during build: + +```bash +# Option 1: Build and push to registry +madengine build --tags model --registry docker.io/myorg + +# Option 2: Use pre-built image +madengine build --tags model --use-image + +# Option 3: Build on compute and push +madengine build --tags model --build-on-compute --registry docker.io/myorg +``` + +This ensures all compute nodes can pull the image in parallel during `madengine run`. + See [Launchers Guide](launchers.md#7-sglang-disaggregated-new) for detailed configuration. ## Troubleshooting diff --git a/docs/launchers.md b/docs/launchers.md index 54205867..b660a2dd 100644 --- a/docs/launchers.md +++ b/docs/launchers.md @@ -436,6 +436,18 @@ Override automatic split based on workload characteristics: - Ray cluster coordination - No torchrun needed (manages own processes) +**Registry Requirement (SLURM)**: + +Models using `slurm_multi` launcher **require** `--registry` or `--use-image` during build: + +```bash +madengine build --tags model --registry docker.io/myorg +# OR +madengine build --tags model --use-image +``` + +This ensures all compute nodes can pull the image in parallel during `madengine run`. + **Environment Variables (K8s)**: ```bash POD_INDEX=${JOB_COMPLETION_INDEX} # Pod index for role assignment diff --git a/docs/usage.md b/docs/usage.md index bf325ed1..ddfbbcec 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -351,6 +351,27 @@ madengine build --tags model \ - Respects login node resource policies - Parallel pull scales to many nodes +### Multi-Node SLURM (slurm_multi) + +Models using the `slurm_multi` launcher **require** either `--registry` or `--use-image`: + +```bash +# Option 1: Build and push +madengine build --tags sglang_model --registry docker.io/myorg + +# Option 2: Use pre-built image +madengine build --tags sglang_model --use-image + +# Option 3: Build on compute +madengine build --tags sglang_model --build-on-compute --registry docker.io/myorg +``` + +**Why?** Multi-node jobs run on multiple compute nodes. Each node needs the Docker image, and local builds only exist on the login node. + +**Parallel Pull:** During `madengine run`, registry images are automatically pulled in parallel on all nodes before execution. + +**Re-using images:** For subsequent runs with the same image, use `--use-image` to skip building. + ## Run Workflow ### Skip model run after build From 6d12462cb47108315bda01c880c4859122255094 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Mon, 9 Mar 2026 04:38:39 +0000 Subject: [PATCH 09/12] Fix build validation, manifest merging, and SLURM monitoring robustness - Make gpu_vendor/guest_os optional when using --use-image (pre-built image) - Remove model name truncation in CLI results table - Merge model card slurm and distributed config into build_manifest.json - Add skip_monitoring flag for synchronous salloc execution - Add completion marker file for robust sbatch job completion detection Made-with: Cursor --- src/madengine/cli/commands/build.py | 3 +- src/madengine/cli/utils.py | 4 +- src/madengine/cli/validators.py | 2 + src/madengine/deployment/base.py | 4 +- src/madengine/deployment/slurm.py | 47 +++++++++++++++++++ .../orchestration/build_orchestrator.py | 35 +++++++++----- 6 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/madengine/cli/commands/build.py b/src/madengine/cli/commands/build.py index a3534fd5..6246a724 100644 --- a/src/madengine/cli/commands/build.py +++ b/src/madengine/cli/commands/build.py @@ -214,9 +214,8 @@ def build( ) try: - # Validate additional context and merge file + CLI; defaults wired into orchestrator validated_context = validate_additional_context( - additional_context, additional_context_file + additional_context, additional_context_file, use_image ) # Create arguments object diff --git a/src/madengine/cli/utils.py b/src/madengine/cli/utils.py index 214b966c..343bd25d 100644 --- a/src/madengine/cli/utils.py +++ b/src/madengine/cli/utils.py @@ -165,7 +165,7 @@ def extract_model_name(item): else: model_name = docker_image return model_name - return str(item)[:20] + return str(item) # Helper function to format numbers def format_number(value): @@ -247,7 +247,7 @@ def format_number(value): row_index += 1 else: # Fallback for non-dict items - model_name = str(item)[:20] + model_name = str(item) if has_node_data: row = [str(row_index), "✅ Success", model_name, "node-0", "-", "-"] else: diff --git a/src/madengine/cli/validators.py b/src/madengine/cli/validators.py index d99e87f7..91219a00 100644 --- a/src/madengine/cli/validators.py +++ b/src/madengine/cli/validators.py @@ -284,6 +284,7 @@ def additional_context_needs_cli_validation( def validate_additional_context( additional_context: str, additional_context_file: Optional[str] = None, + use_image: Optional[str] = None, ) -> Dict[str, Any]: """ Validate and parse additional context. @@ -291,6 +292,7 @@ def validate_additional_context( Args: additional_context: JSON string containing additional context additional_context_file: Optional file containing additional context + use_image: Optional pre-built image to use (skips required field validation) Returns: Dict containing parsed additional context diff --git a/src/madengine/deployment/base.py b/src/madengine/deployment/base.py index 52bbe02f..69878c04 100644 --- a/src/madengine/deployment/base.py +++ b/src/madengine/deployment/base.py @@ -67,6 +67,7 @@ class DeploymentResult: metrics: Optional[Dict[str, Any]] = None logs_path: Optional[str] = None artifacts: Optional[List[str]] = None + skip_monitoring: bool = False # Set True for synchronous runs (e.g., inside salloc) @property def is_success(self) -> bool: @@ -181,7 +182,8 @@ def execute(self) -> DeploymentResult: return result # Step 4: Monitor (optional) - if self.config.monitor: + # Skip monitoring if deploy() already ran synchronously (e.g., inside salloc) + if self.config.monitor and not result.skip_monitoring: result = self._monitor_until_complete(result.deployment_id) # Step 5: Collect Results (always collect, even on failure to record failed runs) diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 9b4ab2e7..9050a5b9 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -504,6 +504,10 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "echo ''", ]) + # Create completion marker path for robust completion detection + # Use absolute path since script will cd to different directory + completion_marker = (self.output_dir / f"madengine_{model_info['name']}.complete").resolve() + script_lines.extend([ "", "# Change to script directory", @@ -512,11 +516,22 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "# Run the model script directly on baremetal", f"echo 'Executing: bash {model_script_path.name} {model_args}'", f"bash {model_script_path.name} {model_args}", + "SCRIPT_EXIT_CODE=$?", "", "echo ''", "echo 'Script completed.'", + "", + "# Write completion marker for madengine to detect", + f"echo \"exit_code=$SCRIPT_EXIT_CODE\" > {completion_marker}", + f"echo \"timestamp=$(date -Iseconds)\" >> {completion_marker}", + f"echo 'Completion marker written: {completion_marker}'", + "", + "exit $SCRIPT_EXIT_CODE", ]) + # Store marker path for monitor to check + self._completion_marker = completion_marker + script_content = "\n".join(script_lines) # Save script @@ -1029,6 +1044,7 @@ def _run_inside_existing_allocation(self) -> DeploymentResult: deployment_id=self.existing_job_id, message=f"Completed inside existing allocation {self.existing_job_id}", logs_path=str(self.output_dir), + skip_monitoring=True, # Already ran synchronously, no need to poll ) else: self.console.print( @@ -1039,6 +1055,7 @@ def _run_inside_existing_allocation(self) -> DeploymentResult: deployment_id=self.existing_job_id, message=f"Script failed with exit code {result.returncode}", logs_path=str(self.output_dir), + skip_monitoring=True, # Already ran synchronously ) except subprocess.TimeoutExpired: @@ -1181,6 +1198,36 @@ def monitor(self, deployment_id: str) -> DeploymentResult: message=f"Completed (ran inside existing allocation {deployment_id})", ) + # Check for completion marker (robust detection for interactive/salloc jobs) + if hasattr(self, '_completion_marker') and self._completion_marker: + marker_path = Path(self._completion_marker) + if marker_path.exists(): + # Read exit code from marker + try: + content = marker_path.read_text() + exit_code = 0 + for line in content.splitlines(): + if line.startswith("exit_code="): + exit_code = int(line.split("=")[1]) + break + + self.console.print(f"[green]✓ Completion marker found: {marker_path}[/green]") + + if exit_code == 0: + return DeploymentResult( + status=DeploymentStatus.SUCCESS, + deployment_id=deployment_id, + message=f"Script completed successfully (exit code {exit_code})", + ) + else: + return DeploymentResult( + status=DeploymentStatus.FAILED, + deployment_id=deployment_id, + message=f"Script failed with exit code {exit_code}", + ) + except Exception as e: + self.console.print(f"[yellow]Warning: Could not read completion marker: {e}[/yellow]") + try: # Query job status using squeue (runs locally) result = subprocess.run( diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index e1e0e41e..7b36ef1f 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -624,24 +624,37 @@ def _execute_with_prebuilt_image( # Save deployment config self._save_deployment_config(manifest_output) - # Merge model's distributed config (especially launcher) into deployment_config - # This ensures sglang-disagg launcher is in deployment_config even if not in additional-context - if models and models[0].get("distributed"): + # Merge model's distributed and slurm config into deployment_config + # This ensures launcher and slurm settings are in deployment_config even if not in additional-context + if models: with open(manifest_output, "r") as f: saved_manifest = json.load(f) - model_distributed = models[0].get("distributed", {}) if "deployment_config" not in saved_manifest: saved_manifest["deployment_config"] = {} - # Merge model's distributed into deployment_config.distributed - if "distributed" not in saved_manifest["deployment_config"]: - saved_manifest["deployment_config"]["distributed"] = {} + # Merge model's distributed config + model_distributed = models[0].get("distributed", {}) + if model_distributed: + if "distributed" not in saved_manifest["deployment_config"]: + saved_manifest["deployment_config"]["distributed"] = {} + + # Copy launcher and other critical fields from model config + for key in ["launcher", "nnodes", "nproc_per_node", "backend", "port", "sglang_disagg", "vllm_disagg"]: + if key in model_distributed and key not in saved_manifest["deployment_config"]["distributed"]: + saved_manifest["deployment_config"]["distributed"][key] = model_distributed[key] - # Copy launcher and other critical fields from model config - for key in ["launcher", "nnodes", "nproc_per_node", "backend", "port", "sglang_disagg"]: - if key in model_distributed and key not in saved_manifest["deployment_config"]["distributed"]: - saved_manifest["deployment_config"]["distributed"][key] = model_distributed[key] + # Merge model's slurm config into deployment_config.slurm + # This enables run phase to auto-detect SLURM deployment without --additional-context + model_slurm = models[0].get("slurm", {}) + if model_slurm: + if "slurm" not in saved_manifest["deployment_config"]: + saved_manifest["deployment_config"]["slurm"] = {} + + # Copy slurm settings from model config + for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir"]: + if key in model_slurm and key not in saved_manifest["deployment_config"]["slurm"]: + saved_manifest["deployment_config"]["slurm"][key] = model_slurm[key] with open(manifest_output, "w") as f: json.dump(saved_manifest, f, indent=2) From 0bba330b1c6308f20848f4827a0116297ee9bf81 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 11 Mar 2026 23:01:00 +0000 Subject: [PATCH 10/12] Add nodelist support to slurm_multi baremetal scripts and rename Launcher column - Emit #SBATCH --nodelist in _prepare_baremetal_script() when nodelist is set via model card or --additional-context - Include nodelist in model card slurm merge key list so it propagates to the build manifest - Rename perf table column from "Launcher" to "Workload" for clarity Made-with: Cursor --- src/madengine/cli/utils.py | 4 ++-- src/madengine/deployment/slurm.py | 5 +++++ src/madengine/orchestration/build_orchestrator.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/madengine/cli/utils.py b/src/madengine/cli/utils.py index 343bd25d..0cacaa47 100644 --- a/src/madengine/cli/utils.py +++ b/src/madengine/cli/utils.py @@ -377,7 +377,7 @@ def display_performance_table(perf_csv_path: str = "perf.csv", session_start_row perf_table.add_column("Index", justify="right", style="dim") perf_table.add_column("Model", style="cyan") perf_table.add_column("Topology", justify="center", style="blue") - perf_table.add_column("Launcher", justify="center", style="magenta") + perf_table.add_column("Workload", justify="center", style="magenta") perf_table.add_column("Deployment", justify="center", style="cyan") perf_table.add_column("GPU Arch", style="yellow") perf_table.add_column("Performance", justify="right", style="green") @@ -482,7 +482,7 @@ def format_performance(perf): str(idx), model, topology, - launcher, # Distributed launcher (docker, torchrun, vllm, etc.) + launcher, # Workload type (sglang-disagg, vllm, torchrun, etc.) deployment_type, gpu_arch, performance, diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 9050a5b9..253d0cd7 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -438,6 +438,11 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N if self.reservation: script_lines.append(f"#SBATCH --reservation={self.reservation}") + # Add nodelist if specified (from model card or --additional-context) + nodelist = self._normalize_nodelist(self.slurm_config.get("nodelist")) + if nodelist: + script_lines.append(f"#SBATCH --nodelist={nodelist}") + script_lines.extend([ "", f"# Baremetal launcher script for {model_info['name']}", diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 7b36ef1f..86068439 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -652,7 +652,7 @@ def _execute_with_prebuilt_image( saved_manifest["deployment_config"]["slurm"] = {} # Copy slurm settings from model config - for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir"]: + for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir", "nodelist"]: if key in model_slurm and key not in saved_manifest["deployment_config"]["slurm"]: saved_manifest["deployment_config"]["slurm"][key] = model_slurm[key] From 808bf896b6a6ac3a38c40780ac3019249666cbf9 Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Wed, 11 Mar 2026 23:47:17 +0000 Subject: [PATCH 11/12] Fix deep-merge of slurm config to preserve model card values (nodes, time) When --additional-context provides partial slurm overrides (e.g. only nodelist), ConfigLoader defaults were overwriting model card values like nodes and time. Fixed by capturing original user slurm keys before ConfigLoader runs, and deep-merging manifest slurm config with runtime context instead of replacing it wholesale. Made-with: Cursor --- .../orchestration/build_orchestrator.py | 9 +++++-- .../orchestration/run_orchestrator.py | 24 +++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 86068439..2b15f192 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -92,6 +92,7 @@ def __init__(self, args, additional_context: Optional[Dict] = None): apply_build_context_defaults(merged_context) self.additional_context = merged_context + self._original_user_slurm_keys = set(merged_context.get("slurm", {}).keys()) # Apply ConfigLoader to infer deploy type, validate, and apply defaults if self.additional_context: @@ -651,9 +652,13 @@ def _execute_with_prebuilt_image( if "slurm" not in saved_manifest["deployment_config"]: saved_manifest["deployment_config"]["slurm"] = {} - # Copy slurm settings from model config + # Copy slurm settings from model config (model card fills in + # values not explicitly set by --additional-context). + # Use _original_user_slurm_keys (captured before ConfigLoader + # applies defaults) so model card values override defaults + # but user's explicit CLI values still win. for key in ["partition", "nodes", "gpus_per_node", "time", "exclusive", "reservation", "output_dir", "nodelist"]: - if key in model_slurm and key not in saved_manifest["deployment_config"]["slurm"]: + if key in model_slurm and key not in self._original_user_slurm_keys: saved_manifest["deployment_config"]["slurm"][key] = model_slurm[key] with open(manifest_output, "w") as f: diff --git a/src/madengine/orchestration/run_orchestrator.py b/src/madengine/orchestration/run_orchestrator.py index e7b1918a..578de179 100644 --- a/src/madengine/orchestration/run_orchestrator.py +++ b/src/madengine/orchestration/run_orchestrator.py @@ -230,10 +230,17 @@ def execute( if not self.additional_context: self.additional_context = {} - # Merge deployment_config into additional_context (for deployment layer to use) + # Merge deployment_config into additional_context (for deployment layer to use). + # For dict-valued keys (slurm, k8s, etc.), deep-merge so manifest + # values fill in gaps while runtime --additional-context wins on conflicts. for key in ["slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]: - if key in deployment_config and key not in self.additional_context: - self.additional_context[key] = deployment_config[key] + if key in deployment_config: + if key not in self.additional_context: + self.additional_context[key] = deployment_config[key] + elif isinstance(deployment_config[key], dict) and isinstance(self.additional_context[key], dict): + merged = dict(deployment_config[key]) + merged.update(self.additional_context[key]) + self.additional_context[key] = merged # Display manifest entries: context (from build) and deployment_config (run/deploy) self.rich_console.print("[bold blue]Build manifest breakdown[/bold blue]\n") @@ -505,10 +512,17 @@ def _load_and_merge_manifest(self, manifest_file: str) -> str: # Merge deployment_config if "deployment_config" in manifest: stored_config = manifest["deployment_config"] - # Runtime --additional-context overrides stored config + # Runtime --additional-context overrides stored config. + # For dict-valued keys, deep-merge so manifest values fill + # in gaps (e.g. nodes, time) while runtime values win on conflicts. for key in ["deploy", "slurm", "k8s", "kubernetes", "distributed", "vllm", "env_vars", "debug"]: if key in self.additional_context: - stored_config[key] = self.additional_context[key] + if key in stored_config and isinstance(stored_config[key], dict) and isinstance(self.additional_context[key], dict): + merged = dict(stored_config[key]) + merged.update(self.additional_context[key]) + stored_config[key] = merged + else: + stored_config[key] = self.additional_context[key] manifest["deployment_config"] = stored_config # Merge context (tools, pre_scripts, post_scripts, encapsulate_script) From 87f8b7fe7abb3648751804016c9f618fa911b80a Mon Sep 17 00:00:00 2001 From: raviguptaamd Date: Thu, 16 Apr 2026 08:05:20 +0000 Subject: [PATCH 12/12] Design review fixes: naming, results collection, and documentation - Rename BAREMETAL_LAUNCHERS to SELF_MANAGED_LAUNCHERS to avoid confusion with PR #62's baremetal_vm (Docker-less nodes with VMs) - Rename _run_on_baremetal -> _run_self_managed, _prepare_baremetal_script -> _prepare_slurm_multi_script throughout - Add slurm_multi to common.py VALID_LAUNCHERS (canonical list) and add hyphen-variant normalization (slurm-multi -> slurm_multi) - Remove duplicate VALID_LAUNCHERS from kubernetes.py (now inherits from common.py) - Fix SBATCH output filename pattern: _%j -> _%j_%t to match the glob pattern in collect_results (madengine-*_{deployment_id}_*.out) - Add _collect_slurm_multi_results for explicit slurm_multi results collection path with perf.csv retry and NFS propagation handling - Add --use-image / --build-on-compute mutual exclusivity check in build_orchestrator.py - Document --use-image vs MAD_CONTAINER_IMAGE relationship in cli-reference.md Made-with: Cursor --- docs/cli-reference.md | 13 +++ docs/deployment.md | 8 +- src/madengine/deployment/common.py | 5 +- src/madengine/deployment/kubernetes.py | 10 --- src/madengine/deployment/slurm.py | 81 ++++++++++++++++--- src/madengine/execution/container_runner.py | 24 +++--- .../orchestration/build_orchestrator.py | 14 ++++ 7 files changed, 115 insertions(+), 40 deletions(-) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index b9d9a943..53024b61 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -249,6 +249,19 @@ When using auto-detection with multiple models that have different `DOCKER_IMAGE The generated manifest marks the image as `"prebuilt": true` with `build_time: 0`. +**Relationship with `MAD_CONTAINER_IMAGE`:** + +`--use-image` and `MAD_CONTAINER_IMAGE` both allow using pre-built images, but operate at different phases: + +| | `--use-image` | `MAD_CONTAINER_IMAGE` | +|---|---|---| +| Phase | Build (`madengine build`) | Run (`madengine run`) | +| Output | Generates a manifest file | Creates a synthetic manifest at runtime | +| Workflow | Two-step: `build` then `run` | Single-step: `run` only | +| Use case | slurm_multi, CI pipelines, reproducible manifests | Quick local testing, ad-hoc runs | + +They are complementary. Use `--use-image` when you want a persistent manifest that can be shared or re-run. Use `MAD_CONTAINER_IMAGE` when you want a quick single-command run without generating a manifest. + **Build on Compute Node (`--build-on-compute`):** Build Docker images on a SLURM compute node, push to registry, and pull in parallel during run phase: diff --git a/docs/deployment.md b/docs/deployment.md index b68d9cc3..8c79bc76 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -419,9 +419,9 @@ scancel -u $USER } ``` -### Baremetal Execution (slurm_multi) +### Self-Managed Execution (slurm_multi) -For disaggregated inference workloads like SGLang Disaggregated, madengine supports baremetal execution where the model's `.slurm` script manages Docker containers directly: +For disaggregated inference workloads like SGLang Disaggregated, madengine supports self-managed execution where the model's `.slurm` script manages Docker containers directly: ```json { @@ -443,9 +443,9 @@ For disaggregated inference workloads like SGLang Disaggregated, madengine suppo } ``` -**How baremetal execution works:** +**How self-managed execution works:** 1. madengine generates a wrapper script (not a Docker container) -2. The wrapper runs the model's `.slurm` script directly on baremetal +2. The wrapper runs the model's `.slurm` script directly on the host 3. The `.slurm` script manages Docker containers via `srun` 4. Environment variables from `models.json` and `additional-context` are passed through diff --git a/src/madengine/deployment/common.py b/src/madengine/deployment/common.py index a122818c..1cf44429 100644 --- a/src/madengine/deployment/common.py +++ b/src/madengine/deployment/common.py @@ -19,7 +19,7 @@ "megatron-lm", "vllm", "sglang", - "sglang-disagg" + "slurm_multi", ] @@ -43,6 +43,9 @@ def normalize_launcher(launcher_type: Optional[str], deployment_type: str) -> st """ if launcher_type and launcher_type in VALID_LAUNCHERS: return launcher_type + # Normalize hyphen variant: slurm-multi -> slurm_multi + if launcher_type and launcher_type.replace("-", "_") in VALID_LAUNCHERS: + return launcher_type.replace("-", "_") if deployment_type == "local": return "docker" if deployment_type == "slurm": diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py index 959fa0d1..900c0176 100644 --- a/src/madengine/deployment/kubernetes.py +++ b/src/madengine/deployment/kubernetes.py @@ -73,16 +73,6 @@ from .kubernetes_launcher_mixin import KubernetesLauncherMixin -VALID_LAUNCHERS = [ - "torchrun", - "torchtitan", - "deepspeed", - "megatron-lm", - "vllm", - "sglang", - "slurm_multi", -] - SLURM_MULTI_ALIASES = [ "slurm_multi", "slurm-multi", diff --git a/src/madengine/deployment/slurm.py b/src/madengine/deployment/slurm.py index 253d0cd7..8f45f4ce 100644 --- a/src/madengine/deployment/slurm.py +++ b/src/madengine/deployment/slurm.py @@ -305,7 +305,7 @@ def prepare(self) -> bool: model_key = model_keys[0] model_info = self.manifest["built_models"][model_key] - # Check if this is a slurm_multi launcher (baremetal multi-node) + # Check if this is a slurm_multi launcher (self-managed multi-node) # Priority: model_info.distributed.launcher > additional_context.distributed.launcher model_distributed = model_info.get("distributed", {}) launcher_type = model_distributed.get("launcher") or self.distributed_config.get("launcher", "torchrun") @@ -315,10 +315,10 @@ def prepare(self) -> bool: slurm_multi_aliases_normalized = [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES] if launcher_normalized in slurm_multi_aliases_normalized: # For slurm_multi launchers, generate simple wrapper script - # that runs the model's .slurm script directly on baremetal + # that runs the model's .slurm script directly on the host self.console.print(f"[cyan]Detected slurm_multi launcher: {launcher_type}[/cyan]") # Pass model_key as docker_image_name (for manifests, the key IS the built image name) - return self._prepare_baremetal_script(model_info, docker_image_name=model_key) + return self._prepare_slurm_multi_script(model_info, docker_image_name=model_key) # Standard flow: validate madengine availability for complex job template if not self._validate_cli_availability(): @@ -355,12 +355,12 @@ def _normalize_nodelist(nodelist: Optional[str]) -> Optional[str]: return None return ",".join(n.strip() for n in nodelist.split(",") if n.strip()) - def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = None) -> bool: + def _prepare_slurm_multi_script(self, model_info: Dict, docker_image_name: str = None) -> bool: """ - Generate a simple wrapper script for baremetal/slurm_multi launchers. + Generate a simple wrapper script for slurm_multi (self-managed) launchers. These launchers (slurm_multi, sglang-disagg, vllm-disagg) run the model's - .slurm script directly on baremetal, which then manages Docker containers + .slurm script directly on the host, which then manages Docker containers via srun. No madengine wrapper needed. Args: @@ -424,8 +424,8 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N script_lines = [ "#!/bin/bash", f"#SBATCH --job-name=madengine-{model_info['name']}", - f"#SBATCH --output={self.output_dir}/madengine-{model_info['name']}_%j.out", - f"#SBATCH --error={self.output_dir}/madengine-{model_info['name']}_%j.err", + f"#SBATCH --output={self.output_dir}/madengine-{model_info['name']}_%j_%t.out", + f"#SBATCH --error={self.output_dir}/madengine-{model_info['name']}_%j_%t.err", f"#SBATCH --partition={self.partition}", f"#SBATCH --nodes={self.nodes}", f"#SBATCH --ntasks={self.nodes}", @@ -445,7 +445,7 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N script_lines.extend([ "", - f"# Baremetal launcher script for {model_info['name']}", + f"# slurm_multi launcher script for {model_info['name']}", f"# Generated by madengine for slurm_multi", "", "set -e", @@ -459,7 +459,7 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N script_lines.append("") script_lines.extend([ "echo '=========================================='", - "echo 'Baremetal Launcher - slurm_multi'", + "echo 'slurm_multi Launcher'", "echo '=========================================='", f"echo 'Model: {model_info['name']}'", f"echo 'Script: {model_script_path}'", @@ -518,7 +518,7 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N "# Change to script directory", f"cd {model_script_path.parent}", "", - "# Run the model script directly on baremetal", + "# Run the model script directly on the host", f"echo 'Executing: bash {model_script_path.name} {model_args}'", f"bash {model_script_path.name} {model_args}", "SCRIPT_EXIT_CODE=$?", @@ -544,7 +544,7 @@ def _prepare_baremetal_script(self, model_info: Dict, docker_image_name: str = N self.script_path.write_text(script_content) self.script_path.chmod(0o755) - self.console.print(f"[green]✓ Generated baremetal script: {self.script_path}[/green]") + self.console.print(f"[green]✓ Generated slurm_multi script: {self.script_path}[/green]") self.console.print(f" Model script: {model_script_path}") self.console.print(f" Environment: {len(env_vars)} variables") @@ -772,7 +772,7 @@ def _generate_slurm_multi_command( """ Generate slurm_multi launcher environment for SLURM. - slurm_multi Architecture (multi-node baremetal): + slurm_multi Architecture (self-managed multi-node): - Node 0: Proxy (load balancer) - Nodes 1 to xP: Prefill nodes - Nodes xP+1 to xP+yD: Decode nodes @@ -1574,6 +1574,14 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: "session_start_row": session_start_row, } + # For slurm_multi launchers, the model script generates its own perf.csv. + # Skip log-based performance parsing and read the CSV directly. + launcher_type = self.distributed_config.get("launcher", "") + launcher_normalized = launcher_type.lower().replace("_", "-") if launcher_type else "" + slurm_multi_normalized = [a.lower().replace("_", "-") for a in SLURM_MULTI_ALIASES] + if launcher_normalized in slurm_multi_normalized: + return self._collect_slurm_multi_results(deployment_id, results, session_start_row) + model_keys = list(self.manifest.get("built_models") or {}) model_key = model_keys[0] if model_keys else None # Use logical model name for job_dir so it matches the task script (which uses model_info["name"]). @@ -1898,6 +1906,53 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]: ) return results + def _collect_slurm_multi_results( + self, deployment_id: str, results: Dict[str, Any], session_start_row: Optional[int] + ) -> Dict[str, Any]: + """ + Collect results for slurm_multi launchers. + + slurm_multi model scripts generate their own perf.csv via their + benchmark scripts (e.g. generate_perf_csv.py). We collect SLURM + logs for diagnostics and read the model-generated perf.csv for metrics. + """ + # Collect SLURM output logs for diagnostics + flat_out_files = sorted(self.output_dir.glob(f"madengine-*_{deployment_id}_*.out")) + results["logs"] = [str(f) for f in flat_out_files] + + # Look for model-generated perf.csv + # Priority: results_dir config > workspace perf.csv > NFS wait + perf_csv_path = None + if self.slurm_config.get("results_dir"): + results_dir = Path(self.slurm_config["results_dir"]) + candidates = list(results_dir.glob("perf*.csv")) + if candidates: + perf_csv_path = candidates[0] + + if not perf_csv_path: + workspace_perf = Path("perf.csv") + # Retry briefly for NFS propagation after SLURM job completion + import time + for _attempt in range(6): + if workspace_perf.exists() and workspace_perf.stat().st_size > 0: + perf_csv_path = workspace_perf + break + time.sleep(5) + + if perf_csv_path: + results["perf_files"] = [str(perf_csv_path)] + self._collect_results_parse_perf_csv(results, session_start_row) + else: + self.console.print("[yellow]No perf.csv found from slurm_multi model script[/yellow]") + + self.console.print( + f"[green]Collected slurm_multi results: {len(results['perf_files'])} perf files, " + f"{len(results['logs'])} log files, " + f"{len(results['successful_runs'])} successful, " + f"{len(results['failed_runs'])} failed[/green]" + ) + return results + def _collect_results_parse_perf_csv( self, results: Dict[str, Any], session_start_row: Optional[int] ) -> None: diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 086629c4..60b3c9f1 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -19,7 +19,7 @@ "slurm_multi", "slurm-multi", ] -BAREMETAL_LAUNCHERS = SLURM_MULTI_ALIASES +SELF_MANAGED_LAUNCHERS = SLURM_MULTI_ALIASES from rich.console import Console as RichConsole from contextlib import redirect_stdout, redirect_stderr from madengine.core.console import Console @@ -674,7 +674,7 @@ def apply_tools( else: print(f" Note: Command '{cmd}' already added by another tool, skipping duplicate.") - def _run_on_baremetal( + def _run_self_managed( self, model_info: typing.Dict, build_info: typing.Dict, @@ -685,7 +685,7 @@ def _run_on_baremetal( run_env: typing.Dict, ) -> typing.Dict: """ - Run script directly on baremetal (not inside Docker). + Run script directly on the host (self-managed launcher, not inside madengine Docker). Used for slurm_multi launchers that manage their own Docker containers via SLURM srun commands. The script is executed directly on the node. @@ -772,7 +772,7 @@ def _run_on_baremetal( # Run script with logging test_start_time = time.time() - self.rich_console.print("\n[bold blue]Running script on baremetal...[/bold blue]") + self.rich_console.print("\n[bold blue]Running script (self-managed launcher)...[/bold blue]") try: with open(log_file_path, mode="w", buffering=1) as outlog: @@ -1097,13 +1097,13 @@ def run_container( print(f"Docker options: {docker_options}") - # ========== CHECK FOR BAREMETAL LAUNCHERS ========== - # slurm_multi launchers run scripts directly on baremetal, - # not inside Docker. The script itself manages Docker containers via srun. + # ========== CHECK FOR SELF-MANAGED LAUNCHERS ========== + # slurm_multi launchers run scripts directly on the host, + # not inside a madengine-managed Docker. The script manages its own containers via srun. launcher = "" # Debug: Print all sources - print(f"🔍 Baremetal check - looking for launcher...") + print(f"🔍 Self-managed launcher check...") print(f" MAD_LAUNCHER_TYPE env: {os.environ.get('MAD_LAUNCHER_TYPE', '')}") if self.additional_context: distributed_config = self.additional_context.get("distributed", {}) @@ -1121,10 +1121,10 @@ def run_container( # Normalize launcher name (replace underscores with hyphens) launcher_normalized = launcher.lower().replace("_", "-") if launcher else "" - if launcher_normalized and launcher_normalized in [l.lower().replace("_", "-") for l in BAREMETAL_LAUNCHERS]: - self.rich_console.print(f"\n[bold cyan]🖥️ Running on BAREMETAL (launcher: {launcher})[/bold cyan]") + if launcher_normalized and launcher_normalized in [l.lower().replace("_", "-") for l in SELF_MANAGED_LAUNCHERS]: + self.rich_console.print(f"\n[bold cyan]🖥️ Self-managed launcher (launcher: {launcher})[/bold cyan]") self.rich_console.print(f"[dim]Script will manage its own Docker containers via SLURM[/dim]") - return self._run_on_baremetal( + return self._run_self_managed( model_info=model_info, build_info=build_info, log_file_path=log_file_path, @@ -1133,7 +1133,7 @@ def run_container( pre_encapsulate_post_scripts=pre_encapsulate_post_scripts, run_env=run_env, ) - # ========== END BAREMETAL CHECK ========== + # ========== END SELF-MANAGED CHECK ========== self.rich_console.print(f"\n[bold blue]🏃 Starting Docker container execution...[/bold blue]") print(f"🏷️ Image: {docker_image}") diff --git a/src/madengine/orchestration/build_orchestrator.py b/src/madengine/orchestration/build_orchestrator.py index 2b15f192..70906398 100644 --- a/src/madengine/orchestration/build_orchestrator.py +++ b/src/madengine/orchestration/build_orchestrator.py @@ -258,6 +258,20 @@ def execute( DiscoveryError: If model discovery fails BuildError: If Docker build fails """ + # --use-image and --build-on-compute are mutually exclusive + if use_image and build_on_compute: + raise ConfigurationError( + "--use-image and --build-on-compute cannot be used together", + context=create_error_context( + operation="build", + component="BuildOrchestrator", + ), + suggestions=[ + "Use --use-image to skip build and use an existing image", + "Use --build-on-compute to build on a compute node and push to registry", + ], + ) + # Handle pre-built image mode if use_image: # If use_image is "auto", resolve from model card