From e53e2121b7ed39709538d6f50a7fdd95368f6eec Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 2 Dec 2025 11:49:22 -0500 Subject: [PATCH 01/11] Add TheRock model for validation (#53) * Added therock model for testing TheRock image * Added therock model * Modified the Dockerfile of TheRock only install core runtime and hip runtime * Fixe the generate-sys-env-details arg in mad * Redsign the rocEnvTool to work with TheRock based image * Keep compatible to the csv parser * Fixed the csv parser * Updated README of rocEnvTool accordingly --- src/madengine/mad.py | 2 +- .../common/pre_scripts/rocEnvTool/README.md | 415 +++++++++- .../pre_scripts/rocEnvTool/rocenv_tool.py | 768 +++++++++++++----- .../docker/therock.ubuntu.amd.Dockerfile | 100 +++ tests/fixtures/dummy/models.json | 12 + tests/fixtures/dummy/scripts/therock/run.sh | 7 + 6 files changed, 1079 insertions(+), 225 deletions(-) create mode 100644 tests/fixtures/dummy/docker/therock.ubuntu.amd.Dockerfile create mode 100644 tests/fixtures/dummy/scripts/therock/run.sh diff --git a/src/madengine/mad.py b/src/madengine/mad.py index 861571b7..e4df7143 100644 --- a/src/madengine/mad.py +++ b/src/madengine/mad.py @@ -217,7 +217,7 @@ def main(): " Overrides detected contexts and additional-context-file.") parser_run.add_argument('--data-config-file-name', default="data.json", help="custom data configuration file.") parser_run.add_argument('--tools-json-file-name', default="./scripts/common/tools.json", help="custom tools json configuration file.") - parser_run.add_argument('--generate-sys-env-details', default=True, help='generate system config env details by default') + parser_run.add_argument('--generate-sys-env-details', type=lambda x: (str(x).lower() in ['true', '1', 'yes']), default=True, help='generate system config env details by default (accepts: true/false, yes/no, 1/0)') parser_run.add_argument('--force-mirror-local', default=None, help="Path to force all relevant dataproviders to mirror data locally on.") parser_run.add_argument('--keep-alive', action='store_true', help="keep Docker container alive after run; will keep model directory after run") parser_run.add_argument('--keep-model-dir', action='store_true', help="keep model directory after run") diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/README.md b/src/madengine/scripts/common/pre_scripts/rocEnvTool/README.md index 1cc71748..45372dd9 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/README.md +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/README.md @@ -1,56 +1,387 @@ -# rocEnvTool: System Environment collection tool +# ROCm Environment Tool - TheRock Compatible -This tool is responsible for collecting some important details from the machine that we run on. -Note: This tool needs sudo previlege access to collect some information. +## Overview -## How to run this tool +`rocenv_tool.py` is a comprehensive ROCm environment collection tool that works with **both TheRock and traditional ROCm installations**. This tool automatically detects the installation type and adapts its behavior accordingly, collecting important system configuration details that are crucial for debugging and system analysis. -This tool needs sudo access. -* To gather full configuration details run the following command: +**Note:** This tool requires sudo privileges for collecting some system information. +## Key Features + +### 1. **Automatic Installation Detection** +- Detects TheRock installations (Python packages, tarballs, local builds) +- Detects traditional ROCm installations (apt/yum packages) +- Falls back to PATH-based detection if neither is found + +### 2. **Dynamic Path Resolution** +- No hardcoded paths to `/opt/rocm` +- Automatically locates `rocminfo`, `rocm-smi`, `hipcc`, etc. +- Works with custom installation directories + +### 3. **Robust Error Handling** +- Commands don't fail if tools are missing +- Graceful fallbacks for unavailable features +- Works in minimal container environments + +### 4. **TheRock-Specific Features** +- Displays TheRock manifest information +- Shows Python package installations +- Reports virtual environment details +- Lists installation contents + +### 5. **Backward Compatibility** +- All original functionality preserved +- Works with existing CSV parser +- Compatible with env_tags.json + +## Differences from Original Version + +| Aspect | Original (v1) | Current | +|--------|--------------|----------| +| Path detection | Hardcoded `/opt/rocm` | Dynamic detection | +| Installation types | Traditional ROCm only | TheRock + Traditional | +| Package listing | `dpkg -l` / `rpm -qa` | Adaptive (pip for TheRock) | +| Error handling | Fails on missing tools | Graceful fallbacks | +| Version detection | `/opt/rocm/.info/version` | Multi-method detection | +| Repo checking | apt/yum repos | Detects TheRock vs traditional | + +## Usage + +### Basic Usage + +```bash +# Run with automatic detection +python3 rocenv_tool.py + +# Verbose mode to see detection details +python3 rocenv_tool.py --verbose + +# Custom output name +python3 rocenv_tool.py --output-name my_system_info + +# Lite mode (uses env_tags.json) +python3 rocenv_tool.py --lite + +# Generate CSV output +python3 rocenv_tool.py --dump-csv + +# Generate and print CSV +python3 rocenv_tool.py --dump-csv --print-csv + +# Run with sudo for full system information +sudo python3 rocenv_tool.py +``` + +### Command-Line Options + +``` +--lite Use lite version from env_tags.json +--dump-csv Generate CSV file with system info +--print-csv Print CSV data to console +--output-name NAME Output directory name (default: sys_config_info) +-v, --verbose Enable verbose detection output +``` + +## How Detection Works + +### Detection Methods (in order) + +1. **Python Package Detection** + - Checks for `rocm-sdk` command in PATH + - Uses `rocm-sdk path --root` to find installation + - Verifies TheRock markers (manifest.json) + +2. **Environment Variable Detection** + - Checks `ROCM_PATH`, `ROCM_HOME`, `HIP_PATH` + - Verifies paths for TheRock markers + +3. **Common Path Detection** + - Searches `/opt/rocm`, `~/rocm`, `~/therock`, etc. + - Checks for `share/therock/therock_manifest.json` + +4. **Traditional ROCm Detection** + - Checks `/opt/rocm/.info/version` + - Uses traditional package manager paths + +5. **PATH-based Detection** + - Searches for `rocminfo`, `rocm-smi` in PATH + - Infers installation root from binary location + +### TheRock Installation Markers + +TheRock installations are identified by: +- `share/therock/therock_manifest.json` (primary marker) +- `share/therock/dist_info.json` (secondary marker) +- Unique directory structure (`lib/llvm/`) +- `rocm-sdk` command availability + +## Details Collected + +### Tags Available for Lite Mode: + +* `hardware_information` - System hardware details +* `cpu_information` - CPU specifications and info +* `gpu_information` - GPU hardware details +* `bios_settings` - BIOS configuration +* `os_information` - Operating system details +* `dmsg_gpu_drm_atom_logs` - GPU kernel logs +* `amdgpu_modinfo` - AMD GPU module information +* `memory_information` - System memory details +* `rocm_information` - ROCm installation details +* `rocm_repo_setup` - Repository configuration +* `rocm_packages_installed` - Installed ROCm packages +* `rocm_env_variables` - ROCm environment variables +* `rocm_smi` - ROCm System Management Interface output +* `ifwi_version` - Integrated Firmware Image version +* `rocm_smi_showhw` - Hardware topology +* `rocm_smi_pcie` - PCIe information +* `rocm_smi_pids` - Process information +* `rocm_smi_topology` - System topology +* `rocm_smi_showserial` - Serial numbers +* `rocm_smi_showperflevel` - Performance levels +* `rocm_smi_showrasinfo` - RAS information +* `rocm_smi_showxgmierr` - XGMI errors +* `rocm_smi_clocks` - Clock information +* `rocm_smi_showcompute_partition` - Compute partitions +* `rocm_smi_nodesbwi` - Node bandwidth +* `rocm_info` - ROCm information utility output +* `pip_list` - Python packages installed +* `numa_balancing` - NUMA balancing status + +## Output Structure + +The tool generates a directory (default: `.sys_config_info/`) with subdirectories for each category: + +``` +.sys_config_info/ +├── os_information/ +│ └── os_information.txt +├── cpu_information/ +│ └── cpu_information.txt +├── gpu_information/ +│ └── gpu_information.txt +├── rocm_information/ +│ └── rocm_information.txt +├── rocm_packages_installed/ +│ └── rocm_packages_installed.txt +├── rocm_env_variables/ +│ └── rocm_env_variables.txt +├── rocm_smi/ +│ └── rocm_smi.txt +├── pip_list/ +│ └── pip_list.txt +└── ... (more sections) ``` -sudo python rocenv_tool.py + +## TheRock-Specific Output + +When TheRock is detected, the output includes: + +### rocm_information section +- Installation type: `therock` +- ROCm root path +- TheRock manifest content (commit hash, submodules) +- Version information from `rocm-sdk version` + +### rocm_repo_setup section +- Message indicating TheRock doesn't use traditional repos +- `rocm-sdk` command output +- Virtual environment information (if applicable) +- Python package list + +### rocm_packages_installed section +- Python ROCm packages (`pip list | grep rocm`) +- TheRock installation directory contents +- `dist_info.json` content (GPU targets, etc.) + +## Examples + +### Example 1: TheRock in Docker Container + +```bash +# In a container built from TheRock +$ python3 rocenv_tool.py --verbose + +[DEBUG] Checking for rocm-sdk command... +[DEBUG] Found rocm-sdk at /usr/local/bin/rocm-sdk +[DEBUG] Found TheRock manifest at /opt/rocm/share/therock/therock_manifest.json +Installation Type: therock +ROCm Root: /opt/rocm +GPU Device Type: AMD +OK: finished dumping the system env details in .sys_config_info folder ``` -This dumps out a folder called : .sys_config_files inside the current working directory which contains multiple folders with logs available. +### Example 2: Traditional ROCm System -* To run the lite version run the below command. Make sure to update your selected tags via roc_env.json file. By default it dumps out os_information. +```bash +# On a system with apt-installed ROCm +$ python3 rocenv_tool.py +Installation Type: traditional +ROCm Root: /opt/rocm +GPU Device Type: AMD +OK: finished dumping the system env details in .sys_config_info folder ``` -sudo python rocenv_tool.pyy --lite + +### Example 3: TheRock Python Virtual Environment + +```bash +# In a venv with TheRock pip packages +$ source .venv/bin/activate +$ python3 rocenv_tool.py --verbose + +[DEBUG] Checking for rocm-sdk command... +[DEBUG] Found rocm-sdk at /home/user/.venv/bin/rocm-sdk +[DEBUG] Found TheRock at /home/user/.venv/lib/python3.10/site-packages/_rocm_sdk_core +Installation Type: therock +ROCm Root: /home/user/.venv/lib/python3.10/site-packages/_rocm_sdk_core +GPU Device Type: AMD +OK: finished dumping the system env details in .sys_config_info folder +``` + +## Troubleshooting + +### Issue: No ROCm installation detected + +**Solution:** +1. Run with `--verbose` to see detection details +2. Ensure ROCm binaries are in PATH: `export PATH=/path/to/rocm/bin:$PATH` +3. Set environment variable: `export ROCM_PATH=/path/to/rocm` +4. For Python packages: activate your virtual environment first + +### Issue: rocm-smi not found + +**For TheRock:** +- TheRock installations may not include all tools +- Output will show "rocm-smi not available" (not an error) +- Script continues with other available tools + +**For Traditional ROCm:** +- Ensure ROCm is properly installed +- Check PATH includes `/opt/rocm/bin` + +### Issue: Permission denied errors + +**Solution:** +- Some commands require sudo (dmidecode, lshw) +- Run as root for full system information: `sudo python3 rocenv_tool.py` +- Or skip privileged commands (they're non-essential) + +### Issue: Commands timing out + +**Solution:** +- Check if GPU is accessible +- Verify driver installation +- Some commands may hang if hardware isn't responding + +## Integration with Existing Tools + +### CSV Parser Compatibility + +The tool maintains compatibility with the existing `csv_parser.py`: + +```python +# CSV parsing still works +csv_parser = CSVParser(csv_file, out_dir, configs) +csv_parser.dump_csv_output() +csv_parser.print_csv_output() +``` + +**Note:** TheRock installations may produce different CSV formats for: +- Package listings (pip packages vs dpkg/rpm) +- Repository information (Python packages vs apt repos) + +### env_tags.json Support + +Lite mode works with `env_tags.json`: + +```bash +python3 rocenv_tool.py --lite +``` + +Only collects information for tags specified in `env_tags.json`. + +## Best Practices + +1. **Use verbose mode for debugging:** + ```bash + python3 rocenv_tool.py --verbose + ``` + +2. **Set ROCM_PATH for custom installations:** + ```bash + export ROCM_PATH=/custom/path/to/rocm + python3 rocenv_tool.py + ``` + +3. **Activate venv for Python package detection:** + ```bash + source .venv/bin/activate + python3 rocenv_tool.py + ``` + +4. **Run as root for complete information:** + ```bash + sudo python3 rocenv_tool.py + ``` + +5. **Use lite mode for quick checks:** + ```bash + python3 rocenv_tool.py --lite + ``` + +## Known Limitations + +1. **Multi-installation detection:** + - Tool detects first valid installation found + - Priority: Python package > env vars > common paths > traditional + +2. **Partial installations:** + - Some TheRock installations may lack certain tools + - Output will note "not available" for missing tools + +3. **Custom build directories:** + - Local builds may not be auto-detected + - Use ROCM_PATH environment variable + +4. **CSV format variations:** + - TheRock package listings differ from traditional + - May affect CSV parser output format + +## Technical Details + +### RocmPathResolver Class + +The core detection logic is in the `RocmPathResolver` class: + +```python +resolver = RocmPathResolver(verbose=True) + +# Access installation info +print(resolver.installation_type) # 'therock', 'traditional', or 'unknown' +print(resolver.rocm_root) # Installation root path +print(resolver.paths['rocminfo']) # Path to rocminfo binary +print(resolver.get_version()) # ROCm version string +``` + +### Command Generation + +All commands are generated dynamically: + +```python +# Dynamic path resolution +cmd = f"{path_resolver.paths.get('rocminfo') or 'rocminfo'} || echo 'rocminfo not available'" ``` -## Details that are collected via this tool: +This ensures: +- Commands work regardless of installation location +- Graceful failure if tools are missing +- Informative error messages -The below tags denote the details that are collected via this tool. -These are the tags that are available for user if they wish to use lite version. +## Support -### Tags: -* hardware_information -* cpu_information -* gpu_information -* bios_settings -* os_information -* dmsg_gpu_drm_atom_logs -* amdgpu_modinfo -* memory_information -* rocm_information -* rocm_repo_setup -* rocm_packages_installed -* rocm_env_variables -* rocm_smi -* ifwi_version -* rocm_smi_showhw -* rocm_smi_pcie -* rocm_smi_pids -* rocm_smi_topology -* rocm_smi_showserial -* rocm_smi_showperflevel -* rocm_smi_showrasinfo -* rocm_smi_showxgmierr -* rocm_smi_clocks -* rocm_smi_showcompute_partition -* rocm_smi_nodesbwi -* rocm_info -* pip_list -* numa_balancing +For issues or questions: +1. Run with `--verbose` to see detection details +2. Check output for specific error messages +3. Verify ROCm installation is functional +4. Review the test script: `test_rocenv.sh` diff --git a/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py b/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py index 8aca62d7..3e92d99a 100644 --- a/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py +++ b/src/madengine/scripts/common/pre_scripts/rocEnvTool/rocenv_tool.py @@ -1,17 +1,24 @@ -"""Tool to collect system environment information. +"""Tool to collect system environment information (TheRock + Traditional ROCm compatible). Copyright (c) Advanced Micro Devices, Inc. All rights reserved. """ import os import sys import argparse +import json +import shutil +import subprocess +from pathlib import Path +from typing import Dict, List, Optional, Tuple from console import Console from csv_parser import CSVParser -import json rocm_version = None pkgtype = None env_map = {} +installation_type = None # 'therock' or 'traditional' or 'unknown' +rocm_paths = {} # Dynamic paths for ROCm components + class CommandInfo: ''' @@ -22,7 +29,269 @@ def __init__(self, section_info, cmds): self.section_info = section_info self.cmds = cmds -## utility functions. + +class RocmPathResolver: + """ + Detects and resolves ROCm installation paths for both TheRock and traditional installations. + """ + + def __init__(self, verbose: bool = False): + self.verbose = verbose + self.installation_type = 'unknown' + self.rocm_root = None + self.paths = { + 'rocminfo': None, + 'rocm_smi': None, + 'hipcc': None, + 'amdclang': None, + 'version_file': None, + 'manifest_file': None, + } + self.therock_details = {} + self.detect() + + def log(self, message: str): + """Print verbose log messages.""" + if self.verbose: + print(f"[DEBUG] {message}") + + def detect(self): + """Detect ROCm installation type and locate components.""" + # Method 1: Check for TheRock via rocm-sdk command + if self._detect_therock_python_package(): + return + + # Method 2: Check environment variables for TheRock + if self._detect_therock_from_env(): + return + + # Method 3: Check for TheRock in common paths + if self._detect_therock_tarball(): + return + + # Method 4: Fallback to traditional ROCm + if self._detect_traditional_rocm(): + return + + # Method 5: Try to find binaries in PATH + self._detect_from_path() + + def _is_therock_installation(self, path: Path) -> bool: + """Check if a path contains TheRock installation markers.""" + if not path.exists(): + return False + + # Check for TheRock manifest + manifest_path = path / "share" / "therock" / "therock_manifest.json" + if manifest_path.exists(): + self.log(f"Found TheRock manifest at {manifest_path}") + try: + with open(manifest_path, "r") as f: + manifest = json.load(f) + self.therock_details['manifest'] = manifest + except Exception as e: + self.log(f"Error reading manifest: {e}") + return True + + # Check for dist_info.json + dist_info_path = path / "share" / "therock" / "dist_info.json" + if dist_info_path.exists(): + self.log(f"Found TheRock dist_info at {dist_info_path}") + return True + + return False + + def _detect_therock_python_package(self) -> bool: + """Detect TheRock via Python package installation.""" + self.log("Checking for rocm-sdk command...") + + rocm_sdk_path = shutil.which("rocm-sdk") + if rocm_sdk_path: + self.log(f"Found rocm-sdk at {rocm_sdk_path}") + + try: + # Get root path from rocm-sdk + result = subprocess.run( + ["rocm-sdk", "path", "--root"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0: + root_path = Path(result.stdout.strip()) + if self._is_therock_installation(root_path): + self.installation_type = 'therock' + self.rocm_root = str(root_path) + self._populate_therock_paths(root_path) + return True + except Exception as e: + self.log(f"Error getting rocm-sdk path: {e}") + + return False + + def _detect_therock_from_env(self) -> bool: + """Detect TheRock from environment variables.""" + self.log("Checking environment variables...") + + for var in ['ROCM_PATH', 'ROCM_HOME', 'HIP_PATH']: + value = os.environ.get(var) + if value: + path = Path(value) + if self._is_therock_installation(path): + self.log(f"Found TheRock via ${var}={value}") + self.installation_type = 'therock' + self.rocm_root = str(path) + self._populate_therock_paths(path) + return True + + return False + + def _detect_therock_tarball(self) -> bool: + """Detect TheRock tarball installations in common paths.""" + self.log("Checking common TheRock installation paths...") + + common_paths = [ + Path("/opt/rocm"), + Path.home() / "rocm", + Path.home() / "therock", + Path("/usr/local/rocm"), + Path.home() / ".local" / "rocm", + ] + + for path in common_paths: + if self._is_therock_installation(path): + self.log(f"Found TheRock at {path}") + self.installation_type = 'therock' + self.rocm_root = str(path) + self._populate_therock_paths(path) + return True + + return False + + def _detect_traditional_rocm(self) -> bool: + """Detect traditional ROCm installation.""" + self.log("Checking for traditional ROCm installation...") + + # Check for traditional ROCm marker + version_file = Path("/opt/rocm/.info/version") + if version_file.exists(): + self.log("Found traditional ROCm at /opt/rocm") + self.installation_type = 'traditional' + self.rocm_root = "/opt/rocm" + self._populate_traditional_paths() + return True + + return False + + def _detect_from_path(self): + """Try to find ROCm binaries in PATH.""" + self.log("Searching for ROCm binaries in PATH...") + + # Try to find rocminfo + rocminfo = shutil.which("rocminfo") + if rocminfo: + self.paths['rocminfo'] = rocminfo + # Try to infer root from binary location + rocminfo_path = Path(rocminfo) + if rocminfo_path.exists(): + potential_root = rocminfo_path.parent.parent + if self._is_therock_installation(potential_root): + self.installation_type = 'therock' + self.rocm_root = str(potential_root) + self._populate_therock_paths(potential_root) + else: + self.installation_type = 'unknown' + self.rocm_root = str(potential_root) + + # Try to find other binaries + self.paths['rocm_smi'] = shutil.which("rocm-smi") + self.paths['hipcc'] = shutil.which("hipcc") + self.paths['amdclang'] = shutil.which("amdclang") + + def _populate_therock_paths(self, root: Path): + """Populate paths for TheRock installation.""" + bin_dir = root / "bin" + + self.paths['rocminfo'] = str(bin_dir / "rocminfo") if (bin_dir / "rocminfo").exists() else None + self.paths['rocm_smi'] = str(bin_dir / "rocm-smi") if (bin_dir / "rocm-smi").exists() else None + self.paths['hipcc'] = str(bin_dir / "hipcc") if (bin_dir / "hipcc").exists() else None + self.paths['amdclang'] = str(bin_dir / "amdclang") if (bin_dir / "amdclang").exists() else None + + # Check for manifest + manifest = root / "share" / "therock" / "therock_manifest.json" + if manifest.exists(): + self.paths['manifest_file'] = str(manifest) + + def _populate_traditional_paths(self): + """Populate paths for traditional ROCm installation.""" + self.paths['rocminfo'] = "/opt/rocm/bin/rocminfo" + self.paths['rocm_smi'] = "/opt/rocm/bin/rocm-smi" + self.paths['hipcc'] = "/opt/rocm/bin/hipcc" + self.paths['version_file'] = "/opt/rocm/.info/version" + + def get_version(self) -> str: + """Get ROCm version string.""" + if self.installation_type == 'therock': + return self._get_therock_version() + elif self.installation_type == 'traditional': + return self._get_traditional_version() + else: + return "unknown" + + def _get_therock_version(self) -> str: + """Get TheRock version from manifest or rocm-sdk.""" + # Try rocm-sdk command + if shutil.which("rocm-sdk"): + try: + result = subprocess.run( + ["rocm-sdk", "version"], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0: + return result.stdout.strip() + except Exception: + pass + + # Try manifest file + if self.therock_details.get('manifest'): + commit = self.therock_details['manifest'].get('the_rock_commit', 'unknown') + return f"TheRock (commit: {commit[:8]})" + + return "TheRock (version unknown)" + + def _get_traditional_version(self) -> str: + """Get traditional ROCm version from version file or header.""" + # Try version file + version_file = Path("/opt/rocm/.info/version") + if version_file.exists(): + try: + return version_file.read_text().strip() + except Exception: + pass + + # Try version header + version_header = Path("/opt/rocm/include/rocm-core/rocm_version.h") + if version_header.exists(): + try: + content = version_header.read_text() + major = minor = patch = 0 + for line in content.split('\n'): + if "#define ROCM_VERSION_MAJOR" in line: + major = line.split()[-1] + if "#define ROCM_VERSION_MINOR" in line: + minor = line.split()[-1] + if "#define ROCM_VERSION_PATCH" in line: + patch = line.split()[-1] + return f"rocm-{major}.{minor}.{patch}" + except Exception: + pass + + return "unknown" + + +## Utility functions def parse_env_tags_json(json_file): env_tags = None with open(json_file) as f: @@ -30,258 +299,357 @@ def parse_env_tags_json(json_file): configs = env_tags["env_tags"] return configs -## Hardware information. + +## Hardware information def print_hardware_information(): cmd = None - if os.path.isfile("/usr/bin/lshw"): - cmd = "/usr/bin/lshw" - elif os.path.isfile("/usr/sbin/lshw"): - cmd = "/usr/sbin/lshw" - elif os.path.isfile("/sbin/lshw"): - cmd = "/sbin/lshw" - else: - print ("WARNING: Install lshw to get lshw hardware information") - print (" Ex: sudo apt install lshw") - + possible_paths = ["/usr/bin/lshw", "/usr/sbin/lshw", "/sbin/lshw"] + for path in possible_paths: + if os.path.isfile(path): + cmd = path + break + + if cmd is None: + print("WARNING: Install lshw to get hardware information") + print(" (TheRock images may not include this by default)") + if cmd is not None: cmd_info = CommandInfo("HardwareInformation", [cmd]) return cmd_info else: return None + ## CPU Hardware Information def print_cpu_hardware_information(): - cmd ="/usr/bin/lscpu" + cmd = "/usr/bin/lscpu" + if not os.path.exists(cmd): + cmd = "lscpu" # Try PATH cmd_info = CommandInfo("CPU Information", [cmd]) return cmd_info -## GPU Hardware information. -def print_gpu_hardware_information(gpu_device_type): + +## GPU Hardware information +def print_gpu_hardware_information(gpu_device_type, path_resolver): if gpu_device_type == "AMD": - cmd = "/opt/rocm/bin/rocminfo" + # Use dynamic path from resolver + cmd = path_resolver.paths.get('rocminfo') or "rocminfo" elif gpu_device_type == "NVIDIA": cmd = "nvidia-smi -L" else: - print ("WARNING: Unknown GPU device detected") + print("WARNING: Unknown GPU device detected") + cmd = "echo 'Unknown GPU device'" + cmd_info = CommandInfo("GPU Information", [cmd]) return cmd_info -## BIOS Information. + +## BIOS Information def print_bios_settings(): cmd = "/usr/sbin/dmidecode" + if not os.path.exists(cmd): + cmd = "dmidecode" # Try PATH cmd_info = CommandInfo("dmidecode Information", [cmd]) return cmd_info -## OS information. + +## OS information def print_os_information(): - cmd1 = "/bin/uname -a" - cmd2 = "/bin/cat /etc/os-release" + cmd1 = "uname -a" + cmd2 = "cat /etc/os-release" cmd_info = CommandInfo("OS Distribution", [cmd1, cmd2]) return cmd_info -## Memory Information. + +## Memory Information def print_memory_information(): cmd = "/usr/bin/lsmem" + if not os.path.exists(cmd): + cmd = "lsmem" # Try PATH cmd_info = CommandInfo("Memory Information", [cmd]) return cmd_info + ## ROCm version data -def print_rocm_version_information(): - cmd1 = "/bin/ls -v -d /opt/rocm*" +def print_rocm_version_information(path_resolver): global rocm_version - rocm_major = 0 - rocm_minor = 0 - rocm_patch = 0 - if (not os.environ.get('ROCM_VERSION')): - rocm_version_header = "/opt/rocm/include/rocm-core/rocm_version.h" - if os.path.isfile(rocm_version_header): - fs = open("/opt/rocm/include/rocm-core/rocm_version.h", 'r') - lines = fs.readlines() - fs.close() - for line in lines: - if "#define ROCM_VERSION_MAJOR" in line: - rocm_major = line.split("#define ROCM_VERSION_MAJOR")[1].strip() - if "#define ROCM_VERSION_MINOR" in line: - rocm_minor = line.split("#define ROCM_VERSION_MINOR")[1].strip() - if "#define ROCM_VERSION_PATCH" in line: - rocm_patch = line.split("#define ROCM_VERSION_PATCH")[1].strip() - rocm_version = "rocm-" + str(rocm_major) + "." + str(rocm_minor) + "." + str(rocm_patch) - - cmd2 = "echo '==== Using " + rocm_version + " to collect ROCm information.==== '" - cmd_info = CommandInfo("Available ROCm versions", [cmd1, cmd2]) + + # List all ROCm-like directories + cmd1 = "ls -v -d /opt/rocm* 2>/dev/null || echo 'No /opt/rocm* directories found'" + + # Get version from resolver + rocm_version = path_resolver.get_version() + + cmd2 = f"echo '==== Installation Type: {path_resolver.installation_type} ===='" + rocm_root_display = path_resolver.rocm_root or "Not found" + cmd3 = f"echo '==== ROCm Root: {rocm_root_display} ===='" + cmd4 = f"echo '==== Using {rocm_version} to collect ROCm information ===='" + + cmds = [cmd1, cmd2, cmd3, cmd4] + + # Add TheRock-specific info + if path_resolver.installation_type == 'therock': + manifest_file = path_resolver.paths.get('manifest_file') + if manifest_file: + cmd5 = f"echo '==== TheRock Manifest: {manifest_file} ===='" + cmd6 = f"cat {manifest_file}" + cmds.extend([cmd5, cmd6]) + + cmd_info = CommandInfo("Available ROCm versions", cmds) return cmd_info -def print_rocm_repo_setup(): - #cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/apt/sources.list.d/* /etc/zypp/repos.d/* /etc/yum.repos.d/*" - cmd = None - if os.path.exists("/etc/zypp/repos.d"): - cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/zypp/repos.d/*" - elif os.path.exists("/etc/apt/sources.list.d"): - cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/apt/sources.list.d/*" - elif os.path.exists("/etc/yum.repos.d/"): - cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/yum.repos.d/*" - - cmd_info = CommandInfo("ROCm Repo Setup", [cmd]) + +def print_rocm_repo_setup(path_resolver): + """Print repo setup - only for traditional ROCm installations.""" + cmds = [] + + if path_resolver.installation_type == 'therock': + cmds.append("echo 'TheRock does not use traditional package repositories'") + cmds.append("echo 'TheRock is installed via Python pip packages or tarballs'") + + # Try to get pip package info + if shutil.which("rocm-sdk"): + cmds.append("echo 'Checking rocm-sdk Python package...'") + cmds.append("rocm-sdk version || true") + cmds.append("rocm-sdk path --root || true") + + # Check if we're in a venv + venv_path = os.environ.get('VIRTUAL_ENV') + if venv_path: + cmds.append(f"echo 'Virtual environment: {venv_path}'") + cmds.append("pip list | grep -i rocm || true") + else: + # Traditional ROCm repo check + cmd = None + if os.path.exists("/etc/zypp/repos.d"): + cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/zypp/repos.d/* || echo 'No ROCm repos found'" + elif os.path.exists("/etc/apt/sources.list.d"): + cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/apt/sources.list.d/* || echo 'No ROCm repos found'" + elif os.path.exists("/etc/yum.repos.d/"): + cmd = "/bin/grep -i -E 'rocm|amdgpu' /etc/yum.repos.d/* || echo 'No ROCm repos found'" + + if cmd: + cmds.append(cmd) + + cmd_info = CommandInfo("ROCm Repo Setup", cmds) return cmd_info -def print_rocm_packages_installed(): - d = {} - with open("/etc/os-release") as fs: - for line in fs: - if "=" in line: - k,v = line.rstrip().split("=") - d[k] = v.strip('"') - pkgtype = d['ID_LIKE'] - cmd1 = "echo ' Pkg type: '" + pkgtype - cmd2 = None - if pkgtype == "debian": - cmd2 = "/usr/bin/dpkg -l | /bin/grep -i -E 'ocl-icd|kfdtest|llvm-amd|miopen|half|^ii hip|hcc|hsa|rocm|atmi|^ii comgr|composa|amd-smi|aomp|amdgpu|rock|mivision|migraph|rocprofiler|roctracer|rocbl|hipify|rocsol|rocthr|rocff|rocalu|rocprim|rocrand|rccl|rocspar|rdc|rocwmma|rpp|openmp|amdfwflash|ocl |opencl' | /usr/bin/sort" + +def print_rocm_packages_installed(path_resolver): + """Print installed ROCm packages - adapted for TheRock.""" + cmds = [] + + if path_resolver.installation_type == 'therock': + # Add Pkg type line for CSV parser compatibility + cmds.append("echo ' Pkg type: therock'") + cmds.append("echo 'Installation Type: TheRock (no system packages)'") + cmds.append("echo ''") + + # Check Python packages + cmds.append("echo '=== Python ROCm Packages ==='") + cmds.append("pip list 2>/dev/null | grep -i -E 'rocm|hip|torch' || echo 'No Python ROCm packages found'") + + # List files in TheRock installation + if path_resolver.rocm_root: + cmds.append("echo ''") + cmds.append(f"echo '=== TheRock Installation Contents ({path_resolver.rocm_root}) ==='") + cmds.append(f"ls -lh {path_resolver.rocm_root}/bin/ 2>/dev/null || true") + cmds.append(f"ls -lh {path_resolver.rocm_root}/lib/ 2>/dev/null | head -20 || true") + + # Check for dist_info + if path_resolver.rocm_root: + dist_info = Path(path_resolver.rocm_root) / "share" / "therock" / "dist_info.json" + if dist_info.exists(): + cmds.append("echo ''") + cmds.append("echo '=== TheRock Distribution Info ==='") + cmds.append(f"cat {dist_info}") else: - cmd2 = "/usr/bin/rpm -qa | /bin/grep -i -E 'ocl-icd|kfdtest|llvm-amd|miopen|half|hip|hcc|hsa|rocm|atmi|comgr|composa|amd-smi|aomp|amdgpu|rock|mivision|migraph|rocprofiler|roctracer|rocblas|hipify|rocsol|rocthr|rocff|rocalu|rocprim|rocrand|rccl|rocspar|rdc|rocwmma|rpp|openmp|amdfwflash|ocl|opencl' | /usr/bin/sort" - cmd_info = CommandInfo("ROCm Packages Installed", [cmd1, cmd2]) + # Traditional package listing + d = {} + try: + with open("/etc/os-release") as fs: + for line in fs: + if "=" in line: + k, v = line.rstrip().split("=", 1) + d[k] = v.strip('"') + except Exception: + d = {'ID_LIKE': 'unknown'} + + pkgtype = d.get('ID_LIKE', d.get('ID', 'unknown')) + # Note: Format must match csv_parser.py expectations (space before "Pkg") + cmd1 = "echo ' Pkg type: '" + pkgtype + cmds.append(cmd1) + + if 'debian' in pkgtype.lower(): + cmd = "/usr/bin/dpkg -l 2>/dev/null | /bin/grep -i -E 'ocl-icd|kfdtest|llvm-amd|miopen|half|^ii hip|hcc|hsa|rocm|atmi|^ii comgr|composa|amd-smi|aomp|amdgpu|rock|mivision|migraph|rocprofiler|roctracer|rocbl|hipify|rocsol|rocthr|rocff|rocalu|rocprim|rocrand|rccl|rocspar|rdc|rocwmma|rpp|openmp|amdfwflash|ocl |opencl' | /usr/bin/sort || echo 'No packages found'" + else: + cmd = "/usr/bin/rpm -qa 2>/dev/null | /bin/grep -i -E 'ocl-icd|kfdtest|llvm-amd|miopen|half|hip|hcc|hsa|rocm|atmi|comgr|composa|amd-smi|aomp|amdgpu|rock|mivision|migraph|rocprofiler|roctracer|rocblas|hipify|rocsol|rocthr|rocff|rocalu|rocprim|rocrand|rccl|rocspar|rdc|rocwmma|rpp|openmp|amdfwflash|ocl|opencl' | /usr/bin/sort || echo 'No packages found'" + + cmds.append(cmd) + + cmd_info = CommandInfo("ROCm Packages Installed", cmds) return cmd_info + def print_rocm_environment_variables(): - cmd = "env | /bin/grep -i -E 'rocm|hsa|hip|mpi|openmp|ucx|miopen'" + cmd = "env | /bin/grep -i -E 'rocm|hsa|hip|mpi|openmp|ucx|miopen|virtual_env|conda' || echo 'No relevant env vars found'" cmd_info = CommandInfo("ROCm environment variables", [cmd]) return cmd_info -def print_rocm_smi_details(smi_config): + +def print_rocm_smi_details(smi_config, path_resolver): cmd_info = None - cmd = "/opt/rocm/bin/rocm-smi" - if (smi_config == "rocm_smi"): - cmd_info = CommandInfo("ROCm SMI", [cmd]) - elif (smi_config == "ifwi_version"): - ifwi_cmd = cmd + " -v" + + # Use dynamic path + rocm_smi_cmd = path_resolver.paths.get('rocm_smi') or "rocm-smi" + + if smi_config == "rocm_smi": + cmd_info = CommandInfo("ROCm SMI", [f"{rocm_smi_cmd} || echo 'rocm-smi not available'"]) + elif smi_config == "ifwi_version": + ifwi_cmd = f"{rocm_smi_cmd} -v || echo 'IFWI version not available'" cmd_info = CommandInfo("IFWI version", [ifwi_cmd]) - elif (smi_config == "rocm_smi_showhw"): - showhw_cmd = cmd + " --showhw" + elif smi_config == "rocm_smi_showhw": + showhw_cmd = f"{rocm_smi_cmd} --showhw || echo 'rocm-smi --showhw not available'" cmd_info = CommandInfo("ROCm SMI showhw", [showhw_cmd]) - elif (smi_config == "rocm_smi_pcie"): - pcie_cmd = cmd + " -c | /bin/grep -i -E 'pcie'" + elif smi_config == "rocm_smi_pcie": + pcie_cmd = f"{rocm_smi_cmd} -c 2>/dev/null | /bin/grep -i -E 'pcie' || echo 'PCIe info not available'" cmd_info = CommandInfo("ROCm SMI pcieclk clock", [pcie_cmd]) - elif (smi_config == "rocm_smi_pids"): - pids_cmd1 = "ls /sys/class/kfd/kfd/proc/" - pids_cmd2 = cmd + " --showpids" + elif smi_config == "rocm_smi_pids": + pids_cmd1 = "ls /sys/class/kfd/kfd/proc/ 2>/dev/null || echo 'KFD proc not available'" + pids_cmd2 = f"{rocm_smi_cmd} --showpids || echo 'showpids not available'" cmd_info = CommandInfo("KFD PIDs sysfs kfd proc", [pids_cmd1, pids_cmd2]) - elif (smi_config == "rocm_smi_topology"): - showtops_cmd = cmd + " --showtopo" + elif smi_config == "rocm_smi_topology": + showtops_cmd = f"{rocm_smi_cmd} --showtopo || echo 'showtopo not available'" cmd_info = CommandInfo("showtop topology", [showtops_cmd]) - elif (smi_config == "rocm_smi_showserial"): - serial_cmd = cmd + " --showserial" + elif smi_config == "rocm_smi_showserial": + serial_cmd = f"{rocm_smi_cmd} --showserial || echo 'showserial not available'" cmd_info = CommandInfo("showserial", [serial_cmd]) - elif (smi_config == "rocm_smi_showperflevel"): - perf_cmd = cmd + " --showperflevel" + elif smi_config == "rocm_smi_showperflevel": + perf_cmd = f"{rocm_smi_cmd} --showperflevel || echo 'showperflevel not available'" cmd_info = CommandInfo("showperflevel", [perf_cmd]) - elif (smi_config == "rocm_smi_showrasinfo"): - showrasinfo_cmd = cmd + " --showrasinfo all" + elif smi_config == "rocm_smi_showrasinfo": + showrasinfo_cmd = f"{rocm_smi_cmd} --showrasinfo all || echo 'showrasinfo not available'" cmd_info = CommandInfo("ROCm SMI showrasinfo all", [showrasinfo_cmd]) - elif (smi_config == "rocm_smi_showxgmierr"): - showxgmierr_cmd = cmd + " --showxgmierr" + elif smi_config == "rocm_smi_showxgmierr": + showxgmierr_cmd = f"{rocm_smi_cmd} --showxgmierr || echo 'showxgmierr not available'" cmd_info = CommandInfo("ROCm SMI showxgmierr", [showxgmierr_cmd]) - elif (smi_config == "rocm_smi_clocks"): - clock_cmd = cmd + " -cga" + elif smi_config == "rocm_smi_clocks": + clock_cmd = f"{rocm_smi_cmd} -cga || echo 'clock info not available'" cmd_info = CommandInfo("ROCm SMI clocks", [clock_cmd]) - elif (smi_config == "rocm_smi_showcompute_partition"): - compute_cmd = cmd + " --showcomputepartition" + elif smi_config == "rocm_smi_showcompute_partition": + compute_cmd = f"{rocm_smi_cmd} --showcomputepartition || echo 'showcomputepartition not available'" cmd_info = CommandInfo("ROCm Show computepartition", [compute_cmd]) - elif (smi_config == "rocm_smi_nodesbw"): - nodesbw_cmd = cmd + " --shownodesbw" + elif smi_config == "rocm_smi_nodesbw": + nodesbw_cmd = f"{rocm_smi_cmd} --shownodesbw || echo 'shownodesbw not available'" cmd_info = CommandInfo("ROCm Show Nodebsion", [nodesbw_cmd]) - elif (smi_config == "rocm_smi_gpudeviceid"): - gpudeviceid_cmd = cmd + " -i -d 0" + elif smi_config == "rocm_smi_gpudeviceid": + gpudeviceid_cmd = f"{rocm_smi_cmd} -i -d 0 || echo 'GPU device ID not available'" cmd_info = CommandInfo("ROCM Show GPU Device ID", [gpudeviceid_cmd]) else: cmd_info = None + return cmd_info -def print_rocm_info_details(): - cmd = "/opt/rocm/bin/rocminfo" + +def print_rocm_info_details(path_resolver): + rocminfo_cmd = path_resolver.paths.get('rocminfo') or "rocminfo" + cmd = f"{rocminfo_cmd} || echo 'rocminfo not available'" cmd_info = CommandInfo("rocminfo", [cmd]) return cmd_info + ## dmesg boot logs - GPU/ATOM/DRM/BIOS def print_dmesg_logs(ignore_prev_boot_logs=True): cmds = [] if os.path.exists("/var/log/journal"): cmds.append("echo 'Persistent logging enabled.'") else: - cmd1_str = "WARNING: Persistent logging possibly disabled.\n" - cmd1_str = cmd1_str + "WARNING: Please run: \n" - cmd1_str = cmd1_str + " sudo mkdir -p /var/log/journal\n" - cmd1_str = cmd1_str + " sudo systemctl restart systemd-journald.service \n" - cmd1_str = cmd1_str + "WARNING: to enable persistent boot logs for collection and analysis.\n" - cmd1_str = "echo " + cmd1_str + cmd1_str = "WARNING: Persistent logging possibly disabled.\\n" + cmd1_str = cmd1_str + "WARNING: Please run: \\n" + cmd1_str = cmd1_str + " sudo mkdir -p /var/log/journal\\n" + cmd1_str = cmd1_str + " sudo systemctl restart systemd-journald.service \\n" + cmd1_str = cmd1_str + "WARNING: to enable persistent boot logs for collection and analysis.\\n" + cmd1_str = "echo '" + cmd1_str + "'" cmds.append(cmd1_str) cmds.append("echo 'Section: dmesg boot logs'") - cmds.append("/bin/dmesg -T | /bin/grep -i -E ' Linux v| Command line|power|pnp|pci|gpu|drm|error|xgmi|panic|watchdog|bug|nmi|dazed|too|mce|edac|oop|fail|fault|atom|bios|kfd|vfio|iommu|ras_mask|ECC|smpboot.*CPU|pcieport.*AER|amdfwflash'") + cmds.append("/bin/dmesg -T 2>/dev/null | /bin/grep -i -E ' Linux v| Command line|power|pnp|pci|gpu|drm|error|xgmi|panic|watchdog|bug|nmi|dazed|too|mce|edac|oop|fail|fault|atom|bios|kfd|vfio|iommu|ras_mask|ECC|smpboot.*CPU|pcieport.*AER|amdfwflash' || echo 'dmesg not available'") + if not ignore_prev_boot_logs: - cmd_exec = None - if os.path.exists("/bin/journalctl"): - cmd_exec = "/bin/journalctl" - elif os.path.exists("/usr/bin/journalctl"): - cmd_exec = "/usr/bin/journalctl" - else: - cmd_exec = None - + cmd_exec = shutil.which("journalctl") + if cmd_exec is not None: cmds.append("echo 'Section: Current boot logs'") boot_exec = "/bin/grep -i -E ' Linux v| Command line|power|pnp|pci|gpu|drm|error|xgmi|panic|watchdog|bug|nmi|dazed|too|mce|edac|oop|fail|fault|atom|bios|kfd|vfio|iommu|ras_mask|ECC|smpboot.*CPU|pcieport.*AER|amdfwflash'" - cmds.append(cmd_exec + " -b | " + boot_exec) + cmds.append(f"{cmd_exec} -b 2>/dev/null | {boot_exec} || echo 'journalctl not available'") cmds.append("echo 'Section: Previous boot logs'") - cmds.append(cmd_exec + " -b 1 | " + boot_exec) + cmds.append(f"{cmd_exec} -b 1 2>/dev/null | {boot_exec} || echo 'Previous boot logs not available'") cmds.append("echo 'Section: Second boot logs'") - cmds.append(cmd_exec + " -b 2 | " + boot_exec) + cmds.append(f"{cmd_exec} -b 2 2>/dev/null | {boot_exec} || echo 'Second boot logs not available'") cmd_info = CommandInfo("dmesg GPU/DRM/ATOM/BIOS", cmds) return cmd_info + ## print amdgpu modinfo def print_amdgpu_modinfo(): - cmd = "/sbin/modinfo amdgpu" + cmd = "/sbin/modinfo amdgpu 2>/dev/null || modinfo amdgpu 2>/dev/null || echo 'amdgpu module not loaded/available'" cmd_info = CommandInfo("amdgpu modinfo", [cmd]) return cmd_info + ## print pip list def print_pip_list_details(): - cmd = "pip3 list --disable-pip-version-check" - cmd_info = CommandInfo("Pip3 package list ", [cmd]) + cmd = "pip3 list --disable-pip-version-check 2>/dev/null || pip list --disable-pip-version-check 2>/dev/null || echo 'pip not available'" + cmd_info = CommandInfo("Pip3 package list", [cmd]) return cmd_info + def print_check_numa_balancing(): - cmd = "cat /proc/sys/kernel/numa_balancing" + cmd = "cat /proc/sys/kernel/numa_balancing 2>/dev/null || echo 'NUMA balancing info not available'" cmd_info = CommandInfo("Numa balancing Info", [cmd]) return cmd_info -## print cuda version information. + +## print cuda version information def print_cuda_version_information(): - cmd = "nvcc --version" + cmd = "nvcc --version 2>/dev/null || echo 'CUDA not available'" cmd_info = CommandInfo("CUDA information", [cmd]) return cmd_info + def print_cuda_env_variables(): - cmd = "env | /bin/grep -i -E 'cuda|nvidia|pytorch|mpi|openmp|ucx|cu'" + cmd = "env | /bin/grep -i -E 'cuda|nvidia|pytorch|mpi|openmp|ucx|cu' || echo 'No CUDA env vars found'" cmd_info = CommandInfo("CUDA Env Variables", [cmd]) return cmd_info + def print_cuda_packages_installed(): - d = {} - with open("/etc/os-release") as fs: - for line in fs: - if "=" in line: - k,v = line.rstrip().split("=") - d[k] = v.strip('"') - pkgtype = d['ID_LIKE'] - cmd1 = "echo ' Pkg type: '" + pkgtype - cmd2 = None - if pkgtype == "debian": - cmd2 = "/usr/bin/dpkg -l | /bin/grep -i -E 'cuda|cu|atlas|hdf5|nccl|nvinfer|nvjpeg|onnx'" - else: - cmd2 = "/usr/bin/rpm -qa | /bin/grep -i -E 'cuda|cu|atlas|hdf5|nccl|nvinfer|nvjpeg|onnx'" - cmd_info = CommandInfo("ROCm Packages Installed", [cmd1, cmd2]) + d = {} + try: + with open("/etc/os-release") as fs: + for line in fs: + if "=" in line: + k, v = line.rstrip().split("=", 1) + d[k] = v.strip('"') + + pkgtype = d.get('ID_LIKE', d.get('ID', 'unknown')) + # Note: Format must match csv_parser.py expectations (space before "Pkg") + cmd1 = "echo ' Pkg type: '" + pkgtype + cmd2 = None + + if 'debian' in pkgtype.lower(): + cmd2 = "/usr/bin/dpkg -l 2>/dev/null | /bin/grep -i -E 'cuda|cu|atlas|hdf5|nccl|nvinfer|nvjpeg|onnx' || echo 'No CUDA packages found'" + else: + cmd2 = "/usr/bin/rpm -qa 2>/dev/null | /bin/grep -i -E 'cuda|cu|atlas|hdf5|nccl|nvinfer|nvjpeg|onnx' || echo 'No CUDA packages found'" + + cmd_info = CommandInfo("CUDA Packages Installed", [cmd1, cmd2]) + except Exception as e: + cmd_info = CommandInfo("CUDA Packages Installed", [f"echo 'Error checking packages: {e}'"]) + return cmd_info + def dump_system_env_information(configs, output_name): out_dir = "." + output_name if not os.path.exists(out_dir): @@ -307,72 +675,98 @@ def dump_system_env_information(configs, output_name): cmds = cmd_info.cmds for cmd in cmds: - if config in ["rocm_env_variables", "dmsg_gpu_drm_atom_logs", "rocm_smi_pcie"]: - out = console.sh(cmd, canFail=True) - else: - out = console.sh(cmd) + # Changed to canFail=True for robustness with TheRock + out = console.sh(cmd, canFail=True) fs.write(out) fs.write("\n") fs.close() -def determine_gpu_device_type(): + +def determine_gpu_device_type(path_resolver): gpu_device_type = "" - rocm_smi_out = console.sh("/opt/rocm/bin/rocm-smi || true") - nv_smi_out = console.sh("nvidia-smi -L || true") - if not "not found" in rocm_smi_out: + + # Try rocm-smi + rocm_smi_cmd = path_resolver.paths.get('rocm_smi') or "rocm-smi" + rocm_smi_out = console.sh(f"{rocm_smi_cmd} 2>/dev/null || true", canFail=True) + + # Try nvidia-smi + nv_smi_out = console.sh("nvidia-smi -L 2>/dev/null || true", canFail=True) + + if rocm_smi_out and "not found" not in rocm_smi_out and len(rocm_smi_out) > 10: gpu_device_type = "AMD" - if not "not found" in nv_smi_out: + elif nv_smi_out and "not found" not in nv_smi_out and len(nv_smi_out) > 10: gpu_device_type = "NVIDIA" + return gpu_device_type -def generate_env_info(gpu_device_type): + +def generate_env_info(gpu_device_type, path_resolver): global env_map + + print(f"Installation Type: {path_resolver.installation_type}") + print(f"ROCm Root: {path_resolver.rocm_root or 'Not found'}") + print(f"GPU Device Type: {gpu_device_type or 'Unknown'}") + env_map["hardware_information"] = print_hardware_information() env_map["cpu_information"] = print_cpu_hardware_information() - env_map["gpu_information"] = print_gpu_hardware_information(gpu_device_type) + env_map["gpu_information"] = print_gpu_hardware_information(gpu_device_type, path_resolver) env_map["bios_settings"] = print_bios_settings() env_map["os_information"] = print_os_information() env_map["dmsg_gpu_drm_atom_logs"] = print_dmesg_logs(ignore_prev_boot_logs=True) env_map["amdgpu_modinfo"] = print_amdgpu_modinfo() env_map["memory_information"] = print_memory_information() - print ("GPU Device type detected is: {}".format(gpu_device_type)) + if gpu_device_type == "AMD": - env_map["rocm_information"] = print_rocm_version_information() - env_map["rocm_repo_setup"] = print_rocm_repo_setup() - env_map["rocm_packages_installed"] = print_rocm_packages_installed() + env_map["rocm_information"] = print_rocm_version_information(path_resolver) + env_map["rocm_repo_setup"] = print_rocm_repo_setup(path_resolver) + env_map["rocm_packages_installed"] = print_rocm_packages_installed(path_resolver) env_map["rocm_env_variables"] = print_rocm_environment_variables() - env_map["rocm_smi"] = print_rocm_smi_details("rocm_smi") - env_map["ifwi_version"] = print_rocm_smi_details("ifwi_version") - env_map["rocm_smi_showhw"] = print_rocm_smi_details("rocm_smi_showhw") - env_map["rocm_smi_pcie"] = print_rocm_smi_details("rocm_smi_pcie") - env_map["rocm_smi_pids"] = print_rocm_smi_details("rocm_smi_pids") - env_map["rocm_smi_topology"] = print_rocm_smi_details("rocm_smi_topology") - env_map["rocm_smi_showserial"] = print_rocm_smi_details("rocm_smi_showserial") - env_map["rocm_smi_showperflevel"] = print_rocm_smi_details("rocm_smi_showperflevel") - env_map["rocm_smi_showrasinfo"] = print_rocm_smi_details("rocm_smi_showrasinfo") - env_map["rocm_smi_showxgmierr"] = print_rocm_smi_details("rocm_smi_showxgmierr") - env_map["rocm_smi_clocks"] = print_rocm_smi_details("rocm_smi_clocks") - env_map["rocm_smi_showcompute_partition"] = print_rocm_smi_details("rocm_smi_showcompute_partition") - env_map["rocm_smi_nodesbwi"] = print_rocm_smi_details("rocm_smi_nodesbwi") - env_map["rocm_smi_gpudeviceid"] = print_rocm_smi_details("rocm_smi_gpudeviceid") - env_map["rocm_info"] = print_rocm_info_details() + env_map["rocm_smi"] = print_rocm_smi_details("rocm_smi", path_resolver) + env_map["ifwi_version"] = print_rocm_smi_details("ifwi_version", path_resolver) + env_map["rocm_smi_showhw"] = print_rocm_smi_details("rocm_smi_showhw", path_resolver) + env_map["rocm_smi_pcie"] = print_rocm_smi_details("rocm_smi_pcie", path_resolver) + env_map["rocm_smi_pids"] = print_rocm_smi_details("rocm_smi_pids", path_resolver) + env_map["rocm_smi_topology"] = print_rocm_smi_details("rocm_smi_topology", path_resolver) + env_map["rocm_smi_showserial"] = print_rocm_smi_details("rocm_smi_showserial", path_resolver) + env_map["rocm_smi_showperflevel"] = print_rocm_smi_details("rocm_smi_showperflevel", path_resolver) + env_map["rocm_smi_showrasinfo"] = print_rocm_smi_details("rocm_smi_showrasinfo", path_resolver) + env_map["rocm_smi_showxgmierr"] = print_rocm_smi_details("rocm_smi_showxgmierr", path_resolver) + env_map["rocm_smi_clocks"] = print_rocm_smi_details("rocm_smi_clocks", path_resolver) + env_map["rocm_smi_showcompute_partition"] = print_rocm_smi_details("rocm_smi_showcompute_partition", path_resolver) + env_map["rocm_smi_nodesbwi"] = print_rocm_smi_details("rocm_smi_nodesbw", path_resolver) + env_map["rocm_smi_gpudeviceid"] = print_rocm_smi_details("rocm_smi_gpudeviceid", path_resolver) + env_map["rocm_info"] = print_rocm_info_details(path_resolver) elif gpu_device_type == "NVIDIA": env_map["cuda_information"] = print_cuda_version_information() env_map["cuda_env_variables"] = print_cuda_env_variables() env_map["cuda_packages_installed"] = print_cuda_packages_installed() + env_map["pip_list"] = print_pip_list_details() if os.path.exists("/proc/sys/kernel/numa_balancing"): env_map["numa_balancing"] = print_check_numa_balancing() + def main(): - gpu_device_type = determine_gpu_device_type() - generate_env_info(gpu_device_type) + # Initialize path resolver + path_resolver = RocmPathResolver(verbose=args.verbose) + + # Detect GPU type with resolver + gpu_device_type = determine_gpu_device_type(path_resolver) + + # Generate environment info + generate_env_info(gpu_device_type, path_resolver) + + # Get configs configs = env_map.keys() if args.lite: configs = parse_env_tags_json("env_tags.json") + + # Dump system environment information dump_system_env_information(configs, args.output_name) - print ("OK: finished dumping the system env details in .{} folder".format(args.output_name)) + print(f"OK: finished dumping the system env details in .{args.output_name} folder") + + # CSV output if args.dump_csv or args.print_csv: csv_file = args.output_name + ".csv" out_dir = "." + args.output_name @@ -381,12 +775,22 @@ def main(): if args.print_csv: csv_parser.print_csv_output() + if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("--lite", action="store_true", help="System environment data lite version taken from env_tags.json") - parser.add_argument("--dump-csv", action="store_true", help="Dump system config info in CSV file") - parser.add_argument("--print-csv", action="store_true", help="Print system config info data") - parser.add_argument("--output-name", required=False, default="sys_config_info", help="Output file or directory name") + parser = argparse.ArgumentParser( + description="System environment data collection tool (TheRock + Traditional ROCm compatible)" + ) + parser.add_argument("--lite", action="store_true", + help="System environment data lite version taken from env_tags.json") + parser.add_argument("--dump-csv", action="store_true", + help="Dump system config info in CSV file") + parser.add_argument("--print-csv", action="store_true", + help="Print system config info data") + parser.add_argument("--output-name", required=False, default="sys_config_info", + help="Output file or directory name") + parser.add_argument("-v", "--verbose", action="store_true", + help="Enable verbose detection output") + args = parser.parse_args() console = Console(shellVerbose=False, live_output=False) diff --git a/tests/fixtures/dummy/docker/therock.ubuntu.amd.Dockerfile b/tests/fixtures/dummy/docker/therock.ubuntu.amd.Dockerfile new file mode 100644 index 00000000..085cc93a --- /dev/null +++ b/tests/fixtures/dummy/docker/therock.ubuntu.amd.Dockerfile @@ -0,0 +1,100 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +ARG BASE_DOCKER=ubuntu:24.04 +FROM ${BASE_DOCKER} + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + gfortran \ + git \ + ninja-build \ + cmake \ + g++ \ + pkg-config \ + xxd \ + patchelf \ + automake \ + libtool \ + python3-venv \ + python3-dev \ + python3-pip \ + libegl1-mesa-dev \ + wget \ + curl \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Create working directory +WORKDIR /workspace + +# Clone TheRock repository +ARG THEROCK_BRANCH=main +RUN git clone https://github.com/ROCm/TheRock.git /workspace/TheRock && \ + cd /workspace/TheRock && \ + git checkout ${THEROCK_BRANCH} + +WORKDIR /workspace/TheRock + +# Setup Python virtual environment and install dependencies +RUN python3 -m venv .venv && \ + . .venv/bin/activate && \ + pip install --upgrade pip && \ + pip install -r requirements.txt + +# Download submodules and apply patches +# Note: dvc is optional but recommended for faster builds +RUN apt-get update && apt-get install -y snapd && \ + rm -rf /var/lib/apt/lists/* || true + +# Fetch sources (includes submodules and patches) +RUN . .venv/bin/activate && \ + python3 ./build_tools/fetch_sources.py + +# Configure build with CMake +# Default to gfx942 (MI300 series), can be overridden with build arg +ARG MAD_SYSTEM_GPU_ARCHITECTURE=gfx942 + +# Only enable core runtime and HIP runtime for minimal build +# This is sufficient for checking amd-smi and ROCm version +# Builds much faster than full component build +RUN . .venv/bin/activate && \ + cmake -B build -GNinja . \ + -DTHEROCK_AMDGPU_TARGETS=${MAD_SYSTEM_GPU_ARCHITECTURE} \ + -DTHEROCK_ENABLE_ALL=OFF \ + -DTHEROCK_ENABLE_CORE_RUNTIME=ON \ + -DTHEROCK_ENABLE_HIP_RUNTIME=ON \ + -DBUILD_TESTING=ON + +# Build TheRock components +# This will take a significant amount of time depending on enabled components +RUN . .venv/bin/activate && \ + cmake --build build + +# Install built components +RUN . .venv/bin/activate && \ + cmake --install build --prefix /opt/rocm + +# Set up runtime environment +ENV PATH=/opt/rocm/bin:/workspace/TheRock/.venv/bin:$PATH +ENV LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH +ENV ROCM_PATH=/opt/rocm +ENV HIP_PATH=/opt/rocm + +# Create entrypoint script +RUN echo '#!/bin/bash\n\ +source /workspace/TheRock/.venv/bin/activate\n\ +exec "$@"' > /entrypoint.sh && \ + chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] +CMD ["/bin/bash"] + +# Labels +LABEL maintainer="AMD ROCm" +LABEL description="TheRock - The HIP Environment and ROCm Kit (Minimal: Core Runtime + HIP Runtime)" +LABEL version="nightly" +LABEL gpu_architecture="${MAD_SYSTEM_GPU_ARCHITECTURE}" +LABEL components="core_runtime,hip_runtime" diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index 1ff21c23..3dfd5de1 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -194,5 +194,17 @@ ], "args": "", "multiple_results": "perf_dummy.csv" + }, + { + "name": "therock", + "dockerfile": "docker/therock", + "scripts": "scripts/therock/run.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "therock" + ], + "args": "" } ] diff --git a/tests/fixtures/dummy/scripts/therock/run.sh b/tests/fixtures/dummy/scripts/therock/run.sh new file mode 100644 index 00000000..e5db9e7b --- /dev/null +++ b/tests/fixtures/dummy/scripts/therock/run.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# + +echo "performance: $RANDOM samples_per_second" From 9ba9f98a6b646641ed774e27186b1c270e3577c2 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Mon, 5 Jan 2026 21:03:32 -0500 Subject: [PATCH 02/11] Fix the cleanup (#60) --- src/madengine/tools/run_models.py | 65 ++++++++++- tests/test_cleanup.py | 177 ++++++++++++++++++++++++++++++ 2 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 tests/test_cleanup.py diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index a620d96f..1eb44c33 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -532,6 +532,69 @@ def run_pre_post_script(self, model_docker, model_dir, pre_post): script_args.strip() model_docker.sh("cd " + model_dir + " && bash " + script_name + " " + script_args , timeout=600) + def _cleanup_model_directory(self, model_docker, model_dir: str, max_retries: int = 3, retry_delay: float = 2.0) -> None: + """Robustly cleanup model directory with retry logic. + + This method handles cleanup failures that can occur due to: + - File system sync delays + - Background processes holding file handles + - Permission issues + - Race conditions between test completion and cleanup + + Args: + model_docker: The Docker instance for executing commands + model_dir: The directory path to remove + max_retries: Maximum number of retry attempts (default: 3) + retry_delay: Delay in seconds between retries (default: 2.0) + + Raises: + RuntimeError: If cleanup fails after all retry attempts (logged as warning, not raised) + """ + import time + + print(f"Cleaning up model directory: {model_dir}") + + for attempt in range(max_retries): + try: + # First attempt: Try killing any processes that might be holding files + if attempt > 0: + print(f"Cleanup attempt {attempt + 1}/{max_retries}...") + # Kill any processes using files in the directory (ignore errors) + model_docker.sh(f"fuser -k {model_dir} 2>/dev/null || true", timeout=30) + time.sleep(retry_delay) + + # Try to fix permissions before removal + try: + model_docker.sh(f"chmod -R 777 {model_dir} 2>/dev/null || true", timeout=60) + except RuntimeError: + # Permission change failed, but continue anyway + pass + + # Attempt to remove the directory + model_docker.sh(f"rm -rf {model_dir}", timeout=240) + print(f"Successfully cleaned up {model_dir}") + return + + except RuntimeError as e: + error_msg = str(e) + print(f"Cleanup attempt {attempt + 1} failed: {error_msg}") + + if attempt < max_retries - 1: + # Not the last attempt, wait and retry + print(f"Waiting {retry_delay} seconds before retry...") + time.sleep(retry_delay) + else: + # Last attempt failed, log warning but don't fail the build + # The test itself succeeded, cleanup failure is not critical + print("=" * 60) + print("WARNING: Failed to cleanup model directory after all retries") + print(f"Directory: {model_dir}") + print(f"Error: {error_msg}") + print("This is not a critical failure - the test completed successfully") + print("The directory will be cleaned up when the container is removed") + print("=" * 60) + # Don't raise the exception - cleanup failure shouldn't fail the build + def run_model_impl( self, info: typing.Dict, dockerfile: str, run_details: RunDetails ) -> None: @@ -879,7 +942,7 @@ def run_model_impl( # remove model directory if not self.args.keep_alive and not self.args.keep_model_dir: - model_docker.sh("rm -rf " + model_dir, timeout=240) + self._cleanup_model_directory(model_docker, model_dir) else: model_docker.sh("chmod -R a+rw " + model_dir) print("keep_alive is specified; model_dir(" + model_dir + ") is not removed") diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py new file mode 100644 index 00000000..458c4e4a --- /dev/null +++ b/tests/test_cleanup.py @@ -0,0 +1,177 @@ +"""Test cleanup functionality for robust directory removal.""" + +import unittest +from unittest.mock import Mock, patch, call, MagicMock +import time +from madengine.tools.run_models import RunModels + + +class TestCleanupModelDirectory(unittest.TestCase): + """Test cases for the _cleanup_model_directory method.""" + + def setUp(self): + """Set up test fixtures.""" + # Create a mock args object with all required attributes + self.mock_args = Mock() + self.mock_args.keep_alive = False + self.mock_args.keep_model_dir = False + self.mock_args.generate_sys_env_details = False + self.mock_args.data_config_file_name = "/tmp/nonexistent_data.json" # Use non-existent path + self.mock_args.additional_context = "" + self.mock_args.additional_context_file = None + self.mock_args.force_mirror_local = False + + # Patch the dependencies before creating RunModels instance + with patch('madengine.tools.run_models.Console'), \ + patch('madengine.tools.run_models.Context') as mock_context_cls: + # Setup Context mock + mock_context = MagicMock() + mock_context.ctx = {} + mock_context_cls.return_value = mock_context + + self.run_models = RunModels(self.mock_args) + + # Create mock docker instance + self.mock_docker = Mock() + + def test_cleanup_success_first_attempt(self): + """Test successful cleanup on first attempt.""" + model_dir = "test_model_dir" + + # Mock successful removal + self.mock_docker.sh.return_value = "" + + # Call cleanup method + self.run_models._cleanup_model_directory(self.mock_docker, model_dir) + + # Verify rm command was called + self.mock_docker.sh.assert_called_with(f"rm -rf {model_dir}", timeout=240) + # Should only be called once on success + self.assertEqual(self.mock_docker.sh.call_count, 1) + + def test_cleanup_success_after_retries(self): + """Test successful cleanup after retries.""" + model_dir = "test_model_dir" + + # Mock failure on first 2 attempts, success on 3rd + self.mock_docker.sh.side_effect = [ + RuntimeError("Directory not empty"), # First rm -rf fails + RuntimeError("Directory not empty"), # fuser command + RuntimeError("Directory not empty"), # chmod command + RuntimeError("Directory not empty"), # Second rm -rf fails + RuntimeError("Directory not empty"), # fuser command + RuntimeError("Directory not empty"), # chmod command + "", # Third rm -rf succeeds + ] + + # Call cleanup method with shorter retry delay for testing + with patch('time.sleep'): # Mock sleep to speed up test + self.run_models._cleanup_model_directory( + self.mock_docker, model_dir, max_retries=3, retry_delay=0.1 + ) + + # Verify multiple attempts were made + self.assertGreater(self.mock_docker.sh.call_count, 1) + + def test_cleanup_all_attempts_fail_no_exception(self): + """Test that cleanup failure doesn't raise exception (only logs warning).""" + model_dir = "test_model_dir" + + # Mock all attempts failing + self.mock_docker.sh.side_effect = RuntimeError("Directory not empty") + + # Call cleanup method - should NOT raise exception + with patch('time.sleep'): # Mock sleep to speed up test + try: + self.run_models._cleanup_model_directory( + self.mock_docker, model_dir, max_retries=2, retry_delay=0.1 + ) + # Should complete without raising exception + cleanup_succeeded = True + except Exception as e: + cleanup_succeeded = False + self.fail(f"Cleanup should not raise exception, but raised: {e}") + + self.assertTrue(cleanup_succeeded, "Cleanup should complete even if all attempts fail") + + def test_cleanup_uses_fuser_and_chmod_on_retry(self): + """Test that retry attempts use fuser and chmod.""" + model_dir = "test_model_dir" + + # Track the commands called + commands_called = [] + + def track_commands(cmd, timeout): + commands_called.append(cmd) + if "rm -rf" in cmd and len([c for c in commands_called if "rm -rf" in c]) == 1: + # Fail first rm -rf + raise RuntimeError("Directory not empty") + return "" + + self.mock_docker.sh.side_effect = track_commands + + # Call cleanup method + with patch('time.sleep'): # Mock sleep to speed up test + self.run_models._cleanup_model_directory( + self.mock_docker, model_dir, max_retries=2, retry_delay=0.1 + ) + + # Verify fuser and chmod were called on retry + command_strings = ' '.join(commands_called) + self.assertIn('fuser', command_strings, "fuser should be called on retry") + self.assertIn('chmod', command_strings, "chmod should be called on retry") + + def test_cleanup_with_custom_retry_params(self): + """Test cleanup with custom retry parameters.""" + model_dir = "test_model_dir" + custom_retries = 5 + custom_delay = 0.5 + + self.mock_docker.sh.return_value = "" + + # Call with custom parameters + self.run_models._cleanup_model_directory( + self.mock_docker, model_dir, + max_retries=custom_retries, + retry_delay=custom_delay + ) + + # Verify it worked + self.mock_docker.sh.assert_called() + + +class TestCleanupIntegration(unittest.TestCase): + """Integration tests for cleanup in run_model_impl.""" + + def setUp(self): + """Set up test fixtures.""" + self.mock_args = Mock() + self.mock_args.keep_alive = False + self.mock_args.keep_model_dir = False + self.mock_args.generate_sys_env_details = False + self.mock_args.skip_model_run = True + self.mock_args.data_config_file_name = "/tmp/nonexistent_data.json" + self.mock_args.additional_context = "" + self.mock_args.additional_context_file = None + self.mock_args.force_mirror_local = False + + with patch('madengine.tools.run_models.Console'), \ + patch('madengine.tools.run_models.Context') as mock_context_cls: + mock_context = MagicMock() + mock_context.ctx = {} + mock_context_cls.return_value = mock_context + self.run_models = RunModels(self.mock_args) + + @patch('madengine.tools.run_models.RunModels._cleanup_model_directory') + def test_cleanup_called_when_not_keep_alive(self, mock_cleanup): + """Test that cleanup is called when keep_alive is False.""" + # This test verifies that our new method is called instead of direct rm -rf + # We can't easily test the full run_model_impl, but we've verified the code change + self.assertTrue(hasattr(self.run_models, '_cleanup_model_directory')) + + # Verify the method exists and is callable + self.assertTrue(callable(self.run_models._cleanup_model_directory)) + + +if __name__ == '__main__': + unittest.main() From 6d7a660fbb678fd22e67966cc43aad216413a682 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Wed, 14 Jan 2026 22:45:20 -0500 Subject: [PATCH 03/11] Perf entry superset (#58) * Implemented a module to parse config inputs and creat perf_entry_super.json and upload dataset to MongoDB * Implement update perf superset * fix unit tests of super set * Fixed the perf superset data collection and MongoDB update --- src/madengine/mad.py | 3 +- src/madengine/tools/run_models.py | 83 ++++- src/madengine/tools/update_perf_csv.py | 31 +- src/madengine/tools/update_perf_super.py | 243 +++++++++++++ src/madengine/tools/upload_mongodb.py | 95 +++-- src/madengine/utils/config_parser.py | 237 ++++++++++++ tests/fixtures/dummy/models.json | 14 + .../dummy/scripts/dummy/configs/default.csv | 5 + .../dummy/scripts/dummy/run_perf_super.sh | 28 ++ tests/test_misc.py | 341 +++++++++++++++++- 10 files changed, 1047 insertions(+), 33 deletions(-) create mode 100644 src/madengine/tools/update_perf_super.py create mode 100644 src/madengine/utils/config_parser.py create mode 100644 tests/fixtures/dummy/scripts/dummy/configs/default.csv create mode 100755 tests/fixtures/dummy/scripts/dummy/run_perf_super.sh diff --git a/src/madengine/mad.py b/src/madengine/mad.py index e4df7143..f9970847 100644 --- a/src/madengine/mad.py +++ b/src/madengine/mad.py @@ -267,7 +267,8 @@ def main(): parser_database_update_table.set_defaults(func=update_table) # Database subcommand uploading to MongoDB parser_database_upload_mongodb = subparsers_database.add_parser('upload-mongodb', description="Update table in DB.", help='Update table in DB') - parser_database_upload_mongodb.add_argument('--csv-file-path', type=str, default='perf_entry.csv', help='Path to the csv file') + parser_database_upload_mongodb.add_argument('--csv-file-path', type=str, default='perf_entry.csv', help='Path to the csv file (for legacy perf.csv)') + parser_database_upload_mongodb.add_argument('--json-file-path', type=str, default=None, help='Path to the json file (for perf_entry_super.json)') parser_database_upload_mongodb.add_argument("--database-name", type=str, required=True, help="Name of the MongoDB database") parser_database_upload_mongodb.add_argument("--collection-name", type=str, required=True, help="Name of the MongoDB collection") parser_database_upload_mongodb.set_defaults(func=upload_mongodb) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 1eb44c33..ec5a3c8d 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -50,8 +50,10 @@ from madengine.core.constants import MODEL_DIR, PUBLIC_GITHUB_ROCM_KEY from madengine.core.timeout import Timeout from madengine.tools.update_perf_csv import update_perf_csv +from madengine.tools.update_perf_super import update_perf_super_json from madengine.tools.csv_to_html import convert_csv_to_html from madengine.tools.discover_models import DiscoverModels +from madengine.utils.config_parser import ConfigParser class RunDetails: @@ -83,6 +85,7 @@ class RunDetails: data_download_duration (str): The duration of data download. build_number (str): The CI build number. additional_docker_run_options (str): The additional options used for docker run. + configs (dict or list or None): The configuration data from config files. """ # Avoiding @property for ease of code, add if needed. @@ -112,6 +115,7 @@ def __init__(self): self.data_download_duration = "" self.build_number = "" self.additional_docker_run_options = "" + self.configs = None def print_perf(self): """Print the performance results of a model. @@ -133,13 +137,37 @@ def generate_json(self, json_name: str, multiple_results: bool = False) -> None: Raises: Exception: An error occurred while generating JSON file for performance results of a model. """ + # Exclude configs from CSV workflow as it can contain list/dict values + # that cause issues with pandas DataFrame creation keys_to_exclude = ( - {"model", "performance", "metric", "status"} if multiple_results else {} + {"model", "performance", "metric", "status", "configs"} if multiple_results + else {"configs"} ) attributes = vars(self) output_dict = {x: attributes[x] for x in attributes if x not in keys_to_exclude} with open(json_name, "w") as outfile: json.dump(output_dict, outfile) + + def generate_super_json(self, json_name: str, multiple_results: bool = False) -> None: + """Generate enhanced JSON file with config data for performance results. + + This method is similar to generate_json but includes the configs field + for perf_entry_super.json generation. + + Args: + json_name (str): The name of the JSON file. + multiple_results (bool): The status of multiple results. Default is False. + + Raises: + Exception: An error occurred while generating JSON file for performance results of a model. + """ + keys_to_exclude = ( + {"model", "performance", "metric", "status"} if multiple_results else {} + ) + attributes = vars(self) + output_dict = {x: attributes[x] for x in attributes if x not in keys_to_exclude} + with open(json_name, "w") as outfile: + json.dump(output_dict, outfile, indent=2) class RunModels: @@ -978,6 +1006,17 @@ def run_model(self, model_info: typing.Dict) -> bool: # Taking gpu arch from context assumes the host image and container have the same gpu arch. # Environment variable updates for MAD Public CI run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] + + # Parse and load config file if present in args for perf_entry_super.json + try: + config_parser = ConfigParser(scripts_base_dir=os.path.dirname(model_info.get("scripts", ""))) + run_details.configs = config_parser.parse_and_load( + model_info["args"], + model_info.get("scripts", "") + ) + except Exception as e: + print(f"Warning: Could not parse config file: {e}") + run_details.configs = None # Check the setting of shared memory size if "SHM_SIZE" in self.context.ctx: @@ -1018,6 +1057,14 @@ def run_model(self, model_info: typing.Dict) -> bool: # generate exception for testing run_details.generate_json("perf_entry.json") update_perf_csv(exception_result="perf_entry.json", perf_csv=self.args.output) + + # Generate perf_entry_super.json + run_details.generate_super_json("perf_entry_super.json") + update_perf_super_json( + exception_result="perf_entry_super.json", + perf_super_json="perf_entry_super.json", + scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), + ) else: print( f"Running model {run_details.model} on {run_details.gpu_architecture} architecture." @@ -1119,12 +1166,30 @@ def run_model(self, model_info: typing.Dict) -> bool: model_name=run_details.model, common_info="common_info.json", ) + + # Generate perf_entry_super.json + run_details.generate_super_json("common_info_super.json", multiple_results=True) + update_perf_super_json( + multiple_results=model_info['multiple_results'], + perf_super_json="perf_entry_super.json", + model_name=run_details.model, + common_info="common_info_super.json", + scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), + ) else: run_details.generate_json("perf_entry.json") update_perf_csv( single_result="perf_entry.json", perf_csv=self.args.output, ) + + # Generate perf_entry_super.json + run_details.generate_super_json("perf_entry_super.json") + update_perf_super_json( + single_result="perf_entry_super.json", + perf_super_json="perf_entry_super.json", + scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), + ) self.return_status &= (run_details.status == 'SUCCESS') @@ -1141,6 +1206,14 @@ def run_model(self, model_info: typing.Dict) -> bool: exception_result="perf_entry.json", perf_csv=self.args.output, ) + + # Generate perf_entry_super.json + run_details.generate_super_json("perf_entry_super.json") + update_perf_super_json( + exception_result="perf_entry_super.json", + perf_super_json="perf_entry_super.json", + scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), + ) except Exception as e: self.return_status = False @@ -1155,6 +1228,14 @@ def run_model(self, model_info: typing.Dict) -> bool: exception_result="perf_entry.json", perf_csv=self.args.output, ) + + # Generate perf_entry_super.json + run_details.generate_super_json("perf_entry_super.json") + update_perf_super_json( + exception_result="perf_entry_super.json", + perf_super_json="perf_entry_super.json", + scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), + ) return self.return_status diff --git a/src/madengine/tools/update_perf_csv.py b/src/madengine/tools/update_perf_csv.py index 5e32e3e2..6f8b84ee 100644 --- a/src/madengine/tools/update_perf_csv.py +++ b/src/madengine/tools/update_perf_csv.py @@ -73,7 +73,18 @@ def perf_entry_dict_to_csv(perf_entry: typing.Dict) -> None: perf_entry: The performance entry dictionary. """ flatten_tags(perf_entry) - js_df = pd.DataFrame(perf_entry, index=[0]) + + # Convert any non-scalar values (list/dict) to JSON strings + # to avoid DataFrame creation errors when values don't match index length + perf_entry_safe = {} + for key, value in perf_entry.items(): + if isinstance(value, (list, dict)): + # Convert lists and dicts to JSON strings + perf_entry_safe[key] = json.dumps(value) if value is not None else None + else: + perf_entry_safe[key] = value + + js_df = pd.DataFrame(perf_entry_safe, index=[0]) perf_entry_df_to_csv(js_df) @@ -116,15 +127,29 @@ def handle_multiple_results( row = common_info_json.copy() model = r.pop("model") row["model"] = model_name + "_" + str(model) - row.update(r) + + # Only extract essential result columns for perf.csv + # The full details with all metrics are preserved in perf_entry_super.json + row["performance"] = r.get("performance") + row["metric"] = r.get("metric") if row["performance"] is not None and pd.notna(row["performance"]): row["status"] = "SUCCESS" else: row["status"] = "FAILURE" + # Convert any non-scalar values (list/dict) to JSON strings + # to avoid DataFrame creation errors when values don't match index length + row_safe = {} + for key, value in row.items(): + if isinstance(value, (list, dict)): + # Convert lists and dicts to JSON strings + row_safe[key] = json.dumps(value) if value is not None else None + else: + row_safe[key] = value + final_multiple_results_df = pd.concat( - [final_multiple_results_df, pd.DataFrame(row, index=[0])], ignore_index=True + [final_multiple_results_df, pd.DataFrame(row_safe, index=[0])], ignore_index=True ) # Reorder columns according to existing perf csv columns = perf_csv_df.columns.tolist() diff --git a/src/madengine/tools/update_perf_super.py b/src/madengine/tools/update_perf_super.py new file mode 100644 index 00000000..23bb1a15 --- /dev/null +++ b/src/madengine/tools/update_perf_super.py @@ -0,0 +1,243 @@ +"""Module to update the perf_entry_super.json file with enhanced performance data. + +This module is used to update the perf_entry_super.json file with performance data +that includes configuration information from config files. + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +# built-in imports +import json +import os +import typing +# third-party imports +import pandas as pd +# MAD Engine imports +from madengine.utils.config_parser import ConfigParser + + +def read_json(js: str) -> dict: + """Read a JSON file. + + Args: + js: The path to the JSON file. + + Returns: + The JSON dictionary. + """ + with open(js, 'r') as f: + return json.load(f) + + +def write_json(data: typing.Union[dict, list], output_path: str) -> None: + """Write data to a JSON file. + + Args: + data: The data to write (dict or list). + output_path: The path to the output JSON file. + """ + with open(output_path, 'w') as f: + json.dump(data, f, indent=2) + + +def load_perf_super_json(perf_super_json: str) -> list: + """Load existing perf_entry_super.json file. + + Args: + perf_super_json: Path to perf_entry_super.json file. + + Returns: + List of performance records, or empty list if file doesn't exist. + """ + if not os.path.exists(perf_super_json): + return [] + + try: + data = read_json(perf_super_json) + # Ensure it's a list + if isinstance(data, list): + return data + else: + return [data] + except Exception as e: + print(f"Warning: Could not load existing perf_entry_super.json: {e}") + return [] + + +def handle_multiple_results_super( + perf_super_list: list, + multiple_results: str, + common_info: str, + model_name: str, + config_parser: ConfigParser + ) -> list: + """Handle multiple results with config matching. + + Args: + perf_super_list: List of existing performance records. + multiple_results: The path to the multiple results CSV file. + common_info: The path to the common info JSON file. + model_name: The model name. + config_parser: ConfigParser instance for loading configs. + + Returns: + Updated list of performance records with configs. + """ + # Load multiple results CSV + multiple_results_df = pd.read_csv(multiple_results) + multiple_results_df.columns = multiple_results_df.columns.str.strip() + + # Check required columns + required_cols = ['model', 'performance', 'metric'] + for col in required_cols: + if col not in multiple_results_df.columns: + raise RuntimeError(f"{multiple_results} file is missing the {col} column") + + # Load common info + common_info_json = read_json(common_info) + + # Parse config file from args if present + configs_data = None + if 'args' in common_info_json and common_info_json['args']: + # Try to extract config path from args + scripts_path = common_info_json.get('pipeline', '') + configs_data = config_parser.parse_and_load( + common_info_json['args'], + scripts_path + ) + + # Process each result row + for result_row in multiple_results_df.to_dict(orient="records"): + record = common_info_json.copy() + + # Update model name + result_model = result_row.pop("model") + record["model"] = f"{model_name}_{result_model}" + + # Update with result data + record.update(result_row) + + # Set status based on performance + if record.get("performance") is not None and pd.notna(record.get("performance")): + record["status"] = "SUCCESS" + else: + record["status"] = "FAILURE" + + # Match config to this specific result + if configs_data: + if isinstance(configs_data, list): + # For CSV configs with multiple rows, try to match + matched_config = config_parser.match_config_to_result( + configs_data, + result_row, + result_model + ) + record["configs"] = matched_config + else: + # For JSON/YAML configs, use as-is + record["configs"] = configs_data + else: + record["configs"] = None + + perf_super_list.append(record) + + return perf_super_list + + +def handle_single_result_super( + perf_super_list: list, + single_result: str + ) -> list: + """Handle a single result. + + Args: + perf_super_list: List of existing performance records. + single_result: The path to the single result JSON file. + + Returns: + Updated list of performance records. + """ + single_result_json = read_json(single_result) + + # Ensure configs field exists (may be None) + if "configs" not in single_result_json: + single_result_json["configs"] = None + + perf_super_list.append(single_result_json) + return perf_super_list + + +def handle_exception_result_super( + perf_super_list: list, + exception_result: str + ) -> list: + """Handle an exception result. + + Args: + perf_super_list: List of existing performance records. + exception_result: The path to the exception result JSON file. + + Returns: + Updated list of performance records. + """ + exception_result_json = read_json(exception_result) + + # Ensure configs field exists (may be None) + if "configs" not in exception_result_json: + exception_result_json["configs"] = None + + perf_super_list.append(exception_result_json) + return perf_super_list + + +def update_perf_super_json( + perf_super_json: str, + multiple_results: typing.Optional[str] = None, + single_result: typing.Optional[str] = None, + exception_result: typing.Optional[str] = None, + common_info: typing.Optional[str] = None, + model_name: typing.Optional[str] = None, + scripts_base_dir: typing.Optional[str] = None, + ) -> None: + """Update the perf_entry_super.json file with the latest performance data. + + Args: + perf_super_json: Path to perf_entry_super.json file. + multiple_results: Path to multiple results CSV file. + single_result: Path to single result JSON file. + exception_result: Path to exception result JSON file. + common_info: Path to common info JSON file. + model_name: The model name. + scripts_base_dir: Base directory for scripts (for config file resolution). + """ + print(f"Updating perf_entry_super.json with enhanced performance data") + + # Load existing perf_entry_super.json + perf_super_list = load_perf_super_json(perf_super_json) + + # Create config parser + config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) + + # Handle different result types + if multiple_results: + perf_super_list = handle_multiple_results_super( + perf_super_list, + multiple_results, + common_info, + model_name, + config_parser, + ) + elif single_result: + perf_super_list = handle_single_result_super(perf_super_list, single_result) + elif exception_result: + perf_super_list = handle_exception_result_super( + perf_super_list, exception_result + ) + else: + print("No results to update in perf_entry_super.json") + return + + # Write updated perf_entry_super.json + write_json(perf_super_list, perf_super_json) + print(f"Successfully updated {perf_super_json}") + diff --git a/src/madengine/tools/upload_mongodb.py b/src/madengine/tools/upload_mongodb.py index 6766e3e2..701174cd 100644 --- a/src/madengine/tools/upload_mongodb.py +++ b/src/madengine/tools/upload_mongodb.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -"""Module to update MongoDB collections with data from a CSV file. +"""Module to update MongoDB collections with data from a CSV or JSON file. This module provides functions to handle MongoDB operations, including checking for collection existence, creating collections, and updating datasets. @@ -8,13 +8,14 @@ """ # built-in modules import os +import json import argparse # third-party modules import pandas as pd import pymongo from pymongo.errors import ConnectionFailure -from typing import Optional +from typing import Optional, Union, List, Dict # MAD Engine modules from madengine.db.logger import setup_logger @@ -40,7 +41,8 @@ def __init__(self, args: argparse.Namespace) -> None: self.uri = mongo_uri self.database_name = args.database_name self.collection_name = args.collection_name - self.csv_file_path = args.csv_file_path + self.csv_file_path = getattr(args, 'csv_file_path', None) + self.json_file_path = getattr(args, 'json_file_path', None) self.client = None self.db = None @@ -62,43 +64,66 @@ def collection_exists(self) -> bool: """ return self.collection_name in self.db.list_collection_names() - def update_collection(self, data: pd.DataFrame) -> None: - """Update a MongoDB collection with data from a DataFrame. + def update_collection(self, data: Union[pd.DataFrame, List[Dict]]) -> None: + """Update a MongoDB collection with data from a DataFrame or list of dicts. Args: - data (pd.DataFrame): DataFrame containing the data to update. + data: DataFrame or list of dicts containing the data to update. """ if not self.collection_exists(): LOGGER.info(f"Collection '{self.collection_name}' does not exist. Creating it.") self.db.create_collection(self.collection_name) collection = self.db[self.collection_name] - records = data.to_dict(orient="records") + + # Convert to list of records if DataFrame + if isinstance(data, pd.DataFrame): + records = data.to_dict(orient="records") + else: + records = data + for record in records: # Use an appropriate unique identifier for upsert (e.g., "_id" or another field) collection.update_one(record, {"$set": record}, upsert=True) LOGGER.info(f"Updated collection '{self.collection_name}' with {len(records)} records.") def run(self) -> None: - """Run the process of updating a MongoDB collection with data from a CSV file. + """Run the process of updating a MongoDB collection with data from a CSV or JSON file. """ self.connect() - data = load_csv_to_dataframe(self.csv_file_path) - - # if the value is NaN, replace it with empty string - data = data.where(pd.notnull(data), "") - # Convert all columns to string type except boolean columns - for col in data.columns: - if data[col].dtype != "bool": - data[col] = data[col].astype(str) - - # Added created_date column and set it to now - data["created_date"] = pd.to_datetime("now").strftime("%Y-%m-%d %H:%M:%S") - - # Remove any leading or trailing whitespace from column names - data.columns = data.columns.str.strip() - - self.update_collection(data) + + if self.json_file_path: + # Load JSON file (perf_entry_super.json) + data = load_json_to_list(self.json_file_path) + + # Add created_date to each record + from datetime import datetime + created_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + for record in data: + if "created_date" not in record: + record["created_date"] = created_date + + self.update_collection(data) + elif self.csv_file_path: + # Load CSV file (legacy perf.csv) + data = load_csv_to_dataframe(self.csv_file_path) + + # if the value is NaN, replace it with empty string + data = data.where(pd.notnull(data), "") + # Convert all columns to string type except boolean columns + for col in data.columns: + if data[col].dtype != "bool": + data[col] = data[col].astype(str) + + # Added created_date column and set it to now + data["created_date"] = pd.to_datetime("now").strftime("%Y-%m-%d %H:%M:%S") + + # Remove any leading or trailing whitespace from column names + data.columns = data.columns.str.strip() + + self.update_collection(data) + else: + raise ValueError("Either csv_file_path or json_file_path must be provided") def load_csv_to_dataframe(csv_path: str) -> pd.DataFrame: @@ -113,3 +138,25 @@ def load_csv_to_dataframe(csv_path: str) -> pd.DataFrame: if not os.path.exists(csv_path): raise FileNotFoundError(f"CSV file '{csv_path}' not found.") return pd.read_csv(csv_path) + + +def load_json_to_list(json_path: str) -> List[Dict]: + """Load a JSON file into a list of dictionaries. + + Args: + json_path (str): Path to the JSON file. + + Returns: + List[Dict]: List of dictionaries containing the JSON data. + """ + if not os.path.exists(json_path): + raise FileNotFoundError(f"JSON file '{json_path}' not found.") + + with open(json_path, 'r') as f: + data = json.load(f) + + # Ensure it's a list + if isinstance(data, list): + return data + else: + return [data] diff --git a/src/madengine/utils/config_parser.py b/src/madengine/utils/config_parser.py new file mode 100644 index 00000000..7d3e31e7 --- /dev/null +++ b/src/madengine/utils/config_parser.py @@ -0,0 +1,237 @@ +"""Config Parser Module for MAD Engine. + +This module provides utilities to parse configuration files from model arguments +and load them in various formats (CSV, JSON, YAML). + +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. +""" + +import os +import re +import json +import logging +import typing +from pathlib import Path + +import pandas as pd + +try: + import yaml + YAML_AVAILABLE = True +except ImportError: + YAML_AVAILABLE = False + +LOGGER = logging.getLogger(__name__) + + +class ConfigParser: + """Parser for model configuration files. + + This class handles parsing configuration files in various formats + (CSV, JSON, YAML) that are referenced in model arguments. + """ + + def __init__(self, scripts_base_dir: typing.Optional[str] = None): + """Initialize ConfigParser. + + Args: + scripts_base_dir: Base directory for scripts (e.g., ~/amd/MAD-private/scripts) + """ + self.scripts_base_dir = scripts_base_dir + + def parse_config_from_args(self, args_string: str, model_scripts_path: str = None) -> typing.Optional[str]: + """Extract config file path from model arguments. + + Args: + args_string: The args field from models.json + model_scripts_path: Path to the model's script directory + + Returns: + Full path to config file, or None if no config found + """ + if not args_string: + return None + + # Look for --config argument + config_match = re.search(r'--config\s+([^\s]+)', args_string) + if not config_match: + return None + + config_path = config_match.group(1) + + # If it's already an absolute path, return it + if os.path.isabs(config_path): + return config_path if os.path.exists(config_path) else None + + # Try to resolve relative path + # First, try relative to model scripts directory + if model_scripts_path: + scripts_dir = os.path.dirname(model_scripts_path) + full_path = os.path.join(scripts_dir, config_path) + if os.path.exists(full_path): + return full_path + + # Try relative to scripts_base_dir + if self.scripts_base_dir: + full_path = os.path.join(self.scripts_base_dir, config_path) + if os.path.exists(full_path): + return full_path + + LOGGER.warning(f"Config file not found: {config_path}") + return None + + def load_config_file(self, config_path: str) -> typing.Optional[typing.Union[typing.List[dict], dict]]: + """Load and parse a configuration file. + + Args: + config_path: Full path to the config file + + Returns: + For CSV: List of dicts (one per row) + For JSON/YAML: Dict or list as-is from file + None if file cannot be loaded + """ + if not config_path or not os.path.exists(config_path): + return None + + file_ext = Path(config_path).suffix.lower() + + try: + if file_ext == '.csv': + return self._load_csv(config_path) + elif file_ext == '.json': + return self._load_json(config_path) + elif file_ext in ['.yaml', '.yml']: + return self._load_yaml(config_path) + else: + LOGGER.warning(f"Unsupported config file format: {file_ext}") + return None + except Exception as e: + LOGGER.error(f"Error loading config file {config_path}: {e}") + return None + + def _load_csv(self, config_path: str) -> typing.List[dict]: + """Load CSV config file. + + Args: + config_path: Path to CSV file + + Returns: + List of dicts, one per row + """ + df = pd.read_csv(config_path) + # Convert NaN to None for JSON serialization + df = df.where(pd.notnull(df), None) + # Convert to list of dicts + return df.to_dict(orient='records') + + def _load_json(self, config_path: str) -> typing.Union[dict, list]: + """Load JSON config file. + + Args: + config_path: Path to JSON file + + Returns: + Dict or list from JSON file + """ + with open(config_path, 'r') as f: + return json.load(f) + + def _load_yaml(self, config_path: str) -> typing.Union[dict, list]: + """Load YAML config file. + + Args: + config_path: Path to YAML file + + Returns: + Dict or list from YAML file + """ + if not YAML_AVAILABLE: + raise ImportError("PyYAML is not installed. Cannot load YAML config files.") + + with open(config_path, 'r') as f: + return yaml.safe_load(f) + + def match_config_to_result( + self, + configs_list: typing.List[dict], + result_data: dict, + model_name: str + ) -> typing.Optional[dict]: + """Match a specific result to its corresponding config. + + For CSV configs with multiple rows (like vllm), match based on + model name and other identifiable fields. + + Args: + configs_list: List of config dicts (from CSV rows) + result_data: Single result row data + model_name: The model name from result + + Returns: + Matching config dict, or None if no match found + """ + if not configs_list: + return None + + # For single config, return it + if len(configs_list) == 1: + return configs_list[0] + + # For multiple configs, try to match based on common fields + # Extract model identifier from result model name + # e.g., "pyt_vllm_llama-3.1-8b_perf_meta-llama_Llama-3.1-8B-Instruct" + # should match config with model="meta-llama/Llama-3.1-8B-Instruct" + + for config in configs_list: + # Try to match on 'model' field if it exists in both + if 'model' in config and 'model' in result_data: + # Compare normalized versions + config_model = str(config['model']).replace('/', '_').replace('-', '_').lower() + result_model = str(result_data['model']).replace('/', '_').replace('-', '_').lower() + if config_model in result_model or result_model in config_model: + # Additional checks for benchmark type if available + if 'benchmark' in config and 'benchmark' in result_data: + if config['benchmark'] == result_data['benchmark']: + return config + else: + return config + + # If no match found, return first config as fallback + LOGGER.warning(f"Could not match config for result: {model_name}. Using first config.") + return configs_list[0] + + def parse_and_load( + self, + args_string: str, + model_scripts_path: str = None + ) -> typing.Optional[typing.Union[typing.List[dict], dict]]: + """Parse config path from args and load the config file. + + Convenience method that combines parse_config_from_args and load_config_file. + + Args: + args_string: The args field from models.json + model_scripts_path: Path to the model's script directory + + Returns: + Config data (list of dicts for CSV, dict for JSON/YAML), or None + """ + config_path = self.parse_config_from_args(args_string, model_scripts_path) + if not config_path: + return None + + return self.load_config_file(config_path) + + +def get_config_parser(scripts_base_dir: typing.Optional[str] = None) -> ConfigParser: + """Factory function to create a ConfigParser instance. + + Args: + scripts_base_dir: Base directory for scripts + + Returns: + ConfigParser instance + """ + return ConfigParser(scripts_base_dir=scripts_base_dir) + diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index 3dfd5de1..9d5762bf 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -195,6 +195,20 @@ "args": "", "multiple_results": "perf_dummy.csv" }, + { + "name": "dummy_perf_super", + "dockerfile": "docker/dummy", + "scripts": "scripts/dummy/run_perf_super.sh", + "n_gpus": "1", + "owner": "mad.support@amd.com", + "training_precision": "", + "tags": [ + "dummies", + "perf_super_test" + ], + "args": "--config configs/default.csv", + "multiple_results": "perf_dummy_super.csv" + }, { "name": "therock", "dockerfile": "docker/therock", diff --git a/tests/fixtures/dummy/scripts/dummy/configs/default.csv b/tests/fixtures/dummy/scripts/dummy/configs/default.csv new file mode 100644 index 00000000..ee04bbc5 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy/configs/default.csv @@ -0,0 +1,5 @@ +model,benchmark,config_value,batch_size,datatype,max_tokens +dummy/model-1,throughput,128,8,float16,1024 +dummy/model-2,serving,256,16,float32,2048 +dummy/model-3,latency,512,4,bfloat16,4096 + diff --git a/tests/fixtures/dummy/scripts/dummy/run_perf_super.sh b/tests/fixtures/dummy/scripts/dummy/run_perf_super.sh new file mode 100755 index 00000000..f9be6228 --- /dev/null +++ b/tests/fixtures/dummy/scripts/dummy/run_perf_super.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Copyright (c) Advanced Micro Devices, Inc. +# All rights reserved. +# +# Script to generate dummy results for perf_entry_super testing + +# Parse config argument +CONFIG_FILE="" +while [[ "$#" -gt 0 ]]; do + case $1 in + --config) CONFIG_FILE="$2"; shift ;; + *) echo "Unknown parameter: $1" ;; + esac + shift +done + +# Generate comprehensive results with best-practice performance metrics +# Includes: latency percentiles, resource utilization, reliability metrics, and throughput +cat > perf_dummy_super.csv << 'EOF' +model,performance,metric,status,throughput,latency_mean_ms,latency_p50_ms,latency_p90_ms,latency_p95_ms,latency_p99_ms,gpu_memory_used_mb,gpu_memory_total_mb,gpu_utilization_percent,cpu_utilization_percent,total_time_seconds,warmup_iterations,measured_iterations,error_count,success_rate_percent,samples_processed +dummy/model-1,1234.56,tokens/s,SUCCESS,1234.56,8.1,7.9,12.3,15.2,22.8,12288,32768,85.3,42.1,120.5,10,100,0,100.0,123456 +dummy/model-2,2345.67,requests/s,SUCCESS,2345.67,4.3,4.1,6.8,8.5,12.3,16384,32768,78.2,38.5,180.3,10,150,2,99.87,352350 +dummy/model-3,345.78,ms,SUCCESS,28.92,345.78,340.5,425.3,512.7,678.9,8192,32768,92.1,55.3,240.8,5,50,0,100.0,1447 +EOF + +cp perf_dummy_super.csv ../ + diff --git a/tests/test_misc.py b/tests/test_misc.py index 11a6fa81..a1aba9b1 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -6,6 +6,9 @@ import os import sys import csv +import json +import tempfile +import shutil import pandas as pd # 3rd party modules import pytest @@ -14,6 +17,11 @@ from .fixtures.utils import global_data from .fixtures.utils import clean_test_temp_files +# Add src to path for module imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) +from madengine.utils.config_parser import ConfigParser +from madengine.tools.update_perf_super import update_perf_super_json + class TestMiscFunctionality: @@ -22,7 +30,7 @@ def test_output_commandline_argument_writes_csv_correctly(self, global_data, cle """ output command-line argument writes csv file to specified output path """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy -o perf_test.csv") + output = global_data['console'].sh("cd " + BASE_DIR + "; " + "PYTHONPATH=src MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy -o perf_test.csv") success = False with open(os.path.join(BASE_DIR, 'perf_test.csv'), 'r') as csv_file: csv_reader = csv.DictReader(csv_file) @@ -41,7 +49,7 @@ def test_commandline_argument_skip_gpu_arch(self, global_data, clean_test_temp_f """ skip_gpu_arch command-line argument skips GPU architecture check """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch") + output = global_data['console'].sh("cd " + BASE_DIR + "; " + "PYTHONPATH=src MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch") if 'Skipping model' not in output: pytest.fail("Enable skipping gpu arch for running model is failed.") @@ -50,7 +58,7 @@ def test_commandline_argument_disable_skip_gpu_arch_fail(self, global_data, clea """ skip_gpu_arch command-line argument fails GPU architecture check """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch --disable-skip-gpu-arch") + output = global_data['console'].sh("cd " + BASE_DIR + "; " + "PYTHONPATH=src MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch --disable-skip-gpu-arch") # Check if exception with message 'Skipping model' is thrown if 'Skipping model' in output: pytest.fail("Disable skipping gpu arch for running model is failed.") @@ -60,7 +68,7 @@ def test_output_multi_results(self, global_data, clean_test_temp_files): """ test output multiple results """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_multi") + output = global_data['console'].sh("cd " + BASE_DIR + "; " + "PYTHONPATH=src MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_multi") # Check if multiple results are written to perf_dummy.csv success = False # Read the csv file to a dataframe using pandas @@ -84,3 +92,328 @@ def test_output_multi_results(self, global_data, clean_test_temp_files): if not success: pytest.fail("The columns of the generated multi results do not match perf.csv.") + +class TestPerfEntrySuperGeneration: + """Test cases for perf_entry_super.json generation.""" + + @pytest.fixture + def test_dir(self): + """Create temporary directory for tests.""" + temp_dir = tempfile.mkdtemp() + yield temp_dir + if os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + @pytest.fixture + def fixtures_dir(self): + """Get path to dummy fixtures directory.""" + return os.path.join( + os.path.dirname(__file__), + 'fixtures', + 'dummy', + 'scripts', + 'dummy' + ) + + @pytest.fixture + def config_file(self, fixtures_dir): + """Get path to config file.""" + return os.path.join(fixtures_dir, 'configs', 'default.csv') + + def test_config_file_exists(self, config_file): + """Test that the dummy config file exists.""" + assert os.path.exists(config_file), \ + f"Config file should exist at {config_file}" + + def test_config_parser_loads_csv(self, config_file): + """Test that ConfigParser can load the dummy CSV config.""" + parser = ConfigParser() + configs = parser.load_config_file(config_file) + + assert configs is not None, "Configs should not be None" + assert isinstance(configs, list), "Configs should be a list" + assert len(configs) == 3, "Should have 3 config rows" + + # Check first config has expected fields + first_config = configs[0] + assert 'model' in first_config + assert 'benchmark' in first_config + assert 'config_value' in first_config + assert 'batch_size' in first_config + assert 'datatype' in first_config + assert 'max_tokens' in first_config + + # Verify values + assert first_config['model'] == 'dummy/model-1' + assert first_config['benchmark'] == 'throughput' + assert first_config['datatype'] == 'float16' + + def test_config_parser_from_args(self, fixtures_dir): + """Test parsing config path from args string.""" + parser = ConfigParser(scripts_base_dir=fixtures_dir) + args_string = "--config configs/default.csv" + + config_path = parser.parse_config_from_args( + args_string, + os.path.join(fixtures_dir, 'run_perf_super.sh') + ) + + assert config_path is not None, "Config path should be found" + assert os.path.exists(config_path), \ + f"Config file should exist at {config_path}" + + def test_perf_entry_super_json_structure(self, test_dir, fixtures_dir): + """Test that perf_entry_super.json has the correct structure.""" + # Create mock data + common_info = { + "pipeline": "dummy_test", + "n_gpus": "1", + "training_precision": "", + "args": "--config configs/default.csv", + "tags": "dummies,perf_super_test", + "docker_file": "docker/dummy.Dockerfile", + "git_commit": "test123", + "machine_name": "test_machine", + "gpu_architecture": "test_gpu", + "build_duration": "10", + "test_duration": "20" + } + + # Create common_info.json + common_info_path = os.path.join(test_dir, "common_info_super.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + # Create results CSV + results_csv = os.path.join(test_dir, "perf_dummy_super.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric,status\n") + f.write("dummy/model-1,1234.56,tokens/s,SUCCESS\n") + f.write("dummy/model-2,2345.67,requests/s,SUCCESS\n") + f.write("dummy/model-3,345.78,ms,SUCCESS\n") + + # Generate perf_entry_super.json + perf_super_path = os.path.join(test_dir, "perf_entry_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="dummy_perf_super", + scripts_base_dir=fixtures_dir + ) + + # Verify file was created + assert os.path.exists(perf_super_path), \ + "perf_entry_super.json should be created" + + # Load and verify structure + with open(perf_super_path, 'r') as f: + data = json.load(f) + + assert isinstance(data, list), "Data should be a list" + assert len(data) == 3, "Should have 3 result records" + + # Check first record structure + first_record = data[0] + + # Verify all common fields are present + required_fields = [ + 'model', 'performance', 'metric', 'status', 'pipeline', + 'n_gpus', 'args', 'tags', 'gpu_architecture' + ] + for field in required_fields: + assert field in first_record, f"Field '{field}' should be present" + + # Verify configs field is present + assert 'configs' in first_record, "configs field should be present" + + # Verify configs is not None (config file was found and loaded) + assert first_record['configs'] is not None, \ + "configs should not be None when config file exists" + + # Verify configs has expected structure + configs = first_record['configs'] + assert isinstance(configs, dict), "configs should be a dict" + assert 'model' in configs + assert 'benchmark' in configs + assert 'config_value' in configs + assert 'batch_size' in configs + assert 'datatype' in configs + assert 'max_tokens' in configs + + def test_perf_entry_super_config_matching(self, test_dir, fixtures_dir): + """Test that configs are present for all results.""" + # Create mock data + common_info = { + "pipeline": "dummy_test", + "n_gpus": "1", + "args": "--config configs/default.csv", + "tags": "dummies" + } + + common_info_path = os.path.join(test_dir, "common_info_super.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + # Create results CSV + results_csv = os.path.join(test_dir, "perf_dummy_super.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric,benchmark\n") + f.write("dummy/model-1,1234.56,tokens/s,throughput\n") + f.write("dummy/model-2,2345.67,requests/s,serving\n") + f.write("dummy/model-3,345.78,ms,latency\n") + + perf_super_path = os.path.join(test_dir, "perf_entry_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="dummy_perf_super", + scripts_base_dir=fixtures_dir + ) + + # Load and verify matching + with open(perf_super_path, 'r') as f: + data = json.load(f) + + # Verify each result has configs + assert len(data) == 3, "Should have 3 results" + + for record in data: + configs = record.get('configs') + assert configs is not None, "Each record should have configs" + assert isinstance(configs, dict), "Configs should be a dict" + + # Verify configs have expected structure (from default.csv) + assert 'model' in configs + assert 'benchmark' in configs + assert 'config_value' in configs + assert 'batch_size' in configs + assert 'datatype' in configs + assert 'max_tokens' in configs + + # Verify configs values are from our config file + assert configs['benchmark'] in ['throughput', 'serving', 'latency'] + assert configs['datatype'] in ['float16', 'float32', 'bfloat16'] + + def test_perf_entry_super_no_config(self, test_dir, fixtures_dir): + """Test handling when no config file is specified.""" + # Create mock data without config + common_info = { + "pipeline": "dummy_test", + "n_gpus": "1", + "args": "", # No --config argument + "tags": "dummies" + } + + common_info_path = os.path.join(test_dir, "common_info_super.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + # Create results CSV + results_csv = os.path.join(test_dir, "perf_dummy_super.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric\n") + f.write("dummy-no-config,1234.56,tokens/s\n") + + perf_super_path = os.path.join(test_dir, "perf_entry_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="dummy_no_config", + scripts_base_dir=fixtures_dir + ) + + # Load and verify + with open(perf_super_path, 'r') as f: + data = json.load(f) + + assert len(data) == 1 + + # Verify configs is None for models without config files + assert 'configs' in data[0] + assert data[0]['configs'] is None, \ + "configs should be None when no config file is specified" + + def test_perf_entry_super_json_format_validation(self, test_dir, fixtures_dir): + """Test that the JSON format matches expected schema.""" + # Create complete mock data + common_info = { + "pipeline": "dummy_test", + "n_gpus": "1", + "training_precision": "fp16", + "args": "--config configs/default.csv", + "tags": "dummies,perf_super_test", + "docker_file": "docker/dummy.Dockerfile", + "base_docker": "rocm/pytorch:latest", + "docker_sha": "sha256:abc123", + "docker_image": "test-image", + "git_commit": "commit123", + "machine_name": "test-machine", + "gpu_architecture": "gfx942", + "build_duration": "120", + "test_duration": "300", + "dataname": "test_data", + "data_provider_type": "local", + "data_size": "1GB", + "data_download_duration": "60", + "build_number": "12345", + "additional_docker_run_options": "--shm-size=16g" + } + + common_info_path = os.path.join(test_dir, "common_info_super.json") + with open(common_info_path, 'w') as f: + json.dump(common_info, f) + + results_csv = os.path.join(test_dir, "perf_dummy_super.csv") + with open(results_csv, 'w') as f: + f.write("model,performance,metric,status\n") + f.write("dummy/model-1,1234.56,tokens/s,SUCCESS\n") + + perf_super_path = os.path.join(test_dir, "perf_entry_super.json") + + update_perf_super_json( + perf_super_json=perf_super_path, + multiple_results=results_csv, + common_info=common_info_path, + model_name="dummy_perf_super", + scripts_base_dir=fixtures_dir + ) + + # Load and validate complete format + with open(perf_super_path, 'r') as f: + data = json.load(f) + + record = data[0] + + # Expected fields from RunDetails + expected_fields = [ + 'model', 'pipeline', 'n_gpus', 'training_precision', 'args', + 'tags', 'docker_file', 'base_docker', 'docker_sha', 'docker_image', + 'git_commit', 'machine_name', 'gpu_architecture', 'performance', + 'metric', 'status', 'build_duration', 'test_duration', 'dataname', + 'data_provider_type', 'data_size', 'data_download_duration', + 'build_number', 'additional_docker_run_options', 'configs' + ] + + for field in expected_fields: + assert field in record, \ + f"Field '{field}' should be present in perf_entry_super.json" + + # Verify configs structure + configs = record['configs'] + assert isinstance(configs, dict) + + expected_config_fields = [ + 'model', 'benchmark', 'config_value', 'batch_size', + 'datatype', 'max_tokens' + ] + + for field in expected_config_fields: + assert field in configs, \ + f"Config field '{field}' should be present" + From 7ff689ea4a1d49d748462fd6d89519a08325094f Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Fri, 16 Jan 2026 19:19:01 -0500 Subject: [PATCH 04/11] Revert "Perf entry superset (#58)" (#66) This reverts commit 6d7a660fbb678fd22e67966cc43aad216413a682. --- src/madengine/mad.py | 3 +- src/madengine/tools/run_models.py | 83 +---- src/madengine/tools/update_perf_csv.py | 31 +- src/madengine/tools/update_perf_super.py | 243 ------------- src/madengine/tools/upload_mongodb.py | 95 ++--- src/madengine/utils/config_parser.py | 237 ------------ tests/fixtures/dummy/models.json | 14 - .../dummy/scripts/dummy/configs/default.csv | 5 - .../dummy/scripts/dummy/run_perf_super.sh | 28 -- tests/test_misc.py | 341 +----------------- 10 files changed, 33 insertions(+), 1047 deletions(-) delete mode 100644 src/madengine/tools/update_perf_super.py delete mode 100644 src/madengine/utils/config_parser.py delete mode 100644 tests/fixtures/dummy/scripts/dummy/configs/default.csv delete mode 100755 tests/fixtures/dummy/scripts/dummy/run_perf_super.sh diff --git a/src/madengine/mad.py b/src/madengine/mad.py index f9970847..e4df7143 100644 --- a/src/madengine/mad.py +++ b/src/madengine/mad.py @@ -267,8 +267,7 @@ def main(): parser_database_update_table.set_defaults(func=update_table) # Database subcommand uploading to MongoDB parser_database_upload_mongodb = subparsers_database.add_parser('upload-mongodb', description="Update table in DB.", help='Update table in DB') - parser_database_upload_mongodb.add_argument('--csv-file-path', type=str, default='perf_entry.csv', help='Path to the csv file (for legacy perf.csv)') - parser_database_upload_mongodb.add_argument('--json-file-path', type=str, default=None, help='Path to the json file (for perf_entry_super.json)') + parser_database_upload_mongodb.add_argument('--csv-file-path', type=str, default='perf_entry.csv', help='Path to the csv file') parser_database_upload_mongodb.add_argument("--database-name", type=str, required=True, help="Name of the MongoDB database") parser_database_upload_mongodb.add_argument("--collection-name", type=str, required=True, help="Name of the MongoDB collection") parser_database_upload_mongodb.set_defaults(func=upload_mongodb) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index ec5a3c8d..1eb44c33 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -50,10 +50,8 @@ from madengine.core.constants import MODEL_DIR, PUBLIC_GITHUB_ROCM_KEY from madengine.core.timeout import Timeout from madengine.tools.update_perf_csv import update_perf_csv -from madengine.tools.update_perf_super import update_perf_super_json from madengine.tools.csv_to_html import convert_csv_to_html from madengine.tools.discover_models import DiscoverModels -from madengine.utils.config_parser import ConfigParser class RunDetails: @@ -85,7 +83,6 @@ class RunDetails: data_download_duration (str): The duration of data download. build_number (str): The CI build number. additional_docker_run_options (str): The additional options used for docker run. - configs (dict or list or None): The configuration data from config files. """ # Avoiding @property for ease of code, add if needed. @@ -115,7 +112,6 @@ def __init__(self): self.data_download_duration = "" self.build_number = "" self.additional_docker_run_options = "" - self.configs = None def print_perf(self): """Print the performance results of a model. @@ -130,30 +126,6 @@ def print_perf(self): def generate_json(self, json_name: str, multiple_results: bool = False) -> None: """Generate JSON file for performance results of a model. - Args: - json_name (str): The name of the JSON file. - multiple_results (bool): The status of multiple results. Default is False. - - Raises: - Exception: An error occurred while generating JSON file for performance results of a model. - """ - # Exclude configs from CSV workflow as it can contain list/dict values - # that cause issues with pandas DataFrame creation - keys_to_exclude = ( - {"model", "performance", "metric", "status", "configs"} if multiple_results - else {"configs"} - ) - attributes = vars(self) - output_dict = {x: attributes[x] for x in attributes if x not in keys_to_exclude} - with open(json_name, "w") as outfile: - json.dump(output_dict, outfile) - - def generate_super_json(self, json_name: str, multiple_results: bool = False) -> None: - """Generate enhanced JSON file with config data for performance results. - - This method is similar to generate_json but includes the configs field - for perf_entry_super.json generation. - Args: json_name (str): The name of the JSON file. multiple_results (bool): The status of multiple results. Default is False. @@ -167,7 +139,7 @@ def generate_super_json(self, json_name: str, multiple_results: bool = False) -> attributes = vars(self) output_dict = {x: attributes[x] for x in attributes if x not in keys_to_exclude} with open(json_name, "w") as outfile: - json.dump(output_dict, outfile, indent=2) + json.dump(output_dict, outfile) class RunModels: @@ -1006,17 +978,6 @@ def run_model(self, model_info: typing.Dict) -> bool: # Taking gpu arch from context assumes the host image and container have the same gpu arch. # Environment variable updates for MAD Public CI run_details.gpu_architecture = self.context.ctx["docker_env_vars"]["MAD_SYSTEM_GPU_ARCHITECTURE"] - - # Parse and load config file if present in args for perf_entry_super.json - try: - config_parser = ConfigParser(scripts_base_dir=os.path.dirname(model_info.get("scripts", ""))) - run_details.configs = config_parser.parse_and_load( - model_info["args"], - model_info.get("scripts", "") - ) - except Exception as e: - print(f"Warning: Could not parse config file: {e}") - run_details.configs = None # Check the setting of shared memory size if "SHM_SIZE" in self.context.ctx: @@ -1057,14 +1018,6 @@ def run_model(self, model_info: typing.Dict) -> bool: # generate exception for testing run_details.generate_json("perf_entry.json") update_perf_csv(exception_result="perf_entry.json", perf_csv=self.args.output) - - # Generate perf_entry_super.json - run_details.generate_super_json("perf_entry_super.json") - update_perf_super_json( - exception_result="perf_entry_super.json", - perf_super_json="perf_entry_super.json", - scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), - ) else: print( f"Running model {run_details.model} on {run_details.gpu_architecture} architecture." @@ -1166,30 +1119,12 @@ def run_model(self, model_info: typing.Dict) -> bool: model_name=run_details.model, common_info="common_info.json", ) - - # Generate perf_entry_super.json - run_details.generate_super_json("common_info_super.json", multiple_results=True) - update_perf_super_json( - multiple_results=model_info['multiple_results'], - perf_super_json="perf_entry_super.json", - model_name=run_details.model, - common_info="common_info_super.json", - scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), - ) else: run_details.generate_json("perf_entry.json") update_perf_csv( single_result="perf_entry.json", perf_csv=self.args.output, ) - - # Generate perf_entry_super.json - run_details.generate_super_json("perf_entry_super.json") - update_perf_super_json( - single_result="perf_entry_super.json", - perf_super_json="perf_entry_super.json", - scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), - ) self.return_status &= (run_details.status == 'SUCCESS') @@ -1206,14 +1141,6 @@ def run_model(self, model_info: typing.Dict) -> bool: exception_result="perf_entry.json", perf_csv=self.args.output, ) - - # Generate perf_entry_super.json - run_details.generate_super_json("perf_entry_super.json") - update_perf_super_json( - exception_result="perf_entry_super.json", - perf_super_json="perf_entry_super.json", - scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), - ) except Exception as e: self.return_status = False @@ -1228,14 +1155,6 @@ def run_model(self, model_info: typing.Dict) -> bool: exception_result="perf_entry.json", perf_csv=self.args.output, ) - - # Generate perf_entry_super.json - run_details.generate_super_json("perf_entry_super.json") - update_perf_super_json( - exception_result="perf_entry_super.json", - perf_super_json="perf_entry_super.json", - scripts_base_dir=os.path.dirname(model_info.get("scripts", "")), - ) return self.return_status diff --git a/src/madengine/tools/update_perf_csv.py b/src/madengine/tools/update_perf_csv.py index 6f8b84ee..5e32e3e2 100644 --- a/src/madengine/tools/update_perf_csv.py +++ b/src/madengine/tools/update_perf_csv.py @@ -73,18 +73,7 @@ def perf_entry_dict_to_csv(perf_entry: typing.Dict) -> None: perf_entry: The performance entry dictionary. """ flatten_tags(perf_entry) - - # Convert any non-scalar values (list/dict) to JSON strings - # to avoid DataFrame creation errors when values don't match index length - perf_entry_safe = {} - for key, value in perf_entry.items(): - if isinstance(value, (list, dict)): - # Convert lists and dicts to JSON strings - perf_entry_safe[key] = json.dumps(value) if value is not None else None - else: - perf_entry_safe[key] = value - - js_df = pd.DataFrame(perf_entry_safe, index=[0]) + js_df = pd.DataFrame(perf_entry, index=[0]) perf_entry_df_to_csv(js_df) @@ -127,29 +116,15 @@ def handle_multiple_results( row = common_info_json.copy() model = r.pop("model") row["model"] = model_name + "_" + str(model) - - # Only extract essential result columns for perf.csv - # The full details with all metrics are preserved in perf_entry_super.json - row["performance"] = r.get("performance") - row["metric"] = r.get("metric") + row.update(r) if row["performance"] is not None and pd.notna(row["performance"]): row["status"] = "SUCCESS" else: row["status"] = "FAILURE" - # Convert any non-scalar values (list/dict) to JSON strings - # to avoid DataFrame creation errors when values don't match index length - row_safe = {} - for key, value in row.items(): - if isinstance(value, (list, dict)): - # Convert lists and dicts to JSON strings - row_safe[key] = json.dumps(value) if value is not None else None - else: - row_safe[key] = value - final_multiple_results_df = pd.concat( - [final_multiple_results_df, pd.DataFrame(row_safe, index=[0])], ignore_index=True + [final_multiple_results_df, pd.DataFrame(row, index=[0])], ignore_index=True ) # Reorder columns according to existing perf csv columns = perf_csv_df.columns.tolist() diff --git a/src/madengine/tools/update_perf_super.py b/src/madengine/tools/update_perf_super.py deleted file mode 100644 index 23bb1a15..00000000 --- a/src/madengine/tools/update_perf_super.py +++ /dev/null @@ -1,243 +0,0 @@ -"""Module to update the perf_entry_super.json file with enhanced performance data. - -This module is used to update the perf_entry_super.json file with performance data -that includes configuration information from config files. - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -# built-in imports -import json -import os -import typing -# third-party imports -import pandas as pd -# MAD Engine imports -from madengine.utils.config_parser import ConfigParser - - -def read_json(js: str) -> dict: - """Read a JSON file. - - Args: - js: The path to the JSON file. - - Returns: - The JSON dictionary. - """ - with open(js, 'r') as f: - return json.load(f) - - -def write_json(data: typing.Union[dict, list], output_path: str) -> None: - """Write data to a JSON file. - - Args: - data: The data to write (dict or list). - output_path: The path to the output JSON file. - """ - with open(output_path, 'w') as f: - json.dump(data, f, indent=2) - - -def load_perf_super_json(perf_super_json: str) -> list: - """Load existing perf_entry_super.json file. - - Args: - perf_super_json: Path to perf_entry_super.json file. - - Returns: - List of performance records, or empty list if file doesn't exist. - """ - if not os.path.exists(perf_super_json): - return [] - - try: - data = read_json(perf_super_json) - # Ensure it's a list - if isinstance(data, list): - return data - else: - return [data] - except Exception as e: - print(f"Warning: Could not load existing perf_entry_super.json: {e}") - return [] - - -def handle_multiple_results_super( - perf_super_list: list, - multiple_results: str, - common_info: str, - model_name: str, - config_parser: ConfigParser - ) -> list: - """Handle multiple results with config matching. - - Args: - perf_super_list: List of existing performance records. - multiple_results: The path to the multiple results CSV file. - common_info: The path to the common info JSON file. - model_name: The model name. - config_parser: ConfigParser instance for loading configs. - - Returns: - Updated list of performance records with configs. - """ - # Load multiple results CSV - multiple_results_df = pd.read_csv(multiple_results) - multiple_results_df.columns = multiple_results_df.columns.str.strip() - - # Check required columns - required_cols = ['model', 'performance', 'metric'] - for col in required_cols: - if col not in multiple_results_df.columns: - raise RuntimeError(f"{multiple_results} file is missing the {col} column") - - # Load common info - common_info_json = read_json(common_info) - - # Parse config file from args if present - configs_data = None - if 'args' in common_info_json and common_info_json['args']: - # Try to extract config path from args - scripts_path = common_info_json.get('pipeline', '') - configs_data = config_parser.parse_and_load( - common_info_json['args'], - scripts_path - ) - - # Process each result row - for result_row in multiple_results_df.to_dict(orient="records"): - record = common_info_json.copy() - - # Update model name - result_model = result_row.pop("model") - record["model"] = f"{model_name}_{result_model}" - - # Update with result data - record.update(result_row) - - # Set status based on performance - if record.get("performance") is not None and pd.notna(record.get("performance")): - record["status"] = "SUCCESS" - else: - record["status"] = "FAILURE" - - # Match config to this specific result - if configs_data: - if isinstance(configs_data, list): - # For CSV configs with multiple rows, try to match - matched_config = config_parser.match_config_to_result( - configs_data, - result_row, - result_model - ) - record["configs"] = matched_config - else: - # For JSON/YAML configs, use as-is - record["configs"] = configs_data - else: - record["configs"] = None - - perf_super_list.append(record) - - return perf_super_list - - -def handle_single_result_super( - perf_super_list: list, - single_result: str - ) -> list: - """Handle a single result. - - Args: - perf_super_list: List of existing performance records. - single_result: The path to the single result JSON file. - - Returns: - Updated list of performance records. - """ - single_result_json = read_json(single_result) - - # Ensure configs field exists (may be None) - if "configs" not in single_result_json: - single_result_json["configs"] = None - - perf_super_list.append(single_result_json) - return perf_super_list - - -def handle_exception_result_super( - perf_super_list: list, - exception_result: str - ) -> list: - """Handle an exception result. - - Args: - perf_super_list: List of existing performance records. - exception_result: The path to the exception result JSON file. - - Returns: - Updated list of performance records. - """ - exception_result_json = read_json(exception_result) - - # Ensure configs field exists (may be None) - if "configs" not in exception_result_json: - exception_result_json["configs"] = None - - perf_super_list.append(exception_result_json) - return perf_super_list - - -def update_perf_super_json( - perf_super_json: str, - multiple_results: typing.Optional[str] = None, - single_result: typing.Optional[str] = None, - exception_result: typing.Optional[str] = None, - common_info: typing.Optional[str] = None, - model_name: typing.Optional[str] = None, - scripts_base_dir: typing.Optional[str] = None, - ) -> None: - """Update the perf_entry_super.json file with the latest performance data. - - Args: - perf_super_json: Path to perf_entry_super.json file. - multiple_results: Path to multiple results CSV file. - single_result: Path to single result JSON file. - exception_result: Path to exception result JSON file. - common_info: Path to common info JSON file. - model_name: The model name. - scripts_base_dir: Base directory for scripts (for config file resolution). - """ - print(f"Updating perf_entry_super.json with enhanced performance data") - - # Load existing perf_entry_super.json - perf_super_list = load_perf_super_json(perf_super_json) - - # Create config parser - config_parser = ConfigParser(scripts_base_dir=scripts_base_dir) - - # Handle different result types - if multiple_results: - perf_super_list = handle_multiple_results_super( - perf_super_list, - multiple_results, - common_info, - model_name, - config_parser, - ) - elif single_result: - perf_super_list = handle_single_result_super(perf_super_list, single_result) - elif exception_result: - perf_super_list = handle_exception_result_super( - perf_super_list, exception_result - ) - else: - print("No results to update in perf_entry_super.json") - return - - # Write updated perf_entry_super.json - write_json(perf_super_list, perf_super_json) - print(f"Successfully updated {perf_super_json}") - diff --git a/src/madengine/tools/upload_mongodb.py b/src/madengine/tools/upload_mongodb.py index 701174cd..6766e3e2 100644 --- a/src/madengine/tools/upload_mongodb.py +++ b/src/madengine/tools/upload_mongodb.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -"""Module to update MongoDB collections with data from a CSV or JSON file. +"""Module to update MongoDB collections with data from a CSV file. This module provides functions to handle MongoDB operations, including checking for collection existence, creating collections, and updating datasets. @@ -8,14 +8,13 @@ """ # built-in modules import os -import json import argparse # third-party modules import pandas as pd import pymongo from pymongo.errors import ConnectionFailure -from typing import Optional, Union, List, Dict +from typing import Optional # MAD Engine modules from madengine.db.logger import setup_logger @@ -41,8 +40,7 @@ def __init__(self, args: argparse.Namespace) -> None: self.uri = mongo_uri self.database_name = args.database_name self.collection_name = args.collection_name - self.csv_file_path = getattr(args, 'csv_file_path', None) - self.json_file_path = getattr(args, 'json_file_path', None) + self.csv_file_path = args.csv_file_path self.client = None self.db = None @@ -64,66 +62,43 @@ def collection_exists(self) -> bool: """ return self.collection_name in self.db.list_collection_names() - def update_collection(self, data: Union[pd.DataFrame, List[Dict]]) -> None: - """Update a MongoDB collection with data from a DataFrame or list of dicts. + def update_collection(self, data: pd.DataFrame) -> None: + """Update a MongoDB collection with data from a DataFrame. Args: - data: DataFrame or list of dicts containing the data to update. + data (pd.DataFrame): DataFrame containing the data to update. """ if not self.collection_exists(): LOGGER.info(f"Collection '{self.collection_name}' does not exist. Creating it.") self.db.create_collection(self.collection_name) collection = self.db[self.collection_name] - - # Convert to list of records if DataFrame - if isinstance(data, pd.DataFrame): - records = data.to_dict(orient="records") - else: - records = data - + records = data.to_dict(orient="records") for record in records: # Use an appropriate unique identifier for upsert (e.g., "_id" or another field) collection.update_one(record, {"$set": record}, upsert=True) LOGGER.info(f"Updated collection '{self.collection_name}' with {len(records)} records.") def run(self) -> None: - """Run the process of updating a MongoDB collection with data from a CSV or JSON file. + """Run the process of updating a MongoDB collection with data from a CSV file. """ self.connect() - - if self.json_file_path: - # Load JSON file (perf_entry_super.json) - data = load_json_to_list(self.json_file_path) - - # Add created_date to each record - from datetime import datetime - created_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - for record in data: - if "created_date" not in record: - record["created_date"] = created_date - - self.update_collection(data) - elif self.csv_file_path: - # Load CSV file (legacy perf.csv) - data = load_csv_to_dataframe(self.csv_file_path) - - # if the value is NaN, replace it with empty string - data = data.where(pd.notnull(data), "") - # Convert all columns to string type except boolean columns - for col in data.columns: - if data[col].dtype != "bool": - data[col] = data[col].astype(str) - - # Added created_date column and set it to now - data["created_date"] = pd.to_datetime("now").strftime("%Y-%m-%d %H:%M:%S") - - # Remove any leading or trailing whitespace from column names - data.columns = data.columns.str.strip() - - self.update_collection(data) - else: - raise ValueError("Either csv_file_path or json_file_path must be provided") + data = load_csv_to_dataframe(self.csv_file_path) + + # if the value is NaN, replace it with empty string + data = data.where(pd.notnull(data), "") + # Convert all columns to string type except boolean columns + for col in data.columns: + if data[col].dtype != "bool": + data[col] = data[col].astype(str) + + # Added created_date column and set it to now + data["created_date"] = pd.to_datetime("now").strftime("%Y-%m-%d %H:%M:%S") + + # Remove any leading or trailing whitespace from column names + data.columns = data.columns.str.strip() + + self.update_collection(data) def load_csv_to_dataframe(csv_path: str) -> pd.DataFrame: @@ -138,25 +113,3 @@ def load_csv_to_dataframe(csv_path: str) -> pd.DataFrame: if not os.path.exists(csv_path): raise FileNotFoundError(f"CSV file '{csv_path}' not found.") return pd.read_csv(csv_path) - - -def load_json_to_list(json_path: str) -> List[Dict]: - """Load a JSON file into a list of dictionaries. - - Args: - json_path (str): Path to the JSON file. - - Returns: - List[Dict]: List of dictionaries containing the JSON data. - """ - if not os.path.exists(json_path): - raise FileNotFoundError(f"JSON file '{json_path}' not found.") - - with open(json_path, 'r') as f: - data = json.load(f) - - # Ensure it's a list - if isinstance(data, list): - return data - else: - return [data] diff --git a/src/madengine/utils/config_parser.py b/src/madengine/utils/config_parser.py deleted file mode 100644 index 7d3e31e7..00000000 --- a/src/madengine/utils/config_parser.py +++ /dev/null @@ -1,237 +0,0 @@ -"""Config Parser Module for MAD Engine. - -This module provides utilities to parse configuration files from model arguments -and load them in various formats (CSV, JSON, YAML). - -Copyright (c) Advanced Micro Devices, Inc. All rights reserved. -""" - -import os -import re -import json -import logging -import typing -from pathlib import Path - -import pandas as pd - -try: - import yaml - YAML_AVAILABLE = True -except ImportError: - YAML_AVAILABLE = False - -LOGGER = logging.getLogger(__name__) - - -class ConfigParser: - """Parser for model configuration files. - - This class handles parsing configuration files in various formats - (CSV, JSON, YAML) that are referenced in model arguments. - """ - - def __init__(self, scripts_base_dir: typing.Optional[str] = None): - """Initialize ConfigParser. - - Args: - scripts_base_dir: Base directory for scripts (e.g., ~/amd/MAD-private/scripts) - """ - self.scripts_base_dir = scripts_base_dir - - def parse_config_from_args(self, args_string: str, model_scripts_path: str = None) -> typing.Optional[str]: - """Extract config file path from model arguments. - - Args: - args_string: The args field from models.json - model_scripts_path: Path to the model's script directory - - Returns: - Full path to config file, or None if no config found - """ - if not args_string: - return None - - # Look for --config argument - config_match = re.search(r'--config\s+([^\s]+)', args_string) - if not config_match: - return None - - config_path = config_match.group(1) - - # If it's already an absolute path, return it - if os.path.isabs(config_path): - return config_path if os.path.exists(config_path) else None - - # Try to resolve relative path - # First, try relative to model scripts directory - if model_scripts_path: - scripts_dir = os.path.dirname(model_scripts_path) - full_path = os.path.join(scripts_dir, config_path) - if os.path.exists(full_path): - return full_path - - # Try relative to scripts_base_dir - if self.scripts_base_dir: - full_path = os.path.join(self.scripts_base_dir, config_path) - if os.path.exists(full_path): - return full_path - - LOGGER.warning(f"Config file not found: {config_path}") - return None - - def load_config_file(self, config_path: str) -> typing.Optional[typing.Union[typing.List[dict], dict]]: - """Load and parse a configuration file. - - Args: - config_path: Full path to the config file - - Returns: - For CSV: List of dicts (one per row) - For JSON/YAML: Dict or list as-is from file - None if file cannot be loaded - """ - if not config_path or not os.path.exists(config_path): - return None - - file_ext = Path(config_path).suffix.lower() - - try: - if file_ext == '.csv': - return self._load_csv(config_path) - elif file_ext == '.json': - return self._load_json(config_path) - elif file_ext in ['.yaml', '.yml']: - return self._load_yaml(config_path) - else: - LOGGER.warning(f"Unsupported config file format: {file_ext}") - return None - except Exception as e: - LOGGER.error(f"Error loading config file {config_path}: {e}") - return None - - def _load_csv(self, config_path: str) -> typing.List[dict]: - """Load CSV config file. - - Args: - config_path: Path to CSV file - - Returns: - List of dicts, one per row - """ - df = pd.read_csv(config_path) - # Convert NaN to None for JSON serialization - df = df.where(pd.notnull(df), None) - # Convert to list of dicts - return df.to_dict(orient='records') - - def _load_json(self, config_path: str) -> typing.Union[dict, list]: - """Load JSON config file. - - Args: - config_path: Path to JSON file - - Returns: - Dict or list from JSON file - """ - with open(config_path, 'r') as f: - return json.load(f) - - def _load_yaml(self, config_path: str) -> typing.Union[dict, list]: - """Load YAML config file. - - Args: - config_path: Path to YAML file - - Returns: - Dict or list from YAML file - """ - if not YAML_AVAILABLE: - raise ImportError("PyYAML is not installed. Cannot load YAML config files.") - - with open(config_path, 'r') as f: - return yaml.safe_load(f) - - def match_config_to_result( - self, - configs_list: typing.List[dict], - result_data: dict, - model_name: str - ) -> typing.Optional[dict]: - """Match a specific result to its corresponding config. - - For CSV configs with multiple rows (like vllm), match based on - model name and other identifiable fields. - - Args: - configs_list: List of config dicts (from CSV rows) - result_data: Single result row data - model_name: The model name from result - - Returns: - Matching config dict, or None if no match found - """ - if not configs_list: - return None - - # For single config, return it - if len(configs_list) == 1: - return configs_list[0] - - # For multiple configs, try to match based on common fields - # Extract model identifier from result model name - # e.g., "pyt_vllm_llama-3.1-8b_perf_meta-llama_Llama-3.1-8B-Instruct" - # should match config with model="meta-llama/Llama-3.1-8B-Instruct" - - for config in configs_list: - # Try to match on 'model' field if it exists in both - if 'model' in config and 'model' in result_data: - # Compare normalized versions - config_model = str(config['model']).replace('/', '_').replace('-', '_').lower() - result_model = str(result_data['model']).replace('/', '_').replace('-', '_').lower() - if config_model in result_model or result_model in config_model: - # Additional checks for benchmark type if available - if 'benchmark' in config and 'benchmark' in result_data: - if config['benchmark'] == result_data['benchmark']: - return config - else: - return config - - # If no match found, return first config as fallback - LOGGER.warning(f"Could not match config for result: {model_name}. Using first config.") - return configs_list[0] - - def parse_and_load( - self, - args_string: str, - model_scripts_path: str = None - ) -> typing.Optional[typing.Union[typing.List[dict], dict]]: - """Parse config path from args and load the config file. - - Convenience method that combines parse_config_from_args and load_config_file. - - Args: - args_string: The args field from models.json - model_scripts_path: Path to the model's script directory - - Returns: - Config data (list of dicts for CSV, dict for JSON/YAML), or None - """ - config_path = self.parse_config_from_args(args_string, model_scripts_path) - if not config_path: - return None - - return self.load_config_file(config_path) - - -def get_config_parser(scripts_base_dir: typing.Optional[str] = None) -> ConfigParser: - """Factory function to create a ConfigParser instance. - - Args: - scripts_base_dir: Base directory for scripts - - Returns: - ConfigParser instance - """ - return ConfigParser(scripts_base_dir=scripts_base_dir) - diff --git a/tests/fixtures/dummy/models.json b/tests/fixtures/dummy/models.json index 9d5762bf..3dfd5de1 100644 --- a/tests/fixtures/dummy/models.json +++ b/tests/fixtures/dummy/models.json @@ -195,20 +195,6 @@ "args": "", "multiple_results": "perf_dummy.csv" }, - { - "name": "dummy_perf_super", - "dockerfile": "docker/dummy", - "scripts": "scripts/dummy/run_perf_super.sh", - "n_gpus": "1", - "owner": "mad.support@amd.com", - "training_precision": "", - "tags": [ - "dummies", - "perf_super_test" - ], - "args": "--config configs/default.csv", - "multiple_results": "perf_dummy_super.csv" - }, { "name": "therock", "dockerfile": "docker/therock", diff --git a/tests/fixtures/dummy/scripts/dummy/configs/default.csv b/tests/fixtures/dummy/scripts/dummy/configs/default.csv deleted file mode 100644 index ee04bbc5..00000000 --- a/tests/fixtures/dummy/scripts/dummy/configs/default.csv +++ /dev/null @@ -1,5 +0,0 @@ -model,benchmark,config_value,batch_size,datatype,max_tokens -dummy/model-1,throughput,128,8,float16,1024 -dummy/model-2,serving,256,16,float32,2048 -dummy/model-3,latency,512,4,bfloat16,4096 - diff --git a/tests/fixtures/dummy/scripts/dummy/run_perf_super.sh b/tests/fixtures/dummy/scripts/dummy/run_perf_super.sh deleted file mode 100755 index f9be6228..00000000 --- a/tests/fixtures/dummy/scripts/dummy/run_perf_super.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/bash -# -# Copyright (c) Advanced Micro Devices, Inc. -# All rights reserved. -# -# Script to generate dummy results for perf_entry_super testing - -# Parse config argument -CONFIG_FILE="" -while [[ "$#" -gt 0 ]]; do - case $1 in - --config) CONFIG_FILE="$2"; shift ;; - *) echo "Unknown parameter: $1" ;; - esac - shift -done - -# Generate comprehensive results with best-practice performance metrics -# Includes: latency percentiles, resource utilization, reliability metrics, and throughput -cat > perf_dummy_super.csv << 'EOF' -model,performance,metric,status,throughput,latency_mean_ms,latency_p50_ms,latency_p90_ms,latency_p95_ms,latency_p99_ms,gpu_memory_used_mb,gpu_memory_total_mb,gpu_utilization_percent,cpu_utilization_percent,total_time_seconds,warmup_iterations,measured_iterations,error_count,success_rate_percent,samples_processed -dummy/model-1,1234.56,tokens/s,SUCCESS,1234.56,8.1,7.9,12.3,15.2,22.8,12288,32768,85.3,42.1,120.5,10,100,0,100.0,123456 -dummy/model-2,2345.67,requests/s,SUCCESS,2345.67,4.3,4.1,6.8,8.5,12.3,16384,32768,78.2,38.5,180.3,10,150,2,99.87,352350 -dummy/model-3,345.78,ms,SUCCESS,28.92,345.78,340.5,425.3,512.7,678.9,8192,32768,92.1,55.3,240.8,5,50,0,100.0,1447 -EOF - -cp perf_dummy_super.csv ../ - diff --git a/tests/test_misc.py b/tests/test_misc.py index a1aba9b1..11a6fa81 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -6,9 +6,6 @@ import os import sys import csv -import json -import tempfile -import shutil import pandas as pd # 3rd party modules import pytest @@ -17,11 +14,6 @@ from .fixtures.utils import global_data from .fixtures.utils import clean_test_temp_files -# Add src to path for module imports -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'src')) -from madengine.utils.config_parser import ConfigParser -from madengine.tools.update_perf_super import update_perf_super_json - class TestMiscFunctionality: @@ -30,7 +22,7 @@ def test_output_commandline_argument_writes_csv_correctly(self, global_data, cle """ output command-line argument writes csv file to specified output path """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "PYTHONPATH=src MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy -o perf_test.csv") + output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy -o perf_test.csv") success = False with open(os.path.join(BASE_DIR, 'perf_test.csv'), 'r') as csv_file: csv_reader = csv.DictReader(csv_file) @@ -49,7 +41,7 @@ def test_commandline_argument_skip_gpu_arch(self, global_data, clean_test_temp_f """ skip_gpu_arch command-line argument skips GPU architecture check """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "PYTHONPATH=src MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch") + output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch") if 'Skipping model' not in output: pytest.fail("Enable skipping gpu arch for running model is failed.") @@ -58,7 +50,7 @@ def test_commandline_argument_disable_skip_gpu_arch_fail(self, global_data, clea """ skip_gpu_arch command-line argument fails GPU architecture check """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "PYTHONPATH=src MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch --disable-skip-gpu-arch") + output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_skip_gpu_arch --disable-skip-gpu-arch") # Check if exception with message 'Skipping model' is thrown if 'Skipping model' in output: pytest.fail("Disable skipping gpu arch for running model is failed.") @@ -68,7 +60,7 @@ def test_output_multi_results(self, global_data, clean_test_temp_files): """ test output multiple results """ - output = global_data['console'].sh("cd " + BASE_DIR + "; " + "PYTHONPATH=src MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_multi") + output = global_data['console'].sh("cd " + BASE_DIR + "; " + "MODEL_DIR=" + MODEL_DIR + " " + "python3 src/madengine/mad.py run --tags dummy_multi") # Check if multiple results are written to perf_dummy.csv success = False # Read the csv file to a dataframe using pandas @@ -92,328 +84,3 @@ def test_output_multi_results(self, global_data, clean_test_temp_files): if not success: pytest.fail("The columns of the generated multi results do not match perf.csv.") - -class TestPerfEntrySuperGeneration: - """Test cases for perf_entry_super.json generation.""" - - @pytest.fixture - def test_dir(self): - """Create temporary directory for tests.""" - temp_dir = tempfile.mkdtemp() - yield temp_dir - if os.path.exists(temp_dir): - shutil.rmtree(temp_dir) - - @pytest.fixture - def fixtures_dir(self): - """Get path to dummy fixtures directory.""" - return os.path.join( - os.path.dirname(__file__), - 'fixtures', - 'dummy', - 'scripts', - 'dummy' - ) - - @pytest.fixture - def config_file(self, fixtures_dir): - """Get path to config file.""" - return os.path.join(fixtures_dir, 'configs', 'default.csv') - - def test_config_file_exists(self, config_file): - """Test that the dummy config file exists.""" - assert os.path.exists(config_file), \ - f"Config file should exist at {config_file}" - - def test_config_parser_loads_csv(self, config_file): - """Test that ConfigParser can load the dummy CSV config.""" - parser = ConfigParser() - configs = parser.load_config_file(config_file) - - assert configs is not None, "Configs should not be None" - assert isinstance(configs, list), "Configs should be a list" - assert len(configs) == 3, "Should have 3 config rows" - - # Check first config has expected fields - first_config = configs[0] - assert 'model' in first_config - assert 'benchmark' in first_config - assert 'config_value' in first_config - assert 'batch_size' in first_config - assert 'datatype' in first_config - assert 'max_tokens' in first_config - - # Verify values - assert first_config['model'] == 'dummy/model-1' - assert first_config['benchmark'] == 'throughput' - assert first_config['datatype'] == 'float16' - - def test_config_parser_from_args(self, fixtures_dir): - """Test parsing config path from args string.""" - parser = ConfigParser(scripts_base_dir=fixtures_dir) - args_string = "--config configs/default.csv" - - config_path = parser.parse_config_from_args( - args_string, - os.path.join(fixtures_dir, 'run_perf_super.sh') - ) - - assert config_path is not None, "Config path should be found" - assert os.path.exists(config_path), \ - f"Config file should exist at {config_path}" - - def test_perf_entry_super_json_structure(self, test_dir, fixtures_dir): - """Test that perf_entry_super.json has the correct structure.""" - # Create mock data - common_info = { - "pipeline": "dummy_test", - "n_gpus": "1", - "training_precision": "", - "args": "--config configs/default.csv", - "tags": "dummies,perf_super_test", - "docker_file": "docker/dummy.Dockerfile", - "git_commit": "test123", - "machine_name": "test_machine", - "gpu_architecture": "test_gpu", - "build_duration": "10", - "test_duration": "20" - } - - # Create common_info.json - common_info_path = os.path.join(test_dir, "common_info_super.json") - with open(common_info_path, 'w') as f: - json.dump(common_info, f) - - # Create results CSV - results_csv = os.path.join(test_dir, "perf_dummy_super.csv") - with open(results_csv, 'w') as f: - f.write("model,performance,metric,status\n") - f.write("dummy/model-1,1234.56,tokens/s,SUCCESS\n") - f.write("dummy/model-2,2345.67,requests/s,SUCCESS\n") - f.write("dummy/model-3,345.78,ms,SUCCESS\n") - - # Generate perf_entry_super.json - perf_super_path = os.path.join(test_dir, "perf_entry_super.json") - - update_perf_super_json( - perf_super_json=perf_super_path, - multiple_results=results_csv, - common_info=common_info_path, - model_name="dummy_perf_super", - scripts_base_dir=fixtures_dir - ) - - # Verify file was created - assert os.path.exists(perf_super_path), \ - "perf_entry_super.json should be created" - - # Load and verify structure - with open(perf_super_path, 'r') as f: - data = json.load(f) - - assert isinstance(data, list), "Data should be a list" - assert len(data) == 3, "Should have 3 result records" - - # Check first record structure - first_record = data[0] - - # Verify all common fields are present - required_fields = [ - 'model', 'performance', 'metric', 'status', 'pipeline', - 'n_gpus', 'args', 'tags', 'gpu_architecture' - ] - for field in required_fields: - assert field in first_record, f"Field '{field}' should be present" - - # Verify configs field is present - assert 'configs' in first_record, "configs field should be present" - - # Verify configs is not None (config file was found and loaded) - assert first_record['configs'] is not None, \ - "configs should not be None when config file exists" - - # Verify configs has expected structure - configs = first_record['configs'] - assert isinstance(configs, dict), "configs should be a dict" - assert 'model' in configs - assert 'benchmark' in configs - assert 'config_value' in configs - assert 'batch_size' in configs - assert 'datatype' in configs - assert 'max_tokens' in configs - - def test_perf_entry_super_config_matching(self, test_dir, fixtures_dir): - """Test that configs are present for all results.""" - # Create mock data - common_info = { - "pipeline": "dummy_test", - "n_gpus": "1", - "args": "--config configs/default.csv", - "tags": "dummies" - } - - common_info_path = os.path.join(test_dir, "common_info_super.json") - with open(common_info_path, 'w') as f: - json.dump(common_info, f) - - # Create results CSV - results_csv = os.path.join(test_dir, "perf_dummy_super.csv") - with open(results_csv, 'w') as f: - f.write("model,performance,metric,benchmark\n") - f.write("dummy/model-1,1234.56,tokens/s,throughput\n") - f.write("dummy/model-2,2345.67,requests/s,serving\n") - f.write("dummy/model-3,345.78,ms,latency\n") - - perf_super_path = os.path.join(test_dir, "perf_entry_super.json") - - update_perf_super_json( - perf_super_json=perf_super_path, - multiple_results=results_csv, - common_info=common_info_path, - model_name="dummy_perf_super", - scripts_base_dir=fixtures_dir - ) - - # Load and verify matching - with open(perf_super_path, 'r') as f: - data = json.load(f) - - # Verify each result has configs - assert len(data) == 3, "Should have 3 results" - - for record in data: - configs = record.get('configs') - assert configs is not None, "Each record should have configs" - assert isinstance(configs, dict), "Configs should be a dict" - - # Verify configs have expected structure (from default.csv) - assert 'model' in configs - assert 'benchmark' in configs - assert 'config_value' in configs - assert 'batch_size' in configs - assert 'datatype' in configs - assert 'max_tokens' in configs - - # Verify configs values are from our config file - assert configs['benchmark'] in ['throughput', 'serving', 'latency'] - assert configs['datatype'] in ['float16', 'float32', 'bfloat16'] - - def test_perf_entry_super_no_config(self, test_dir, fixtures_dir): - """Test handling when no config file is specified.""" - # Create mock data without config - common_info = { - "pipeline": "dummy_test", - "n_gpus": "1", - "args": "", # No --config argument - "tags": "dummies" - } - - common_info_path = os.path.join(test_dir, "common_info_super.json") - with open(common_info_path, 'w') as f: - json.dump(common_info, f) - - # Create results CSV - results_csv = os.path.join(test_dir, "perf_dummy_super.csv") - with open(results_csv, 'w') as f: - f.write("model,performance,metric\n") - f.write("dummy-no-config,1234.56,tokens/s\n") - - perf_super_path = os.path.join(test_dir, "perf_entry_super.json") - - update_perf_super_json( - perf_super_json=perf_super_path, - multiple_results=results_csv, - common_info=common_info_path, - model_name="dummy_no_config", - scripts_base_dir=fixtures_dir - ) - - # Load and verify - with open(perf_super_path, 'r') as f: - data = json.load(f) - - assert len(data) == 1 - - # Verify configs is None for models without config files - assert 'configs' in data[0] - assert data[0]['configs'] is None, \ - "configs should be None when no config file is specified" - - def test_perf_entry_super_json_format_validation(self, test_dir, fixtures_dir): - """Test that the JSON format matches expected schema.""" - # Create complete mock data - common_info = { - "pipeline": "dummy_test", - "n_gpus": "1", - "training_precision": "fp16", - "args": "--config configs/default.csv", - "tags": "dummies,perf_super_test", - "docker_file": "docker/dummy.Dockerfile", - "base_docker": "rocm/pytorch:latest", - "docker_sha": "sha256:abc123", - "docker_image": "test-image", - "git_commit": "commit123", - "machine_name": "test-machine", - "gpu_architecture": "gfx942", - "build_duration": "120", - "test_duration": "300", - "dataname": "test_data", - "data_provider_type": "local", - "data_size": "1GB", - "data_download_duration": "60", - "build_number": "12345", - "additional_docker_run_options": "--shm-size=16g" - } - - common_info_path = os.path.join(test_dir, "common_info_super.json") - with open(common_info_path, 'w') as f: - json.dump(common_info, f) - - results_csv = os.path.join(test_dir, "perf_dummy_super.csv") - with open(results_csv, 'w') as f: - f.write("model,performance,metric,status\n") - f.write("dummy/model-1,1234.56,tokens/s,SUCCESS\n") - - perf_super_path = os.path.join(test_dir, "perf_entry_super.json") - - update_perf_super_json( - perf_super_json=perf_super_path, - multiple_results=results_csv, - common_info=common_info_path, - model_name="dummy_perf_super", - scripts_base_dir=fixtures_dir - ) - - # Load and validate complete format - with open(perf_super_path, 'r') as f: - data = json.load(f) - - record = data[0] - - # Expected fields from RunDetails - expected_fields = [ - 'model', 'pipeline', 'n_gpus', 'training_precision', 'args', - 'tags', 'docker_file', 'base_docker', 'docker_sha', 'docker_image', - 'git_commit', 'machine_name', 'gpu_architecture', 'performance', - 'metric', 'status', 'build_duration', 'test_duration', 'dataname', - 'data_provider_type', 'data_size', 'data_download_duration', - 'build_number', 'additional_docker_run_options', 'configs' - ] - - for field in expected_fields: - assert field in record, \ - f"Field '{field}' should be present in perf_entry_super.json" - - # Verify configs structure - configs = record['configs'] - assert isinstance(configs, dict) - - expected_config_fields = [ - 'model', 'benchmark', 'config_value', 'batch_size', - 'datatype', 'max_tokens' - ] - - for field in expected_config_fields: - assert field in configs, \ - f"Config field '{field}' should be present" - From 9bf6ae626e2b4ffa6214adc01ab49a0d07eeddc9 Mon Sep 17 00:00:00 2001 From: Shashank Parsi <156772515+shashank-parsi@users.noreply.github.com> Date: Thu, 5 Feb 2026 13:33:08 +0530 Subject: [PATCH 05/11] Fail Check condition update for RPM distro (#64) --- src/madengine/tools/run_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/madengine/tools/run_models.py b/src/madengine/tools/run_models.py index 1eb44c33..614eec1a 100644 --- a/src/madengine/tools/run_models.py +++ b/src/madengine/tools/run_models.py @@ -1176,11 +1176,11 @@ def run(self) -> bool: if host_os.find("HOST_UBUNTU") != -1: print(self.console.sh("apt show rocm-libs -a", canFail=True)) elif host_os.find("HOST_CENTOS") != -1: - print(self.console.sh("yum info rocm-libs")) + print(self.console.sh("yum info rocm-libs", canFail=True)) elif host_os.find("HOST_SLES") != -1: - print(self.console.sh("zypper info rocm-libs")) + print(self.console.sh("zypper info rocm-libs", CanFail=True)) elif host_os.find("HOST_AZURE") != -1: - print(self.console.sh("tdnf info rocm-libs")) + print(self.console.sh("tdnf info rocm-libs", canFail=True)) else: print("ERROR: Unable to detect host OS.") self.return_status = False From a197d7cbea95c8fe4e9024e40de29fbfb091c101 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Thu, 5 Mar 2026 11:08:19 -0600 Subject: [PATCH 06/11] Fixed launcher type issue on k8s --- src/madengine/deployment/kubernetes.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/madengine/deployment/kubernetes.py b/src/madengine/deployment/kubernetes.py index f3ed2223..29c9874e 100644 --- a/src/madengine/deployment/kubernetes.py +++ b/src/madengine/deployment/kubernetes.py @@ -3469,6 +3469,8 @@ def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: s nproc_per_node = distributed_config.get("nproc_per_node") if nproc_per_node is None: nproc_per_node = int(model_info.get("n_gpus", 1)) + # Launcher: use distributed.launcher when set, otherwise "native" for k8s + launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") # Create a record with the same structure as successful runs # but with performance=0, metric="", and status="FAILED" @@ -3495,6 +3497,7 @@ def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: s "git_commit": "", "machine_name": pod_name, "deployment_type": "kubernetes", + "launcher": launcher, "gpu_architecture": "", # Performance metrics - FAILED @@ -3561,6 +3564,8 @@ def _build_common_info_dict( total_gpus = nnodes * nproc_per_node gpus_per_node = str(nproc_per_node) nnodes_str = str(nnodes) + # Launcher: use distributed.launcher when set, otherwise "native" for k8s + launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") result = { "n_gpus": str(total_gpus), "nnodes": nnodes_str, @@ -3576,7 +3581,7 @@ def _build_common_info_dict( "git_commit": "", "machine_name": deployment_id, "deployment_type": "kubernetes", - "launcher": "native", + "launcher": launcher, "gpu_architecture": gpu_architecture, "relative_change": "", "build_duration": build_info.get("build_duration", ""), @@ -3612,6 +3617,8 @@ def _create_multiple_result_row_record( if nproc_per_node is None: nproc_per_node = int(model_info.get("n_gpus", 1)) + # Launcher: use distributed.launcher when set, otherwise "native" for k8s + launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes") result = { "model": item.get("model", model_info.get("name", "")), "n_gpus": str(nnodes * nproc_per_node), @@ -3628,7 +3635,7 @@ def _create_multiple_result_row_record( "git_commit": "", "machine_name": deployment_id, "deployment_type": "kubernetes", - "launcher": "native", + "launcher": launcher, "gpu_architecture": item.get("gpu_architecture", ""), "performance": str(item.get("performance", "")), "metric": item.get("metric", ""), From 5d9ba6f0e8250d15649c7f20cfb78ada04872632 Mon Sep 17 00:00:00 2001 From: mkuznet1 Date: Mon, 16 Mar 2026 21:45:31 +0000 Subject: [PATCH 07/11] remove environment-specific manifests from public branch; .gitignore file has been restored --- .gitignore | 3 +- manifests/mad.env | 30 ---- ...manifest_primus_2node_qwen_localimage.json | 114 --------------- ...ag_llama-3.1-8b_3node_rdma_localimage.json | 132 ------------------ 4 files changed, 2 insertions(+), 277 deletions(-) delete mode 100644 manifests/mad.env delete mode 100644 manifests/run_manifest_primus_2node_qwen_localimage.json delete mode 100644 manifests/run_manifest_pyt_vllm_dissag_llama-3.1-8b_3node_rdma_localimage.json diff --git a/.gitignore b/.gitignore index 05c86b03..f3fd9c54 100644 --- a/.gitignore +++ b/.gitignore @@ -130,6 +130,7 @@ venv/ # model relatives docker/ scripts/ +*.json .*_env/ .vscode/ @@ -141,4 +142,4 @@ rocprof_output/ rpd_output/ slurm_output/ MagicMock/ -.madengine_session_start \ No newline at end of file +.madengine_session_start diff --git a/manifests/mad.env b/manifests/mad.env deleted file mode 100644 index f6318923..00000000 --- a/manifests/mad.env +++ /dev/null @@ -1,30 +0,0 @@ -# MAD/MadEngine runtime environment -export MAD_SECRETS_HFTOKEN=$(cat ~/.huggingface/token) -export MAD_SYSTEM_GPU_ARCHITECTURE=gfx942 -export MAD_VERBOSE_CONFIG=true - -# Keep model and package source in shared MAD repo -export MAD_SETUP_MODEL_DIR=false -export MODEL_DIR=/shared_inference/$USER/MAD-internal/ - -# Cache/data paths: keep large artifacts off /home -export MAD_DATAHOME=/mnt/m2m_nobackup/data/models -export HF_HOME=/mnt/m2m_nobackup/data/cache/huggingface -export TORCH_HOME=/mnt/m2m_nobackup/data/cache/torch -export XDG_CACHE_HOME=/mnt/m2m_nobackup/data/cache/xdg -export PIP_CACHE_DIR=/mnt/m2m_nobackup/data/cache/pip - -# Optional helper paths for common frameworks -export TRANSFORMERS_CACHE=$HF_HOME -export HUGGINGFACE_HUB_CACHE=$HF_HOME/hub -export TRITON_CACHE_DIR=/mnt/m2m_nobackup/data/cache/triton - -# MAD metadata -export MAD_DEPLOYMENT_TYPE=slurm -export BUILD_NUMBER=${BUILD_NUMBER:-0} - -# Default RDMA-friendly communication settings (can be overridden per run config) -export NCCL_IB_DISABLE=0 -export NCCL_SOCKET_IFNAME=ib0 -export GLOO_SOCKET_IFNAME=ib0 -export NCCL_IB_GID_INDEX=3 diff --git a/manifests/run_manifest_primus_2node_qwen_localimage.json b/manifests/run_manifest_primus_2node_qwen_localimage.json deleted file mode 100644 index 865d112f..00000000 --- a/manifests/run_manifest_primus_2node_qwen_localimage.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "built_images": { - "rocm-primus-qwen25-7b": { - "model": "primus_pyt_megatron_lm_train_qwen2.5-7b", - "docker_image": "rocm/primus:v26.1", - "dockerfile": "docker/primus_megatron_train.ubuntu.amd.Dockerfile", - "base_docker": "rocm/primus:v26.1", - "build_duration": 0, - "local_image": true, - "registry_image": null, - "registry": null, - "gpu_vendor": "AMD" - } - }, - "built_models": { - "rocm-primus-qwen25-7b": { - "name": "primus_pyt_megatron_lm_train_qwen2.5-7b", - "url": "", - "dockerfile": "docker/primus_megatron_train", - "scripts": "scripts/primus/megatron-lm/run.sh", - "n_gpus": "-1", - "owner": "mad.support@amd.com", - "training_precision": "", - "multiple_results": "perf_primus-megatron-Qwen2.5-7B.csv", - "tags": [ - "pyt", - "pretrain", - "qwen2.5-7b", - "training" - ], - "timeout": -1, - "args": "--model_repo primus_pyt_megatron_lm_train_qwen2.5-7b", - "additional_docker_run_options": "--privileged --group-add render --shm-size 64G --device=/dev/infiniband --cap-add IPC_LOCK --ulimit memlock=-1 -v /sys:/sys:ro -v /run/udev:/run/udev:ro" - } - }, - "context": { - "docker_env_vars": { - "MAD_SECRETS_HFTOKEN": "${MAD_SECRETS_HFTOKEN}", - "NCCL_DEBUG": "INFO", - "NCCL_DEBUG_SUBSYS": "INIT,NET", - "NCCL_IB_DISABLE": "0", - "NCCL_NET": "IB", - "NCCL_IB_HCA": "mlx5_0:1,mlx5_1:1", - "NCCL_IB_GID_INDEX": "3", - "NCCL_SOCKET_IFNAME": "eth0", - "GLOO_SOCKET_IFNAME": "eth0", - "LIBIBVERBS_DRIVER_PATH": "/usr/lib/x86_64-linux-gnu/libibverbs", - "RDMAV_DRIVERS": "mlx5", - "IBV_DRIVERS": "mlx5", - "LD_LIBRARY_PATH": "/usr/lib/x86_64-linux-gnu:/usr/local/lib", - "IBV_SHOW_WARNINGS": "1" - }, - "docker_mounts": { - "/dev/infiniband": "/dev/infiniband" - }, - "docker_build_arg": {}, - "gpu_vendor": "AMD", - "guest_os": "UBUNTU", - "docker_gpus": "0,1,2,3,4,5,6,7" - }, - "credentials_required": [], - "summary": { - "successful_builds": [], - "failed_builds": [], - "total_build_time": 0, - "successful_pushes": [], - "failed_pushes": [] - }, - "deployment_config": { - "target": "slurm", - "slurm": { - "partition": "amd-rccl", - "account": "amd-rccl", - "qos": "normal", - "exclude": "useocpm2m-097-089,useocpm2m-097-094", - "nodes": 2, - "gpus_per_node": 8, - "time": "12:00:00", - "output_dir": "./slurm_output", - "exclusive": true, - "network_interface": "eth0" - }, - "distributed": { - "launcher": "torchrun", - "backend": "nccl", - "port": 29500, - "nnodes": 2, - "nproc_per_node": 8 - }, - "env_vars": { - "HF_HOME": "/mnt/m2m_nobackup/data/cache/huggingface", - "TORCH_HOME": "/mnt/m2m_nobackup/data/cache/torch", - "XDG_CACHE_HOME": "/mnt/m2m_nobackup/data/cache/xdg", - "PIP_CACHE_DIR": "/mnt/m2m_nobackup/data/cache/pip", - "MAD_DATAHOME": "/mnt/m2m_nobackup/data/models", - "NCCL_DEBUG": "INFO", - "NCCL_DEBUG_SUBSYS": "INIT,NET", - "NCCL_IB_DISABLE": "0", - "NCCL_NET": "IB", - "NCCL_SOCKET_IFNAME": "eth0", - "GLOO_SOCKET_IFNAME": "eth0", - "NCCL_IB_GID_INDEX": "3", - "NCCL_IB_HCA": "mlx5_0:1,mlx5_1:1", - "NCCL_TIMEOUT": "900", - "TORCH_NCCL_ASYNC_ERROR_HANDLING": "1", - "TORCH_NCCL_HIGH_PRIORITY": "1", - "OMP_NUM_THREADS": "8", - "MIOPEN_FIND_MODE": "1", - "MIOPEN_USER_DB_PATH": "/mnt/m2m_nobackup/data/cache/miopen" - }, - "debug": false, - "docker_gpus": "0,1,2,3,4,5,6,7" - } -} diff --git a/manifests/run_manifest_pyt_vllm_dissag_llama-3.1-8b_3node_rdma_localimage.json b/manifests/run_manifest_pyt_vllm_dissag_llama-3.1-8b_3node_rdma_localimage.json deleted file mode 100644 index d7680886..00000000 --- a/manifests/run_manifest_pyt_vllm_dissag_llama-3.1-8b_3node_rdma_localimage.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "built_images": { - "rocm-pyt-vllm-dissag-llama31-8b": { - "model": "pyt_vllm_dissag_llama-3.1-8b", - "docker_image": "rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210-disagg-rdmafix", - "dockerfile": "docker/vllm_disagg_inference.ubuntu.amd.Dockerfile", - "base_docker": "rocm/vllm:rocm7.0.0_vllm_0.11.2_20251210-disagg-rdmafix", - "build_duration": 0, - "local_image": true, - "registry_image": null, - "registry": null, - "gpu_vendor": "AMD" - } - }, - "built_models": { - "rocm-pyt-vllm-dissag-llama31-8b": { - "name": "pyt_vllm_dissag_llama-3.1-8b", - "url": "", - "dockerfile": "docker/vllm_disagg_inference", - "scripts": "scripts/vllm_dissag/run.sh", - "data": "huggingface", - "n_gpus": "-1", - "owner": "mad.support@amd.com", - "training_precision": "", - "multiple_results": "perf-vllm-disagg-Llama-3.1-8B-Instruct.csv", - "tags": [ - "pyt", - "vllm", - "disagg", - "inference" - ], - "timeout": -1, - "args": "--model_repo /shared_inference/models_blog/Llama-3.1-8B-Instruct", - "additional_docker_run_options": "--privileged --group-add render --shm-size 64G --device=/dev/infiniband --cap-add IPC_LOCK --ulimit memlock=-1 -v /sys:/sys:ro -v /sys/class/infiniband:/sys/class/infiniband:ro -v /run/udev:/run/udev:ro -v /etc/libibverbs.d:/etc/libibverbs.d:ro -v /usr/lib/x86_64-linux-gnu:/usr/lib/x86_64-linux-gnu:ro" - } - }, - "context": { - "docker_env_vars": { - "MAD_SECRETS_HFTOKEN": "${MAD_SECRETS_HFTOKEN}", - "NCCL_DEBUG": "INFO", - "NCCL_DEBUG_SUBSYS": "INIT,NET", - "NCCL_IB_DISABLE": "0", - "NCCL_NET": "IB", - "NCCL_IB_HCA": "mlx5_0:1,mlx5_1:1", - "NCCL_IB_GID_INDEX": "3", - "NCCL_SOCKET_IFNAME": "eth0", - "GLOO_SOCKET_IFNAME": "eth0", - "UCX_NET_DEVICES": "mlx5_0:1", - "UCX_TLS": "rc,sm,self,rocm_copy,rocm_ipc,tcp", - "UCX_SOCKADDR_TLS_PRIORITY": "rdmacm,tcp", - "UCX_SOCKADDR_CM_ENABLE": "y", - "UCX_RDMA_CM_ENABLED": "y", - "RDMAV_DRIVERS": "mlx5", - "IBV_DRIVERS": "mlx5", - "LIBIBVERBS_DRIVER_PATH": "/usr/lib/x86_64-linux-gnu/libibverbs", - "LD_LIBRARY_PATH": "/usr/lib/x86_64-linux-gnu:/usr/local/lib:/opt/rocm/lib", - "IBV_SHOW_WARNINGS": "1", - "MODEL_NAME": "Llama-3.1-8B-Instruct", - "xP": "1", - "yD": "1", - "PROXY_TYPE": "vllm_router", - "ROUTER_PORT": "2584", - "BENCHMARK_PORT": "2584", - "MODEL_DIR": "/shared_inference//data/models_blog", - "PD_SYNC_ROOT": "/shared_inference//data/vllm_sync", - "OUTPUT_DIR": "/myworkspace/run_directory/workdir" - }, - "docker_mounts": { - "/dev/infiniband": "/dev/infiniband", - "/sys/class/infiniband": "/sys/class/infiniband", - "/shared_inference": "/shared_inference", - "/mnt/m2m_nobackup": "/mnt/m2m_nobackup" - }, - "docker_build_arg": {}, - "gpu_vendor": "AMD", - "guest_os": "UBUNTU", - "docker_gpus": "0,1,2,3,4,5,6,7" - }, - "credentials_required": [], - "summary": { - "successful_builds": [], - "failed_builds": [], - "total_build_time": 0, - "successful_pushes": [], - "failed_pushes": [] - }, - "deployment_config": { - "target": "slurm", - "slurm": { - "partition": "amd-rccl", - "account": "amd-rccl", - "qos": "normal", - "exclude": "useocpm2m-097-089,useocpm2m-097-094,useocpm2m-097-021,useocpm2m-097-008", - "nodes": 3, - "gpus_per_node": 8, - "time": "12:00:00", - "output_dir": "./slurm_output", - "exclusive": true, - "network_interface": "eth0" - }, - "distributed": { - "launcher": "vllm", - "backend": "nccl", - "port": 29500, - "nnodes": 3, - "nproc_per_node": 1 - }, - "env_vars": { - "NCCL_DEBUG": "INFO", - "NCCL_DEBUG_SUBSYS": "INIT,NET", - "NCCL_IB_DISABLE": "0", - "NCCL_NET": "IB", - "NCCL_SOCKET_IFNAME": "eth0", - "GLOO_SOCKET_IFNAME": "eth0", - "NCCL_IB_GID_INDEX": "3", - "NCCL_IB_HCA": "mlx5_0:1,mlx5_1:1", - "NCCL_TIMEOUT": "900", - "OMP_NUM_THREADS": "8", - "MODEL_NAME": "Llama-3.1-8B-Instruct", - "xP": "1", - "yD": "1", - "PD_SYNC_ROOT": "/shared_inference//data/vllm_sync", - "PROXY_TYPE": "vllm_router", - "ROUTER_PORT": "2584", - "BENCHMARK_PORT": "2584", - "OUTPUT_DIR": "/myworkspace/run_directory/workdir" - }, - "debug": false, - "docker_gpus": "0,1,2,3,4,5,6,7", - "gpus_per_node": 8 - } -} From e648df9cabb8e394a72d13692e0089cdd89a7768 Mon Sep 17 00:00:00 2001 From: Mikhail Kuznetsov Date: Wed, 18 Mar 2026 17:32:06 +0000 Subject: [PATCH 08/11] Enhance SLURM job script for improved result staging and aggregation. - Introduced per-node artifact staging to a dedicated results directory. - Implemented a mechanism to wait for all nodes to complete staging before merging results. - Added logic to merge performance CSV files from multiple nodes, selecting the best file based on content. - Updated the master node's result collection process to reflect these changes, ensuring comprehensive data aggregation. This update aims to improve the reliability and accuracy of performance reporting in distributed SLURM runs. --- .../deployment/templates/slurm/job.sh.j2 | 76 ++++++++++++++----- src/madengine/execution/container_runner.py | 28 ++++++- 2 files changed, 81 insertions(+), 23 deletions(-) diff --git a/src/madengine/deployment/templates/slurm/job.sh.j2 b/src/madengine/deployment/templates/slurm/job.sh.j2 index 05456a60..611b7db9 100644 --- a/src/madengine/deployment/templates/slurm/job.sh.j2 +++ b/src/madengine/deployment/templates/slurm/job.sh.j2 @@ -655,40 +655,74 @@ echo "Task completed with exit code: $TASK_EXIT" # ============================================================================= if [ $TASK_EXIT -eq 0 ]; then + RESULTS_DIR={{ manifest_file | dirname }} + NODE_STAGE_DIR="${RESULTS_DIR}/.madengine_job_${SLURM_JOB_ID}_node_${SLURM_PROCID}" + STAGE_MARKER="${NODE_STAGE_DIR}/.stage_complete" + mkdir -p "${NODE_STAGE_DIR}" 2>/dev/null || true + + # Stage per-node artifacts into shared results directory. + for f in "$WORKSPACE"/perf.csv "$WORKSPACE"/perf_*.csv "$WORKSPACE"/perf-*.csv "$WORKSPACE"/benchmark_*_CONCURRENCY.log "$WORKSPACE"/*.log; do + if [ -f "$f" ]; then + cp "$f" "${NODE_STAGE_DIR}/" 2>/dev/null || true + fi + done + touch "${STAGE_MARKER}" 2>/dev/null || true + if [ "${SLURM_PROCID}" = "0" ]; then # Master node: Collect and report results - RESULTS_DIR={{ manifest_file | dirname }} echo "" echo "========================================================================" echo "Master Node (SLURM_PROCID=0): Collecting results" echo "========================================================================" echo "Copying results back to: $RESULTS_DIR" - - # Copy performance results (main metric file) - if [ -f "$WORKSPACE/perf.csv" ]; then - cp "$WORKSPACE/perf.csv" "$RESULTS_DIR/perf.csv" 2>/dev/null || true - echo " ✓ Copied: perf.csv (global metrics)" - fi - # Copy workload-level CSV artifacts (supports both perf_*.csv and perf-*.csv naming) - for csv in "$WORKSPACE"/perf_*.csv "$WORKSPACE"/perf-*.csv; do - if [ -f "$csv" ]; then - csv_basename=$(basename "$csv") - cp "$csv" "$RESULTS_DIR/${csv_basename}" 2>/dev/null || true - echo " ✓ Copied: ${csv_basename}" + # Wait for worker staging to avoid racing on partially written artifacts. + EXPECTED_NODES={{ nodes }} + WAITED_SECONDS=0 + while [ $WAITED_SECONDS -lt 180 ]; do + READY_NODES=$(ls -1d "${RESULTS_DIR}"/.madengine_job_"${SLURM_JOB_ID}"_node_*/.stage_complete 2>/dev/null | wc -l) + if [ "${READY_NODES}" -ge "${EXPECTED_NODES}" ]; then + break fi + sleep 2 + WAITED_SECONDS=$((WAITED_SECONDS + 2)) done + echo " Node staging markers detected: ${READY_NODES}/${EXPECTED_NODES}" - # Copy log files - for log in "$WORKSPACE"/*.log; do - if [ -f "$log" ]; then - log_basename=$(basename "$log") - cp "$log" "$RESULTS_DIR/${log_basename}" 2>/dev/null || true - echo " ✓ Copied: ${log_basename}" + # Merge perf.csv and workload-level perf files across all nodes. + CSV_NAME_LIST=$( + { + for c in "${RESULTS_DIR}"/.madengine_job_"${SLURM_JOB_ID}"_node_*/perf.csv; do [ -f "$c" ] && basename "$c"; done + for c in "${RESULTS_DIR}"/.madengine_job_"${SLURM_JOB_ID}"_node_*/perf_*.csv; do [ -f "$c" ] && basename "$c"; done + for c in "${RESULTS_DIR}"/.madengine_job_"${SLURM_JOB_ID}"_node_*/perf-*.csv; do [ -f "$c" ] && basename "$c"; done + } | sort -u + ) + + for csv_basename in ${CSV_NAME_LIST}; do + BEST_FILE="" + BEST_SCORE=-1 + BEST_NODE="unknown" + + for candidate in "${RESULTS_DIR}"/.madengine_job_"${SLURM_JOB_ID}"_node_*/"${csv_basename}"; do + if [ -f "$candidate" ]; then + NON_EMPTY_PERF=$(awk -F, 'NR>1 && $2 != "" {c++} END{print c+0}' "$candidate" 2>/dev/null || echo 0) + TOTAL_ROWS=$(awk 'END{print NR+0}' "$candidate" 2>/dev/null || echo 0) + SCORE=$((NON_EMPTY_PERF * 100000 + TOTAL_ROWS)) + if [ "$SCORE" -gt "$BEST_SCORE" ]; then + BEST_SCORE="$SCORE" + BEST_FILE="$candidate" + BEST_NODE=$(basename "$(dirname "$candidate")") + fi + fi + done + + if [ -n "$BEST_FILE" ]; then + cp "$BEST_FILE" "$RESULTS_DIR/${csv_basename}" 2>/dev/null || true + echo " ✓ Merged: ${csv_basename} (selected from ${BEST_NODE})" fi done - - # Copy any workload results files + + # Copy any workload results files from master node workspace if [ -f "$WORKSPACE/results.txt" ]; then cp "$WORKSPACE/results.txt" "$RESULTS_DIR/" 2>/dev/null || true echo " ✓ Copied: results.txt" diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 64cb6bca..8e289aae 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -1455,6 +1455,14 @@ def run_container( ]: probe_cmd = f"if [ -f {candidate} ]; then echo EXISTS; else echo MISSING; fi" container_checks[candidate] = (model_docker.sh(probe_cmd) or "").strip() + csv_inventory = ( + model_docker.sh( + f"sh -c 'ls -lah {model_dir}/*.csv 2>/dev/null; " + f"ls -lah {model_dir}/workdir/*.csv 2>/dev/null; " + f"ls -lah {model_dir}/benchmark_*_CONCURRENCY.log 2>/dev/null'" + ) + or "" + ) except Exception as probe_err: pass @@ -1500,8 +1508,24 @@ def run_container( pass if not has_valid_perf: - run_results["performance"] = None - print("Error: Performance metric is empty in all rows of multiple results file.") + nnodes_env = os.environ.get("NNODES", "1") + try: + nnodes = int(nnodes_env) + except (TypeError, ValueError): + nnodes = 1 + + if nnodes > 1: + # In multi-node runs perf CSV may be populated by another node + # moments later (shared workspace race). Keep the path so + # downstream aggregation can consume finalized file content. + print( + "Warning: Performance metric is currently empty in " + "multiple results file during multi-node run; " + "deferring final decision to aggregation step." + ) + else: + run_results["performance"] = None + print("Error: Performance metric is empty in all rows of multiple results file.") except Exception as e: self.rich_console.print( f"[yellow]Warning: Could not validate multiple results file: {e}[/yellow]" From aa28ae74355ce484680990de6c7ced6b05db2098 Mon Sep 17 00:00:00 2001 From: Mikhail Kuznetsov Date: Mon, 30 Mar 2026 16:55:51 +0000 Subject: [PATCH 09/11] Refactor container synchronization logic in ContainerRunner. - Removed redundant file-based synchronization mechanism for node readiness. - Simplified the barrier waiting process by directly utilizing TCP for image readiness. - Adjusted timeout handling to ensure consistent behavior across node synchronization. This change enhances the efficiency of multi-node operations by streamlining the readiness check process. --- src/madengine/execution/container_runner.py | 36 +-------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 8e289aae..6a9ce6b9 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -146,44 +146,10 @@ def _sync_after_local_image_ready(self, run_image: str, timeout_s: int = 1800) - if nnodes <= 1: return - sync_root = os.environ.get( - "PD_SYNC_ROOT", - f"/home/{os.environ.get('USER', 'user')}/.madengine_vllm_disagg_sync", - ) - job_id = os.environ.get("SLURM_JOB_ID", "0") - image_key = re.sub(r"[^a-zA-Z0-9_.-]+", "_", run_image) - barrier_dir = os.path.join(sync_root, f"{job_id}_image_ready_{image_key}") - os.makedirs(barrier_dir, exist_ok=True) - - if node_rank == "0": - for name in os.listdir(barrier_dir): - if name.startswith("ready_"): - try: - os.remove(os.path.join(barrier_dir, name)) - except OSError: - pass - - ready_file = os.path.join(barrier_dir, f"ready_{node_rank}.txt") - with open(ready_file, "w", encoding="utf-8") as f: - f.write(str(time.time())) - - - start = time.time() - ready_count = 0 - fs_barrier_timeout_s = min(timeout_s, 20) - while time.time() - start < fs_barrier_timeout_s: - try: - ready_count = len([n for n in os.listdir(barrier_dir) if n.startswith("ready_")]) - except FileNotFoundError: - ready_count = 0 - if ready_count >= nnodes: - return - time.sleep(2) - self._tcp_image_ready_barrier( nnodes=nnodes, node_rank=node_rank, - timeout_s=max(1, int(timeout_s - (time.time() - start))), + timeout_s=timeout_s, ) return From 0f9a40d57af725cd518a2823be3483521dde9627 Mon Sep 17 00:00:00 2001 From: Mikhail Kuznetsov Date: Tue, 31 Mar 2026 11:06:07 +0000 Subject: [PATCH 10/11] Enhance SLURM job script to improve performance value extraction. - Added logic to prefer files with the most non-empty performance values during result aggregation. - Implemented dynamic column index retrieval for the "performance" column in CSV files, ensuring accurate counting of non-empty performance entries. - Maintained backward compatibility by falling back to the previous method if the performance column is not found. This update aims to enhance the accuracy of performance metrics in multi-node training scenarios. --- .../deployment/templates/slurm/job.sh.j2 | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/madengine/deployment/templates/slurm/job.sh.j2 b/src/madengine/deployment/templates/slurm/job.sh.j2 index 611b7db9..816c90f1 100644 --- a/src/madengine/deployment/templates/slurm/job.sh.j2 +++ b/src/madengine/deployment/templates/slurm/job.sh.j2 @@ -705,7 +705,38 @@ if [ $TASK_EXIT -eq 0 ]; then for candidate in "${RESULTS_DIR}"/.madengine_job_"${SLURM_JOB_ID}"_node_*/"${csv_basename}"; do if [ -f "$candidate" ]; then - NON_EMPTY_PERF=$(awk -F, 'NR>1 && $2 != "" {c++} END{print c+0}' "$candidate" 2>/dev/null || echo 0) + # Prefer files that contain the most non-empty performance values. + # This matters for multi-node training where only one node may see + # the final throughput lines and therefore generate valid metrics. + PERF_COL_INDEX=$( + awk -F, ' + NR == 1 { + for (i = 1; i <= NF; i++) { + gsub(/^"|"$/, "", $i) + if ($i == "performance") { + print i + exit + } + } + } + ' "$candidate" 2>/dev/null + ) + if [ -n "$PERF_COL_INDEX" ]; then + NON_EMPTY_PERF=$( + awk -F, -v perf_col="$PERF_COL_INDEX" ' + NR > 1 { + value = $perf_col + gsub(/^"|"$/, "", value) + if (value != "") { + c++ + } + } + END { print c + 0 } + ' "$candidate" 2>/dev/null || echo 0 + ) + else + NON_EMPTY_PERF=$(awk -F, 'NR>1 && $2 != "" {c++} END{print c+0}' "$candidate" 2>/dev/null || echo 0) + fi TOTAL_ROWS=$(awk 'END{print NR+0}' "$candidate" 2>/dev/null || echo 0) SCORE=$((NON_EMPTY_PERF * 100000 + TOTAL_ROWS)) if [ "$SCORE" -gt "$BEST_SCORE" ]; then From 6f6a2762711db3b6c2d3661bd0880b9e44840cb5 Mon Sep 17 00:00:00 2001 From: Mikhail Kuznetsov Date: Tue, 21 Apr 2026 16:15:31 +0000 Subject: [PATCH 11/11] Implement local image management in ContainerRunner. - Added methods to handle local Docker images, including checking for existence, loading from tar, and saving to tar. - Enhanced the _ensure_local_image_available method to manage image availability across distributed nodes, ensuring primary nodes build and save images while worker nodes load from shared tar caches. - Introduced tests to validate the behavior of local image handling, including scenarios for saving and loading images in a multi-node environment. This update improves the efficiency and reliability of local image management in distributed runs. --- src/madengine/execution/container_runner.py | 171 +++++++++++++++--- tests/integration/test_container_execution.py | 100 ++++++++++ 2 files changed, 244 insertions(+), 27 deletions(-) diff --git a/src/madengine/execution/container_runner.py b/src/madengine/execution/container_runner.py index 6a9ce6b9..70fb6662 100644 --- a/src/madengine/execution/container_runner.py +++ b/src/madengine/execution/container_runner.py @@ -94,6 +94,144 @@ def _get_build_args(self) -> str: build_args += f"--build-arg {key}='{value}' " return build_args + def _get_node_rank(self) -> int: + """Return the current node rank for distributed runs.""" + node_rank_raw = os.environ.get("NODE_RANK") or os.environ.get("RANK") or "0" + try: + return int(node_rank_raw) + except Exception: + return 0 + + def _local_image_exists(self, run_image: str) -> bool: + """Check whether a Docker image already exists locally.""" + try: + self.console.sh( + f"docker image inspect {shlex.quote(run_image)} > /dev/null 2>&1" + ) + return True + except (subprocess.CalledProcessError, RuntimeError): + return False + + def _get_local_image_tar_path(self, run_image: str) -> typing.Optional[str]: + """Resolve the shared tar path for a local image, if configured.""" + builds_dir = (os.environ.get("MAD_DOCKER_BUILDS") or "").strip() + if not builds_dir: + return None + + safe_image_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", run_image).strip("._") + if not safe_image_name: + safe_image_name = "docker_image" + return os.path.join(builds_dir, f"{safe_image_name}.tar") + + def _load_local_image_from_tar(self, run_image: str, tar_path: str) -> None: + """Load a Docker image from a previously saved tar archive.""" + if not os.path.exists(tar_path): + raise RuntimeError(f"Image tar not found for {run_image}: {tar_path}") + + self.rich_console.print( + f"[yellow]📦 Loading local image tar:[/yellow] {tar_path}" + ) + self.console.sh(f"docker load -i {shlex.quote(tar_path)}", timeout=None) + self.console.sh( + f"docker image inspect {shlex.quote(run_image)} > /dev/null 2>&1" + ) + self.rich_console.print( + f"[green]✅ Loaded local image from tar:[/green] {run_image}" + ) + + def _save_local_image_to_tar(self, run_image: str, tar_path: str) -> None: + """Persist a local Docker image into the shared tar cache.""" + tar_dir = os.path.dirname(tar_path) + if tar_dir: + os.makedirs(tar_dir, exist_ok=True) + + self.rich_console.print( + f"[yellow]💾 Saving local image tar:[/yellow] {tar_path}" + ) + self.console.sh( + f"docker save -o {shlex.quote(tar_path)} {shlex.quote(run_image)}", + timeout=None, + ) + self.rich_console.print( + f"[green]✅ Saved local image tar:[/green] {tar_path}" + ) + + def _build_or_pull_local_image( + self, run_image: str, build_info: typing.Dict, model_info: typing.Dict + ) -> None: + """Ensure the local image exists by building it first and pulling as fallback.""" + self.rich_console.print( + f"[yellow]⚠️ Image {run_image} not found on this node.[/yellow]" + ) + try: + self._build_local_image_from_manifest( + run_image=run_image, + build_info=build_info, + model_info=model_info, + ) + except Exception as build_error: + self.rich_console.print( + "[yellow]⚠️ Local build failed, attempting pull as fallback...[/yellow]" + ) + try: + self.pull_image(run_image) + except Exception as pull_error: + raise RuntimeError( + f"Failed to build or pull local image {run_image}: " + f"build_error={build_error}; pull_error={pull_error}" + ) + + def _ensure_local_image_available( + self, run_image: str, build_info: typing.Dict, model_info: typing.Dict + ) -> None: + """Prepare a local image with optional shared tar cache support.""" + tar_path = self._get_local_image_tar_path(run_image) + node_rank = self._get_node_rank() + is_primary_node = node_rank == 0 + image_exists = self._local_image_exists(run_image) + tar_exists = bool(tar_path) and os.path.exists(tar_path) + tar_missing_at_start = bool(tar_path) and not tar_exists + + # When shared cache is configured and no tar exists yet, only node 0 + # may produce the tar artifact. Other nodes wait and then load it. + if tar_missing_at_start: + if is_primary_node: + if not image_exists: + self._build_or_pull_local_image( + run_image=run_image, + build_info=build_info, + model_info=model_info, + ) + image_exists = True + if not tar_exists: + self._save_local_image_to_tar(run_image, tar_path) + tar_exists = True + + self._sync_after_local_image_ready(run_image=run_image) + + if not image_exists: + if not tar_exists and not os.path.exists(tar_path): + raise RuntimeError( + f"Node 0 did not produce image tar for {run_image}: {tar_path}" + ) + self._load_local_image_from_tar(run_image, tar_path) + image_exists = True + + elif not image_exists: + if tar_exists: + self._load_local_image_from_tar(run_image, tar_path) + image_exists = True + else: + self._build_or_pull_local_image( + run_image=run_image, + build_info=build_info, + model_info=model_info, + ) + image_exists = True + + if tar_path and image_exists and is_primary_node and not tar_exists: + self._save_local_image_to_tar(run_image, tar_path) + def _build_local_image_from_manifest( self, run_image: str, build_info: typing.Dict, model_info: typing.Dict ) -> None: @@ -1975,33 +2113,12 @@ def run_models_from_manifest( # Local image mode (MAD_CONTAINER_IMAGE): Use the provided image directly run_image = build_info.get("docker_image") self.rich_console.print(f"[yellow]🏠 Using local image: {run_image}[/yellow]") - - # Verify image exists - try: - inspect_t0 = time.time() - self.console.sh(f"docker image inspect {run_image} > /dev/null 2>&1") - except (subprocess.CalledProcessError, RuntimeError) as e: - self.rich_console.print( - f"[yellow]⚠️ Image {run_image} not found on this node.[/yellow]" - ) - # Build from manifest dockerfile on current compute node first. - try: - self._build_local_image_from_manifest( - run_image=run_image, - build_info=build_info, - model_info=model_info, - ) - except Exception as build_error: - self.rich_console.print( - "[yellow]⚠️ Local build failed, attempting pull as fallback...[/yellow]" - ) - try: - self.pull_image(run_image) - except Exception as pull_error: - raise RuntimeError( - f"Failed to build or pull local image {run_image}: " - f"build_error={build_error}; pull_error={pull_error}" - ) + + self._ensure_local_image_available( + run_image=run_image, + build_info=build_info, + model_info=model_info, + ) # Ensure all nodes reach this point before entering container run. self._sync_after_local_image_ready(run_image=run_image) diff --git a/tests/integration/test_container_execution.py b/tests/integration/test_container_execution.py index 77cfb291..ca3477c5 100644 --- a/tests/integration/test_container_execution.py +++ b/tests/integration/test_container_execution.py @@ -76,6 +76,106 @@ def test_load_build_manifest(self): assert "images" in result assert "model1" in result["images"] + @patch.dict(os.environ, {"MAD_DOCKER_BUILDS": "/shared/builds", "NODE_RANK": "0"}, clear=False) + @patch.object(ContainerRunner, "_sync_after_local_image_ready") + @patch.object(ContainerRunner, "_save_local_image_to_tar") + @patch.object(ContainerRunner, "_build_or_pull_local_image") + @patch.object(ContainerRunner, "_local_image_exists", return_value=True) + @patch("os.path.exists", return_value=False) + def test_ensure_local_image_available_saves_tar_on_primary_node( + self, + mock_exists, + mock_local_image_exists, + mock_build_or_pull, + mock_save_to_tar, + mock_sync, + ): + """Primary node should save a tar when image exists but cache file is missing.""" + runner = ContainerRunner() + + runner._ensure_local_image_available( + run_image="rocm/pyt_mlperf_training:full-tefix", + build_info={}, + model_info={}, + ) + + mock_build_or_pull.assert_not_called() + mock_save_to_tar.assert_called_once_with( + "rocm/pyt_mlperf_training:full-tefix", + "/shared/builds/rocm_pyt_mlperf_training_full-tefix.tar", + ) + assert mock_sync.call_count == 1 + + @patch.dict(os.environ, {"MAD_DOCKER_BUILDS": "/shared/builds", "NODE_RANK": "0"}, clear=False) + @patch.object(ContainerRunner, "_save_local_image_to_tar") + @patch.object(ContainerRunner, "_build_or_pull_local_image") + @patch.object(ContainerRunner, "_load_local_image_from_tar") + @patch.object(ContainerRunner, "_local_image_exists", return_value=False) + @patch("os.path.exists", return_value=True) + def test_ensure_local_image_available_loads_existing_tar( + self, + mock_exists, + mock_local_image_exists, + mock_load_from_tar, + mock_build_or_pull, + mock_save_to_tar, + ): + """Existing tar cache should be loaded instead of rebuilding.""" + runner = ContainerRunner() + + runner._ensure_local_image_available( + run_image="rocm/pyt_mlperf_training:full-tefix", + build_info={}, + model_info={}, + ) + + mock_load_from_tar.assert_called_once_with( + "rocm/pyt_mlperf_training:full-tefix", + "/shared/builds/rocm_pyt_mlperf_training_full-tefix.tar", + ) + mock_build_or_pull.assert_not_called() + mock_save_to_tar.assert_not_called() + + @patch.dict(os.environ, {"MAD_DOCKER_BUILDS": "/shared/builds", "NODE_RANK": "1"}, clear=False) + @patch.object(ContainerRunner, "_save_local_image_to_tar") + @patch.object(ContainerRunner, "_build_or_pull_local_image") + @patch.object(ContainerRunner, "_load_local_image_from_tar") + @patch.object(ContainerRunner, "_sync_after_local_image_ready") + @patch.object(ContainerRunner, "_local_image_exists", return_value=False) + @patch("os.path.exists", return_value=False) + def test_ensure_local_image_available_waits_for_primary_tar_on_worker( + self, + mock_exists, + mock_local_image_exists, + mock_sync, + mock_load_from_tar, + mock_build_or_pull, + mock_save_to_tar, + ): + """Worker nodes should wait for node 0 and then load the shared tar.""" + runner = ContainerRunner() + + def exists_side_effect(path): + if path == "/shared/builds/rocm_pyt_mlperf_training_full-tefix.tar": + return mock_sync.call_count > 0 + return False + + mock_exists.side_effect = exists_side_effect + + runner._ensure_local_image_available( + run_image="rocm/pyt_mlperf_training:full-tefix", + build_info={}, + model_info={}, + ) + + mock_sync.assert_called_once_with(run_image="rocm/pyt_mlperf_training:full-tefix") + mock_load_from_tar.assert_called_once_with( + "rocm/pyt_mlperf_training:full-tefix", + "/shared/builds/rocm_pyt_mlperf_training_full-tefix.tar", + ) + mock_build_or_pull.assert_not_called() + mock_save_to_tar.assert_not_called() + @patch.object(Console, "sh") def test_pull_image(self, mock_sh): """Test pulling image from registry."""