Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions src/madengine/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,28 @@
import os
import json
import logging
import sys


# Utility function for optional verbose logging of configuration
def _log_config_info(message: str, force_print: bool = False):
"""Log configuration information either to logger or print if specified."""
# Keep --version/--help output clean even if MAD_VERBOSE_CONFIG=true.
if any(arg in {"--version", "-V", "--help", "-h"} for arg in sys.argv[1:]):
logging.debug(message)
return
if force_print or os.environ.get("MAD_VERBOSE_CONFIG", "").lower() == "true":
print(message)
else:
logging.debug(message)


def _is_lightweight_cli_invocation() -> bool:
"""Return True for metadata/help invocations that should avoid side effects."""
lightweight_flags = {"--version", "-V", "--help", "-h"}
return any(arg in lightweight_flags for arg in sys.argv[1:])


# third-party modules
from madengine.core.console import Console

Expand Down Expand Up @@ -65,9 +76,12 @@ def _setup_model_dir():
_log_config_info(f"Model dir: {MODEL_DIR} copied to current dir: {cwd_abs}")


# Only setup model directory if explicitly requested (when not just importing for constants)
# Only setup model directory if explicitly requested and invocation is not metadata-only.
if os.environ.get("MAD_SETUP_MODEL_DIR", "").lower() == "true":
_setup_model_dir()
if _is_lightweight_cli_invocation():
_log_config_info("Skipping MODEL_DIR setup for lightweight CLI invocation (--version/--help).")
else:
_setup_model_dir()

# madengine credentials configuration
CRED_FILE = "credential.json"
Expand Down
85 changes: 77 additions & 8 deletions src/madengine/deployment/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1297,18 +1297,24 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
model_key, {}
) if model_key else {}

# Multiple results path: resolve CSV from job_dir/node_*, then cwd/run_directory
# Multiple results path: resolve CSV from job_dir/node_*, then cwd/run_directory.
# In multi-node runs, different nodes may produce the CSV with different levels
# of completeness (e.g. only one node observes the final throughput numbers and
# populates the "performance" column). Prefer the candidate with the most
# non-empty "performance" rows so aggregation does not silently pick an empty one.
mult_res = model_info_for_entry.get("multiple_results")
if mult_res:
resolved_csv: Optional[Path] = None
candidates: List[Path] = []
if (job_dir / mult_res).is_file():
resolved_csv = job_dir / mult_res
else:
for i in range(self.nodes):
candidate = job_dir / f"node_{i}" / mult_res
if candidate.is_file():
resolved_csv = candidate
break
candidates.append(job_dir / mult_res)
for i in range(self.nodes):
per_node_candidate = job_dir / f"node_{i}" / mult_res
if per_node_candidate.is_file():
candidates.append(per_node_candidate)

if candidates:
resolved_csv = self._select_best_multiple_results_csv(candidates)
if not resolved_csv and Path(mult_res).is_file():
resolved_csv = Path(mult_res)
if not resolved_csv and Path("run_directory", mult_res).is_file():
Expand Down Expand Up @@ -1519,6 +1525,69 @@ def collect_results(self, deployment_id: str) -> Dict[str, Any]:
)
return results

def _select_best_multiple_results_csv(
self, candidates: List[Path]
) -> Optional[Path]:
"""Pick the CSV with the most non-empty ``performance`` entries.

In multi-node SLURM runs, every node copies its local copy of the
workload's multi-results CSV into ``job_dir/node_<rank>/``. Only
some nodes will observe the final throughput numbers and therefore
populate the ``performance`` column; others may have the file but
with empty values. Ranking candidates by the number of non-empty
``performance`` rows lets downstream aggregation use the richest
available data without depending on node-0 winning every race.

Falls back to the first candidate when none has a ``performance``
column or when counting fails, preserving previous behavior.
"""
if not candidates:
return None
if len(candidates) == 1:
return candidates[0]

import csv as _csv

best_candidate: Optional[Path] = None
best_score = -1
best_rows = -1
for candidate in candidates:
non_empty_perf = 0
total_rows = 0
has_perf_column = False
try:
with open(candidate, "r", encoding="utf-8", errors="ignore") as f:
reader = _csv.DictReader(f)
fieldnames = reader.fieldnames or []
stripped_fields = [fn.strip() for fn in fieldnames]
has_perf_column = "performance" in stripped_fields
for row in reader:
total_rows += 1
if has_perf_column:
value = (row.get("performance") or "").strip()
Copy link

Copilot AI Apr 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_select_best_multiple_results_csv() strips whitespace from fieldnames to detect the presence of a "performance" column, but then reads values with row.get("performance"). If the actual CSV header is " performance" (or otherwise differs), this will incorrectly treat all rows as empty. Consider normalizing row keys (similar to the later {k.strip(): v for k,v in row.items()} logic in collect_results) or tracking the exact header name that matched after stripping.

Suggested change
value = (row.get("performance") or "").strip()
normalized_row = {
(k.strip() if isinstance(k, str) else k): v
for k, v in row.items()
}
value = (normalized_row.get("performance") or "").strip()

Copilot uses AI. Check for mistakes.
Comment on lines +1562 to +1567
if value:
non_empty_perf += 1
except Exception:
continue

score = non_empty_perf if has_perf_column else 0
if (
score > best_score
or (score == best_score and total_rows > best_rows)
):
best_score = score
best_rows = total_rows
best_candidate = candidate

if best_candidate is None:
return candidates[0]
if best_score > 0:
self.console.print(
f"[dim] Selected multiple_results CSV with {best_score} "
f"non-empty performance rows: {best_candidate}[/dim]"
)
return best_candidate

def _collect_results_parse_perf_csv(
self, results: Dict[str, Any], session_start_row: Optional[int]
) -> None:
Expand Down
18 changes: 14 additions & 4 deletions src/madengine/deployment/templates/slurm/job.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -213,15 +213,16 @@ fi
echo ""
echo "Verifying madengine availability..."
if command -v madengine >/dev/null 2>&1; then
MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown")
# MODEL_DIR can trigger side effects in madengine startup; unset it for preflight probes only.
MAD_CLI_VERSION=$(env -u MODEL_DIR madengine --version 2>&1 | head -n1 || echo "unknown")
MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown")

echo " ✓ madengine available"
echo " Version: $MAD_CLI_VERSION"
echo " Path: $MAD_CLI_PATH"

# Verify it's executable
if madengine --help >/dev/null 2>&1; then
if env -u MODEL_DIR madengine --help >/dev/null 2>&1; then
export MAD_CLI_COMMAND="madengine"
else
echo " ❌ ERROR: madengine found but not functional!"
Expand Down Expand Up @@ -488,15 +489,24 @@ trap 'ec=$?; echo "[DEBUG] $(date -Iseconds) Node ${SLURM_PROCID} ($(hostname)):
echo "Verifying madengine availability..."

if command -v madengine >/dev/null 2>&1; then
MAD_CLI_VERSION=$(madengine --version 2>&1 | head -n1 || echo "unknown")
# MODEL_DIR can trigger side effects in madengine startup; isolate it for preflight probes.
set +e
MAD_VERSION_RAW_SANITIZED=$(env -u MODEL_DIR madengine --version 2>&1)
set -e
MAD_CLI_VERSION=$(printf "%s" "$MAD_VERSION_RAW_SANITIZED" | head -n1 || echo "unknown")
MAD_CLI_PATH=$(which madengine 2>/dev/null || echo "unknown")

echo "✓ madengine available"
echo " Version: $MAD_CLI_VERSION"
echo " Path: $MAD_CLI_PATH"

# Verify it's executable
if madengine --help >/dev/null 2>&1; then
set +e
MAD_HELP_RAW_SANITIZED=$(env -u MODEL_DIR madengine --help 2>&1)
MAD_HELP_EXIT_SANITIZED=$?
set -e

if [ "${MAD_HELP_EXIT_SANITIZED}" -eq 0 ]; then
echo " ✓ Verified: madengine is functional"
MAD_CLI_COMMAND="madengine"
else
Expand Down
Loading