Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ venv/
# model relatives
docker/
scripts/
*.json
.*_env/
.vscode/

Expand All @@ -141,4 +142,4 @@ rocprof_output/
rpd_output/
slurm_output/
MagicMock/
.madengine_session_start
.madengine_session_start
30 changes: 0 additions & 30 deletions manifests/mad.env

This file was deleted.

114 changes: 0 additions & 114 deletions manifests/run_manifest_primus_2node_qwen_localimage.json

This file was deleted.

This file was deleted.

11 changes: 9 additions & 2 deletions src/madengine/deployment/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3469,6 +3469,8 @@ def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: s
nproc_per_node = distributed_config.get("nproc_per_node")
if nproc_per_node is None:
nproc_per_node = int(model_info.get("n_gpus", 1))
# Launcher: use distributed.launcher when set, otherwise "native" for k8s
launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes")

# Create a record with the same structure as successful runs
# but with performance=0, metric="", and status="FAILED"
Expand All @@ -3495,6 +3497,7 @@ def _create_failure_record(self, model_info: Dict, build_info: Dict, pod_name: s
"git_commit": "",
"machine_name": pod_name,
"deployment_type": "kubernetes",
"launcher": launcher,
"gpu_architecture": "",

# Performance metrics - FAILED
Expand Down Expand Up @@ -3561,6 +3564,8 @@ def _build_common_info_dict(
total_gpus = nnodes * nproc_per_node
gpus_per_node = str(nproc_per_node)
nnodes_str = str(nnodes)
# Launcher: use distributed.launcher when set, otherwise "native" for k8s
launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes")
result = {
"n_gpus": str(total_gpus),
"nnodes": nnodes_str,
Expand All @@ -3576,7 +3581,7 @@ def _build_common_info_dict(
"git_commit": "",
"machine_name": deployment_id,
"deployment_type": "kubernetes",
"launcher": "native",
"launcher": launcher,
"gpu_architecture": gpu_architecture,
"relative_change": "",
"build_duration": build_info.get("build_duration", ""),
Expand Down Expand Up @@ -3612,6 +3617,8 @@ def _create_multiple_result_row_record(
if nproc_per_node is None:
nproc_per_node = int(model_info.get("n_gpus", 1))

# Launcher: use distributed.launcher when set, otherwise "native" for k8s
launcher = normalize_launcher(distributed_config.get("launcher"), "kubernetes")
result = {
"model": item.get("model", model_info.get("name", "")),
"n_gpus": str(nnodes * nproc_per_node),
Expand All @@ -3628,7 +3635,7 @@ def _create_multiple_result_row_record(
"git_commit": "",
"machine_name": deployment_id,
"deployment_type": "kubernetes",
"launcher": "native",
"launcher": launcher,
"gpu_architecture": item.get("gpu_architecture", ""),
"performance": str(item.get("performance", "")),
"metric": item.get("metric", ""),
Expand Down
Loading