From d768f93c911330a9313201b7dedb5f074b760d8e Mon Sep 17 00:00:00 2001
From: Seonghee Lee <seongheel@nvidia.com>
Date: Tue, 28 Apr 2026 10:51:35 -0700
Subject: [PATCH 1/5] make .agents/ the cannonical agent-skills location

Signed-off-by: Seonghee Lee <seongheel@nvidia.com>
---
 .agents/README.md                             | 70 +++++++++++++++++++
 .agents/clusters.yaml.example                 | 19 +++++
 .../scripts/sync-upstream-skills.sh           | 13 ++--
 .../skills/accessing-mlflow/SKILL.md          |  0
 .../skills/common/credentials.md              |  0
 .../skills/common/environment-setup.md        |  0
 .../skills/common/remote-execution.md         |  0
 .../skills/common/remote_exec.sh              | 11 ++-
 .../skills/common/slurm-setup.md              |  0
 .../skills/common/workspace-management.md     |  0
 {.claude => .agents}/skills/debug/SKILL.md    |  0
 .../skills/deployment/SKILL.md                |  0
 .../skills/deployment/references/setup.md     |  0
 .../skills/deployment/references/sglang.md    |  0
 .../deployment/references/support-matrix.md   |  0
 .../skills/deployment/references/trtllm.md    |  0
 .../references/unsupported-models.md          |  0
 .../skills/deployment/references/vllm.md      |  0
 .../skills/deployment/scripts/deploy.sh       |  0
 .../skills/deployment/tests/evals.json        |  0
 .../skills/evaluation/SKILL.md                |  0
 .../references/model-card-research.md         |  0
 .../evaluation/references/multi-node.md       |  0
 .../references/quantization-benchmarks.md     |  0
 .../skills/evaluation/tests/evals.json        |  0
 .../skills/launching-evals/SKILL.md           |  0
 .../references/analyze-results.md             |  0
 .../benchmarks/swebench-general-info.md       |  0
 .../benchmarks/terminal-bench-general-info.md |  0
 .../terminal-bench-trace-analysis.md          |  0
 .../references/check-progress.md              |  0
 .../references/debug-failed-runs.md           |  0
 .../references/run-evaluation.md              |  0
 .../skills/launching-evals/tests.json         |  0
 {.claude => .agents}/skills/monitor/SKILL.md  |  0
 {.claude => .agents}/skills/ptq/SKILL.md      |  0
 .../ptq/references/checkpoint-validation.md   |  0
 .../skills/ptq/references/launcher-guide.md   |  0
 .../skills/ptq/references/slurm-setup-ptq.md  |  0
 .../ptq/references/unsupported-models.md      |  0
 {.claude => .agents}/skills/ptq/tests.json    |  0
 .../skills/release-cherry-pick/SKILL.md       |  0
 .claude/clusters.yaml.example                 | 19 +----
 .claude/scripts                               |  1 +
 .claude/skills                                |  1 +
 .markdownlint-cli2.yaml                       |  6 +-
 CLAUDE.md                                     |  5 ++
 47 files changed, 116 insertions(+), 29 deletions(-)
 create mode 100644 .agents/README.md
 create mode 100644 .agents/clusters.yaml.example
 rename {.claude => .agents}/scripts/sync-upstream-skills.sh (93%)
 rename {.claude => .agents}/skills/accessing-mlflow/SKILL.md (100%)
 rename {.claude => .agents}/skills/common/credentials.md (100%)
 rename {.claude => .agents}/skills/common/environment-setup.md (100%)
 rename {.claude => .agents}/skills/common/remote-execution.md (100%)
 rename {.claude => .agents}/skills/common/remote_exec.sh (98%)
 rename {.claude => .agents}/skills/common/slurm-setup.md (100%)
 rename {.claude => .agents}/skills/common/workspace-management.md (100%)
 rename {.claude => .agents}/skills/debug/SKILL.md (100%)
 rename {.claude => .agents}/skills/deployment/SKILL.md (100%)
 rename {.claude => .agents}/skills/deployment/references/setup.md (100%)
 rename {.claude => .agents}/skills/deployment/references/sglang.md (100%)
 rename {.claude => .agents}/skills/deployment/references/support-matrix.md (100%)
 rename {.claude => .agents}/skills/deployment/references/trtllm.md (100%)
 rename {.claude => .agents}/skills/deployment/references/unsupported-models.md (100%)
 rename {.claude => .agents}/skills/deployment/references/vllm.md (100%)
 rename {.claude => .agents}/skills/deployment/scripts/deploy.sh (100%)
 rename {.claude => .agents}/skills/deployment/tests/evals.json (100%)
 rename {.claude => .agents}/skills/evaluation/SKILL.md (100%)
 rename {.claude => .agents}/skills/evaluation/references/model-card-research.md (100%)
 rename {.claude => .agents}/skills/evaluation/references/multi-node.md (100%)
 rename {.claude => .agents}/skills/evaluation/references/quantization-benchmarks.md (100%)
 rename {.claude => .agents}/skills/evaluation/tests/evals.json (100%)
 rename {.claude => .agents}/skills/launching-evals/SKILL.md (100%)
 rename {.claude => .agents}/skills/launching-evals/references/analyze-results.md (100%)
 rename {.claude => .agents}/skills/launching-evals/references/benchmarks/swebench-general-info.md (100%)
 rename {.claude => .agents}/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md (100%)
 rename {.claude => .agents}/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md (100%)
 rename {.claude => .agents}/skills/launching-evals/references/check-progress.md (100%)
 rename {.claude => .agents}/skills/launching-evals/references/debug-failed-runs.md (100%)
 rename {.claude => .agents}/skills/launching-evals/references/run-evaluation.md (100%)
 rename {.claude => .agents}/skills/launching-evals/tests.json (100%)
 rename {.claude => .agents}/skills/monitor/SKILL.md (100%)
 rename {.claude => .agents}/skills/ptq/SKILL.md (100%)
 rename {.claude => .agents}/skills/ptq/references/checkpoint-validation.md (100%)
 rename {.claude => .agents}/skills/ptq/references/launcher-guide.md (100%)
 rename {.claude => .agents}/skills/ptq/references/slurm-setup-ptq.md (100%)
 rename {.claude => .agents}/skills/ptq/references/unsupported-models.md (100%)
 rename {.claude => .agents}/skills/ptq/tests.json (100%)
 rename {.claude => .agents}/skills/release-cherry-pick/SKILL.md (100%)
 mode change 100644 => 120000 .claude/clusters.yaml.example
 create mode 120000 .claude/scripts
 create mode 120000 .claude/skills

diff --git a/.agents/README.md b/.agents/README.md
new file mode 100644
index 00000000000..d5798b5a853
--- /dev/null
+++ b/.agents/README.md
@@ -0,0 +1,70 @@
+# `.agents/` — agent-agnostic source of truth
+
+This directory is the canonical location for assets shared by AI coding agents
+working in this repository (Claude Code, Codex, Cursor, …).
+
+## Layout
+
+```text
+.agents/
+├── skills/                 # SKILL.md files (canonical)
+│   └── <skill-name>/SKILL.md
+├── scripts/                # shared helper scripts (sync-upstream-skills.sh, …)
+└── clusters.yaml.example   # remote-cluster config template
+```
+
+## Why this exists
+
+Different agents look for skills/config in vendor-specific directories:
+
+| Agent       | Default location              |
+|-------------|-------------------------------|
+| Claude Code | `.claude/skills/`             |
+| Codex       | `.codex/skills/`              |
+| Cursor      | `.cursor/skills/`             |
+
+Maintaining N copies of the same skill is a non-starter. Instead, **`.agents/`
+is the single source of truth**, and each vendor directory is a symlink:
+
+```text
+.claude/skills              -> ../.agents/skills
+.claude/scripts             -> ../.agents/scripts
+.claude/clusters.yaml.example -> ../.agents/clusters.yaml.example
+```
+
+To add support for a new agent, create a directory with the symlinks that
+agent expects, e.g.:
+
+```bash
+mkdir -p .codex
+ln -s ../.agents/skills .codex/skills
+git add .codex/skills
+```
+
+## Editing rules
+
+- **Always edit files under `.agents/`**, never under the vendor symlink paths.
+  Edits via the symlink work, but the diff will look like changes to
+  `.agents/...` either way; editing the canonical path makes that explicit.
+- Vendored-verbatim skills (`launching-evals`, `accessing-mlflow`) are managed
+  by `.agents/scripts/sync-upstream-skills.sh` — do not modify by hand.
+- New skills go in `.agents/skills/<skill-name>/SKILL.md` following the
+  conventions documented in [`.cursor/skills-cursor/create-skill/SKILL.md`](https://docs.anthropic.com/) (or your agent's equivalent).
+
+## Project-level cluster config
+
+The remote-execution skills look for a `clusters.yaml` at, in order:
+
+1. `~/.config/modelopt/clusters.yaml` (user-level, recommended)
+2. `<repo-root>/.agents/clusters.yaml` (project-level, canonical)
+3. `<repo-root>/.claude/clusters.yaml` (project-level, back-compat)
+
+See `clusters.yaml.example` for the schema.
+
+## A note on Windows
+
+Git stores symlinks portably, but Windows requires either Developer Mode or
+`git config --global core.symlinks true` plus admin rights for them to
+materialise correctly. If you're on Windows and skills aren't being picked
+up under `.claude/skills/`, that's the most likely cause — `.agents/skills/`
+will still work directly.
diff --git a/.agents/clusters.yaml.example b/.agents/clusters.yaml.example
new file mode 100644
index 00000000000..c58a60993a4
--- /dev/null
+++ b/.agents/clusters.yaml.example
@@ -0,0 +1,19 @@
+# ModelOpt Remote Cluster Configuration
+# Copy to ~/.config/modelopt/clusters.yaml (user-level, recommended)
+# or .agents/clusters.yaml (project-level, can be committed).
+# .claude/clusters.yaml is also accepted for back-compat.
+
+clusters:
+  # GPU workstation or SLURM login node
+  my-cluster:
+    login_node: cluster-login.example.com
+    user: myusername
+    ssh_key: ~/.ssh/id_rsa
+    # ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128"  # optional
+    workspace: /path/to/remote/workdir
+    gpu_type: H100   # used for quantization format recommendation
+    # slurm:
+    #   default_account: my_account
+    #   default_partition: batch_short
+
+default_cluster: my-cluster
diff --git a/.claude/scripts/sync-upstream-skills.sh b/.agents/scripts/sync-upstream-skills.sh
similarity index 93%
rename from .claude/scripts/sync-upstream-skills.sh
rename to .agents/scripts/sync-upstream-skills.sh
index c8717ac917e..616643d322c 100755
--- a/.claude/scripts/sync-upstream-skills.sh
+++ b/.agents/scripts/sync-upstream-skills.sh
@@ -21,15 +21,18 @@
 # NOT managed by this script — update it manually when pulling upstream changes.
 #
 # Usage:
-#   .claude/scripts/sync-upstream-skills.sh            # re-vendor at the pinned SHA
-#   UPSTREAM_SHA=<sha> .claude/scripts/sync-upstream-skills.sh   # bump to a new SHA
+#   .agents/scripts/sync-upstream-skills.sh            # re-vendor at the pinned SHA
+#   UPSTREAM_SHA=<sha> .agents/scripts/sync-upstream-skills.sh   # bump to a new SHA
 #
 # Requires: gh, base64, awk. Run from the repo root.
 #
-# The script overwrites .claude/skills/<skill>/ with upstream contents and
+# The script overwrites .agents/skills/<skill>/ with upstream contents and
 # re-applies our provenance lines into each SKILL.md frontmatter. If you have
 # local changes to a vendored skill, they will be lost — that is expected,
 # since vendored-verbatim skills should not be modified locally.
+#
+# Note: .claude/skills/ (and other agent-specific skill dirs) are symlinks to
+# .agents/skills/ — see .agents/README.md.
 
 set -euo pipefail
 
@@ -40,7 +43,7 @@ SHORT_SHA="${SHA:0:7}"
 
 UPSTREAM_REPO="NVIDIA-NeMo/Evaluator"
 UPSTREAM_BASE="packages/nemo-evaluator-launcher/.claude/skills"
-DEST_BASE=".claude/skills"
+DEST_BASE=".agents/skills"
 
 if [[ ! -d "$DEST_BASE" ]]; then
     echo "error: run from the repo root (expected $DEST_BASE/ to exist)" >&2
@@ -116,7 +119,7 @@ inject_provenance() {
                 print "license: Apache-2.0"
                 print "# Vendored verbatim from NVIDIA NeMo Evaluator (commit " short ")"
                 print "# https://github.com/NVIDIA-NeMo/Evaluator/tree/" sha "/packages/nemo-evaluator-launcher/.claude/skills/" skill
-                print "# To re-sync: .claude/scripts/sync-upstream-skills.sh"
+                print "# To re-sync: .agents/scripts/sync-upstream-skills.sh"
                 if (extra != "") {
                     n = split(extra, lines, "\\|")
                     for (i = 1; i <= n; i++) print "# " lines[i]
diff --git a/.claude/skills/accessing-mlflow/SKILL.md b/.agents/skills/accessing-mlflow/SKILL.md
similarity index 100%
rename from .claude/skills/accessing-mlflow/SKILL.md
rename to .agents/skills/accessing-mlflow/SKILL.md
diff --git a/.claude/skills/common/credentials.md b/.agents/skills/common/credentials.md
similarity index 100%
rename from .claude/skills/common/credentials.md
rename to .agents/skills/common/credentials.md
diff --git a/.claude/skills/common/environment-setup.md b/.agents/skills/common/environment-setup.md
similarity index 100%
rename from .claude/skills/common/environment-setup.md
rename to .agents/skills/common/environment-setup.md
diff --git a/.claude/skills/common/remote-execution.md b/.agents/skills/common/remote-execution.md
similarity index 100%
rename from .claude/skills/common/remote-execution.md
rename to .agents/skills/common/remote-execution.md
diff --git a/.claude/skills/common/remote_exec.sh b/.agents/skills/common/remote_exec.sh
similarity index 98%
rename from .claude/skills/common/remote_exec.sh
rename to .agents/skills/common/remote_exec.sh
index 1cc070e17e7..b1d3e0c01b6 100644
--- a/.claude/skills/common/remote_exec.sh
+++ b/.agents/skills/common/remote_exec.sh
@@ -41,12 +41,17 @@
 # ── Helpers ──────────────────────────────────────────────────────────────────
 
 _remote_config_file() {
-    # Find clusters.yaml: user-level > project-level
+    # Find clusters.yaml: user-level > project-level.
+    # Project-level is checked at .agents/clusters.yaml (canonical) and then
+    # .claude/clusters.yaml (back-compat).
     local user_config="${HOME}/.config/modelopt/clusters.yaml"
     local project_config
-    # Walk up from pwd looking for .claude/clusters.yaml
     local dir="$PWD"
     while [[ "$dir" != "/" ]]; do
+        if [[ -f "$dir/.agents/clusters.yaml" ]]; then
+            project_config="$dir/.agents/clusters.yaml"
+            break
+        fi
         if [[ -f "$dir/.claude/clusters.yaml" ]]; then
             project_config="$dir/.claude/clusters.yaml"
             break
@@ -196,7 +201,7 @@ remote_load_cluster() {
     if [[ -z "$config_file" ]]; then
         echo "ERROR: No clusters.yaml found. Provide cluster info interactively or create one." >&2
         echo "  User config:    ~/.config/modelopt/clusters.yaml" >&2
-        echo "  Project config: .claude/clusters.yaml" >&2
+        echo "  Project config: .agents/clusters.yaml (or .claude/clusters.yaml)" >&2
         return 1
     fi
 
diff --git a/.claude/skills/common/slurm-setup.md b/.agents/skills/common/slurm-setup.md
similarity index 100%
rename from .claude/skills/common/slurm-setup.md
rename to .agents/skills/common/slurm-setup.md
diff --git a/.claude/skills/common/workspace-management.md b/.agents/skills/common/workspace-management.md
similarity index 100%
rename from .claude/skills/common/workspace-management.md
rename to .agents/skills/common/workspace-management.md
diff --git a/.claude/skills/debug/SKILL.md b/.agents/skills/debug/SKILL.md
similarity index 100%
rename from .claude/skills/debug/SKILL.md
rename to .agents/skills/debug/SKILL.md
diff --git a/.claude/skills/deployment/SKILL.md b/.agents/skills/deployment/SKILL.md
similarity index 100%
rename from .claude/skills/deployment/SKILL.md
rename to .agents/skills/deployment/SKILL.md
diff --git a/.claude/skills/deployment/references/setup.md b/.agents/skills/deployment/references/setup.md
similarity index 100%
rename from .claude/skills/deployment/references/setup.md
rename to .agents/skills/deployment/references/setup.md
diff --git a/.claude/skills/deployment/references/sglang.md b/.agents/skills/deployment/references/sglang.md
similarity index 100%
rename from .claude/skills/deployment/references/sglang.md
rename to .agents/skills/deployment/references/sglang.md
diff --git a/.claude/skills/deployment/references/support-matrix.md b/.agents/skills/deployment/references/support-matrix.md
similarity index 100%
rename from .claude/skills/deployment/references/support-matrix.md
rename to .agents/skills/deployment/references/support-matrix.md
diff --git a/.claude/skills/deployment/references/trtllm.md b/.agents/skills/deployment/references/trtllm.md
similarity index 100%
rename from .claude/skills/deployment/references/trtllm.md
rename to .agents/skills/deployment/references/trtllm.md
diff --git a/.claude/skills/deployment/references/unsupported-models.md b/.agents/skills/deployment/references/unsupported-models.md
similarity index 100%
rename from .claude/skills/deployment/references/unsupported-models.md
rename to .agents/skills/deployment/references/unsupported-models.md
diff --git a/.claude/skills/deployment/references/vllm.md b/.agents/skills/deployment/references/vllm.md
similarity index 100%
rename from .claude/skills/deployment/references/vllm.md
rename to .agents/skills/deployment/references/vllm.md
diff --git a/.claude/skills/deployment/scripts/deploy.sh b/.agents/skills/deployment/scripts/deploy.sh
similarity index 100%
rename from .claude/skills/deployment/scripts/deploy.sh
rename to .agents/skills/deployment/scripts/deploy.sh
diff --git a/.claude/skills/deployment/tests/evals.json b/.agents/skills/deployment/tests/evals.json
similarity index 100%
rename from .claude/skills/deployment/tests/evals.json
rename to .agents/skills/deployment/tests/evals.json
diff --git a/.claude/skills/evaluation/SKILL.md b/.agents/skills/evaluation/SKILL.md
similarity index 100%
rename from .claude/skills/evaluation/SKILL.md
rename to .agents/skills/evaluation/SKILL.md
diff --git a/.claude/skills/evaluation/references/model-card-research.md b/.agents/skills/evaluation/references/model-card-research.md
similarity index 100%
rename from .claude/skills/evaluation/references/model-card-research.md
rename to .agents/skills/evaluation/references/model-card-research.md
diff --git a/.claude/skills/evaluation/references/multi-node.md b/.agents/skills/evaluation/references/multi-node.md
similarity index 100%
rename from .claude/skills/evaluation/references/multi-node.md
rename to .agents/skills/evaluation/references/multi-node.md
diff --git a/.claude/skills/evaluation/references/quantization-benchmarks.md b/.agents/skills/evaluation/references/quantization-benchmarks.md
similarity index 100%
rename from .claude/skills/evaluation/references/quantization-benchmarks.md
rename to .agents/skills/evaluation/references/quantization-benchmarks.md
diff --git a/.claude/skills/evaluation/tests/evals.json b/.agents/skills/evaluation/tests/evals.json
similarity index 100%
rename from .claude/skills/evaluation/tests/evals.json
rename to .agents/skills/evaluation/tests/evals.json
diff --git a/.claude/skills/launching-evals/SKILL.md b/.agents/skills/launching-evals/SKILL.md
similarity index 100%
rename from .claude/skills/launching-evals/SKILL.md
rename to .agents/skills/launching-evals/SKILL.md
diff --git a/.claude/skills/launching-evals/references/analyze-results.md b/.agents/skills/launching-evals/references/analyze-results.md
similarity index 100%
rename from .claude/skills/launching-evals/references/analyze-results.md
rename to .agents/skills/launching-evals/references/analyze-results.md
diff --git a/.claude/skills/launching-evals/references/benchmarks/swebench-general-info.md b/.agents/skills/launching-evals/references/benchmarks/swebench-general-info.md
similarity index 100%
rename from .claude/skills/launching-evals/references/benchmarks/swebench-general-info.md
rename to .agents/skills/launching-evals/references/benchmarks/swebench-general-info.md
diff --git a/.claude/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md b/.agents/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md
similarity index 100%
rename from .claude/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md
rename to .agents/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md
diff --git a/.claude/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md b/.agents/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md
similarity index 100%
rename from .claude/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md
rename to .agents/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md
diff --git a/.claude/skills/launching-evals/references/check-progress.md b/.agents/skills/launching-evals/references/check-progress.md
similarity index 100%
rename from .claude/skills/launching-evals/references/check-progress.md
rename to .agents/skills/launching-evals/references/check-progress.md
diff --git a/.claude/skills/launching-evals/references/debug-failed-runs.md b/.agents/skills/launching-evals/references/debug-failed-runs.md
similarity index 100%
rename from .claude/skills/launching-evals/references/debug-failed-runs.md
rename to .agents/skills/launching-evals/references/debug-failed-runs.md
diff --git a/.claude/skills/launching-evals/references/run-evaluation.md b/.agents/skills/launching-evals/references/run-evaluation.md
similarity index 100%
rename from .claude/skills/launching-evals/references/run-evaluation.md
rename to .agents/skills/launching-evals/references/run-evaluation.md
diff --git a/.claude/skills/launching-evals/tests.json b/.agents/skills/launching-evals/tests.json
similarity index 100%
rename from .claude/skills/launching-evals/tests.json
rename to .agents/skills/launching-evals/tests.json
diff --git a/.claude/skills/monitor/SKILL.md b/.agents/skills/monitor/SKILL.md
similarity index 100%
rename from .claude/skills/monitor/SKILL.md
rename to .agents/skills/monitor/SKILL.md
diff --git a/.claude/skills/ptq/SKILL.md b/.agents/skills/ptq/SKILL.md
similarity index 100%
rename from .claude/skills/ptq/SKILL.md
rename to .agents/skills/ptq/SKILL.md
diff --git a/.claude/skills/ptq/references/checkpoint-validation.md b/.agents/skills/ptq/references/checkpoint-validation.md
similarity index 100%
rename from .claude/skills/ptq/references/checkpoint-validation.md
rename to .agents/skills/ptq/references/checkpoint-validation.md
diff --git a/.claude/skills/ptq/references/launcher-guide.md b/.agents/skills/ptq/references/launcher-guide.md
similarity index 100%
rename from .claude/skills/ptq/references/launcher-guide.md
rename to .agents/skills/ptq/references/launcher-guide.md
diff --git a/.claude/skills/ptq/references/slurm-setup-ptq.md b/.agents/skills/ptq/references/slurm-setup-ptq.md
similarity index 100%
rename from .claude/skills/ptq/references/slurm-setup-ptq.md
rename to .agents/skills/ptq/references/slurm-setup-ptq.md
diff --git a/.claude/skills/ptq/references/unsupported-models.md b/.agents/skills/ptq/references/unsupported-models.md
similarity index 100%
rename from .claude/skills/ptq/references/unsupported-models.md
rename to .agents/skills/ptq/references/unsupported-models.md
diff --git a/.claude/skills/ptq/tests.json b/.agents/skills/ptq/tests.json
similarity index 100%
rename from .claude/skills/ptq/tests.json
rename to .agents/skills/ptq/tests.json
diff --git a/.claude/skills/release-cherry-pick/SKILL.md b/.agents/skills/release-cherry-pick/SKILL.md
similarity index 100%
rename from .claude/skills/release-cherry-pick/SKILL.md
rename to .agents/skills/release-cherry-pick/SKILL.md
diff --git a/.claude/clusters.yaml.example b/.claude/clusters.yaml.example
deleted file mode 100644
index 5bf4182e5c2..00000000000
--- a/.claude/clusters.yaml.example
+++ /dev/null
@@ -1,18 +0,0 @@
-# ModelOpt Remote Cluster Configuration
-# Copy to ~/.config/modelopt/clusters.yaml (user-level, recommended)
-# or .claude/clusters.yaml (project-level, can be committed).
-
-clusters:
-  # GPU workstation or SLURM login node
-  my-cluster:
-    login_node: cluster-login.example.com
-    user: myusername
-    ssh_key: ~/.ssh/id_rsa
-    # ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128"  # optional
-    workspace: /path/to/remote/workdir
-    gpu_type: H100   # used for quantization format recommendation
-    # slurm:
-    #   default_account: my_account
-    #   default_partition: batch_short
-
-default_cluster: my-cluster
diff --git a/.claude/clusters.yaml.example b/.claude/clusters.yaml.example
new file mode 120000
index 00000000000..8cf2f9dd0ca
--- /dev/null
+++ b/.claude/clusters.yaml.example
@@ -0,0 +1 @@
+../.agents/clusters.yaml.example
\ No newline at end of file
diff --git a/.claude/scripts b/.claude/scripts
new file mode 120000
index 00000000000..026c3b766d4
--- /dev/null
+++ b/.claude/scripts
@@ -0,0 +1 @@
+../.agents/scripts
\ No newline at end of file
diff --git a/.claude/skills b/.claude/skills
new file mode 120000
index 00000000000..2b7a412b8fa
--- /dev/null
+++ b/.claude/skills
@@ -0,0 +1 @@
+../.agents/skills
\ No newline at end of file
diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml
index de3bbba7b3e..86a0387160b 100644
--- a/.markdownlint-cli2.yaml
+++ b/.markdownlint-cli2.yaml
@@ -12,7 +12,7 @@ config:
   MD059: false # no-hard-tabs
 
 # Vendored upstream skills — kept byte-identical to upstream via
-# .claude/scripts/sync-upstream-skills.sh; do not reformat.
+# .agents/scripts/sync-upstream-skills.sh; do not reformat.
 ignores:
-  - ".claude/skills/launching-evals/**"
-  - ".claude/skills/accessing-mlflow/**"
+  - ".agents/skills/launching-evals/**"
+  - ".agents/skills/accessing-mlflow/**"
diff --git a/CLAUDE.md b/CLAUDE.md
index d0b47148c38..a810a40b59b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -9,6 +9,11 @@ Primarily Python codebase with optional C++/CUDA extensions supporting PyTorch,
 > If a `CLAUDE.local.md` file exists alongside this file, read and respect it — it contains
 > developer-specific overrides that supplement this shared guidance.
 
+> **Skills live in `.agents/skills/`** — `.claude/skills/` is a symlink to
+> `.agents/skills/` for back-compat. See `.agents/README.md` for the convention
+> (used to share skills/scripts/cluster-config across Claude Code, Codex, Cursor,
+> etc.). Always edit files under `.agents/`, not the symlink path.
+
 ## Rules (Read First)
 
 **CRITICAL (YOU MUST):**

From 28bc8fe676e1d117a86b2ec2dae33722b73ce2a7 Mon Sep 17 00:00:00 2001
From: Seonghee Lee <seongheel@nvidia.com>
Date: Tue, 28 Apr 2026 11:35:58 -0700
Subject: [PATCH 2/5] chore(skills): redact NVIDIA-internal references in
 vendored skills

Surfaced by an internal-keyword scan over .agents/skills/. All four
findings replaced with vendor-neutral wording:

- launching-evals/SKILL.md: replace concrete Slurm account names
  (coreai_dlalgo_compeval / coreai_dlalgo_llm) used as the "PPP -> X"
  rename example with placeholders <old_account> / <new_account>.
- launching-evals/SKILL.md: generalise the HF cache path from
  /lustre/fsw/portfolios/coreai/users/<username>/cache/huggingface to
  HF_HOME=<your_hf_cache_path>, with a parenthetical note that lustre-
  style HPC clusters typically organise this under
  /lustre/.../<group>/users/<username>/...
- launching-evals/references/debug-failed-runs.md: rephrase the
  "Drop ':5005' from GitLab container registry URLs" advice (port 5005
  is the standard port for an on-prem GitLab container registry; the
  raw advice only made sense in that context) to a vendor-neutral
  "If the image is on an on-prem GitLab registry, drop the registry
  port suffix (e.g. ':5005') from the URL." Applied at both occurrences.
- common/slurm-setup.md: change the enroot/pyxis "Typical clusters"
  cell from "NVIDIA internal (DGX Cloud, EOS, Selene, GCP-NRT)" to
  "HPC clusters with container runtime (e.g. DGX Cloud and similar
  Slurm + container setups)" -- removes internal cluster codenames
  (EOS, Selene, GCP-NRT) and the "NVIDIA internal" label.

Caveat: the three launching-evals/* files are vendored verbatim from
NVIDIA-NeMo/Evaluator (per the provenance header injected by
.agents/scripts/sync-upstream-skills.sh). The next sync will overwrite
them. Follow-ups: (1) upstream MR against NVIDIA-NeMo/Evaluator, and/or
(2) add a redaction post-process to sync-upstream-skills.sh so the
scrub survives re-syncs.

Signed-off-by: Seonghee Lee <seongheel@nvidia.com>
Made-with: Cursor
Signed-off-by: Seonghee Lee <seongheel@nvidia.com>
---
 .agents/skills/common/slurm-setup.md                          | 2 +-
 .agents/skills/launching-evals/SKILL.md                       | 4 ++--
 .../skills/launching-evals/references/debug-failed-runs.md    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.agents/skills/common/slurm-setup.md b/.agents/skills/common/slurm-setup.md
index f7d99c7543e..901a1ab9b2a 100644
--- a/.agents/skills/common/slurm-setup.md
+++ b/.agents/skills/common/slurm-setup.md
@@ -215,7 +215,7 @@ which docker 2>/dev/null && echo "RUNTIME=docker"
 
 | Runtime | Typical clusters | SLURM integration |
 | --- | --- | --- |
-| **enroot/pyxis** | NVIDIA internal (DGX Cloud, EOS, Selene, GCP-NRT) | `srun --container-image` |
+| **enroot/pyxis** | HPC clusters with container runtime (e.g. DGX Cloud and similar Slurm + container setups) | `srun --container-image` |
 | **Docker** | Bare-metal / on-prem with GPU | `docker run` inside job script |
 
 ### Step 2: Check credentials for the image's registry
diff --git a/.agents/skills/launching-evals/SKILL.md b/.agents/skills/launching-evals/SKILL.md
index adb40834899..d5a7250d865 100644
--- a/.agents/skills/launching-evals/SKILL.md
+++ b/.agents/skills/launching-evals/SKILL.md
@@ -62,9 +62,9 @@ The complete evaluation workflow is divided into the following steps you should
 # Key Facts
 
 - Benchmark-specific info learned during launching/analyzing evals should be added to `references/benchmarks/`
-- **PPP** = Slurm account (the `account` field in cluster_config.yaml). When the user says "change PPP to X", update the account value (e.g., `coreai_dlalgo_compeval` → `coreai_dlalgo_llm`).
+- **PPP** = Slurm account / project portfolio code (the `account` field in cluster_config.yaml). When the user says "change PPP to X", update the account value (e.g., `<old_account>` → `<new_account>`).
 - **Slurm job pairs**: NEL (nemo-evaluator-launcher) submits paired Slurm jobs — a RUNNING job + a PENDING restart job (for when the 4h walltime expires). Never cancel the pending restart jobs — they are expected and necessary.
-- **HF cache requirement**: For configs with `HF_HUB_OFFLINE=1`, models must be pre-downloaded to the HF cache on each cluster before launching. **Before running a model on a new cluster, always ask the user if the model is already cached there.** If not, on the cluster login node: `python3 -m venv hf_cli && source hf_cli/bin/activate && pip install huggingface_hub` then `HF_HOME=/lustre/fsw/portfolios/coreai/users/<username>/cache/huggingface hf download <model>`. Without this, vLLM will fail with `LocalEntryNotFoundError`.
+- **HF cache requirement**: For configs with `HF_HUB_OFFLINE=1`, models must be pre-downloaded to the HF cache on each cluster before launching. **Before running a model on a new cluster, always ask the user if the model is already cached there.** If not, on the cluster login node: `python3 -m venv hf_cli && source hf_cli/bin/activate && pip install huggingface_hub` then `HF_HOME=<your_hf_cache_path> hf download <model>` (on lustre-style HPC clusters this is typically under `/lustre/.../<group>/users/<username>/cache/huggingface`). Without this, vLLM will fail with `LocalEntryNotFoundError`.
 - **`data_parallel_size` is per node**: `dp_size=1` with `num_nodes=8` means 8 model instances total (one per node), load-balanced by haproxy. Do NOT interpret `dp_size` as the global replica count.
 - **`payload_modifier` interceptor**: The `params_to_remove` list (e.g. `[max_tokens, max_completion_tokens]`) strips those fields from the outgoing payload, intentionally lifting output length limits so reasoning models can think as long as they need.
 - **Auto-export git workaround**: The export container (`python:3.12-slim`) lacks `git`. When installing the launcher from a git URL, set `auto_export.launcher_install_cmd` to install git first (e.g., `apt-get update -qq && apt-get install -qq -y git && pip install "nemo-evaluator-launcher[all] @ git+...#subdirectory=packages/nemo-evaluator-launcher"`).
diff --git a/.agents/skills/launching-evals/references/debug-failed-runs.md b/.agents/skills/launching-evals/references/debug-failed-runs.md
index e94d3bb89f8..5783dc7ac86 100644
--- a/.agents/skills/launching-evals/references/debug-failed-runs.md
+++ b/.agents/skills/launching-evals/references/debug-failed-runs.md
@@ -70,7 +70,7 @@ tail -200 $LOGS/client-*.log
 - **CUDA OOM**: Increase `deployment.tensor_parallel_size` to shard across more GPUs. For multi-node: increase `execution.num_nodes` and set `deployment.pipeline_parallel_size`. As last resort: add `--max-model-len <lower_value>` to `deployment.extra_args`. Do NOT quantize as a first fix — scale compute instead.
 - **Missing model/checkpoint**: `FileNotFoundError` or `RepositoryNotFoundError` or `GatedRepoError: 403` — verify `deployment.checkpoint_path` or `deployment.hf_model_handle`. For gated models, set `HF_TOKEN` via `deployment.env_vars`.
 - **Bad `extra_args`**: `unrecognized arguments` or `unexpected keyword argument` — check flags against deployment engine version. Some flags change between versions (e.g., `--rope-scaling` removed in vLLM > 0.11.0).
-- **Image pull failure**: `manifest not found` or `pyxis: child 1 failed` — verify image tag exists. Drop `:5005` from GitLab container registry URLs.
+- **Image pull failure**: `manifest not found` or `pyxis: child 1 failed` — verify image tag exists. If the image is on an on-prem GitLab registry, drop the registry port suffix (e.g. `:5005`) from the URL.
 - **GPU driver mismatch**: `CUDA driver version is insufficient` — use an older container image matching the host CUDA driver.
 - **Health check timeout / connection refused**: Server didn't start — check server logs first. Increase `execution.endpoint_readiness_timeout` (seconds). SLURM default: `null` (falls back to walltime).
 - **Server crashed mid-eval**: `Connection reset by peer` — check server logs for OOM. Reduce `parallelism` (concurrent requests). Check SLURM logs for preemption or walltime exceeded.
@@ -80,7 +80,7 @@ tail -200 $LOGS/client-*.log
 - **Config validation**: `MissingMandatoryValue` (unfilled `???`), `ValidationError` (type mismatch), `ScannerError` (invalid YAML) — run `--dry-run` to catch these upfront.
 - **Walltime exceeded**: `CANCELLED DUE TO TIME LIMIT` — NEL submits paired restart jobs that automatically resume when walltime expires, so this is often expected behavior, not a failure. Only increase `execution.walltime` if the evaluation isn't making progress across restarts.
 - **Preemption**: `CANCELLED DUE TO PREEMPTION` — the paired restart job should automatically resume. If it doesn't, use non-preemptible partition, or re-run.
-- **Container not found**: Applies to both `deployment.image` and task-level eval container. Drop `:5005` from GitLab registry URLs.
+- **Container not found**: Applies to both `deployment.image` and task-level eval container. For on-prem GitLab registries, drop the registry port suffix (e.g. `:5005`) from the URL.
 - Troubleshooting docs: list files with WebFetch `https://api.github.com/repos/NVIDIA-NeMo/Evaluator/contents/docs/troubleshooting`, then fetch relevant ones from `https://raw.githubusercontent.com/NVIDIA-NeMo/Evaluator/main/docs/troubleshooting/<file>`
 
 **Fix Slurm invalid account/partition:**

From b9a5fb2ea26d2207242a60803e5e3c62114b3287 Mon Sep 17 00:00:00 2001
From: Seonghee Lee <seongheel@nvidia.com>
Date: Wed, 29 Apr 2026 13:22:23 -0700
Subject: [PATCH 3/5] Update remote-execution.md to match the new lookup order.

Signed-off-by: Seonghee Lee <seongheel@nvidia.com>
---
 .agents/skills/common/remote-execution.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.agents/skills/common/remote-execution.md b/.agents/skills/common/remote-execution.md
index be770aef936..b98462ef737 100644
--- a/.agents/skills/common/remote-execution.md
+++ b/.agents/skills/common/remote-execution.md
@@ -9,8 +9,9 @@ Read this when Claude Code runs on a different machine than the target GPU clust
 Config locations (checked in order, first found wins):
 
 1. `~/.config/modelopt/clusters.yaml` — user-level (not committed, recommended)
-2. `.claude/clusters.yaml` — project-level (can be committed for shared defaults)
-3. Interactive input — if neither file exists, ask the user (see SKILL.md Step 0) and write `~/.config/modelopt/clusters.yaml` before proceeding
+2. `.agents/clusters.yaml` — project-level, canonical (can be committed for shared defaults)
+3. `.claude/clusters.yaml` — project-level, back-compat
+4. Interactive input — if no file exists, ask the user (see SKILL.md Step 0) and write `~/.config/modelopt/clusters.yaml` before proceeding
 
 ```yaml
 clusters:
@@ -38,7 +39,7 @@ rsync -av /path/to/local/checkpoint <cluster-login>:<cluster-workspace>/checkpoi
 
 Use the `workspace` path from your cluster config as the destination. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster.
 
-See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types.
+See `.agents/clusters.yaml.example` for a fully annotated example with multiple cluster types.
 
 ---
 
@@ -153,5 +154,5 @@ remote_sync_from <remote_output_subdir> /local/output/
 ## Reference Files
 
 - **`skills/common/remote_exec.sh`** — Full utility library (session, run, sync, SLURM, Docker helpers)
-- **`.claude/clusters.yaml`** — Active cluster configuration
-- **`.claude/clusters.yaml.example`** — Annotated example config
+- **`.agents/clusters.yaml`** — Active cluster configuration (canonical; `.claude/clusters.yaml` also accepted for back-compat)
+- **`.agents/clusters.yaml.example`** — Annotated example config

From 1e5664d2d15b8f4f9c678a0bad61c893d64bd72a Mon Sep 17 00:00:00 2001
From: Seonghee Lee <seongheel@nvidia.com>
Date: Thu, 30 Apr 2026 11:06:10 -0700
Subject: [PATCH 4/5] fix wrong path link in documentation

Signed-off-by: Seonghee Lee <seongheel@nvidia.com>
---
 .agents/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.agents/README.md b/.agents/README.md
index d5798b5a853..f4a8cebf4e7 100644
--- a/.agents/README.md
+++ b/.agents/README.md
@@ -49,7 +49,7 @@ git add .codex/skills
 - Vendored-verbatim skills (`launching-evals`, `accessing-mlflow`) are managed
   by `.agents/scripts/sync-upstream-skills.sh` — do not modify by hand.
 - New skills go in `.agents/skills/<skill-name>/SKILL.md` following the
-  conventions documented in [`.cursor/skills-cursor/create-skill/SKILL.md`](https://docs.anthropic.com/) (or your agent's equivalent).
+  conventions of existing skills (e.g. `.agents/skills/monitor/SKILL.md`).
 
 ## Project-level cluster config
 

From 510f36da4921e641322201d1c045338fca565f75 Mon Sep 17 00:00:00 2001
From: Seonghee Lee <seongheel@nvidia.com>
Date: Thu, 30 Apr 2026 11:15:46 -0700
Subject: [PATCH 5/5] removing symlinks

Signed-off-by: Seonghee Lee <seongheel@nvidia.com>
---
 .agents/README.md             | 41 +++++------------------------------
 .claude/clusters.yaml.example |  1 -
 .claude/scripts               |  1 -
 .claude/skills                |  1 -
 4 files changed, 5 insertions(+), 39 deletions(-)
 delete mode 120000 .claude/clusters.yaml.example
 delete mode 120000 .claude/scripts
 delete mode 120000 .claude/skills

diff --git a/.agents/README.md b/.agents/README.md
index f4a8cebf4e7..bd522ead05c 100644
--- a/.agents/README.md
+++ b/.agents/README.md
@@ -15,37 +15,14 @@ working in this repository (Claude Code, Codex, Cursor, …).
 
 ## Why this exists
 
-Different agents look for skills/config in vendor-specific directories:
-
-| Agent       | Default location              |
-|-------------|-------------------------------|
-| Claude Code | `.claude/skills/`             |
-| Codex       | `.codex/skills/`              |
-| Cursor      | `.cursor/skills/`             |
-
-Maintaining N copies of the same skill is a non-starter. Instead, **`.agents/`
-is the single source of truth**, and each vendor directory is a symlink:
-
-```text
-.claude/skills              -> ../.agents/skills
-.claude/scripts             -> ../.agents/scripts
-.claude/clusters.yaml.example -> ../.agents/clusters.yaml.example
-```
-
-To add support for a new agent, create a directory with the symlinks that
-agent expects, e.g.:
-
-```bash
-mkdir -p .codex
-ln -s ../.agents/skills .codex/skills
-git add .codex/skills
-```
+Different agents look for skills/config in vendor-specific directories. Rather
+than maintaining N copies that drift out of sync, **`.agents/` is the single
+source of truth** — each agent's guidance or install mechanism points here
+directly.
 
 ## Editing rules
 
-- **Always edit files under `.agents/`**, never under the vendor symlink paths.
-  Edits via the symlink work, but the diff will look like changes to
-  `.agents/...` either way; editing the canonical path makes that explicit.
+- **Always edit files under `.agents/`**.
 - Vendored-verbatim skills (`launching-evals`, `accessing-mlflow`) are managed
   by `.agents/scripts/sync-upstream-skills.sh` — do not modify by hand.
 - New skills go in `.agents/skills/<skill-name>/SKILL.md` following the
@@ -60,11 +37,3 @@ The remote-execution skills look for a `clusters.yaml` at, in order:
 3. `<repo-root>/.claude/clusters.yaml` (project-level, back-compat)
 
 See `clusters.yaml.example` for the schema.
-
-## A note on Windows
-
-Git stores symlinks portably, but Windows requires either Developer Mode or
-`git config --global core.symlinks true` plus admin rights for them to
-materialise correctly. If you're on Windows and skills aren't being picked
-up under `.claude/skills/`, that's the most likely cause — `.agents/skills/`
-will still work directly.
diff --git a/.claude/clusters.yaml.example b/.claude/clusters.yaml.example
deleted file mode 120000
index 8cf2f9dd0ca..00000000000
--- a/.claude/clusters.yaml.example
+++ /dev/null
@@ -1 +0,0 @@
-../.agents/clusters.yaml.example
\ No newline at end of file
diff --git a/.claude/scripts b/.claude/scripts
deleted file mode 120000
index 026c3b766d4..00000000000
--- a/.claude/scripts
+++ /dev/null
@@ -1 +0,0 @@
-../.agents/scripts
\ No newline at end of file
diff --git a/.claude/skills b/.claude/skills
deleted file mode 120000
index 2b7a412b8fa..00000000000
--- a/.claude/skills
+++ /dev/null
@@ -1 +0,0 @@
-../.agents/skills
\ No newline at end of file