From d768f93c911330a9313201b7dedb5f074b760d8e Mon Sep 17 00:00:00 2001 From: Seonghee Lee Date: Tue, 28 Apr 2026 10:51:35 -0700 Subject: [PATCH 1/5] make .agents/ the cannonical agent-skills location Signed-off-by: Seonghee Lee --- .agents/README.md | 70 +++++++++++++++++++ .agents/clusters.yaml.example | 19 +++++ .../scripts/sync-upstream-skills.sh | 13 ++-- .../skills/accessing-mlflow/SKILL.md | 0 .../skills/common/credentials.md | 0 .../skills/common/environment-setup.md | 0 .../skills/common/remote-execution.md | 0 .../skills/common/remote_exec.sh | 11 ++- .../skills/common/slurm-setup.md | 0 .../skills/common/workspace-management.md | 0 {.claude => .agents}/skills/debug/SKILL.md | 0 .../skills/deployment/SKILL.md | 0 .../skills/deployment/references/setup.md | 0 .../skills/deployment/references/sglang.md | 0 .../deployment/references/support-matrix.md | 0 .../skills/deployment/references/trtllm.md | 0 .../references/unsupported-models.md | 0 .../skills/deployment/references/vllm.md | 0 .../skills/deployment/scripts/deploy.sh | 0 .../skills/deployment/tests/evals.json | 0 .../skills/evaluation/SKILL.md | 0 .../references/model-card-research.md | 0 .../evaluation/references/multi-node.md | 0 .../references/quantization-benchmarks.md | 0 .../skills/evaluation/tests/evals.json | 0 .../skills/launching-evals/SKILL.md | 0 .../references/analyze-results.md | 0 .../benchmarks/swebench-general-info.md | 0 .../benchmarks/terminal-bench-general-info.md | 0 .../terminal-bench-trace-analysis.md | 0 .../references/check-progress.md | 0 .../references/debug-failed-runs.md | 0 .../references/run-evaluation.md | 0 .../skills/launching-evals/tests.json | 0 {.claude => .agents}/skills/monitor/SKILL.md | 0 {.claude => .agents}/skills/ptq/SKILL.md | 0 .../ptq/references/checkpoint-validation.md | 0 .../skills/ptq/references/launcher-guide.md | 0 .../skills/ptq/references/slurm-setup-ptq.md | 0 .../ptq/references/unsupported-models.md | 0 {.claude => .agents}/skills/ptq/tests.json | 0 .../skills/release-cherry-pick/SKILL.md | 0 .claude/clusters.yaml.example | 19 +---- .claude/scripts | 1 + .claude/skills | 1 + .markdownlint-cli2.yaml | 6 +- CLAUDE.md | 5 ++ 47 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 .agents/README.md create mode 100644 .agents/clusters.yaml.example rename {.claude => .agents}/scripts/sync-upstream-skills.sh (93%) rename {.claude => .agents}/skills/accessing-mlflow/SKILL.md (100%) rename {.claude => .agents}/skills/common/credentials.md (100%) rename {.claude => .agents}/skills/common/environment-setup.md (100%) rename {.claude => .agents}/skills/common/remote-execution.md (100%) rename {.claude => .agents}/skills/common/remote_exec.sh (98%) rename {.claude => .agents}/skills/common/slurm-setup.md (100%) rename {.claude => .agents}/skills/common/workspace-management.md (100%) rename {.claude => .agents}/skills/debug/SKILL.md (100%) rename {.claude => .agents}/skills/deployment/SKILL.md (100%) rename {.claude => .agents}/skills/deployment/references/setup.md (100%) rename {.claude => .agents}/skills/deployment/references/sglang.md (100%) rename {.claude => .agents}/skills/deployment/references/support-matrix.md (100%) rename {.claude => .agents}/skills/deployment/references/trtllm.md (100%) rename {.claude => .agents}/skills/deployment/references/unsupported-models.md (100%) rename {.claude => .agents}/skills/deployment/references/vllm.md (100%) rename {.claude => .agents}/skills/deployment/scripts/deploy.sh (100%) rename {.claude => .agents}/skills/deployment/tests/evals.json (100%) rename {.claude => .agents}/skills/evaluation/SKILL.md (100%) rename {.claude => .agents}/skills/evaluation/references/model-card-research.md (100%) rename {.claude => .agents}/skills/evaluation/references/multi-node.md (100%) rename {.claude => .agents}/skills/evaluation/references/quantization-benchmarks.md (100%) rename {.claude => .agents}/skills/evaluation/tests/evals.json (100%) rename {.claude => .agents}/skills/launching-evals/SKILL.md (100%) rename {.claude => .agents}/skills/launching-evals/references/analyze-results.md (100%) rename {.claude => .agents}/skills/launching-evals/references/benchmarks/swebench-general-info.md (100%) rename {.claude => .agents}/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md (100%) rename {.claude => .agents}/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md (100%) rename {.claude => .agents}/skills/launching-evals/references/check-progress.md (100%) rename {.claude => .agents}/skills/launching-evals/references/debug-failed-runs.md (100%) rename {.claude => .agents}/skills/launching-evals/references/run-evaluation.md (100%) rename {.claude => .agents}/skills/launching-evals/tests.json (100%) rename {.claude => .agents}/skills/monitor/SKILL.md (100%) rename {.claude => .agents}/skills/ptq/SKILL.md (100%) rename {.claude => .agents}/skills/ptq/references/checkpoint-validation.md (100%) rename {.claude => .agents}/skills/ptq/references/launcher-guide.md (100%) rename {.claude => .agents}/skills/ptq/references/slurm-setup-ptq.md (100%) rename {.claude => .agents}/skills/ptq/references/unsupported-models.md (100%) rename {.claude => .agents}/skills/ptq/tests.json (100%) rename {.claude => .agents}/skills/release-cherry-pick/SKILL.md (100%) mode change 100644 => 120000 .claude/clusters.yaml.example create mode 120000 .claude/scripts create mode 120000 .claude/skills diff --git a/.agents/README.md b/.agents/README.md new file mode 100644 index 00000000000..d5798b5a853 --- /dev/null +++ b/.agents/README.md @@ -0,0 +1,70 @@ +# `.agents/` — agent-agnostic source of truth + +This directory is the canonical location for assets shared by AI coding agents +working in this repository (Claude Code, Codex, Cursor, …). + +## Layout + +```text +.agents/ +├── skills/ # SKILL.md files (canonical) +│ └── /SKILL.md +├── scripts/ # shared helper scripts (sync-upstream-skills.sh, …) +└── clusters.yaml.example # remote-cluster config template +``` + +## Why this exists + +Different agents look for skills/config in vendor-specific directories: + +| Agent | Default location | +|-------------|-------------------------------| +| Claude Code | `.claude/skills/` | +| Codex | `.codex/skills/` | +| Cursor | `.cursor/skills/` | + +Maintaining N copies of the same skill is a non-starter. Instead, **`.agents/` +is the single source of truth**, and each vendor directory is a symlink: + +```text +.claude/skills -> ../.agents/skills +.claude/scripts -> ../.agents/scripts +.claude/clusters.yaml.example -> ../.agents/clusters.yaml.example +``` + +To add support for a new agent, create a directory with the symlinks that +agent expects, e.g.: + +```bash +mkdir -p .codex +ln -s ../.agents/skills .codex/skills +git add .codex/skills +``` + +## Editing rules + +- **Always edit files under `.agents/`**, never under the vendor symlink paths. + Edits via the symlink work, but the diff will look like changes to + `.agents/...` either way; editing the canonical path makes that explicit. +- Vendored-verbatim skills (`launching-evals`, `accessing-mlflow`) are managed + by `.agents/scripts/sync-upstream-skills.sh` — do not modify by hand. +- New skills go in `.agents/skills//SKILL.md` following the + conventions documented in [`.cursor/skills-cursor/create-skill/SKILL.md`](https://docs.anthropic.com/) (or your agent's equivalent). + +## Project-level cluster config + +The remote-execution skills look for a `clusters.yaml` at, in order: + +1. `~/.config/modelopt/clusters.yaml` (user-level, recommended) +2. `/.agents/clusters.yaml` (project-level, canonical) +3. `/.claude/clusters.yaml` (project-level, back-compat) + +See `clusters.yaml.example` for the schema. + +## A note on Windows + +Git stores symlinks portably, but Windows requires either Developer Mode or +`git config --global core.symlinks true` plus admin rights for them to +materialise correctly. If you're on Windows and skills aren't being picked +up under `.claude/skills/`, that's the most likely cause — `.agents/skills/` +will still work directly. diff --git a/.agents/clusters.yaml.example b/.agents/clusters.yaml.example new file mode 100644 index 00000000000..c58a60993a4 --- /dev/null +++ b/.agents/clusters.yaml.example @@ -0,0 +1,19 @@ +# ModelOpt Remote Cluster Configuration +# Copy to ~/.config/modelopt/clusters.yaml (user-level, recommended) +# or .agents/clusters.yaml (project-level, can be committed). +# .claude/clusters.yaml is also accepted for back-compat. + +clusters: + # GPU workstation or SLURM login node + my-cluster: + login_node: cluster-login.example.com + user: myusername + ssh_key: ~/.ssh/id_rsa + # ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128" # optional + workspace: /path/to/remote/workdir + gpu_type: H100 # used for quantization format recommendation + # slurm: + # default_account: my_account + # default_partition: batch_short + +default_cluster: my-cluster diff --git a/.claude/scripts/sync-upstream-skills.sh b/.agents/scripts/sync-upstream-skills.sh similarity index 93% rename from .claude/scripts/sync-upstream-skills.sh rename to .agents/scripts/sync-upstream-skills.sh index c8717ac917e..616643d322c 100755 --- a/.claude/scripts/sync-upstream-skills.sh +++ b/.agents/scripts/sync-upstream-skills.sh @@ -21,15 +21,18 @@ # NOT managed by this script — update it manually when pulling upstream changes. # # Usage: -# .claude/scripts/sync-upstream-skills.sh # re-vendor at the pinned SHA -# UPSTREAM_SHA= .claude/scripts/sync-upstream-skills.sh # bump to a new SHA +# .agents/scripts/sync-upstream-skills.sh # re-vendor at the pinned SHA +# UPSTREAM_SHA= .agents/scripts/sync-upstream-skills.sh # bump to a new SHA # # Requires: gh, base64, awk. Run from the repo root. # -# The script overwrites .claude/skills// with upstream contents and +# The script overwrites .agents/skills// with upstream contents and # re-applies our provenance lines into each SKILL.md frontmatter. If you have # local changes to a vendored skill, they will be lost — that is expected, # since vendored-verbatim skills should not be modified locally. +# +# Note: .claude/skills/ (and other agent-specific skill dirs) are symlinks to +# .agents/skills/ — see .agents/README.md. set -euo pipefail @@ -40,7 +43,7 @@ SHORT_SHA="${SHA:0:7}" UPSTREAM_REPO="NVIDIA-NeMo/Evaluator" UPSTREAM_BASE="packages/nemo-evaluator-launcher/.claude/skills" -DEST_BASE=".claude/skills" +DEST_BASE=".agents/skills" if [[ ! -d "$DEST_BASE" ]]; then echo "error: run from the repo root (expected $DEST_BASE/ to exist)" >&2 @@ -116,7 +119,7 @@ inject_provenance() { print "license: Apache-2.0" print "# Vendored verbatim from NVIDIA NeMo Evaluator (commit " short ")" print "# https://github.com/NVIDIA-NeMo/Evaluator/tree/" sha "/packages/nemo-evaluator-launcher/.claude/skills/" skill - print "# To re-sync: .claude/scripts/sync-upstream-skills.sh" + print "# To re-sync: .agents/scripts/sync-upstream-skills.sh" if (extra != "") { n = split(extra, lines, "\\|") for (i = 1; i <= n; i++) print "# " lines[i] diff --git a/.claude/skills/accessing-mlflow/SKILL.md b/.agents/skills/accessing-mlflow/SKILL.md similarity index 100% rename from .claude/skills/accessing-mlflow/SKILL.md rename to .agents/skills/accessing-mlflow/SKILL.md diff --git a/.claude/skills/common/credentials.md b/.agents/skills/common/credentials.md similarity index 100% rename from .claude/skills/common/credentials.md rename to .agents/skills/common/credentials.md diff --git a/.claude/skills/common/environment-setup.md b/.agents/skills/common/environment-setup.md similarity index 100% rename from .claude/skills/common/environment-setup.md rename to .agents/skills/common/environment-setup.md diff --git a/.claude/skills/common/remote-execution.md b/.agents/skills/common/remote-execution.md similarity index 100% rename from .claude/skills/common/remote-execution.md rename to .agents/skills/common/remote-execution.md diff --git a/.claude/skills/common/remote_exec.sh b/.agents/skills/common/remote_exec.sh similarity index 98% rename from .claude/skills/common/remote_exec.sh rename to .agents/skills/common/remote_exec.sh index 1cc070e17e7..b1d3e0c01b6 100644 --- a/.claude/skills/common/remote_exec.sh +++ b/.agents/skills/common/remote_exec.sh @@ -41,12 +41,17 @@ # ── Helpers ────────────────────────────────────────────────────────────────── _remote_config_file() { - # Find clusters.yaml: user-level > project-level + # Find clusters.yaml: user-level > project-level. + # Project-level is checked at .agents/clusters.yaml (canonical) and then + # .claude/clusters.yaml (back-compat). local user_config="${HOME}/.config/modelopt/clusters.yaml" local project_config - # Walk up from pwd looking for .claude/clusters.yaml local dir="$PWD" while [[ "$dir" != "/" ]]; do + if [[ -f "$dir/.agents/clusters.yaml" ]]; then + project_config="$dir/.agents/clusters.yaml" + break + fi if [[ -f "$dir/.claude/clusters.yaml" ]]; then project_config="$dir/.claude/clusters.yaml" break @@ -196,7 +201,7 @@ remote_load_cluster() { if [[ -z "$config_file" ]]; then echo "ERROR: No clusters.yaml found. Provide cluster info interactively or create one." >&2 echo " User config: ~/.config/modelopt/clusters.yaml" >&2 - echo " Project config: .claude/clusters.yaml" >&2 + echo " Project config: .agents/clusters.yaml (or .claude/clusters.yaml)" >&2 return 1 fi diff --git a/.claude/skills/common/slurm-setup.md b/.agents/skills/common/slurm-setup.md similarity index 100% rename from .claude/skills/common/slurm-setup.md rename to .agents/skills/common/slurm-setup.md diff --git a/.claude/skills/common/workspace-management.md b/.agents/skills/common/workspace-management.md similarity index 100% rename from .claude/skills/common/workspace-management.md rename to .agents/skills/common/workspace-management.md diff --git a/.claude/skills/debug/SKILL.md b/.agents/skills/debug/SKILL.md similarity index 100% rename from .claude/skills/debug/SKILL.md rename to .agents/skills/debug/SKILL.md diff --git a/.claude/skills/deployment/SKILL.md b/.agents/skills/deployment/SKILL.md similarity index 100% rename from .claude/skills/deployment/SKILL.md rename to .agents/skills/deployment/SKILL.md diff --git a/.claude/skills/deployment/references/setup.md b/.agents/skills/deployment/references/setup.md similarity index 100% rename from .claude/skills/deployment/references/setup.md rename to .agents/skills/deployment/references/setup.md diff --git a/.claude/skills/deployment/references/sglang.md b/.agents/skills/deployment/references/sglang.md similarity index 100% rename from .claude/skills/deployment/references/sglang.md rename to .agents/skills/deployment/references/sglang.md diff --git a/.claude/skills/deployment/references/support-matrix.md b/.agents/skills/deployment/references/support-matrix.md similarity index 100% rename from .claude/skills/deployment/references/support-matrix.md rename to .agents/skills/deployment/references/support-matrix.md diff --git a/.claude/skills/deployment/references/trtllm.md b/.agents/skills/deployment/references/trtllm.md similarity index 100% rename from .claude/skills/deployment/references/trtllm.md rename to .agents/skills/deployment/references/trtllm.md diff --git a/.claude/skills/deployment/references/unsupported-models.md b/.agents/skills/deployment/references/unsupported-models.md similarity index 100% rename from .claude/skills/deployment/references/unsupported-models.md rename to .agents/skills/deployment/references/unsupported-models.md diff --git a/.claude/skills/deployment/references/vllm.md b/.agents/skills/deployment/references/vllm.md similarity index 100% rename from .claude/skills/deployment/references/vllm.md rename to .agents/skills/deployment/references/vllm.md diff --git a/.claude/skills/deployment/scripts/deploy.sh b/.agents/skills/deployment/scripts/deploy.sh similarity index 100% rename from .claude/skills/deployment/scripts/deploy.sh rename to .agents/skills/deployment/scripts/deploy.sh diff --git a/.claude/skills/deployment/tests/evals.json b/.agents/skills/deployment/tests/evals.json similarity index 100% rename from .claude/skills/deployment/tests/evals.json rename to .agents/skills/deployment/tests/evals.json diff --git a/.claude/skills/evaluation/SKILL.md b/.agents/skills/evaluation/SKILL.md similarity index 100% rename from .claude/skills/evaluation/SKILL.md rename to .agents/skills/evaluation/SKILL.md diff --git a/.claude/skills/evaluation/references/model-card-research.md b/.agents/skills/evaluation/references/model-card-research.md similarity index 100% rename from .claude/skills/evaluation/references/model-card-research.md rename to .agents/skills/evaluation/references/model-card-research.md diff --git a/.claude/skills/evaluation/references/multi-node.md b/.agents/skills/evaluation/references/multi-node.md similarity index 100% rename from .claude/skills/evaluation/references/multi-node.md rename to .agents/skills/evaluation/references/multi-node.md diff --git a/.claude/skills/evaluation/references/quantization-benchmarks.md b/.agents/skills/evaluation/references/quantization-benchmarks.md similarity index 100% rename from .claude/skills/evaluation/references/quantization-benchmarks.md rename to .agents/skills/evaluation/references/quantization-benchmarks.md diff --git a/.claude/skills/evaluation/tests/evals.json b/.agents/skills/evaluation/tests/evals.json similarity index 100% rename from .claude/skills/evaluation/tests/evals.json rename to .agents/skills/evaluation/tests/evals.json diff --git a/.claude/skills/launching-evals/SKILL.md b/.agents/skills/launching-evals/SKILL.md similarity index 100% rename from .claude/skills/launching-evals/SKILL.md rename to .agents/skills/launching-evals/SKILL.md diff --git a/.claude/skills/launching-evals/references/analyze-results.md b/.agents/skills/launching-evals/references/analyze-results.md similarity index 100% rename from .claude/skills/launching-evals/references/analyze-results.md rename to .agents/skills/launching-evals/references/analyze-results.md diff --git a/.claude/skills/launching-evals/references/benchmarks/swebench-general-info.md b/.agents/skills/launching-evals/references/benchmarks/swebench-general-info.md similarity index 100% rename from .claude/skills/launching-evals/references/benchmarks/swebench-general-info.md rename to .agents/skills/launching-evals/references/benchmarks/swebench-general-info.md diff --git a/.claude/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md b/.agents/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md similarity index 100% rename from .claude/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md rename to .agents/skills/launching-evals/references/benchmarks/terminal-bench-general-info.md diff --git a/.claude/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md b/.agents/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md similarity index 100% rename from .claude/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md rename to .agents/skills/launching-evals/references/benchmarks/terminal-bench-trace-analysis.md diff --git a/.claude/skills/launching-evals/references/check-progress.md b/.agents/skills/launching-evals/references/check-progress.md similarity index 100% rename from .claude/skills/launching-evals/references/check-progress.md rename to .agents/skills/launching-evals/references/check-progress.md diff --git a/.claude/skills/launching-evals/references/debug-failed-runs.md b/.agents/skills/launching-evals/references/debug-failed-runs.md similarity index 100% rename from .claude/skills/launching-evals/references/debug-failed-runs.md rename to .agents/skills/launching-evals/references/debug-failed-runs.md diff --git a/.claude/skills/launching-evals/references/run-evaluation.md b/.agents/skills/launching-evals/references/run-evaluation.md similarity index 100% rename from .claude/skills/launching-evals/references/run-evaluation.md rename to .agents/skills/launching-evals/references/run-evaluation.md diff --git a/.claude/skills/launching-evals/tests.json b/.agents/skills/launching-evals/tests.json similarity index 100% rename from .claude/skills/launching-evals/tests.json rename to .agents/skills/launching-evals/tests.json diff --git a/.claude/skills/monitor/SKILL.md b/.agents/skills/monitor/SKILL.md similarity index 100% rename from .claude/skills/monitor/SKILL.md rename to .agents/skills/monitor/SKILL.md diff --git a/.claude/skills/ptq/SKILL.md b/.agents/skills/ptq/SKILL.md similarity index 100% rename from .claude/skills/ptq/SKILL.md rename to .agents/skills/ptq/SKILL.md diff --git a/.claude/skills/ptq/references/checkpoint-validation.md b/.agents/skills/ptq/references/checkpoint-validation.md similarity index 100% rename from .claude/skills/ptq/references/checkpoint-validation.md rename to .agents/skills/ptq/references/checkpoint-validation.md diff --git a/.claude/skills/ptq/references/launcher-guide.md b/.agents/skills/ptq/references/launcher-guide.md similarity index 100% rename from .claude/skills/ptq/references/launcher-guide.md rename to .agents/skills/ptq/references/launcher-guide.md diff --git a/.claude/skills/ptq/references/slurm-setup-ptq.md b/.agents/skills/ptq/references/slurm-setup-ptq.md similarity index 100% rename from .claude/skills/ptq/references/slurm-setup-ptq.md rename to .agents/skills/ptq/references/slurm-setup-ptq.md diff --git a/.claude/skills/ptq/references/unsupported-models.md b/.agents/skills/ptq/references/unsupported-models.md similarity index 100% rename from .claude/skills/ptq/references/unsupported-models.md rename to .agents/skills/ptq/references/unsupported-models.md diff --git a/.claude/skills/ptq/tests.json b/.agents/skills/ptq/tests.json similarity index 100% rename from .claude/skills/ptq/tests.json rename to .agents/skills/ptq/tests.json diff --git a/.claude/skills/release-cherry-pick/SKILL.md b/.agents/skills/release-cherry-pick/SKILL.md similarity index 100% rename from .claude/skills/release-cherry-pick/SKILL.md rename to .agents/skills/release-cherry-pick/SKILL.md diff --git a/.claude/clusters.yaml.example b/.claude/clusters.yaml.example deleted file mode 100644 index 5bf4182e5c2..00000000000 --- a/.claude/clusters.yaml.example +++ /dev/null @@ -1,18 +0,0 @@ -# ModelOpt Remote Cluster Configuration -# Copy to ~/.config/modelopt/clusters.yaml (user-level, recommended) -# or .claude/clusters.yaml (project-level, can be committed). - -clusters: - # GPU workstation or SLURM login node - my-cluster: - login_node: cluster-login.example.com - user: myusername - ssh_key: ~/.ssh/id_rsa - # ssh_proxy: "socat - PROXY:localhost:%h:%p,proxyport=3128" # optional - workspace: /path/to/remote/workdir - gpu_type: H100 # used for quantization format recommendation - # slurm: - # default_account: my_account - # default_partition: batch_short - -default_cluster: my-cluster diff --git a/.claude/clusters.yaml.example b/.claude/clusters.yaml.example new file mode 120000 index 00000000000..8cf2f9dd0ca --- /dev/null +++ b/.claude/clusters.yaml.example @@ -0,0 +1 @@ +../.agents/clusters.yaml.example \ No newline at end of file diff --git a/.claude/scripts b/.claude/scripts new file mode 120000 index 00000000000..026c3b766d4 --- /dev/null +++ b/.claude/scripts @@ -0,0 +1 @@ +../.agents/scripts \ No newline at end of file diff --git a/.claude/skills b/.claude/skills new file mode 120000 index 00000000000..2b7a412b8fa --- /dev/null +++ b/.claude/skills @@ -0,0 +1 @@ +../.agents/skills \ No newline at end of file diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml index de3bbba7b3e..86a0387160b 100644 --- a/.markdownlint-cli2.yaml +++ b/.markdownlint-cli2.yaml @@ -12,7 +12,7 @@ config: MD059: false # no-hard-tabs # Vendored upstream skills — kept byte-identical to upstream via -# .claude/scripts/sync-upstream-skills.sh; do not reformat. +# .agents/scripts/sync-upstream-skills.sh; do not reformat. ignores: - - ".claude/skills/launching-evals/**" - - ".claude/skills/accessing-mlflow/**" + - ".agents/skills/launching-evals/**" + - ".agents/skills/accessing-mlflow/**" diff --git a/CLAUDE.md b/CLAUDE.md index d0b47148c38..a810a40b59b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -9,6 +9,11 @@ Primarily Python codebase with optional C++/CUDA extensions supporting PyTorch, > If a `CLAUDE.local.md` file exists alongside this file, read and respect it — it contains > developer-specific overrides that supplement this shared guidance. +> **Skills live in `.agents/skills/`** — `.claude/skills/` is a symlink to +> `.agents/skills/` for back-compat. See `.agents/README.md` for the convention +> (used to share skills/scripts/cluster-config across Claude Code, Codex, Cursor, +> etc.). Always edit files under `.agents/`, not the symlink path. + ## Rules (Read First) **CRITICAL (YOU MUST):** From 28bc8fe676e1d117a86b2ec2dae33722b73ce2a7 Mon Sep 17 00:00:00 2001 From: Seonghee Lee Date: Tue, 28 Apr 2026 11:35:58 -0700 Subject: [PATCH 2/5] chore(skills): redact NVIDIA-internal references in vendored skills Surfaced by an internal-keyword scan over .agents/skills/. All four findings replaced with vendor-neutral wording: - launching-evals/SKILL.md: replace concrete Slurm account names (coreai_dlalgo_compeval / coreai_dlalgo_llm) used as the "PPP -> X" rename example with placeholders / . - launching-evals/SKILL.md: generalise the HF cache path from /lustre/fsw/portfolios/coreai/users//cache/huggingface to HF_HOME=, with a parenthetical note that lustre- style HPC clusters typically organise this under /lustre/...//users//... - launching-evals/references/debug-failed-runs.md: rephrase the "Drop ':5005' from GitLab container registry URLs" advice (port 5005 is the standard port for an on-prem GitLab container registry; the raw advice only made sense in that context) to a vendor-neutral "If the image is on an on-prem GitLab registry, drop the registry port suffix (e.g. ':5005') from the URL." Applied at both occurrences. - common/slurm-setup.md: change the enroot/pyxis "Typical clusters" cell from "NVIDIA internal (DGX Cloud, EOS, Selene, GCP-NRT)" to "HPC clusters with container runtime (e.g. DGX Cloud and similar Slurm + container setups)" -- removes internal cluster codenames (EOS, Selene, GCP-NRT) and the "NVIDIA internal" label. Caveat: the three launching-evals/* files are vendored verbatim from NVIDIA-NeMo/Evaluator (per the provenance header injected by .agents/scripts/sync-upstream-skills.sh). The next sync will overwrite them. Follow-ups: (1) upstream MR against NVIDIA-NeMo/Evaluator, and/or (2) add a redaction post-process to sync-upstream-skills.sh so the scrub survives re-syncs. Signed-off-by: Seonghee Lee Made-with: Cursor Signed-off-by: Seonghee Lee --- .agents/skills/common/slurm-setup.md | 2 +- .agents/skills/launching-evals/SKILL.md | 4 ++-- .../skills/launching-evals/references/debug-failed-runs.md | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.agents/skills/common/slurm-setup.md b/.agents/skills/common/slurm-setup.md index f7d99c7543e..901a1ab9b2a 100644 --- a/.agents/skills/common/slurm-setup.md +++ b/.agents/skills/common/slurm-setup.md @@ -215,7 +215,7 @@ which docker 2>/dev/null && echo "RUNTIME=docker" | Runtime | Typical clusters | SLURM integration | | --- | --- | --- | -| **enroot/pyxis** | NVIDIA internal (DGX Cloud, EOS, Selene, GCP-NRT) | `srun --container-image` | +| **enroot/pyxis** | HPC clusters with container runtime (e.g. DGX Cloud and similar Slurm + container setups) | `srun --container-image` | | **Docker** | Bare-metal / on-prem with GPU | `docker run` inside job script | ### Step 2: Check credentials for the image's registry diff --git a/.agents/skills/launching-evals/SKILL.md b/.agents/skills/launching-evals/SKILL.md index adb40834899..d5a7250d865 100644 --- a/.agents/skills/launching-evals/SKILL.md +++ b/.agents/skills/launching-evals/SKILL.md @@ -62,9 +62,9 @@ The complete evaluation workflow is divided into the following steps you should # Key Facts - Benchmark-specific info learned during launching/analyzing evals should be added to `references/benchmarks/` -- **PPP** = Slurm account (the `account` field in cluster_config.yaml). When the user says "change PPP to X", update the account value (e.g., `coreai_dlalgo_compeval` → `coreai_dlalgo_llm`). +- **PPP** = Slurm account / project portfolio code (the `account` field in cluster_config.yaml). When the user says "change PPP to X", update the account value (e.g., `` → ``). - **Slurm job pairs**: NEL (nemo-evaluator-launcher) submits paired Slurm jobs — a RUNNING job + a PENDING restart job (for when the 4h walltime expires). Never cancel the pending restart jobs — they are expected and necessary. -- **HF cache requirement**: For configs with `HF_HUB_OFFLINE=1`, models must be pre-downloaded to the HF cache on each cluster before launching. **Before running a model on a new cluster, always ask the user if the model is already cached there.** If not, on the cluster login node: `python3 -m venv hf_cli && source hf_cli/bin/activate && pip install huggingface_hub` then `HF_HOME=/lustre/fsw/portfolios/coreai/users//cache/huggingface hf download `. Without this, vLLM will fail with `LocalEntryNotFoundError`. +- **HF cache requirement**: For configs with `HF_HUB_OFFLINE=1`, models must be pre-downloaded to the HF cache on each cluster before launching. **Before running a model on a new cluster, always ask the user if the model is already cached there.** If not, on the cluster login node: `python3 -m venv hf_cli && source hf_cli/bin/activate && pip install huggingface_hub` then `HF_HOME= hf download ` (on lustre-style HPC clusters this is typically under `/lustre/...//users//cache/huggingface`). Without this, vLLM will fail with `LocalEntryNotFoundError`. - **`data_parallel_size` is per node**: `dp_size=1` with `num_nodes=8` means 8 model instances total (one per node), load-balanced by haproxy. Do NOT interpret `dp_size` as the global replica count. - **`payload_modifier` interceptor**: The `params_to_remove` list (e.g. `[max_tokens, max_completion_tokens]`) strips those fields from the outgoing payload, intentionally lifting output length limits so reasoning models can think as long as they need. - **Auto-export git workaround**: The export container (`python:3.12-slim`) lacks `git`. When installing the launcher from a git URL, set `auto_export.launcher_install_cmd` to install git first (e.g., `apt-get update -qq && apt-get install -qq -y git && pip install "nemo-evaluator-launcher[all] @ git+...#subdirectory=packages/nemo-evaluator-launcher"`). diff --git a/.agents/skills/launching-evals/references/debug-failed-runs.md b/.agents/skills/launching-evals/references/debug-failed-runs.md index e94d3bb89f8..5783dc7ac86 100644 --- a/.agents/skills/launching-evals/references/debug-failed-runs.md +++ b/.agents/skills/launching-evals/references/debug-failed-runs.md @@ -70,7 +70,7 @@ tail -200 $LOGS/client-*.log - **CUDA OOM**: Increase `deployment.tensor_parallel_size` to shard across more GPUs. For multi-node: increase `execution.num_nodes` and set `deployment.pipeline_parallel_size`. As last resort: add `--max-model-len ` to `deployment.extra_args`. Do NOT quantize as a first fix — scale compute instead. - **Missing model/checkpoint**: `FileNotFoundError` or `RepositoryNotFoundError` or `GatedRepoError: 403` — verify `deployment.checkpoint_path` or `deployment.hf_model_handle`. For gated models, set `HF_TOKEN` via `deployment.env_vars`. - **Bad `extra_args`**: `unrecognized arguments` or `unexpected keyword argument` — check flags against deployment engine version. Some flags change between versions (e.g., `--rope-scaling` removed in vLLM > 0.11.0). -- **Image pull failure**: `manifest not found` or `pyxis: child 1 failed` — verify image tag exists. Drop `:5005` from GitLab container registry URLs. +- **Image pull failure**: `manifest not found` or `pyxis: child 1 failed` — verify image tag exists. If the image is on an on-prem GitLab registry, drop the registry port suffix (e.g. `:5005`) from the URL. - **GPU driver mismatch**: `CUDA driver version is insufficient` — use an older container image matching the host CUDA driver. - **Health check timeout / connection refused**: Server didn't start — check server logs first. Increase `execution.endpoint_readiness_timeout` (seconds). SLURM default: `null` (falls back to walltime). - **Server crashed mid-eval**: `Connection reset by peer` — check server logs for OOM. Reduce `parallelism` (concurrent requests). Check SLURM logs for preemption or walltime exceeded. @@ -80,7 +80,7 @@ tail -200 $LOGS/client-*.log - **Config validation**: `MissingMandatoryValue` (unfilled `???`), `ValidationError` (type mismatch), `ScannerError` (invalid YAML) — run `--dry-run` to catch these upfront. - **Walltime exceeded**: `CANCELLED DUE TO TIME LIMIT` — NEL submits paired restart jobs that automatically resume when walltime expires, so this is often expected behavior, not a failure. Only increase `execution.walltime` if the evaluation isn't making progress across restarts. - **Preemption**: `CANCELLED DUE TO PREEMPTION` — the paired restart job should automatically resume. If it doesn't, use non-preemptible partition, or re-run. -- **Container not found**: Applies to both `deployment.image` and task-level eval container. Drop `:5005` from GitLab registry URLs. +- **Container not found**: Applies to both `deployment.image` and task-level eval container. For on-prem GitLab registries, drop the registry port suffix (e.g. `:5005`) from the URL. - Troubleshooting docs: list files with WebFetch `https://api.github.com/repos/NVIDIA-NeMo/Evaluator/contents/docs/troubleshooting`, then fetch relevant ones from `https://raw.githubusercontent.com/NVIDIA-NeMo/Evaluator/main/docs/troubleshooting/` **Fix Slurm invalid account/partition:** From b9a5fb2ea26d2207242a60803e5e3c62114b3287 Mon Sep 17 00:00:00 2001 From: Seonghee Lee Date: Wed, 29 Apr 2026 13:22:23 -0700 Subject: [PATCH 3/5] Update remote-execution.md to match the new lookup order. Signed-off-by: Seonghee Lee --- .agents/skills/common/remote-execution.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.agents/skills/common/remote-execution.md b/.agents/skills/common/remote-execution.md index be770aef936..b98462ef737 100644 --- a/.agents/skills/common/remote-execution.md +++ b/.agents/skills/common/remote-execution.md @@ -9,8 +9,9 @@ Read this when Claude Code runs on a different machine than the target GPU clust Config locations (checked in order, first found wins): 1. `~/.config/modelopt/clusters.yaml` — user-level (not committed, recommended) -2. `.claude/clusters.yaml` — project-level (can be committed for shared defaults) -3. Interactive input — if neither file exists, ask the user (see SKILL.md Step 0) and write `~/.config/modelopt/clusters.yaml` before proceeding +2. `.agents/clusters.yaml` — project-level, canonical (can be committed for shared defaults) +3. `.claude/clusters.yaml` — project-level, back-compat +4. Interactive input — if no file exists, ask the user (see SKILL.md Step 0) and write `~/.config/modelopt/clusters.yaml` before proceeding ```yaml clusters: @@ -38,7 +39,7 @@ rsync -av /path/to/local/checkpoint :/checkpoi Use the `workspace` path from your cluster config as the destination. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster. -See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types. +See `.agents/clusters.yaml.example` for a fully annotated example with multiple cluster types. --- @@ -153,5 +154,5 @@ remote_sync_from /local/output/ ## Reference Files - **`skills/common/remote_exec.sh`** — Full utility library (session, run, sync, SLURM, Docker helpers) -- **`.claude/clusters.yaml`** — Active cluster configuration -- **`.claude/clusters.yaml.example`** — Annotated example config +- **`.agents/clusters.yaml`** — Active cluster configuration (canonical; `.claude/clusters.yaml` also accepted for back-compat) +- **`.agents/clusters.yaml.example`** — Annotated example config From 1e5664d2d15b8f4f9c678a0bad61c893d64bd72a Mon Sep 17 00:00:00 2001 From: Seonghee Lee Date: Thu, 30 Apr 2026 11:06:10 -0700 Subject: [PATCH 4/5] fix wrong path link in documentation Signed-off-by: Seonghee Lee --- .agents/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.agents/README.md b/.agents/README.md index d5798b5a853..f4a8cebf4e7 100644 --- a/.agents/README.md +++ b/.agents/README.md @@ -49,7 +49,7 @@ git add .codex/skills - Vendored-verbatim skills (`launching-evals`, `accessing-mlflow`) are managed by `.agents/scripts/sync-upstream-skills.sh` — do not modify by hand. - New skills go in `.agents/skills//SKILL.md` following the - conventions documented in [`.cursor/skills-cursor/create-skill/SKILL.md`](https://docs.anthropic.com/) (or your agent's equivalent). + conventions of existing skills (e.g. `.agents/skills/monitor/SKILL.md`). ## Project-level cluster config From 510f36da4921e641322201d1c045338fca565f75 Mon Sep 17 00:00:00 2001 From: Seonghee Lee Date: Thu, 30 Apr 2026 11:15:46 -0700 Subject: [PATCH 5/5] removing symlinks Signed-off-by: Seonghee Lee --- .agents/README.md | 41 +++++------------------------------ .claude/clusters.yaml.example | 1 - .claude/scripts | 1 - .claude/skills | 1 - 4 files changed, 5 insertions(+), 39 deletions(-) delete mode 120000 .claude/clusters.yaml.example delete mode 120000 .claude/scripts delete mode 120000 .claude/skills diff --git a/.agents/README.md b/.agents/README.md index f4a8cebf4e7..bd522ead05c 100644 --- a/.agents/README.md +++ b/.agents/README.md @@ -15,37 +15,14 @@ working in this repository (Claude Code, Codex, Cursor, …). ## Why this exists -Different agents look for skills/config in vendor-specific directories: - -| Agent | Default location | -|-------------|-------------------------------| -| Claude Code | `.claude/skills/` | -| Codex | `.codex/skills/` | -| Cursor | `.cursor/skills/` | - -Maintaining N copies of the same skill is a non-starter. Instead, **`.agents/` -is the single source of truth**, and each vendor directory is a symlink: - -```text -.claude/skills -> ../.agents/skills -.claude/scripts -> ../.agents/scripts -.claude/clusters.yaml.example -> ../.agents/clusters.yaml.example -``` - -To add support for a new agent, create a directory with the symlinks that -agent expects, e.g.: - -```bash -mkdir -p .codex -ln -s ../.agents/skills .codex/skills -git add .codex/skills -``` +Different agents look for skills/config in vendor-specific directories. Rather +than maintaining N copies that drift out of sync, **`.agents/` is the single +source of truth** — each agent's guidance or install mechanism points here +directly. ## Editing rules -- **Always edit files under `.agents/`**, never under the vendor symlink paths. - Edits via the symlink work, but the diff will look like changes to - `.agents/...` either way; editing the canonical path makes that explicit. +- **Always edit files under `.agents/`**. - Vendored-verbatim skills (`launching-evals`, `accessing-mlflow`) are managed by `.agents/scripts/sync-upstream-skills.sh` — do not modify by hand. - New skills go in `.agents/skills//SKILL.md` following the @@ -60,11 +37,3 @@ The remote-execution skills look for a `clusters.yaml` at, in order: 3. `/.claude/clusters.yaml` (project-level, back-compat) See `clusters.yaml.example` for the schema. - -## A note on Windows - -Git stores symlinks portably, but Windows requires either Developer Mode or -`git config --global core.symlinks true` plus admin rights for them to -materialise correctly. If you're on Windows and skills aren't being picked -up under `.claude/skills/`, that's the most likely cause — `.agents/skills/` -will still work directly. diff --git a/.claude/clusters.yaml.example b/.claude/clusters.yaml.example deleted file mode 120000 index 8cf2f9dd0ca..00000000000 --- a/.claude/clusters.yaml.example +++ /dev/null @@ -1 +0,0 @@ -../.agents/clusters.yaml.example \ No newline at end of file diff --git a/.claude/scripts b/.claude/scripts deleted file mode 120000 index 026c3b766d4..00000000000 --- a/.claude/scripts +++ /dev/null @@ -1 +0,0 @@ -../.agents/scripts \ No newline at end of file diff --git a/.claude/skills b/.claude/skills deleted file mode 120000 index 2b7a412b8fa..00000000000 --- a/.claude/skills +++ /dev/null @@ -1 +0,0 @@ -../.agents/skills \ No newline at end of file