From b42544dbe9db94309cde476d40f61029abab2737 Mon Sep 17 00:00:00 2001 From: Kai Xu Date: Mon, 27 Apr 2026 14:53:46 -0700 Subject: [PATCH 1/4] Add pre-built evaluation recipes for common benchmarks Signed-off-by: Kai Xu --- .claude/skills/evaluation/SKILL.md | 2 + .../recipes/examples/example_eval.yaml | 108 ++++++++++++++++++ .../evaluation/recipes/tasks/aime2025.yaml | 61 ++++++++++ .../skills/evaluation/recipes/tasks/gpqa.yaml | 62 ++++++++++ .../evaluation/recipes/tasks/ifbench.yaml | 61 ++++++++++ .../recipes/tasks/livecodebench.yaml | 62 ++++++++++ .../skills/evaluation/recipes/tasks/mmlu.yaml | 53 +++++++++ .../evaluation/recipes/tasks/mmlu_pro.yaml | 46 ++++++++ .../evaluation/recipes/tasks/scicode.yaml | 61 ++++++++++ 9 files changed, 516 insertions(+) create mode 100644 .claude/skills/evaluation/recipes/examples/example_eval.yaml create mode 100644 .claude/skills/evaluation/recipes/tasks/aime2025.yaml create mode 100644 .claude/skills/evaluation/recipes/tasks/gpqa.yaml create mode 100644 .claude/skills/evaluation/recipes/tasks/ifbench.yaml create mode 100644 .claude/skills/evaluation/recipes/tasks/livecodebench.yaml create mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu.yaml create mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml create mode 100644 .claude/skills/evaluation/recipes/tasks/scicode.yaml diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md index 1dd8aa27067..1ceff205e8e 100644 --- a/.claude/skills/evaluation/SKILL.md +++ b/.claude/skills/evaluation/SKILL.md @@ -40,6 +40,8 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running. +**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2-5 — go directly to the recipe, fill in deployment overrides, and proceed to Step 7.5/8. + **Step 2: Build the base config file** Prompt the user with "I'll ask you 5 questions to build the base config we'll adjust in the next steps". Guide the user through the 5 questions using AskUserQuestion: diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml new file mode 100644 index 00000000000..b7f68bd7f1c --- /dev/null +++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml @@ -0,0 +1,108 @@ +# Example: Quantization Validation Suite +# +# A balanced set of benchmarks for validating quantized model quality. +# Copy this file and customize for your needs. +# +# Includes: +# - MMLU-Pro (knowledge, completions) +# - GPQA Diamond (reasoning, chat, 5 repeats) +# - LiveCodeBench v6 (code, chat, 3 repeats) +# - IFBench (instruction following, chat, 8 repeats) +# +# Usage: +# nel run --config recipes/examples/example_eval.yaml \ +# -o deployment.checkpoint_path=/path/to/quantized/checkpoint \ +# -o deployment.served_model_name=my-model-nvfp4 \ +# -o execution.hostname= \ +# -o execution.account= \ +# -o execution.output_dir=/path/to/output +# +# For quantized checkpoints, also add the quantization flag: +# -o 'deployment.extra_args=--max-model-len 32768 --trust-remote-code --quantization modelopt_fp4' +# +# Run a single task: +# nel run --config ... -t ns_gpqa +# +# Smoke test (2 samples): +# nel run --config ... -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=2 +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ +execution: + hostname: ??? + username: ${oc.env:USER} + account: ??? + output_dir: ??? + walltime: "04:00:00" + mounts: + mount_home: false +deployment: + env_vars: + HF_TOKEN: host:HF_TOKEN + checkpoint_path: ??? + hf_model_handle: + served_model_name: ??? + tensor_parallel_size: 1 + data_parallel_size: 1 + # For models with custom code, add: --trust-remote-code + extra_args: --max-model-len 32768 +evaluation: + env_vars: + HF_TOKEN: host:HF_TOKEN + nemo_evaluator_config: + config: + params: + request_timeout: 3600 + max_retries: 10 + parallelism: 16 + target: + api_endpoint: + api_key_name: DUMMY_API_KEY + tasks: + # Knowledge (completions endpoint, short) + - name: adlr_mmlu_pro_5_shot_base + + # Reasoning (chat endpoint, 5 repeats, short) + - name: ns_gpqa + nemo_evaluator_config: + config: + params: + extra: + args: ++prompt_config=eval/aai/mcq-4choices + num_repeats: 5 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens + + # Code (chat endpoint, 3 repeats, medium) + - name: ns_livecodebench + nemo_evaluator_config: + config: + params: + extra: + dataset_split: test_v6_2408_2505 + num_repeats: 3 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens + + # Instruction following (chat endpoint, 8 repeats, super short) + - name: ns_ifbench + nemo_evaluator_config: + config: + params: + extra: + num_repeats: 8 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml new file mode 100644 index 00000000000..07c242079a0 --- /dev/null +++ b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml @@ -0,0 +1,61 @@ +# AIME 2025 (NeMo Skills, chat) +# +# Math competition benchmark. Uses the chat endpoint. +# Primary metric: pass@1[avg-of-16] symbolic_correct +# Run time: Long (reasoning models generate lengthy thinking traces) +# Repeats: 16 +# +# Usage: +# nel run --config recipes/tasks/aime2025.yaml \ +# -o deployment.checkpoint_path=/path/to/checkpoint \ +# -o execution.hostname= \ +# -o execution.account= \ +# -o execution.output_dir=/path/to/output \ +# -o deployment.served_model_name= +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ +execution: + hostname: ??? + username: ${oc.env:USER} + account: ??? + output_dir: ??? + walltime: "04:00:00" + mounts: + mount_home: false +deployment: + env_vars: + HF_TOKEN: host:HF_TOKEN + checkpoint_path: ??? + hf_model_handle: + served_model_name: ??? + tensor_parallel_size: 1 + data_parallel_size: 1 + # For models with custom code, add: --trust-remote-code + extra_args: --max-model-len 32768 +evaluation: + env_vars: + HF_TOKEN: host:HF_TOKEN + nemo_evaluator_config: + config: + params: + request_timeout: 100000 + max_retries: 10 + parallelism: 16 + target: + api_endpoint: + api_key_name: DUMMY_API_KEY + tasks: + - name: ns_aime2025 + nemo_evaluator_config: + config: + params: + extra: + num_repeats: 16 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml new file mode 100644 index 00000000000..ba8cf2a720a --- /dev/null +++ b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml @@ -0,0 +1,62 @@ +# GPQA Diamond (NeMo Skills, chat) +# +# Graduate-level reasoning benchmark. Uses the chat endpoint. +# Primary metric: pass@1[avg-of-5] symbolic_correct +# Run time: Short +# Repeats: 5 +# +# Usage: +# nel run --config recipes/tasks/gpqa.yaml \ +# -o deployment.checkpoint_path=/path/to/checkpoint \ +# -o execution.hostname= \ +# -o execution.account= \ +# -o execution.output_dir=/path/to/output \ +# -o deployment.served_model_name= +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ +execution: + hostname: ??? + username: ${oc.env:USER} + account: ??? + output_dir: ??? + walltime: "02:00:00" + mounts: + mount_home: false +deployment: + env_vars: + HF_TOKEN: host:HF_TOKEN + checkpoint_path: ??? + hf_model_handle: + served_model_name: ??? + tensor_parallel_size: 1 + data_parallel_size: 1 + # For models with custom code, add: --trust-remote-code + extra_args: --max-model-len 32768 +evaluation: + env_vars: + HF_TOKEN: host:HF_TOKEN + nemo_evaluator_config: + config: + params: + request_timeout: 3600 + max_retries: 5 + parallelism: 16 + target: + api_endpoint: + api_key_name: DUMMY_API_KEY + tasks: + - name: ns_gpqa + nemo_evaluator_config: + config: + params: + extra: + args: ++prompt_config=eval/aai/mcq-4choices + num_repeats: 5 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml new file mode 100644 index 00000000000..0876c332e7a --- /dev/null +++ b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml @@ -0,0 +1,61 @@ +# IFBench (NeMo Skills, chat) +# +# Instruction following benchmark. Uses the chat endpoint. +# Primary metric: pass@1[avg-of-8] prompt_strict_accuracy +# Run time: Super Short +# Repeats: 8 +# +# Usage: +# nel run --config recipes/tasks/ifbench.yaml \ +# -o deployment.checkpoint_path=/path/to/checkpoint \ +# -o execution.hostname= \ +# -o execution.account= \ +# -o execution.output_dir=/path/to/output \ +# -o deployment.served_model_name= +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ +execution: + hostname: ??? + username: ${oc.env:USER} + account: ??? + output_dir: ??? + walltime: "02:00:00" + mounts: + mount_home: false +deployment: + env_vars: + HF_TOKEN: host:HF_TOKEN + checkpoint_path: ??? + hf_model_handle: + served_model_name: ??? + tensor_parallel_size: 1 + data_parallel_size: 1 + # For models with custom code, add: --trust-remote-code + extra_args: --max-model-len 32768 +evaluation: + env_vars: + HF_TOKEN: host:HF_TOKEN + nemo_evaluator_config: + config: + params: + request_timeout: 3600 + max_retries: 5 + parallelism: 16 + target: + api_endpoint: + api_key_name: DUMMY_API_KEY + tasks: + - name: ns_ifbench + nemo_evaluator_config: + config: + params: + extra: + num_repeats: 8 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml new file mode 100644 index 00000000000..b56b500df83 --- /dev/null +++ b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml @@ -0,0 +1,62 @@ +# LiveCodeBench v6 (NeMo Skills, chat) +# +# Code generation benchmark with recent problems. Uses the chat endpoint. +# Primary metric: pass@1[avg-of-3] accuracy +# Run time: Medium +# Repeats: 3 +# +# Usage: +# nel run --config recipes/tasks/livecodebench.yaml \ +# -o deployment.checkpoint_path=/path/to/checkpoint \ +# -o execution.hostname= \ +# -o execution.account= \ +# -o execution.output_dir=/path/to/output \ +# -o deployment.served_model_name= +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ +execution: + hostname: ??? + username: ${oc.env:USER} + account: ??? + output_dir: ??? + walltime: "04:00:00" + mounts: + mount_home: false +deployment: + env_vars: + HF_TOKEN: host:HF_TOKEN + checkpoint_path: ??? + hf_model_handle: + served_model_name: ??? + tensor_parallel_size: 1 + data_parallel_size: 1 + # For models with custom code, add: --trust-remote-code + extra_args: --max-model-len 32768 +evaluation: + env_vars: + HF_TOKEN: host:HF_TOKEN + nemo_evaluator_config: + config: + params: + request_timeout: 3600 + max_retries: 10 + parallelism: 16 + target: + api_endpoint: + api_key_name: DUMMY_API_KEY + tasks: + - name: ns_livecodebench + nemo_evaluator_config: + config: + params: + extra: + dataset_split: test_v6_2408_2505 + num_repeats: 3 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu.yaml new file mode 100644 index 00000000000..cccbe5ff78c --- /dev/null +++ b/.claude/skills/evaluation/recipes/tasks/mmlu.yaml @@ -0,0 +1,53 @@ +# MMLU (ADLR, completions) +# +# Massive Multitask Language Understanding. Uses the completions endpoint. +# Primary metric: exact_match +# Run time: Short +# +# Usage: +# nel run --config recipes/tasks/mmlu.yaml \ +# -o deployment.checkpoint_path=/path/to/checkpoint \ +# -o execution.hostname= \ +# -o execution.account= \ +# -o execution.output_dir=/path/to/output \ +# -o deployment.served_model_name= +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ +execution: + hostname: ??? + username: ${oc.env:USER} + account: ??? + output_dir: ??? + walltime: "02:00:00" + mounts: + mount_home: false +deployment: + env_vars: + HF_TOKEN: host:HF_TOKEN + checkpoint_path: ??? + hf_model_handle: + served_model_name: ??? + tensor_parallel_size: 1 + data_parallel_size: 1 + # For models with custom code, add: --trust-remote-code + extra_args: --max-model-len 32768 +evaluation: + env_vars: + HF_TOKEN: host:HF_TOKEN + nemo_evaluator_config: + config: + params: + request_timeout: 3600 + max_retries: 5 + parallelism: 16 + tasks: + - name: adlr_mmlu + nemo_evaluator_config: + config: + params: + max_new_tokens: 2 + target: + api_endpoint: + type: completions diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml new file mode 100644 index 00000000000..749cba572f6 --- /dev/null +++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml @@ -0,0 +1,46 @@ +# MMLU-Pro (5-shot, base/completions) +# +# Knowledge benchmark. Uses the completions endpoint (not chat). +# Primary metric: exact_match +# Run time: Short +# +# Usage: +# nel run --config recipes/tasks/mmlu_pro.yaml \ +# -o deployment.checkpoint_path=/path/to/checkpoint \ +# -o execution.hostname= \ +# -o execution.account= \ +# -o execution.output_dir=/path/to/output \ +# -o deployment.served_model_name= +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ +execution: + hostname: ??? + username: ${oc.env:USER} + account: ??? + output_dir: ??? + walltime: "02:00:00" + mounts: + mount_home: false +deployment: + env_vars: + HF_TOKEN: host:HF_TOKEN + checkpoint_path: ??? + hf_model_handle: + served_model_name: ??? + tensor_parallel_size: 1 + data_parallel_size: 1 + # For models with custom code, add: --trust-remote-code + extra_args: --max-model-len 32768 +evaluation: + env_vars: + HF_TOKEN: host:HF_TOKEN + nemo_evaluator_config: + config: + params: + request_timeout: 3600 + max_retries: 5 + parallelism: 16 + tasks: + - name: adlr_mmlu_pro_5_shot_base diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.yaml b/.claude/skills/evaluation/recipes/tasks/scicode.yaml new file mode 100644 index 00000000000..27bdd39eb53 --- /dev/null +++ b/.claude/skills/evaluation/recipes/tasks/scicode.yaml @@ -0,0 +1,61 @@ +# SciCode (NeMo Skills, chat) +# +# Science + code benchmark. Uses the chat endpoint. +# Primary metric: pass@1[avg-of-3] subtask_accuracy +# Run time: Long +# Repeats: 3 +# +# Usage: +# nel run --config recipes/tasks/scicode.yaml \ +# -o deployment.checkpoint_path=/path/to/checkpoint \ +# -o execution.hostname= \ +# -o execution.account= \ +# -o execution.output_dir=/path/to/output \ +# -o deployment.served_model_name= +defaults: + - execution: slurm/default + - deployment: vllm + - _self_ +execution: + hostname: ??? + username: ${oc.env:USER} + account: ??? + output_dir: ??? + walltime: "04:00:00" + mounts: + mount_home: false +deployment: + env_vars: + HF_TOKEN: host:HF_TOKEN + checkpoint_path: ??? + hf_model_handle: + served_model_name: ??? + tensor_parallel_size: 1 + data_parallel_size: 1 + # For models with custom code, add: --trust-remote-code + extra_args: --max-model-len 32768 +evaluation: + env_vars: + HF_TOKEN: host:HF_TOKEN + nemo_evaluator_config: + config: + params: + request_timeout: 3600 + max_retries: 10 + parallelism: 16 + target: + api_endpoint: + api_key_name: DUMMY_API_KEY + tasks: + - name: ns_scicode + nemo_evaluator_config: + config: + params: + extra: + num_repeats: 3 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens From 585a7a154b22dc2e08f10e67e5cec378132a99ac Mon Sep 17 00:00:00 2001 From: Kai Xu Date: Wed, 29 Apr 2026 14:42:52 -0700 Subject: [PATCH 2/4] Auto-detect deployment settings from checkpoint Signed-off-by: Kai Xu --- .claude/skills/evaluation/SKILL.md | 25 ++++++++++++++++++- .../evaluation/recipes/tasks/aime2025.yaml | 4 +++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md index 1ceff205e8e..f458afdc8ea 100644 --- a/.claude/skills/evaluation/SKILL.md +++ b/.claude/skills/evaluation/SKILL.md @@ -40,7 +40,7 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running. -**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2-5 — go directly to the recipe, fill in deployment overrides, and proceed to Step 7.5/8. +**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2 and 5 (config generation and task confirmation), but still do Step 3 (auto-detect model settings from checkpoint) and Step 4 (fill in required `???` values from user input), then proceed to Step 7.5/8. **Step 2: Build the base config file** @@ -125,6 +125,29 @@ If no `hf_quant_config.json`, also check `config.json` for a `quantization_confi > **Note:** Some models require additional env vars for deployment (e.g., `VLLM_NVFP4_GEMM_BACKEND=marlin` for Nemotron Super). These are not in `hf_quant_config.json` — they are discovered during model card research below. +**Auto-detect deployment settings from checkpoint:** + +Read `config.json` from the checkpoint (or HF model card) and build `deployment.extra_args` dynamically: + +```bash +cat /config.json 2>/dev/null +``` + +| Field in `config.json` | What to set | Example | +| --- | --- | --- | +| `max_position_embeddings` | `--max-model-len ` | `131072` → `--max-model-len 131072` | +| `auto_map` exists | `--trust-remote-code` | Only add if model has custom code | + +Then use WebSearch to check the model card (HuggingFace page) for deployment-specific settings: + +| Model card signal | What to set | +| --- | --- | +| Reasoning model (thinking/CoT) | `--reasoning-parser` and `--reasoning-parser-plugin` if a custom parser is provided | +| Tool-calling support | `--enable-auto-tool-choice --tool-call-parser ` | +| Custom vLLM flags documented | Add as specified (e.g., `--mamba_ssm_cache_dtype float32`) | + +Combine all detected flags into a single `deployment.extra_args` override. The recipe's default `--max-model-len 32768` is a fallback — always prefer the value from `config.json`. + **Quantization-aware benchmark defaults:** When a quantized checkpoint is detected, read `references/quantization-benchmarks.md` for benchmark sensitivity rankings and recommended sets. Present recommendations to the user and ask which to include. diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml index 07c242079a0..a34bef675c3 100644 --- a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml +++ b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml @@ -5,6 +5,10 @@ # Run time: Long (reasoning models generate lengthy thinking traces) # Repeats: 16 # +# Note: The AA variant (simple_evals.AIME_2025) requires JUDGE_API_KEY +# for LLM-based scoring. This NeMo Skills variant uses symbolic scoring +# and does not require external API keys. +# # Usage: # nel run --config recipes/tasks/aime2025.yaml \ # -o deployment.checkpoint_path=/path/to/checkpoint \ From 0f608a895cf16044cddcbbc6271bed4f68958361 Mon Sep 17 00:00:00 2001 From: Kai Xu Date: Wed, 29 Apr 2026 14:52:35 -0700 Subject: [PATCH 3/4] Add env.example with all possible API keys Signed-off-by: Kai Xu --- .claude/skills/evaluation/SKILL.md | 8 ++++- .claude/skills/evaluation/recipes/env.example | 29 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/evaluation/recipes/env.example diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md index f458afdc8ea..871396d3dd6 100644 --- a/.claude/skills/evaluation/SKILL.md +++ b/.claude/skills/evaluation/SKILL.md @@ -243,7 +243,13 @@ ssh "grep -E '^\s*machine\s+' ~/.config/enroot/.credentials 2>/dev/null" Print the following commands to the user. Propose to execute them in order to confirm the config works as expected before the full run. -**Important**: Export required environment variables based on your config. If any tokens or keys are missing (e.g. `HF_TOKEN`, `NGC_API_KEY`, `api_key_name` from the config), ask the user to put them in a `.env` file in the project root so you can run `set -a && source .env && set +a` (or equivalent) before executing `nel run` commands. +**Important**: Export required environment variables based on your config. If any tokens or keys are missing, point the user to `recipes/env.example` — it lists all possible keys with notes on which tasks need them. Ask the user to copy it, fill in their keys, and source it: + +```bash +cp recipes/env.example .env +# Edit .env with your keys +set -a && source .env && set +a +``` ```bash # If using pre_cmd or post_cmd (review pre_cmd content before enabling — it runs arbitrary commands): diff --git a/.claude/skills/evaluation/recipes/env.example b/.claude/skills/evaluation/recipes/env.example new file mode 100644 index 00000000000..8d9b9bfa6d9 --- /dev/null +++ b/.claude/skills/evaluation/recipes/env.example @@ -0,0 +1,29 @@ +# Evaluation API Keys +# +# Copy this file and fill in the keys you need: +# cp recipes/env.example .env +# # Edit .env with your keys +# set -a && source .env && set +a +# +# Not all keys are required — only fill in what your tasks need. + +# Required for all tasks (model/dataset downloads) +HF_TOKEN=hf_... + +# Required for nemo_skills.* tasks (dummy value, not a real key) +DUMMY_API_KEY=dummy + +# Required for NEL pre_cmd execution +NEMO_EVALUATOR_TRUST_PRE_CMD=1 + +# --- Optional: task-specific keys --- + +# AIME 2025 (simple_evals variant only, not ns_aime2025) +# JUDGE_API_KEY= + +# tau2_bench_telecom (LLM judge) +# JUDGE_API_KEY_NVDEV_QWEN235B= + +# terminal-bench-hard (AWS sandbox) +# AWS_ACCESS_KEY_ID= +# AWS_SECRET_ACCESS_KEY= From 4e5db923d1e43bf608e7a53c641a077d87811092 Mon Sep 17 00:00:00 2001 From: Kai Xu Date: Wed, 29 Apr 2026 16:53:33 -0700 Subject: [PATCH 4/4] Strip each task file to just the task config, and create one shared base config Signed-off-by: Kai Xu --- .claude/skills/evaluation/SKILL.md | 8 +- .../recipes/examples/example_eval.yaml | 18 ++++- .../evaluation/recipes/tasks/aime2025.yaml | 80 ++++--------------- .../skills/evaluation/recipes/tasks/gpqa.yaml | 74 ++++------------- .../evaluation/recipes/tasks/ifbench.yaml | 72 +++-------------- .../recipes/tasks/livecodebench.yaml | 75 ++++------------- .../skills/evaluation/recipes/tasks/mmlu.yaml | 53 ------------ .../evaluation/recipes/tasks/mmlu_pro.yaml | 62 ++++---------- .../evaluation/recipes/tasks/scicode.yaml | 73 ++++------------- 9 files changed, 112 insertions(+), 403 deletions(-) delete mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu.yaml diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md index 871396d3dd6..69920814828 100644 --- a/.claude/skills/evaluation/SKILL.md +++ b/.claude/skills/evaluation/SKILL.md @@ -40,7 +40,13 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running. -**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2 and 5 (config generation and task confirmation), but still do Step 3 (auto-detect model settings from checkpoint) and Step 4 (fill in required `???` values from user input), then proceed to Step 7.5/8. +**Shortcut: use pre-built task snippets.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching task snippet. Available: mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. Task snippets contain only the task-specific config (name, params, repeats) — not the full NEL config. To use them: + +1. Read the task snippet(s) the user wants +2. Use `recipes/examples/example_eval.yaml` as the base config template +3. Replace the `tasks:` section with the selected snippet(s) +4. Do Step 3 (auto-detect model settings from checkpoint) and Step 4 (fill in `???` values) +5. Proceed to Step 7.5/8 **Step 2: Build the base config file** diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml index b7f68bd7f1c..77887b3f8c3 100644 --- a/.claude/skills/evaluation/recipes/examples/example_eval.yaml +++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml @@ -2,6 +2,8 @@ # # A balanced set of benchmarks for validating quantized model quality. # Copy this file and customize for your needs. +# Task snippets in recipes/tasks/ define per-task configs — the agent +# composes them into a runnable config like this one. # # Includes: # - MMLU-Pro (knowledge, completions) @@ -60,8 +62,20 @@ evaluation: api_endpoint: api_key_name: DUMMY_API_KEY tasks: - # Knowledge (completions endpoint, short) - - name: adlr_mmlu_pro_5_shot_base + # Knowledge (chat endpoint, short) + - name: ns_mmlu_pro + nemo_evaluator_config: + config: + params: + extra: + num_repeats: 1 + args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens # Reasoning (chat endpoint, 5 repeats, short) - name: ns_gpqa diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml index a34bef675c3..1cf5643f481 100644 --- a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml +++ b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml @@ -1,65 +1,19 @@ # AIME 2025 (NeMo Skills, chat) -# -# Math competition benchmark. Uses the chat endpoint. # Primary metric: pass@1[avg-of-16] symbolic_correct -# Run time: Long (reasoning models generate lengthy thinking traces) -# Repeats: 16 -# -# Note: The AA variant (simple_evals.AIME_2025) requires JUDGE_API_KEY -# for LLM-based scoring. This NeMo Skills variant uses symbolic scoring -# and does not require external API keys. -# -# Usage: -# nel run --config recipes/tasks/aime2025.yaml \ -# -o deployment.checkpoint_path=/path/to/checkpoint \ -# -o execution.hostname= \ -# -o execution.account= \ -# -o execution.output_dir=/path/to/output \ -# -o deployment.served_model_name= -defaults: - - execution: slurm/default - - deployment: vllm - - _self_ -execution: - hostname: ??? - username: ${oc.env:USER} - account: ??? - output_dir: ??? - walltime: "04:00:00" - mounts: - mount_home: false -deployment: - env_vars: - HF_TOKEN: host:HF_TOKEN - checkpoint_path: ??? - hf_model_handle: - served_model_name: ??? - tensor_parallel_size: 1 - data_parallel_size: 1 - # For models with custom code, add: --trust-remote-code - extra_args: --max-model-len 32768 -evaluation: - env_vars: - HF_TOKEN: host:HF_TOKEN - nemo_evaluator_config: - config: - params: - request_timeout: 100000 - max_retries: 10 - parallelism: 16 - target: - api_endpoint: - api_key_name: DUMMY_API_KEY - tasks: - - name: ns_aime2025 - nemo_evaluator_config: - config: - params: - extra: - num_repeats: 16 - target: - api_endpoint: - adapter_config: - params_to_remove: - - max_new_tokens - - max_completion_tokens +# Run time: Long (reasoning models generate lengthy thinking traces) | Repeats: 16 +# Note: The AA variant (simple_evals.AIME_2025) requires JUDGE_API_KEY. +# This NeMo Skills variant uses symbolic scoring — no external API keys needed. + - name: ns_aime2025 + nemo_evaluator_config: + config: + params: + request_timeout: 100000 + max_retries: 10 + extra: + num_repeats: 16 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml index ba8cf2a720a..3692175d987 100644 --- a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml +++ b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml @@ -1,62 +1,16 @@ # GPQA Diamond (NeMo Skills, chat) -# -# Graduate-level reasoning benchmark. Uses the chat endpoint. # Primary metric: pass@1[avg-of-5] symbolic_correct -# Run time: Short -# Repeats: 5 -# -# Usage: -# nel run --config recipes/tasks/gpqa.yaml \ -# -o deployment.checkpoint_path=/path/to/checkpoint \ -# -o execution.hostname= \ -# -o execution.account= \ -# -o execution.output_dir=/path/to/output \ -# -o deployment.served_model_name= -defaults: - - execution: slurm/default - - deployment: vllm - - _self_ -execution: - hostname: ??? - username: ${oc.env:USER} - account: ??? - output_dir: ??? - walltime: "02:00:00" - mounts: - mount_home: false -deployment: - env_vars: - HF_TOKEN: host:HF_TOKEN - checkpoint_path: ??? - hf_model_handle: - served_model_name: ??? - tensor_parallel_size: 1 - data_parallel_size: 1 - # For models with custom code, add: --trust-remote-code - extra_args: --max-model-len 32768 -evaluation: - env_vars: - HF_TOKEN: host:HF_TOKEN - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - max_retries: 5 - parallelism: 16 - target: - api_endpoint: - api_key_name: DUMMY_API_KEY - tasks: - - name: ns_gpqa - nemo_evaluator_config: - config: - params: - extra: - args: ++prompt_config=eval/aai/mcq-4choices - num_repeats: 5 - target: - api_endpoint: - adapter_config: - params_to_remove: - - max_new_tokens - - max_completion_tokens +# Run time: Short | Repeats: 5 + - name: ns_gpqa + nemo_evaluator_config: + config: + params: + extra: + args: ++prompt_config=eval/aai/mcq-4choices + num_repeats: 5 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml index 0876c332e7a..46cbc2db085 100644 --- a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml +++ b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml @@ -1,61 +1,15 @@ # IFBench (NeMo Skills, chat) -# -# Instruction following benchmark. Uses the chat endpoint. # Primary metric: pass@1[avg-of-8] prompt_strict_accuracy -# Run time: Super Short -# Repeats: 8 -# -# Usage: -# nel run --config recipes/tasks/ifbench.yaml \ -# -o deployment.checkpoint_path=/path/to/checkpoint \ -# -o execution.hostname= \ -# -o execution.account= \ -# -o execution.output_dir=/path/to/output \ -# -o deployment.served_model_name= -defaults: - - execution: slurm/default - - deployment: vllm - - _self_ -execution: - hostname: ??? - username: ${oc.env:USER} - account: ??? - output_dir: ??? - walltime: "02:00:00" - mounts: - mount_home: false -deployment: - env_vars: - HF_TOKEN: host:HF_TOKEN - checkpoint_path: ??? - hf_model_handle: - served_model_name: ??? - tensor_parallel_size: 1 - data_parallel_size: 1 - # For models with custom code, add: --trust-remote-code - extra_args: --max-model-len 32768 -evaluation: - env_vars: - HF_TOKEN: host:HF_TOKEN - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - max_retries: 5 - parallelism: 16 - target: - api_endpoint: - api_key_name: DUMMY_API_KEY - tasks: - - name: ns_ifbench - nemo_evaluator_config: - config: - params: - extra: - num_repeats: 8 - target: - api_endpoint: - adapter_config: - params_to_remove: - - max_new_tokens - - max_completion_tokens +# Run time: Super Short | Repeats: 8 + - name: ns_ifbench + nemo_evaluator_config: + config: + params: + extra: + num_repeats: 8 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml index b56b500df83..202387a1eb6 100644 --- a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml +++ b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml @@ -1,62 +1,17 @@ # LiveCodeBench v6 (NeMo Skills, chat) -# -# Code generation benchmark with recent problems. Uses the chat endpoint. # Primary metric: pass@1[avg-of-3] accuracy -# Run time: Medium -# Repeats: 3 -# -# Usage: -# nel run --config recipes/tasks/livecodebench.yaml \ -# -o deployment.checkpoint_path=/path/to/checkpoint \ -# -o execution.hostname= \ -# -o execution.account= \ -# -o execution.output_dir=/path/to/output \ -# -o deployment.served_model_name= -defaults: - - execution: slurm/default - - deployment: vllm - - _self_ -execution: - hostname: ??? - username: ${oc.env:USER} - account: ??? - output_dir: ??? - walltime: "04:00:00" - mounts: - mount_home: false -deployment: - env_vars: - HF_TOKEN: host:HF_TOKEN - checkpoint_path: ??? - hf_model_handle: - served_model_name: ??? - tensor_parallel_size: 1 - data_parallel_size: 1 - # For models with custom code, add: --trust-remote-code - extra_args: --max-model-len 32768 -evaluation: - env_vars: - HF_TOKEN: host:HF_TOKEN - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - max_retries: 10 - parallelism: 16 - target: - api_endpoint: - api_key_name: DUMMY_API_KEY - tasks: - - name: ns_livecodebench - nemo_evaluator_config: - config: - params: - extra: - dataset_split: test_v6_2408_2505 - num_repeats: 3 - target: - api_endpoint: - adapter_config: - params_to_remove: - - max_new_tokens - - max_completion_tokens +# Run time: Medium | Repeats: 3 + - name: ns_livecodebench + nemo_evaluator_config: + config: + params: + max_retries: 10 + extra: + dataset_split: test_v6_2408_2505 + num_repeats: 3 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu.yaml deleted file mode 100644 index cccbe5ff78c..00000000000 --- a/.claude/skills/evaluation/recipes/tasks/mmlu.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# MMLU (ADLR, completions) -# -# Massive Multitask Language Understanding. Uses the completions endpoint. -# Primary metric: exact_match -# Run time: Short -# -# Usage: -# nel run --config recipes/tasks/mmlu.yaml \ -# -o deployment.checkpoint_path=/path/to/checkpoint \ -# -o execution.hostname= \ -# -o execution.account= \ -# -o execution.output_dir=/path/to/output \ -# -o deployment.served_model_name= -defaults: - - execution: slurm/default - - deployment: vllm - - _self_ -execution: - hostname: ??? - username: ${oc.env:USER} - account: ??? - output_dir: ??? - walltime: "02:00:00" - mounts: - mount_home: false -deployment: - env_vars: - HF_TOKEN: host:HF_TOKEN - checkpoint_path: ??? - hf_model_handle: - served_model_name: ??? - tensor_parallel_size: 1 - data_parallel_size: 1 - # For models with custom code, add: --trust-remote-code - extra_args: --max-model-len 32768 -evaluation: - env_vars: - HF_TOKEN: host:HF_TOKEN - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - max_retries: 5 - parallelism: 16 - tasks: - - name: adlr_mmlu - nemo_evaluator_config: - config: - params: - max_new_tokens: 2 - target: - api_endpoint: - type: completions diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml index 749cba572f6..be16a546a39 100644 --- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml +++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml @@ -1,46 +1,16 @@ -# MMLU-Pro (5-shot, base/completions) -# -# Knowledge benchmark. Uses the completions endpoint (not chat). -# Primary metric: exact_match -# Run time: Short -# -# Usage: -# nel run --config recipes/tasks/mmlu_pro.yaml \ -# -o deployment.checkpoint_path=/path/to/checkpoint \ -# -o execution.hostname= \ -# -o execution.account= \ -# -o execution.output_dir=/path/to/output \ -# -o deployment.served_model_name= -defaults: - - execution: slurm/default - - deployment: vllm - - _self_ -execution: - hostname: ??? - username: ${oc.env:USER} - account: ??? - output_dir: ??? - walltime: "02:00:00" - mounts: - mount_home: false -deployment: - env_vars: - HF_TOKEN: host:HF_TOKEN - checkpoint_path: ??? - hf_model_handle: - served_model_name: ??? - tensor_parallel_size: 1 - data_parallel_size: 1 - # For models with custom code, add: --trust-remote-code - extra_args: --max-model-len 32768 -evaluation: - env_vars: - HF_TOKEN: host:HF_TOKEN - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - max_retries: 5 - parallelism: 16 - tasks: - - name: adlr_mmlu_pro_5_shot_base +# MMLU-Pro (NeMo Skills, chat) +# Primary metric: symbolic_correct +# Run time: Short | Repeats: 1 + - name: ns_mmlu_pro + nemo_evaluator_config: + config: + params: + extra: + num_repeats: 1 + args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.yaml b/.claude/skills/evaluation/recipes/tasks/scicode.yaml index 27bdd39eb53..724b6935759 100644 --- a/.claude/skills/evaluation/recipes/tasks/scicode.yaml +++ b/.claude/skills/evaluation/recipes/tasks/scicode.yaml @@ -1,61 +1,16 @@ # SciCode (NeMo Skills, chat) -# -# Science + code benchmark. Uses the chat endpoint. # Primary metric: pass@1[avg-of-3] subtask_accuracy -# Run time: Long -# Repeats: 3 -# -# Usage: -# nel run --config recipes/tasks/scicode.yaml \ -# -o deployment.checkpoint_path=/path/to/checkpoint \ -# -o execution.hostname= \ -# -o execution.account= \ -# -o execution.output_dir=/path/to/output \ -# -o deployment.served_model_name= -defaults: - - execution: slurm/default - - deployment: vllm - - _self_ -execution: - hostname: ??? - username: ${oc.env:USER} - account: ??? - output_dir: ??? - walltime: "04:00:00" - mounts: - mount_home: false -deployment: - env_vars: - HF_TOKEN: host:HF_TOKEN - checkpoint_path: ??? - hf_model_handle: - served_model_name: ??? - tensor_parallel_size: 1 - data_parallel_size: 1 - # For models with custom code, add: --trust-remote-code - extra_args: --max-model-len 32768 -evaluation: - env_vars: - HF_TOKEN: host:HF_TOKEN - nemo_evaluator_config: - config: - params: - request_timeout: 3600 - max_retries: 10 - parallelism: 16 - target: - api_endpoint: - api_key_name: DUMMY_API_KEY - tasks: - - name: ns_scicode - nemo_evaluator_config: - config: - params: - extra: - num_repeats: 3 - target: - api_endpoint: - adapter_config: - params_to_remove: - - max_new_tokens - - max_completion_tokens +# Run time: Long | Repeats: 3 + - name: ns_scicode + nemo_evaluator_config: + config: + params: + max_retries: 10 + extra: + num_repeats: 3 + target: + api_endpoint: + adapter_config: + params_to_remove: + - max_new_tokens + - max_completion_tokens