Add pre-built evaluation recipes for common benchmarks

kaix-nv · kaix-nv · commit b805c9aaa3e9 · 2026-04-27T23:31:40.000-07:00
Signed-off-by: Kai Xu &lt;kaix@nvidia.com&gt;
diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
@@ -40,6 +40,8 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to
 
 If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running.
 
+**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2-5 — go directly to the recipe, fill in deployment overrides, and proceed to Step 7.5/8.
+
 **Step 2: Build the base config file**
 
 Prompt the user with "I'll ask you 5 questions to build the base config we'll adjust in the next steps". Guide the user through the 5 questions using AskUserQuestion:
diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
@@ -0,0 +1,108 @@
+# Example: Quantization Validation Suite
+#
+# A balanced set of benchmarks for validating quantized model quality.
+# Copy this file and customize for your needs.
+#
+# Includes:
+#   - MMLU-Pro (knowledge, completions)
+#   - GPQA Diamond (reasoning, chat, 5 repeats)
+#   - LiveCodeBench v6 (code, chat, 3 repeats)
+#   - IFBench (instruction following, chat, 8 repeats)
+#
+# Usage:
+#   nel run --config recipes/examples/example_eval.yaml \
+#     -o deployment.checkpoint_path=/path/to/quantized/checkpoint \
+#     -o deployment.served_model_name=my-model-nvfp4 \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output
+#
+# For quantized checkpoints, also add the quantization flag:
+#   -o 'deployment.extra_args=--max-model-len 32768 --trust-remote-code --quantization modelopt_fp4'
+#
+# Run a single task:
+#   nel run --config ... -t ns_gpqa
+#
+# Smoke test (2 samples):
+#   nel run --config ... -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=2
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "04:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 10
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+  # Knowledge (completions endpoint, short)
+    - name: adlr_mmlu_pro_5_shot_base
+
+  # Reasoning (chat endpoint, 5 repeats, short)
+    - name: ns_gpqa
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              args: ++prompt_config=eval/aai/mcq-4choices
+              num_repeats: 5
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
+
+  # Code (chat endpoint, 3 repeats, medium)
+    - name: ns_livecodebench
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              dataset_split: test_v6_2408_2505
+              num_repeats: 3
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
+
+  # Instruction following (chat endpoint, 8 repeats, super short)
+    - name: ns_ifbench
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              num_repeats: 8
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
@@ -0,0 +1,61 @@
+# AIME 2025 (NeMo Skills, chat)
+#
+# Math competition benchmark. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-16] symbolic_correct
+# Run time: Long (reasoning models generate lengthy thinking traces)
+# Repeats: 16
+#
+# Usage:
+#   nel run --config recipes/tasks/aime2025.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "04:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 100000
+        max_retries: 10
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_aime2025
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              num_repeats: 16
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
@@ -0,0 +1,62 @@
+# GPQA Diamond (NeMo Skills, chat)
+#
+# Graduate-level reasoning benchmark. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-5] symbolic_correct
+# Run time: Short
+# Repeats: 5
+#
+# Usage:
+#   nel run --config recipes/tasks/gpqa.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "02:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 5
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_gpqa
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              args: ++prompt_config=eval/aai/mcq-4choices
+              num_repeats: 5
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
@@ -0,0 +1,61 @@
+# IFBench (NeMo Skills, chat)
+#
+# Instruction following benchmark. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-8] prompt_strict_accuracy
+# Run time: Super Short
+# Repeats: 8
+#
+# Usage:
+#   nel run --config recipes/tasks/ifbench.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "02:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 5
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_ifbench
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              num_repeats: 8
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
@@ -0,0 +1,62 @@
+# LiveCodeBench v6 (NeMo Skills, chat)
+#
+# Code generation benchmark with recent problems. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-3] accuracy
+# Run time: Medium
+# Repeats: 3
+#
+# Usage:
+#   nel run --config recipes/tasks/livecodebench.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "04:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 10
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_livecodebench
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              dataset_split: test_v6_2408_2505
+              num_repeats: 3
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu.yaml
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.yaml b/.claude/skills/evaluation/recipes/tasks/scicode.yaml