From b42544dbe9db94309cde476d40f61029abab2737 Mon Sep 17 00:00:00 2001
From: Kai Xu <kaix@nvidia.com>
Date: Mon, 27 Apr 2026 14:53:46 -0700
Subject: [PATCH 1/4] Add pre-built evaluation recipes for common benchmarks

Signed-off-by: Kai Xu <kaix@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |   2 +
 .../recipes/examples/example_eval.yaml        | 108 ++++++++++++++++++
 .../evaluation/recipes/tasks/aime2025.yaml    |  61 ++++++++++
 .../skills/evaluation/recipes/tasks/gpqa.yaml |  62 ++++++++++
 .../evaluation/recipes/tasks/ifbench.yaml     |  61 ++++++++++
 .../recipes/tasks/livecodebench.yaml          |  62 ++++++++++
 .../skills/evaluation/recipes/tasks/mmlu.yaml |  53 +++++++++
 .../evaluation/recipes/tasks/mmlu_pro.yaml    |  46 ++++++++
 .../evaluation/recipes/tasks/scicode.yaml     |  61 ++++++++++
 9 files changed, 516 insertions(+)
 create mode 100644 .claude/skills/evaluation/recipes/examples/example_eval.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/aime2025.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/gpqa.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/ifbench.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/livecodebench.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/scicode.yaml

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 1dd8aa27067..1ceff205e8e 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -40,6 +40,8 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to
 
 If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running.
 
+**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2-5 — go directly to the recipe, fill in deployment overrides, and proceed to Step 7.5/8.
+
 **Step 2: Build the base config file**
 
 Prompt the user with "I'll ask you 5 questions to build the base config we'll adjust in the next steps". Guide the user through the 5 questions using AskUserQuestion:
diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
new file mode 100644
index 00000000000..b7f68bd7f1c
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
@@ -0,0 +1,108 @@
+# Example: Quantization Validation Suite
+#
+# A balanced set of benchmarks for validating quantized model quality.
+# Copy this file and customize for your needs.
+#
+# Includes:
+#   - MMLU-Pro (knowledge, completions)
+#   - GPQA Diamond (reasoning, chat, 5 repeats)
+#   - LiveCodeBench v6 (code, chat, 3 repeats)
+#   - IFBench (instruction following, chat, 8 repeats)
+#
+# Usage:
+#   nel run --config recipes/examples/example_eval.yaml \
+#     -o deployment.checkpoint_path=/path/to/quantized/checkpoint \
+#     -o deployment.served_model_name=my-model-nvfp4 \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output
+#
+# For quantized checkpoints, also add the quantization flag:
+#   -o 'deployment.extra_args=--max-model-len 32768 --trust-remote-code --quantization modelopt_fp4'
+#
+# Run a single task:
+#   nel run --config ... -t ns_gpqa
+#
+# Smoke test (2 samples):
+#   nel run --config ... -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=2
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "04:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 10
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+  # Knowledge (completions endpoint, short)
+    - name: adlr_mmlu_pro_5_shot_base
+
+  # Reasoning (chat endpoint, 5 repeats, short)
+    - name: ns_gpqa
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              args: ++prompt_config=eval/aai/mcq-4choices
+              num_repeats: 5
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
+
+  # Code (chat endpoint, 3 repeats, medium)
+    - name: ns_livecodebench
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              dataset_split: test_v6_2408_2505
+              num_repeats: 3
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
+
+  # Instruction following (chat endpoint, 8 repeats, super short)
+    - name: ns_ifbench
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              num_repeats: 8
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
new file mode 100644
index 00000000000..07c242079a0
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
@@ -0,0 +1,61 @@
+# AIME 2025 (NeMo Skills, chat)
+#
+# Math competition benchmark. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-16] symbolic_correct
+# Run time: Long (reasoning models generate lengthy thinking traces)
+# Repeats: 16
+#
+# Usage:
+#   nel run --config recipes/tasks/aime2025.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "04:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 100000
+        max_retries: 10
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_aime2025
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              num_repeats: 16
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
new file mode 100644
index 00000000000..ba8cf2a720a
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
@@ -0,0 +1,62 @@
+# GPQA Diamond (NeMo Skills, chat)
+#
+# Graduate-level reasoning benchmark. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-5] symbolic_correct
+# Run time: Short
+# Repeats: 5
+#
+# Usage:
+#   nel run --config recipes/tasks/gpqa.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "02:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 5
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_gpqa
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              args: ++prompt_config=eval/aai/mcq-4choices
+              num_repeats: 5
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
new file mode 100644
index 00000000000..0876c332e7a
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
@@ -0,0 +1,61 @@
+# IFBench (NeMo Skills, chat)
+#
+# Instruction following benchmark. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-8] prompt_strict_accuracy
+# Run time: Super Short
+# Repeats: 8
+#
+# Usage:
+#   nel run --config recipes/tasks/ifbench.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "02:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 5
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_ifbench
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              num_repeats: 8
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
new file mode 100644
index 00000000000..b56b500df83
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
@@ -0,0 +1,62 @@
+# LiveCodeBench v6 (NeMo Skills, chat)
+#
+# Code generation benchmark with recent problems. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-3] accuracy
+# Run time: Medium
+# Repeats: 3
+#
+# Usage:
+#   nel run --config recipes/tasks/livecodebench.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "04:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 10
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_livecodebench
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              dataset_split: test_v6_2408_2505
+              num_repeats: 3
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu.yaml
new file mode 100644
index 00000000000..cccbe5ff78c
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu.yaml
@@ -0,0 +1,53 @@
+# MMLU (ADLR, completions)
+#
+# Massive Multitask Language Understanding. Uses the completions endpoint.
+# Primary metric: exact_match
+# Run time: Short
+#
+# Usage:
+#   nel run --config recipes/tasks/mmlu.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "02:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 5
+        parallelism: 16
+  tasks:
+    - name: adlr_mmlu
+      nemo_evaluator_config:
+        config:
+          params:
+            max_new_tokens: 2
+        target:
+          api_endpoint:
+            type: completions
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
new file mode 100644
index 00000000000..749cba572f6
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
@@ -0,0 +1,46 @@
+# MMLU-Pro (5-shot, base/completions)
+#
+# Knowledge benchmark. Uses the completions endpoint (not chat).
+# Primary metric: exact_match
+# Run time: Short
+#
+# Usage:
+#   nel run --config recipes/tasks/mmlu_pro.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "02:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 5
+        parallelism: 16
+  tasks:
+    - name: adlr_mmlu_pro_5_shot_base
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.yaml b/.claude/skills/evaluation/recipes/tasks/scicode.yaml
new file mode 100644
index 00000000000..27bdd39eb53
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/scicode.yaml
@@ -0,0 +1,61 @@
+# SciCode (NeMo Skills, chat)
+#
+# Science + code benchmark. Uses the chat endpoint.
+# Primary metric: pass@1[avg-of-3] subtask_accuracy
+# Run time: Long
+# Repeats: 3
+#
+# Usage:
+#   nel run --config recipes/tasks/scicode.yaml \
+#     -o deployment.checkpoint_path=/path/to/checkpoint \
+#     -o execution.hostname=<slurm_host> \
+#     -o execution.account=<slurm_account> \
+#     -o execution.output_dir=/path/to/output \
+#     -o deployment.served_model_name=<model_name>
+defaults:
+  - execution: slurm/default
+  - deployment: vllm
+  - _self_
+execution:
+  hostname: ???
+  username: ${oc.env:USER}
+  account: ???
+  output_dir: ???
+  walltime: "04:00:00"
+  mounts:
+    mount_home: false
+deployment:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  checkpoint_path: ???
+  hf_model_handle:
+  served_model_name: ???
+  tensor_parallel_size: 1
+  data_parallel_size: 1
+  # For models with custom code, add: --trust-remote-code
+  extra_args: --max-model-len 32768
+evaluation:
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 3600
+        max_retries: 10
+        parallelism: 16
+    target:
+      api_endpoint:
+        api_key_name: DUMMY_API_KEY
+  tasks:
+    - name: ns_scicode
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              num_repeats: 3
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens

From 585a7a154b22dc2e08f10e67e5cec378132a99ac Mon Sep 17 00:00:00 2001
From: Kai Xu <kaix@nvidia.com>
Date: Wed, 29 Apr 2026 14:42:52 -0700
Subject: [PATCH 2/4] Auto-detect deployment settings from checkpoint

Signed-off-by: Kai Xu <kaix@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            | 25 ++++++++++++++++++-
 .../evaluation/recipes/tasks/aime2025.yaml    |  4 +++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 1ceff205e8e..f458afdc8ea 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -40,7 +40,7 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to
 
 If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running.
 
-**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2-5 — go directly to the recipe, fill in deployment overrides, and proceed to Step 7.5/8.
+**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2 and 5 (config generation and task confirmation), but still do Step 3 (auto-detect model settings from checkpoint) and Step 4 (fill in required `???` values from user input), then proceed to Step 7.5/8.
 
 **Step 2: Build the base config file**
 
@@ -125,6 +125,29 @@ If no `hf_quant_config.json`, also check `config.json` for a `quantization_confi
 
 > **Note:** Some models require additional env vars for deployment (e.g., `VLLM_NVFP4_GEMM_BACKEND=marlin` for Nemotron Super). These are not in `hf_quant_config.json` — they are discovered during model card research below.
 
+**Auto-detect deployment settings from checkpoint:**
+
+Read `config.json` from the checkpoint (or HF model card) and build `deployment.extra_args` dynamically:
+
+```bash
+cat <checkpoint_path>/config.json 2>/dev/null
+```
+
+| Field in `config.json` | What to set | Example |
+| --- | --- | --- |
+| `max_position_embeddings` | `--max-model-len <value>` | `131072` → `--max-model-len 131072` |
+| `auto_map` exists | `--trust-remote-code` | Only add if model has custom code |
+
+Then use WebSearch to check the model card (HuggingFace page) for deployment-specific settings:
+
+| Model card signal | What to set |
+| --- | --- |
+| Reasoning model (thinking/CoT) | `--reasoning-parser` and `--reasoning-parser-plugin` if a custom parser is provided |
+| Tool-calling support | `--enable-auto-tool-choice --tool-call-parser <parser>` |
+| Custom vLLM flags documented | Add as specified (e.g., `--mamba_ssm_cache_dtype float32`) |
+
+Combine all detected flags into a single `deployment.extra_args` override. The recipe's default `--max-model-len 32768` is a fallback — always prefer the value from `config.json`.
+
 **Quantization-aware benchmark defaults:**
 
 When a quantized checkpoint is detected, read `references/quantization-benchmarks.md` for benchmark sensitivity rankings and recommended sets. Present recommendations to the user and ask which to include.
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
index 07c242079a0..a34bef675c3 100644
--- a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
@@ -5,6 +5,10 @@
 # Run time: Long (reasoning models generate lengthy thinking traces)
 # Repeats: 16
 #
+# Note: The AA variant (simple_evals.AIME_2025) requires JUDGE_API_KEY
+# for LLM-based scoring. This NeMo Skills variant uses symbolic scoring
+# and does not require external API keys.
+#
 # Usage:
 #   nel run --config recipes/tasks/aime2025.yaml \
 #     -o deployment.checkpoint_path=/path/to/checkpoint \

From 0f608a895cf16044cddcbbc6271bed4f68958361 Mon Sep 17 00:00:00 2001
From: Kai Xu <kaix@nvidia.com>
Date: Wed, 29 Apr 2026 14:52:35 -0700
Subject: [PATCH 3/4] Add env.example with all possible API keys

Signed-off-by: Kai Xu <kaix@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |  8 ++++-
 .claude/skills/evaluation/recipes/env.example | 29 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 .claude/skills/evaluation/recipes/env.example

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index f458afdc8ea..871396d3dd6 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -243,7 +243,13 @@ ssh <host> "grep -E '^\s*machine\s+' ~/.config/enroot/.credentials 2>/dev/null"
 
 Print the following commands to the user. Propose to execute them in order to confirm the config works as expected before the full run.
 
-**Important**: Export required environment variables based on your config. If any tokens or keys are missing (e.g. `HF_TOKEN`, `NGC_API_KEY`, `api_key_name` from the config), ask the user to put them in a `.env` file in the project root so you can run `set -a && source .env && set +a` (or equivalent) before executing `nel run` commands.
+**Important**: Export required environment variables based on your config. If any tokens or keys are missing, point the user to `recipes/env.example` — it lists all possible keys with notes on which tasks need them. Ask the user to copy it, fill in their keys, and source it:
+
+```bash
+cp recipes/env.example .env
+# Edit .env with your keys
+set -a && source .env && set +a
+```
 
 ```bash
 # If using pre_cmd or post_cmd (review pre_cmd content before enabling — it runs arbitrary commands):
diff --git a/.claude/skills/evaluation/recipes/env.example b/.claude/skills/evaluation/recipes/env.example
new file mode 100644
index 00000000000..8d9b9bfa6d9
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/env.example
@@ -0,0 +1,29 @@
+# Evaluation API Keys
+#
+# Copy this file and fill in the keys you need:
+#   cp recipes/env.example .env
+#   # Edit .env with your keys
+#   set -a && source .env && set +a
+#
+# Not all keys are required — only fill in what your tasks need.
+
+# Required for all tasks (model/dataset downloads)
+HF_TOKEN=hf_...
+
+# Required for nemo_skills.* tasks (dummy value, not a real key)
+DUMMY_API_KEY=dummy
+
+# Required for NEL pre_cmd execution
+NEMO_EVALUATOR_TRUST_PRE_CMD=1
+
+# --- Optional: task-specific keys ---
+
+# AIME 2025 (simple_evals variant only, not ns_aime2025)
+# JUDGE_API_KEY=
+
+# tau2_bench_telecom (LLM judge)
+# JUDGE_API_KEY_NVDEV_QWEN235B=
+
+# terminal-bench-hard (AWS sandbox)
+# AWS_ACCESS_KEY_ID=
+# AWS_SECRET_ACCESS_KEY=

From 4e5db923d1e43bf608e7a53c641a077d87811092 Mon Sep 17 00:00:00 2001
From: Kai Xu <kaix@nvidia.com>
Date: Wed, 29 Apr 2026 16:53:33 -0700
Subject: [PATCH 4/4] Strip each task file to just the task config, and create
 one shared base config

Signed-off-by: Kai Xu <kaix@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |  8 +-
 .../recipes/examples/example_eval.yaml        | 18 ++++-
 .../evaluation/recipes/tasks/aime2025.yaml    | 80 ++++---------------
 .../skills/evaluation/recipes/tasks/gpqa.yaml | 74 ++++-------------
 .../evaluation/recipes/tasks/ifbench.yaml     | 72 +++--------------
 .../recipes/tasks/livecodebench.yaml          | 75 ++++-------------
 .../skills/evaluation/recipes/tasks/mmlu.yaml | 53 ------------
 .../evaluation/recipes/tasks/mmlu_pro.yaml    | 62 ++++----------
 .../evaluation/recipes/tasks/scicode.yaml     | 73 ++++-------------
 9 files changed, 112 insertions(+), 403 deletions(-)
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu.yaml

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 871396d3dd6..69920814828 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -40,7 +40,13 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to
 
 If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running.
 
-**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2 and 5 (config generation and task confirmation), but still do Step 3 (auto-detect model settings from checkpoint) and Step 4 (fill in required `???` values from user input), then proceed to Step 7.5/8.
+**Shortcut: use pre-built task snippets.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching task snippet. Available: mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. Task snippets contain only the task-specific config (name, params, repeats) — not the full NEL config. To use them:
+
+1. Read the task snippet(s) the user wants
+2. Use `recipes/examples/example_eval.yaml` as the base config template
+3. Replace the `tasks:` section with the selected snippet(s)
+4. Do Step 3 (auto-detect model settings from checkpoint) and Step 4 (fill in `???` values)
+5. Proceed to Step 7.5/8
 
 **Step 2: Build the base config file**
 
diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
index b7f68bd7f1c..77887b3f8c3 100644
--- a/.claude/skills/evaluation/recipes/examples/example_eval.yaml
+++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
@@ -2,6 +2,8 @@
 #
 # A balanced set of benchmarks for validating quantized model quality.
 # Copy this file and customize for your needs.
+# Task snippets in recipes/tasks/ define per-task configs — the agent
+# composes them into a runnable config like this one.
 #
 # Includes:
 #   - MMLU-Pro (knowledge, completions)
@@ -60,8 +62,20 @@ evaluation:
       api_endpoint:
         api_key_name: DUMMY_API_KEY
   tasks:
-  # Knowledge (completions endpoint, short)
-    - name: adlr_mmlu_pro_5_shot_base
+  # Knowledge (chat endpoint, short)
+    - name: ns_mmlu_pro
+      nemo_evaluator_config:
+        config:
+          params:
+            extra:
+              num_repeats: 1
+              args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null
+        target:
+          api_endpoint:
+            adapter_config:
+              params_to_remove:
+                - max_new_tokens
+                - max_completion_tokens
 
   # Reasoning (chat endpoint, 5 repeats, short)
     - name: ns_gpqa
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
index a34bef675c3..1cf5643f481 100644
--- a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
@@ -1,65 +1,19 @@
 # AIME 2025 (NeMo Skills, chat)
-#
-# Math competition benchmark. Uses the chat endpoint.
 # Primary metric: pass@1[avg-of-16] symbolic_correct
-# Run time: Long (reasoning models generate lengthy thinking traces)
-# Repeats: 16
-#
-# Note: The AA variant (simple_evals.AIME_2025) requires JUDGE_API_KEY
-# for LLM-based scoring. This NeMo Skills variant uses symbolic scoring
-# and does not require external API keys.
-#
-# Usage:
-#   nel run --config recipes/tasks/aime2025.yaml \
-#     -o deployment.checkpoint_path=/path/to/checkpoint \
-#     -o execution.hostname=<slurm_host> \
-#     -o execution.account=<slurm_account> \
-#     -o execution.output_dir=/path/to/output \
-#     -o deployment.served_model_name=<model_name>
-defaults:
-  - execution: slurm/default
-  - deployment: vllm
-  - _self_
-execution:
-  hostname: ???
-  username: ${oc.env:USER}
-  account: ???
-  output_dir: ???
-  walltime: "04:00:00"
-  mounts:
-    mount_home: false
-deployment:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  checkpoint_path: ???
-  hf_model_handle:
-  served_model_name: ???
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  # For models with custom code, add: --trust-remote-code
-  extra_args: --max-model-len 32768
-evaluation:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 100000
-        max_retries: 10
-        parallelism: 16
-    target:
-      api_endpoint:
-        api_key_name: DUMMY_API_KEY
-  tasks:
-    - name: ns_aime2025
-      nemo_evaluator_config:
-        config:
-          params:
-            extra:
-              num_repeats: 16
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
+# Run time: Long (reasoning models generate lengthy thinking traces) | Repeats: 16
+# Note: The AA variant (simple_evals.AIME_2025) requires JUDGE_API_KEY.
+#       This NeMo Skills variant uses symbolic scoring — no external API keys needed.
+  - name: ns_aime2025
+    nemo_evaluator_config:
+      config:
+        params:
+          request_timeout: 100000
+          max_retries: 10
+          extra:
+            num_repeats: 16
+      target:
+        api_endpoint:
+          adapter_config:
+            params_to_remove:
+              - max_new_tokens
+              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
index ba8cf2a720a..3692175d987 100644
--- a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
+++ b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
@@ -1,62 +1,16 @@
 # GPQA Diamond (NeMo Skills, chat)
-#
-# Graduate-level reasoning benchmark. Uses the chat endpoint.
 # Primary metric: pass@1[avg-of-5] symbolic_correct
-# Run time: Short
-# Repeats: 5
-#
-# Usage:
-#   nel run --config recipes/tasks/gpqa.yaml \
-#     -o deployment.checkpoint_path=/path/to/checkpoint \
-#     -o execution.hostname=<slurm_host> \
-#     -o execution.account=<slurm_account> \
-#     -o execution.output_dir=/path/to/output \
-#     -o deployment.served_model_name=<model_name>
-defaults:
-  - execution: slurm/default
-  - deployment: vllm
-  - _self_
-execution:
-  hostname: ???
-  username: ${oc.env:USER}
-  account: ???
-  output_dir: ???
-  walltime: "02:00:00"
-  mounts:
-    mount_home: false
-deployment:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  checkpoint_path: ???
-  hf_model_handle:
-  served_model_name: ???
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  # For models with custom code, add: --trust-remote-code
-  extra_args: --max-model-len 32768
-evaluation:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        max_retries: 5
-        parallelism: 16
-    target:
-      api_endpoint:
-        api_key_name: DUMMY_API_KEY
-  tasks:
-    - name: ns_gpqa
-      nemo_evaluator_config:
-        config:
-          params:
-            extra:
-              args: ++prompt_config=eval/aai/mcq-4choices
-              num_repeats: 5
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
+# Run time: Short | Repeats: 5
+  - name: ns_gpqa
+    nemo_evaluator_config:
+      config:
+        params:
+          extra:
+            args: ++prompt_config=eval/aai/mcq-4choices
+            num_repeats: 5
+      target:
+        api_endpoint:
+          adapter_config:
+            params_to_remove:
+              - max_new_tokens
+              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
index 0876c332e7a..46cbc2db085 100644
--- a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
+++ b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
@@ -1,61 +1,15 @@
 # IFBench (NeMo Skills, chat)
-#
-# Instruction following benchmark. Uses the chat endpoint.
 # Primary metric: pass@1[avg-of-8] prompt_strict_accuracy
-# Run time: Super Short
-# Repeats: 8
-#
-# Usage:
-#   nel run --config recipes/tasks/ifbench.yaml \
-#     -o deployment.checkpoint_path=/path/to/checkpoint \
-#     -o execution.hostname=<slurm_host> \
-#     -o execution.account=<slurm_account> \
-#     -o execution.output_dir=/path/to/output \
-#     -o deployment.served_model_name=<model_name>
-defaults:
-  - execution: slurm/default
-  - deployment: vllm
-  - _self_
-execution:
-  hostname: ???
-  username: ${oc.env:USER}
-  account: ???
-  output_dir: ???
-  walltime: "02:00:00"
-  mounts:
-    mount_home: false
-deployment:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  checkpoint_path: ???
-  hf_model_handle:
-  served_model_name: ???
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  # For models with custom code, add: --trust-remote-code
-  extra_args: --max-model-len 32768
-evaluation:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        max_retries: 5
-        parallelism: 16
-    target:
-      api_endpoint:
-        api_key_name: DUMMY_API_KEY
-  tasks:
-    - name: ns_ifbench
-      nemo_evaluator_config:
-        config:
-          params:
-            extra:
-              num_repeats: 8
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
+# Run time: Super Short | Repeats: 8
+  - name: ns_ifbench
+    nemo_evaluator_config:
+      config:
+        params:
+          extra:
+            num_repeats: 8
+      target:
+        api_endpoint:
+          adapter_config:
+            params_to_remove:
+              - max_new_tokens
+              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
index b56b500df83..202387a1eb6 100644
--- a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
+++ b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
@@ -1,62 +1,17 @@
 # LiveCodeBench v6 (NeMo Skills, chat)
-#
-# Code generation benchmark with recent problems. Uses the chat endpoint.
 # Primary metric: pass@1[avg-of-3] accuracy
-# Run time: Medium
-# Repeats: 3
-#
-# Usage:
-#   nel run --config recipes/tasks/livecodebench.yaml \
-#     -o deployment.checkpoint_path=/path/to/checkpoint \
-#     -o execution.hostname=<slurm_host> \
-#     -o execution.account=<slurm_account> \
-#     -o execution.output_dir=/path/to/output \
-#     -o deployment.served_model_name=<model_name>
-defaults:
-  - execution: slurm/default
-  - deployment: vllm
-  - _self_
-execution:
-  hostname: ???
-  username: ${oc.env:USER}
-  account: ???
-  output_dir: ???
-  walltime: "04:00:00"
-  mounts:
-    mount_home: false
-deployment:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  checkpoint_path: ???
-  hf_model_handle:
-  served_model_name: ???
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  # For models with custom code, add: --trust-remote-code
-  extra_args: --max-model-len 32768
-evaluation:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        max_retries: 10
-        parallelism: 16
-    target:
-      api_endpoint:
-        api_key_name: DUMMY_API_KEY
-  tasks:
-    - name: ns_livecodebench
-      nemo_evaluator_config:
-        config:
-          params:
-            extra:
-              dataset_split: test_v6_2408_2505
-              num_repeats: 3
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
+# Run time: Medium | Repeats: 3
+  - name: ns_livecodebench
+    nemo_evaluator_config:
+      config:
+        params:
+          max_retries: 10
+          extra:
+            dataset_split: test_v6_2408_2505
+            num_repeats: 3
+      target:
+        api_endpoint:
+          adapter_config:
+            params_to_remove:
+              - max_new_tokens
+              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu.yaml
deleted file mode 100644
index cccbe5ff78c..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/mmlu.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
-# MMLU (ADLR, completions)
-#
-# Massive Multitask Language Understanding. Uses the completions endpoint.
-# Primary metric: exact_match
-# Run time: Short
-#
-# Usage:
-#   nel run --config recipes/tasks/mmlu.yaml \
-#     -o deployment.checkpoint_path=/path/to/checkpoint \
-#     -o execution.hostname=<slurm_host> \
-#     -o execution.account=<slurm_account> \
-#     -o execution.output_dir=/path/to/output \
-#     -o deployment.served_model_name=<model_name>
-defaults:
-  - execution: slurm/default
-  - deployment: vllm
-  - _self_
-execution:
-  hostname: ???
-  username: ${oc.env:USER}
-  account: ???
-  output_dir: ???
-  walltime: "02:00:00"
-  mounts:
-    mount_home: false
-deployment:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  checkpoint_path: ???
-  hf_model_handle:
-  served_model_name: ???
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  # For models with custom code, add: --trust-remote-code
-  extra_args: --max-model-len 32768
-evaluation:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        max_retries: 5
-        parallelism: 16
-  tasks:
-    - name: adlr_mmlu
-      nemo_evaluator_config:
-        config:
-          params:
-            max_new_tokens: 2
-        target:
-          api_endpoint:
-            type: completions
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
index 749cba572f6..be16a546a39 100644
--- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
@@ -1,46 +1,16 @@
-# MMLU-Pro (5-shot, base/completions)
-#
-# Knowledge benchmark. Uses the completions endpoint (not chat).
-# Primary metric: exact_match
-# Run time: Short
-#
-# Usage:
-#   nel run --config recipes/tasks/mmlu_pro.yaml \
-#     -o deployment.checkpoint_path=/path/to/checkpoint \
-#     -o execution.hostname=<slurm_host> \
-#     -o execution.account=<slurm_account> \
-#     -o execution.output_dir=/path/to/output \
-#     -o deployment.served_model_name=<model_name>
-defaults:
-  - execution: slurm/default
-  - deployment: vllm
-  - _self_
-execution:
-  hostname: ???
-  username: ${oc.env:USER}
-  account: ???
-  output_dir: ???
-  walltime: "02:00:00"
-  mounts:
-    mount_home: false
-deployment:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  checkpoint_path: ???
-  hf_model_handle:
-  served_model_name: ???
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  # For models with custom code, add: --trust-remote-code
-  extra_args: --max-model-len 32768
-evaluation:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        max_retries: 5
-        parallelism: 16
-  tasks:
-    - name: adlr_mmlu_pro_5_shot_base
+# MMLU-Pro (NeMo Skills, chat)
+# Primary metric: symbolic_correct
+# Run time: Short | Repeats: 1
+  - name: ns_mmlu_pro
+    nemo_evaluator_config:
+      config:
+        params:
+          extra:
+            num_repeats: 1
+            args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null
+      target:
+        api_endpoint:
+          adapter_config:
+            params_to_remove:
+              - max_new_tokens
+              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.yaml b/.claude/skills/evaluation/recipes/tasks/scicode.yaml
index 27bdd39eb53..724b6935759 100644
--- a/.claude/skills/evaluation/recipes/tasks/scicode.yaml
+++ b/.claude/skills/evaluation/recipes/tasks/scicode.yaml
@@ -1,61 +1,16 @@
 # SciCode (NeMo Skills, chat)
-#
-# Science + code benchmark. Uses the chat endpoint.
 # Primary metric: pass@1[avg-of-3] subtask_accuracy
-# Run time: Long
-# Repeats: 3
-#
-# Usage:
-#   nel run --config recipes/tasks/scicode.yaml \
-#     -o deployment.checkpoint_path=/path/to/checkpoint \
-#     -o execution.hostname=<slurm_host> \
-#     -o execution.account=<slurm_account> \
-#     -o execution.output_dir=/path/to/output \
-#     -o deployment.served_model_name=<model_name>
-defaults:
-  - execution: slurm/default
-  - deployment: vllm
-  - _self_
-execution:
-  hostname: ???
-  username: ${oc.env:USER}
-  account: ???
-  output_dir: ???
-  walltime: "04:00:00"
-  mounts:
-    mount_home: false
-deployment:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  checkpoint_path: ???
-  hf_model_handle:
-  served_model_name: ???
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  # For models with custom code, add: --trust-remote-code
-  extra_args: --max-model-len 32768
-evaluation:
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  nemo_evaluator_config:
-    config:
-      params:
-        request_timeout: 3600
-        max_retries: 10
-        parallelism: 16
-    target:
-      api_endpoint:
-        api_key_name: DUMMY_API_KEY
-  tasks:
-    - name: ns_scicode
-      nemo_evaluator_config:
-        config:
-          params:
-            extra:
-              num_repeats: 3
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
+# Run time: Long | Repeats: 3
+  - name: ns_scicode
+    nemo_evaluator_config:
+      config:
+        params:
+          max_retries: 10
+          extra:
+            num_repeats: 3
+      target:
+        api_endpoint:
+          adapter_config:
+            params_to_remove:
+              - max_new_tokens
+              - max_completion_tokens