Skip to content

Commit b805c9a

Browse files
committed
Add pre-built evaluation recipes for common benchmarks
Signed-off-by: Kai Xu <kaix@nvidia.com>
1 parent 6e08b13 commit b805c9a

9 files changed

Lines changed: 516 additions & 0 deletions

File tree

.claude/skills/evaluation/SKILL.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to
4040

4141
If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running.
4242

43+
**Shortcut: use a pre-built recipe.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching recipe. Available: mmlu, mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. If found, skip Steps 2-5 — go directly to the recipe, fill in deployment overrides, and proceed to Step 7.5/8.
44+
4345
**Step 2: Build the base config file**
4446

4547
Prompt the user with "I'll ask you 5 questions to build the base config we'll adjust in the next steps". Guide the user through the 5 questions using AskUserQuestion:
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# Example: Quantization Validation Suite
2+
#
3+
# A balanced set of benchmarks for validating quantized model quality.
4+
# Copy this file and customize for your needs.
5+
#
6+
# Includes:
7+
# - MMLU-Pro (knowledge, completions)
8+
# - GPQA Diamond (reasoning, chat, 5 repeats)
9+
# - LiveCodeBench v6 (code, chat, 3 repeats)
10+
# - IFBench (instruction following, chat, 8 repeats)
11+
#
12+
# Usage:
13+
# nel run --config recipes/examples/example_eval.yaml \
14+
# -o deployment.checkpoint_path=/path/to/quantized/checkpoint \
15+
# -o deployment.served_model_name=my-model-nvfp4 \
16+
# -o execution.hostname=<slurm_host> \
17+
# -o execution.account=<slurm_account> \
18+
# -o execution.output_dir=/path/to/output
19+
#
20+
# For quantized checkpoints, also add the quantization flag:
21+
# -o 'deployment.extra_args=--max-model-len 32768 --trust-remote-code --quantization modelopt_fp4'
22+
#
23+
# Run a single task:
24+
# nel run --config ... -t ns_gpqa
25+
#
26+
# Smoke test (2 samples):
27+
# nel run --config ... -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=2
28+
defaults:
29+
- execution: slurm/default
30+
- deployment: vllm
31+
- _self_
32+
execution:
33+
hostname: ???
34+
username: ${oc.env:USER}
35+
account: ???
36+
output_dir: ???
37+
walltime: "04:00:00"
38+
mounts:
39+
mount_home: false
40+
deployment:
41+
env_vars:
42+
HF_TOKEN: host:HF_TOKEN
43+
checkpoint_path: ???
44+
hf_model_handle:
45+
served_model_name: ???
46+
tensor_parallel_size: 1
47+
data_parallel_size: 1
48+
# For models with custom code, add: --trust-remote-code
49+
extra_args: --max-model-len 32768
50+
evaluation:
51+
env_vars:
52+
HF_TOKEN: host:HF_TOKEN
53+
nemo_evaluator_config:
54+
config:
55+
params:
56+
request_timeout: 3600
57+
max_retries: 10
58+
parallelism: 16
59+
target:
60+
api_endpoint:
61+
api_key_name: DUMMY_API_KEY
62+
tasks:
63+
# Knowledge (completions endpoint, short)
64+
- name: adlr_mmlu_pro_5_shot_base
65+
66+
# Reasoning (chat endpoint, 5 repeats, short)
67+
- name: ns_gpqa
68+
nemo_evaluator_config:
69+
config:
70+
params:
71+
extra:
72+
args: ++prompt_config=eval/aai/mcq-4choices
73+
num_repeats: 5
74+
target:
75+
api_endpoint:
76+
adapter_config:
77+
params_to_remove:
78+
- max_new_tokens
79+
- max_completion_tokens
80+
81+
# Code (chat endpoint, 3 repeats, medium)
82+
- name: ns_livecodebench
83+
nemo_evaluator_config:
84+
config:
85+
params:
86+
extra:
87+
dataset_split: test_v6_2408_2505
88+
num_repeats: 3
89+
target:
90+
api_endpoint:
91+
adapter_config:
92+
params_to_remove:
93+
- max_new_tokens
94+
- max_completion_tokens
95+
96+
# Instruction following (chat endpoint, 8 repeats, super short)
97+
- name: ns_ifbench
98+
nemo_evaluator_config:
99+
config:
100+
params:
101+
extra:
102+
num_repeats: 8
103+
target:
104+
api_endpoint:
105+
adapter_config:
106+
params_to_remove:
107+
- max_new_tokens
108+
- max_completion_tokens
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# AIME 2025 (NeMo Skills, chat)
2+
#
3+
# Math competition benchmark. Uses the chat endpoint.
4+
# Primary metric: pass@1[avg-of-16] symbolic_correct
5+
# Run time: Long (reasoning models generate lengthy thinking traces)
6+
# Repeats: 16
7+
#
8+
# Usage:
9+
# nel run --config recipes/tasks/aime2025.yaml \
10+
# -o deployment.checkpoint_path=/path/to/checkpoint \
11+
# -o execution.hostname=<slurm_host> \
12+
# -o execution.account=<slurm_account> \
13+
# -o execution.output_dir=/path/to/output \
14+
# -o deployment.served_model_name=<model_name>
15+
defaults:
16+
- execution: slurm/default
17+
- deployment: vllm
18+
- _self_
19+
execution:
20+
hostname: ???
21+
username: ${oc.env:USER}
22+
account: ???
23+
output_dir: ???
24+
walltime: "04:00:00"
25+
mounts:
26+
mount_home: false
27+
deployment:
28+
env_vars:
29+
HF_TOKEN: host:HF_TOKEN
30+
checkpoint_path: ???
31+
hf_model_handle:
32+
served_model_name: ???
33+
tensor_parallel_size: 1
34+
data_parallel_size: 1
35+
# For models with custom code, add: --trust-remote-code
36+
extra_args: --max-model-len 32768
37+
evaluation:
38+
env_vars:
39+
HF_TOKEN: host:HF_TOKEN
40+
nemo_evaluator_config:
41+
config:
42+
params:
43+
request_timeout: 100000
44+
max_retries: 10
45+
parallelism: 16
46+
target:
47+
api_endpoint:
48+
api_key_name: DUMMY_API_KEY
49+
tasks:
50+
- name: ns_aime2025
51+
nemo_evaluator_config:
52+
config:
53+
params:
54+
extra:
55+
num_repeats: 16
56+
target:
57+
api_endpoint:
58+
adapter_config:
59+
params_to_remove:
60+
- max_new_tokens
61+
- max_completion_tokens
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# GPQA Diamond (NeMo Skills, chat)
2+
#
3+
# Graduate-level reasoning benchmark. Uses the chat endpoint.
4+
# Primary metric: pass@1[avg-of-5] symbolic_correct
5+
# Run time: Short
6+
# Repeats: 5
7+
#
8+
# Usage:
9+
# nel run --config recipes/tasks/gpqa.yaml \
10+
# -o deployment.checkpoint_path=/path/to/checkpoint \
11+
# -o execution.hostname=<slurm_host> \
12+
# -o execution.account=<slurm_account> \
13+
# -o execution.output_dir=/path/to/output \
14+
# -o deployment.served_model_name=<model_name>
15+
defaults:
16+
- execution: slurm/default
17+
- deployment: vllm
18+
- _self_
19+
execution:
20+
hostname: ???
21+
username: ${oc.env:USER}
22+
account: ???
23+
output_dir: ???
24+
walltime: "02:00:00"
25+
mounts:
26+
mount_home: false
27+
deployment:
28+
env_vars:
29+
HF_TOKEN: host:HF_TOKEN
30+
checkpoint_path: ???
31+
hf_model_handle:
32+
served_model_name: ???
33+
tensor_parallel_size: 1
34+
data_parallel_size: 1
35+
# For models with custom code, add: --trust-remote-code
36+
extra_args: --max-model-len 32768
37+
evaluation:
38+
env_vars:
39+
HF_TOKEN: host:HF_TOKEN
40+
nemo_evaluator_config:
41+
config:
42+
params:
43+
request_timeout: 3600
44+
max_retries: 5
45+
parallelism: 16
46+
target:
47+
api_endpoint:
48+
api_key_name: DUMMY_API_KEY
49+
tasks:
50+
- name: ns_gpqa
51+
nemo_evaluator_config:
52+
config:
53+
params:
54+
extra:
55+
args: ++prompt_config=eval/aai/mcq-4choices
56+
num_repeats: 5
57+
target:
58+
api_endpoint:
59+
adapter_config:
60+
params_to_remove:
61+
- max_new_tokens
62+
- max_completion_tokens
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# IFBench (NeMo Skills, chat)
2+
#
3+
# Instruction following benchmark. Uses the chat endpoint.
4+
# Primary metric: pass@1[avg-of-8] prompt_strict_accuracy
5+
# Run time: Super Short
6+
# Repeats: 8
7+
#
8+
# Usage:
9+
# nel run --config recipes/tasks/ifbench.yaml \
10+
# -o deployment.checkpoint_path=/path/to/checkpoint \
11+
# -o execution.hostname=<slurm_host> \
12+
# -o execution.account=<slurm_account> \
13+
# -o execution.output_dir=/path/to/output \
14+
# -o deployment.served_model_name=<model_name>
15+
defaults:
16+
- execution: slurm/default
17+
- deployment: vllm
18+
- _self_
19+
execution:
20+
hostname: ???
21+
username: ${oc.env:USER}
22+
account: ???
23+
output_dir: ???
24+
walltime: "02:00:00"
25+
mounts:
26+
mount_home: false
27+
deployment:
28+
env_vars:
29+
HF_TOKEN: host:HF_TOKEN
30+
checkpoint_path: ???
31+
hf_model_handle:
32+
served_model_name: ???
33+
tensor_parallel_size: 1
34+
data_parallel_size: 1
35+
# For models with custom code, add: --trust-remote-code
36+
extra_args: --max-model-len 32768
37+
evaluation:
38+
env_vars:
39+
HF_TOKEN: host:HF_TOKEN
40+
nemo_evaluator_config:
41+
config:
42+
params:
43+
request_timeout: 3600
44+
max_retries: 5
45+
parallelism: 16
46+
target:
47+
api_endpoint:
48+
api_key_name: DUMMY_API_KEY
49+
tasks:
50+
- name: ns_ifbench
51+
nemo_evaluator_config:
52+
config:
53+
params:
54+
extra:
55+
num_repeats: 8
56+
target:
57+
api_endpoint:
58+
adapter_config:
59+
params_to_remove:
60+
- max_new_tokens
61+
- max_completion_tokens
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# LiveCodeBench v6 (NeMo Skills, chat)
2+
#
3+
# Code generation benchmark with recent problems. Uses the chat endpoint.
4+
# Primary metric: pass@1[avg-of-3] accuracy
5+
# Run time: Medium
6+
# Repeats: 3
7+
#
8+
# Usage:
9+
# nel run --config recipes/tasks/livecodebench.yaml \
10+
# -o deployment.checkpoint_path=/path/to/checkpoint \
11+
# -o execution.hostname=<slurm_host> \
12+
# -o execution.account=<slurm_account> \
13+
# -o execution.output_dir=/path/to/output \
14+
# -o deployment.served_model_name=<model_name>
15+
defaults:
16+
- execution: slurm/default
17+
- deployment: vllm
18+
- _self_
19+
execution:
20+
hostname: ???
21+
username: ${oc.env:USER}
22+
account: ???
23+
output_dir: ???
24+
walltime: "04:00:00"
25+
mounts:
26+
mount_home: false
27+
deployment:
28+
env_vars:
29+
HF_TOKEN: host:HF_TOKEN
30+
checkpoint_path: ???
31+
hf_model_handle:
32+
served_model_name: ???
33+
tensor_parallel_size: 1
34+
data_parallel_size: 1
35+
# For models with custom code, add: --trust-remote-code
36+
extra_args: --max-model-len 32768
37+
evaluation:
38+
env_vars:
39+
HF_TOKEN: host:HF_TOKEN
40+
nemo_evaluator_config:
41+
config:
42+
params:
43+
request_timeout: 3600
44+
max_retries: 10
45+
parallelism: 16
46+
target:
47+
api_endpoint:
48+
api_key_name: DUMMY_API_KEY
49+
tasks:
50+
- name: ns_livecodebench
51+
nemo_evaluator_config:
52+
config:
53+
params:
54+
extra:
55+
dataset_split: test_v6_2408_2505
56+
num_repeats: 3
57+
target:
58+
api_endpoint:
59+
adapter_config:
60+
params_to_remove:
61+
- max_new_tokens
62+
- max_completion_tokens

0 commit comments

Comments
 (0)