From 78e9c6120ced3ce6cfe5cc7ac3f19973af6d4fb0 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 23 Apr 2026 09:30:55 -0700 Subject: [PATCH 1/3] feat(recipes): add KV cache cast variants (fp8_cast / nvfp4_cast) Introduce three built-in PTQ recipes that express the cast variants of KV cache quantization directly in YAML by setting use_constant_amax: true on the *[kv]_bmm_quantizer entry: - general/ptq/fp8_default-fp8_cast_kv - general/ptq/nvfp4_default-fp8_cast_kv - general/ptq/nvfp4_default-nvfp4_cast_kv Previously, cast semantics could only be activated via --kv_cache_qformat={fp8_cast,nvfp4_cast} in hf_ptq.py, which was layered on top of any --recipe-supplied config and silently overrode it. With these recipes, the YAML is self-contained and authoritative: hf_ptq.py now skips the post-hoc _set_kv_cache_constant_amax override when --recipe is provided, and --kv_cache_qformat is documented as ignored in that case. Signed-off-by: Chenjie Luo --- examples/llm_ptq/hf_ptq.py | 13 ++- .../general/ptq/fp8_default-fp8_cast_kv.yaml | 70 ++++++++++++++++ .../ptq/nvfp4_default-fp8_cast_kv.yaml | 78 ++++++++++++++++++ .../ptq/nvfp4_default-nvfp4_cast_kv.yaml | 82 +++++++++++++++++++ tests/unit/recipe/test_loader.py | 3 + 5 files changed, 243 insertions(+), 3 deletions(-) create mode 100644 modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml create mode 100644 modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml create mode 100644 modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 831d230a672..e55f565648c 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -1107,7 +1107,10 @@ def quantize_main( print(f"Excluding MTP layer from quantization: {pattern}") # Use constant amax for KV quantizers when a cast format is selected. - if args.kv_cache_qformat in _KV_CAST_FORMATS: + # Recipes are authoritative for KV cache config (including use_constant_amax), + # so skip this post-hoc override when --recipe is used; rely on the YAML instead + # (see modelopt_recipes/general/ptq/*_cast_kv.yaml). + if args.recipe is None and args.kv_cache_qformat in _KV_CAST_FORMATS: quant_cfg = copy.deepcopy(quant_cfg) _set_kv_cache_constant_amax(quant_cfg["quant_cfg"]) @@ -1163,7 +1166,9 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--recipe", help=( - "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_kv)." + "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_kv, " + "general/ptq/nvfp4_default-fp8_cast_kv, general/ptq/nvfp4_default-nvfp4_cast_kv). " + "When set, --kv_cache_qformat is ignored; the recipe fully determines KV cache config." ), default=None, ) @@ -1252,7 +1257,9 @@ def parse_args() -> argparse.Namespace: "Specify KV cache quantization format. Default: fp8_cast. " "Formats ending in '_cast' (fp8_cast, nvfp4_cast) set the amax to FP8 range " "without data-driven calibration. " - "Other formats (fp8, nvfp4, etc.) use data-driven calibration." + "Other formats (fp8, nvfp4, etc.) use data-driven calibration. " + "Ignored when --recipe is given: the recipe YAML is authoritative for KV " + "cache config (use the *_cast_kv.yaml recipes for the cast variants)." ), ) parser.add_argument( diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml b/modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml new file mode 100644 index 00000000000..dcf842728e0 --- /dev/null +++ b/modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +metadata: + recipe_type: ptq + description: >- + FP8 per-tensor weight and activation (W8A8), FP8 KV cache with constant amax + (skips KV calibration; amax hardcoded to FP8 E4M3 max 448.0), max calibration. +quantize: + algorithm: max + quant_cfg: + - quantizer_name: '*' + enable: false + - quantizer_name: '*input_quantizer' + cfg: + num_bits: e4m3 + axis: + - quantizer_name: '*weight_quantizer' + cfg: + num_bits: e4m3 + axis: + - quantizer_name: '*[kv]_bmm_quantizer' + enable: true + cfg: + num_bits: e4m3 + use_constant_amax: true + - quantizer_name: '*block_sparse_moe.gate*' + enable: false + - quantizer_name: '*linear_attn.conv1d*' + enable: false + - quantizer_name: '*lm_head*' + enable: false + - quantizer_name: '*mixer.conv1d*' + enable: false + - quantizer_name: '*mlp.gate.*' + enable: false + - quantizer_name: '*mlp.shared_expert_gate.*' + enable: false + - quantizer_name: '*output_layer*' + enable: false + - quantizer_name: '*proj_out.*' + enable: false + - quantizer_name: '*router*' + enable: false + - quantizer_name: 'output.*' + enable: false + - parent_class: 'nn.BatchNorm1d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm2d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm3d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.LeakyReLU' + quantizer_name: '*' + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml new file mode 100644 index 00000000000..b1770883468 --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +metadata: + recipe_type: ptq + description: >- + NVFP4 W4A4, FP8 KV cache with constant amax (skips KV calibration; amax + hardcoded to FP8 E4M3 max 448.0), max calibration. +quantize: + algorithm: max + quant_cfg: + - quantizer_name: '*' + enable: false + - quantizer_name: '*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*input_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*[kv]_bmm_quantizer' + enable: true + cfg: + num_bits: e4m3 + use_constant_amax: true + - quantizer_name: '*block_sparse_moe.gate*' + enable: false + - quantizer_name: '*linear_attn.conv1d*' + enable: false + - quantizer_name: '*lm_head*' + enable: false + - quantizer_name: '*mixer.conv1d*' + enable: false + - quantizer_name: '*mlp.gate.*' + enable: false + - quantizer_name: '*mlp.shared_expert_gate.*' + enable: false + - quantizer_name: '*output_layer*' + enable: false + - quantizer_name: '*proj_out.*' + enable: false + - quantizer_name: '*router*' + enable: false + - quantizer_name: 'output.*' + enable: false + - parent_class: 'nn.BatchNorm1d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm2d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm3d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.LeakyReLU' + quantizer_name: '*' + enable: false diff --git a/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml new file mode 100644 index 00000000000..4150243916d --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +metadata: + recipe_type: ptq + description: >- + NVFP4 W4A4, NVFP4 KV cache with constant amax (skips KV calibration; amax + hardcoded to FP8 E4M3 max 448.0), max calibration. +quantize: + algorithm: max + quant_cfg: + - quantizer_name: '*' + enable: false + - quantizer_name: '*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*input_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*[kv]_bmm_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + use_constant_amax: true + - quantizer_name: '*block_sparse_moe.gate*' + enable: false + - quantizer_name: '*linear_attn.conv1d*' + enable: false + - quantizer_name: '*lm_head*' + enable: false + - quantizer_name: '*mixer.conv1d*' + enable: false + - quantizer_name: '*mlp.gate.*' + enable: false + - quantizer_name: '*mlp.shared_expert_gate.*' + enable: false + - quantizer_name: '*output_layer*' + enable: false + - quantizer_name: '*proj_out.*' + enable: false + - quantizer_name: '*router*' + enable: false + - quantizer_name: 'output.*' + enable: false + - parent_class: 'nn.BatchNorm1d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm2d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm3d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.LeakyReLU' + quantizer_name: '*' + enable: false diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py index b8da2d140f8..6926d89a5d2 100644 --- a/tests/unit/recipe/test_loader.py +++ b/tests/unit/recipe/test_loader.py @@ -106,7 +106,10 @@ def test_load_recipe_builtin_description(): _BUILTIN_PTQ_RECIPES = [ "general/ptq/fp8_default-fp8_kv", + "general/ptq/fp8_default-fp8_cast_kv", "general/ptq/nvfp4_default-fp8_kv", + "general/ptq/nvfp4_default-fp8_cast_kv", + "general/ptq/nvfp4_default-nvfp4_cast_kv", "general/ptq/nvfp4_mlp_only-fp8_kv", "general/ptq/nvfp4_omlp_only-fp8_kv", ] From 1c2a68f0d5c159f83ecb126b1d663491627dc2d8 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 23 Apr 2026 10:21:33 -0700 Subject: [PATCH 2/3] Reorder description Signed-off-by: Chenjie Luo --- examples/llm_ptq/hf_ptq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index e55f565648c..136d221e37e 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -1166,8 +1166,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--recipe", help=( - "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_kv, " - "general/ptq/nvfp4_default-fp8_cast_kv, general/ptq/nvfp4_default-nvfp4_cast_kv). " + "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_cast_kv, " + "general/ptq/nvfp4_default-fp8_kv, general/ptq/nvfp4_default-nvfp4_cast_kv). " "When set, --kv_cache_qformat is ignored; the recipe fully determines KV cache config." ), default=None, From 177ec6e76e6e5b2f85d58407e838a4aa531baf52 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 23 Apr 2026 10:55:16 -0700 Subject: [PATCH 3/3] Update Signed-off-by: Chenjie Luo --- modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml index 4150243916d..95a191113e4 100644 --- a/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml +++ b/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml @@ -17,7 +17,9 @@ metadata: recipe_type: ptq description: >- NVFP4 W4A4, NVFP4 KV cache with constant amax (skips KV calibration; amax - hardcoded to FP8 E4M3 max 448.0), max calibration. + hardcoded to FP8 E4M3 max 448.0 — the deployment kernel upcasts NVFP4 KV + values to FP8 before attention, so the scale must land in the FP8 range), + max calibration. quantize: algorithm: max quant_cfg: