From 78e9c6120ced3ce6cfe5cc7ac3f19973af6d4fb0 Mon Sep 17 00:00:00 2001
From: Chenjie Luo <chenjiel@nvidia.com>
Date: Thu, 23 Apr 2026 09:30:55 -0700
Subject: [PATCH 1/3] feat(recipes): add KV cache cast variants (fp8_cast /
 nvfp4_cast)

Introduce three built-in PTQ recipes that express the cast variants of
KV cache quantization directly in YAML by setting
use_constant_amax: true on the *[kv]_bmm_quantizer entry:

  - general/ptq/fp8_default-fp8_cast_kv
  - general/ptq/nvfp4_default-fp8_cast_kv
  - general/ptq/nvfp4_default-nvfp4_cast_kv

Previously, cast semantics could only be activated via
--kv_cache_qformat={fp8_cast,nvfp4_cast} in hf_ptq.py, which was
layered on top of any --recipe-supplied config and silently overrode
it. With these recipes, the YAML is self-contained and authoritative:
hf_ptq.py now skips the post-hoc _set_kv_cache_constant_amax override
when --recipe is provided, and --kv_cache_qformat is documented as
ignored in that case.

Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py                    | 13 ++-
 .../general/ptq/fp8_default-fp8_cast_kv.yaml  | 70 ++++++++++++++++
 .../ptq/nvfp4_default-fp8_cast_kv.yaml        | 78 ++++++++++++++++++
 .../ptq/nvfp4_default-nvfp4_cast_kv.yaml      | 82 +++++++++++++++++++
 tests/unit/recipe/test_loader.py              |  3 +
 5 files changed, 243 insertions(+), 3 deletions(-)
 create mode 100644 modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml
 create mode 100644 modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml
 create mode 100644 modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 831d230a672..e55f565648c 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -1107,7 +1107,10 @@ def quantize_main(
                 print(f"Excluding MTP layer from quantization: {pattern}")
 
         # Use constant amax for KV quantizers when a cast format is selected.
-        if args.kv_cache_qformat in _KV_CAST_FORMATS:
+        # Recipes are authoritative for KV cache config (including use_constant_amax),
+        # so skip this post-hoc override when --recipe is used; rely on the YAML instead
+        # (see modelopt_recipes/general/ptq/*_cast_kv.yaml).
+        if args.recipe is None and args.kv_cache_qformat in _KV_CAST_FORMATS:
             quant_cfg = copy.deepcopy(quant_cfg)
             _set_kv_cache_constant_amax(quant_cfg["quant_cfg"])
 
@@ -1163,7 +1166,9 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--recipe",
         help=(
-            "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_kv)."
+            "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_kv, "
+            "general/ptq/nvfp4_default-fp8_cast_kv, general/ptq/nvfp4_default-nvfp4_cast_kv). "
+            "When set, --kv_cache_qformat is ignored; the recipe fully determines KV cache config."
         ),
         default=None,
     )
@@ -1252,7 +1257,9 @@ def parse_args() -> argparse.Namespace:
             "Specify KV cache quantization format. Default: fp8_cast. "
             "Formats ending in '_cast' (fp8_cast, nvfp4_cast) set the amax to FP8 range "
             "without data-driven calibration. "
-            "Other formats (fp8, nvfp4, etc.) use data-driven calibration."
+            "Other formats (fp8, nvfp4, etc.) use data-driven calibration. "
+            "Ignored when --recipe is given: the recipe YAML is authoritative for KV "
+            "cache config (use the *_cast_kv.yaml recipes for the cast variants)."
         ),
     )
     parser.add_argument(
diff --git a/modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml b/modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml
new file mode 100644
index 00000000000..dcf842728e0
--- /dev/null
+++ b/modelopt_recipes/general/ptq/fp8_default-fp8_cast_kv.yaml
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    FP8 per-tensor weight and activation (W8A8), FP8 KV cache with constant amax
+    (skips KV calibration; amax hardcoded to FP8 E4M3 max 448.0), max calibration.
+quantize:
+  algorithm: max
+  quant_cfg:
+    - quantizer_name: '*'
+      enable: false
+    - quantizer_name: '*input_quantizer'
+      cfg:
+        num_bits: e4m3
+        axis:
+    - quantizer_name: '*weight_quantizer'
+      cfg:
+        num_bits: e4m3
+        axis:
+    - quantizer_name: '*[kv]_bmm_quantizer'
+      enable: true
+      cfg:
+        num_bits: e4m3
+        use_constant_amax: true
+    - quantizer_name: '*block_sparse_moe.gate*'
+      enable: false
+    - quantizer_name: '*linear_attn.conv1d*'
+      enable: false
+    - quantizer_name: '*lm_head*'
+      enable: false
+    - quantizer_name: '*mixer.conv1d*'
+      enable: false
+    - quantizer_name: '*mlp.gate.*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert_gate.*'
+      enable: false
+    - quantizer_name: '*output_layer*'
+      enable: false
+    - quantizer_name: '*proj_out.*'
+      enable: false
+    - quantizer_name: '*router*'
+      enable: false
+    - quantizer_name: 'output.*'
+      enable: false
+    - parent_class: 'nn.BatchNorm1d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm2d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm3d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.LeakyReLU'
+      quantizer_name: '*'
+      enable: false
diff --git a/modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml
new file mode 100644
index 00000000000..b1770883468
--- /dev/null
+++ b/modelopt_recipes/general/ptq/nvfp4_default-fp8_cast_kv.yaml
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    NVFP4 W4A4, FP8 KV cache with constant amax (skips KV calibration; amax
+    hardcoded to FP8 E4M3 max 448.0), max calibration.
+quantize:
+  algorithm: max
+  quant_cfg:
+    - quantizer_name: '*'
+      enable: false
+    - quantizer_name: '*weight_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*input_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*[kv]_bmm_quantizer'
+      enable: true
+      cfg:
+        num_bits: e4m3
+        use_constant_amax: true
+    - quantizer_name: '*block_sparse_moe.gate*'
+      enable: false
+    - quantizer_name: '*linear_attn.conv1d*'
+      enable: false
+    - quantizer_name: '*lm_head*'
+      enable: false
+    - quantizer_name: '*mixer.conv1d*'
+      enable: false
+    - quantizer_name: '*mlp.gate.*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert_gate.*'
+      enable: false
+    - quantizer_name: '*output_layer*'
+      enable: false
+    - quantizer_name: '*proj_out.*'
+      enable: false
+    - quantizer_name: '*router*'
+      enable: false
+    - quantizer_name: 'output.*'
+      enable: false
+    - parent_class: 'nn.BatchNorm1d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm2d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm3d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.LeakyReLU'
+      quantizer_name: '*'
+      enable: false
diff --git a/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml
new file mode 100644
index 00000000000..4150243916d
--- /dev/null
+++ b/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+metadata:
+  recipe_type: ptq
+  description: >-
+    NVFP4 W4A4, NVFP4 KV cache with constant amax (skips KV calibration; amax
+    hardcoded to FP8 E4M3 max 448.0), max calibration.
+quantize:
+  algorithm: max
+  quant_cfg:
+    - quantizer_name: '*'
+      enable: false
+    - quantizer_name: '*weight_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*input_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*[kv]_bmm_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+        use_constant_amax: true
+    - quantizer_name: '*block_sparse_moe.gate*'
+      enable: false
+    - quantizer_name: '*linear_attn.conv1d*'
+      enable: false
+    - quantizer_name: '*lm_head*'
+      enable: false
+    - quantizer_name: '*mixer.conv1d*'
+      enable: false
+    - quantizer_name: '*mlp.gate.*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert_gate.*'
+      enable: false
+    - quantizer_name: '*output_layer*'
+      enable: false
+    - quantizer_name: '*proj_out.*'
+      enable: false
+    - quantizer_name: '*router*'
+      enable: false
+    - quantizer_name: 'output.*'
+      enable: false
+    - parent_class: 'nn.BatchNorm1d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm2d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm3d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.LeakyReLU'
+      quantizer_name: '*'
+      enable: false
diff --git a/tests/unit/recipe/test_loader.py b/tests/unit/recipe/test_loader.py
index b8da2d140f8..6926d89a5d2 100644
--- a/tests/unit/recipe/test_loader.py
+++ b/tests/unit/recipe/test_loader.py
@@ -106,7 +106,10 @@ def test_load_recipe_builtin_description():
 
 _BUILTIN_PTQ_RECIPES = [
     "general/ptq/fp8_default-fp8_kv",
+    "general/ptq/fp8_default-fp8_cast_kv",
     "general/ptq/nvfp4_default-fp8_kv",
+    "general/ptq/nvfp4_default-fp8_cast_kv",
+    "general/ptq/nvfp4_default-nvfp4_cast_kv",
     "general/ptq/nvfp4_mlp_only-fp8_kv",
     "general/ptq/nvfp4_omlp_only-fp8_kv",
 ]

From 1c2a68f0d5c159f83ecb126b1d663491627dc2d8 Mon Sep 17 00:00:00 2001
From: Chenjie Luo <chenjiel@nvidia.com>
Date: Thu, 23 Apr 2026 10:21:33 -0700
Subject: [PATCH 2/3] Reorder description

Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index e55f565648c..136d221e37e 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -1166,8 +1166,8 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--recipe",
         help=(
-            "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_kv, "
-            "general/ptq/nvfp4_default-fp8_cast_kv, general/ptq/nvfp4_default-nvfp4_cast_kv). "
+            "PTQ recipe YAML file or name without suffix (e.g. general/ptq/nvfp4_default-fp8_cast_kv, "
+            "general/ptq/nvfp4_default-fp8_kv, general/ptq/nvfp4_default-nvfp4_cast_kv). "
             "When set, --kv_cache_qformat is ignored; the recipe fully determines KV cache config."
         ),
         default=None,

From 177ec6e76e6e5b2f85d58407e838a4aa531baf52 Mon Sep 17 00:00:00 2001
From: Chenjie Luo <chenjiel@nvidia.com>
Date: Thu, 23 Apr 2026 10:55:16 -0700
Subject: [PATCH 3/3] Update

Signed-off-by: Chenjie Luo <chenjiel@nvidia.com>
---
 modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml b/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml
index 4150243916d..95a191113e4 100644
--- a/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml
+++ b/modelopt_recipes/general/ptq/nvfp4_default-nvfp4_cast_kv.yaml
@@ -17,7 +17,9 @@ metadata:
   recipe_type: ptq
   description: >-
     NVFP4 W4A4, NVFP4 KV cache with constant amax (skips KV calibration; amax
-    hardcoded to FP8 E4M3 max 448.0), max calibration.
+    hardcoded to FP8 E4M3 max 448.0 — the deployment kernel upcasts NVFP4 KV
+    values to FP8 before attention, so the scale must land in the FP8 range),
+    max calibration.
 quantize:
   algorithm: max
   quant_cfg: