NVIDIA · realAsma · Apr 24, 2026 · Apr 24, 2026 · coderabbitai · Apr 24, 2026
@@ -955,6 +955,25 @@ def quantize_main(
     default_pad_token,
     device: torch.device,
 ):
+    # Load the recipe up front so we can detect layerwise calibration before batch-size probing.
+    recipe = None
+    if args.recipe is not None and not args.auto_quantize_bits:
+        print(f"Use recipe {args.recipe} for quantization")
+        recipe = load_recipe(args.recipe)
+        if not isinstance(recipe, ModelOptPTQRecipe):
+            raise TypeError(
+                f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}"
+            )
+
+    def _is_layerwise(obj):
+        if isinstance(obj, ModelOptPTQRecipe):
+            return _is_layerwise(obj.quantize.algorithm)
+        if isinstance(obj, list):
+            return any(_is_layerwise(a) for a in obj)
+        return bool(getattr(obj, "layerwise", False))
+
+    is_layerwise = _is_layerwise(recipe)
+
     if args.batch_size == 0:
         # For VL models with image-text calibration, skip automatic batch size detection
         # since get_max_batch_size can't handle multimodal inputs
@@ -968,6 +987,11 @@ def quantize_main(
                 "Offline speculative decoding calibration enabled. Using default batch_size=1 for calibration."
             )
             args.batch_size = 1
+        # Layerwise calibration processes one layer at a time; auto batch-size probing runs a
+        # full-model forward which defeats the point and can OOM on very large models.
+        elif is_layerwise:
+            print("Layerwise calibration enabled. Using default batch_size=1 for calibration.")
+            args.batch_size = 1
         else:
             # Calibration/sparsification will actually take much more memory than regular inference
             # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio
@@ -1027,12 +1051,7 @@ def quantize_main(
     else:
         # mono quantization
 
-        if args.recipe is not None:
-            print(f"Use recipe {args.recipe} for quantization")
-            recipe = load_recipe(args.recipe)
-            assert isinstance(recipe, ModelOptPTQRecipe), (
-                f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}"
-            )
+        if recipe is not None:
             quant_cfg = recipe.quantize.model_dump()
 
         else:

@@ -154,7 +154,7 @@
 import warnings
 from typing import Any, Literal, cast
 
-from pydantic import ValidationInfo, field_validator, model_validator
+from pydantic import AliasChoices, ValidationInfo, field_validator, model_validator
 from typing_extensions import Required, TypedDict
 
 from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
@@ -1219,6 +1219,7 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):
 
     layerwise: bool = ModeloptField(
         default=False,
+        validation_alias=AliasChoices("layerwise", "use_sequential"),
         title="Enable layerwise (layer-by-layer) calibration.",
         description=(
             "If True, the calibration algorithm is applied layer by layer. "

@@ -15,12 +15,10 @@
 
 metadata:
   recipe_type: ptq
-  description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max layerwise calibration.
+  description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max calibration.
 quantize:
   algorithm:
     method: max
-    # Max calibration is fast and does not typically need checkpointing.
-    layerwise: true
   quant_cfg:
     - quantizer_name: '*'
       enable: false

@@ -0,0 +1,94 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+metadata:
+  recipe_type: ptq
+  description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max layerwise calibration.
+quantize:
+  algorithm:
+    method: max
+    # Max calibration is fast and does not typically need checkpointing.
+    layerwise: true
+  quant_cfg:
+    - quantizer_name: '*'
+      enable: false
+    - quantizer_name: '*mlp.experts*weight_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*mlp.experts*input_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*block_sparse_moe*weight_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*block_sparse_moe*input_quantizer'
+      enable: true
+      cfg:
+        block_sizes:
+          -1: 16
+          type: dynamic
+          scale_bits: e4m3
+        num_bits: e2m1
+    - quantizer_name: '*[kv]_bmm_quantizer'
+      enable: true
+      cfg:
+        num_bits: e4m3
+    - quantizer_name: '*block_sparse_moe.gate*'
+      enable: false
+    - quantizer_name: '*linear_attn.conv1d*'
+      enable: false
+    - quantizer_name: '*lm_head*'
+      enable: false
+    - quantizer_name: '*mixer.conv1d*'
+      enable: false
+    - quantizer_name: '*mlp.gate.*'
+      enable: false
+    - quantizer_name: '*mlp.shared_expert_gate.*'
+      enable: false
+    - quantizer_name: '*output_layer*'
+      enable: false
+    - quantizer_name: '*proj_out.*'
+      enable: false
+    - quantizer_name: '*router*'
+      enable: false
+    - quantizer_name: 'output.*'
+      enable: false
+    - parent_class: 'nn.BatchNorm1d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm2d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.BatchNorm3d'
+      quantizer_name: '*'
+      enable: false
+    - parent_class: 'nn.LeakyReLU'
+      quantizer_name: '*'
+      enable: false
diff --git a/tests/unit/torch/quantization/test_config_validation.py b/tests/unit/torch/quantization/test_config_validation.py
@@ -25,6 +25,7 @@
     INT4_AWQ_CFG,
     NVFP4_DEFAULT_CFG,
     W4A8_AWQ_BETA_CFG,
+    MaxCalibConfig,
     QuantizeConfig,
     find_quant_cfg_entry_by_path,
     need_calibration,
@@ -525,3 +526,35 @@ def test_validate_quant_cfg_entries_accepts_valid_cfg(self):
             algorithm="max",
         )
         assert len(cfg.quant_cfg) == 2
+
+
+class TestLayerwiseUseSequentialAlias:
+    """`layerwise` accepts the legacy `use_sequential` name via validation_alias.
+
+    Old PTQ checkpoints serialized the field as `use_sequential` before #1251 renamed
+    it to `layerwise`. AliasChoices lets those checkpoints load without a migration
+    validator while still serializing under the current name.
+    """
+
+    def test_use_sequential_true_sets_layerwise(self):
+        cfg = MaxCalibConfig(use_sequential=True)
+        assert cfg.layerwise is True
+
+    def test_use_sequential_false_sets_layerwise(self):
+        cfg = MaxCalibConfig(use_sequential=False)
+        assert cfg.layerwise is False
+
+    def test_layerwise_name_still_accepted(self):
+        cfg = MaxCalibConfig(layerwise=True)
+        assert cfg.layerwise is True
+
+    def test_serializes_under_current_name(self):
+        """Dump must use `layerwise`, not the legacy alias."""
+        dumped = MaxCalibConfig(use_sequential=True).model_dump()
+        assert dumped["layerwise"] is True
+        assert "use_sequential" not in dumped
+
+    def test_unknown_field_still_rejected(self):
+        """extra='forbid' must still reject unrelated unknown fields."""
+        with pytest.raises(ValidationError):
+            MaxCalibConfig(not_a_real_field=True)