Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions examples/llm_ptq/hf_ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,25 @@ def quantize_main(
default_pad_token,
device: torch.device,
):
# Load the recipe up front so we can detect layerwise calibration before batch-size probing.
recipe = None
if args.recipe is not None and not args.auto_quantize_bits:
print(f"Use recipe {args.recipe} for quantization")
recipe = load_recipe(args.recipe)
Comment thread
cjluo-nv marked this conversation as resolved.
if not isinstance(recipe, ModelOptPTQRecipe):
raise TypeError(
f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}"
)

def _is_layerwise(obj):
if isinstance(obj, ModelOptPTQRecipe):
return _is_layerwise(obj.quantize.algorithm)
if isinstance(obj, list):
return any(_is_layerwise(a) for a in obj)
return bool(getattr(obj, "layerwise", False))
Comment on lines +968 to +973
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Handle dict-form algorithms in _is_layerwise.

At Line 973, getattr(obj, "layerwise", False) makes dict algorithms evaluate as non-layerwise. That can bypass the Line 990-994 guard and fall back to full-model batch probing.

Suggested fix
     def _is_layerwise(obj):
         if isinstance(obj, ModelOptPTQRecipe):
             return _is_layerwise(obj.quantize.algorithm)
+        if isinstance(obj, dict):
+            if "layerwise" in obj:
+                return bool(obj["layerwise"])
+            if "algorithm" in obj:
+                return _is_layerwise(obj["algorithm"])
+            return False
         if isinstance(obj, list):
             return any(_is_layerwise(a) for a in obj)
         return bool(getattr(obj, "layerwise", False))
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@examples/llm_ptq/hf_ptq.py` around lines 968 - 973, The helper _is_layerwise
currently treats dict-form algorithms as non-layerwise because getattr(obj,
"layerwise", False) returns False for dicts; update _is_layerwise to explicitly
handle dicts by checking if obj is a dict and returning True when
obj.get("layerwise") is truthy or when any of its values (or nested algorithm
entries) are layerwise (i.e., recurse into dict values similar to list
handling). Keep the existing branches for ModelOptPTQRecipe and list, and ensure
the final fallback checks dicts before using getattr to avoid misclassifying
dict algorithms and bypassing the layerwise guard.


is_layerwise = _is_layerwise(recipe)

if args.batch_size == 0:
# For VL models with image-text calibration, skip automatic batch size detection
# since get_max_batch_size can't handle multimodal inputs
Expand All @@ -968,6 +987,11 @@ def quantize_main(
"Offline speculative decoding calibration enabled. Using default batch_size=1 for calibration."
)
args.batch_size = 1
# Layerwise calibration processes one layer at a time; auto batch-size probing runs a
# full-model forward which defeats the point and can OOM on very large models.
elif is_layerwise:
print("Layerwise calibration enabled. Using default batch_size=1 for calibration.")
args.batch_size = 1
else:
# Calibration/sparsification will actually take much more memory than regular inference
# due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio
Expand Down Expand Up @@ -1027,12 +1051,7 @@ def quantize_main(
else:
# mono quantization

if args.recipe is not None:
print(f"Use recipe {args.recipe} for quantization")
recipe = load_recipe(args.recipe)
assert isinstance(recipe, ModelOptPTQRecipe), (
f"Expected PTQ recipe, but got {type(recipe).__name__} from {args.recipe}"
)
if recipe is not None:
quant_cfg = recipe.quantize.model_dump()

else:
Expand Down
3 changes: 2 additions & 1 deletion modelopt/torch/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@
import warnings
from typing import Any, Literal, cast

from pydantic import ValidationInfo, field_validator, model_validator
from pydantic import AliasChoices, ValidationInfo, field_validator, model_validator
from typing_extensions import Required, TypedDict

from modelopt.torch.opt.config import ModeloptBaseConfig, ModeloptField
Expand Down Expand Up @@ -1219,6 +1219,7 @@ class QuantizeAlgorithmConfig(ModeloptBaseConfig):

layerwise: bool = ModeloptField(
default=False,
validation_alias=AliasChoices("layerwise", "use_sequential"),
Comment thread
realAsma marked this conversation as resolved.
title="Enable layerwise (layer-by-layer) calibration.",
description=(
"If True, the calibration algorithm is applied layer by layer. "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,10 @@

metadata:
recipe_type: ptq
description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max layerwise calibration.
description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max calibration.
quantize:
algorithm:
method: max
# Max calibration is fast and does not typically need checkpointing.
layerwise: true
quant_cfg:
- quantizer_name: '*'
enable: false
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay for now. Can you compose this yaml based on the modelopt_recipes/general/ptq/nvfp4_experts_only-fp8_kv.yaml once the composable recipes PR is merged?

# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

metadata:
recipe_type: ptq
description: NVFP4 static weight and dynamic activation for expert layers only (W4A4), FP8 KV cache, max layerwise calibration.
quantize:
algorithm:
method: max
# Max calibration is fast and does not typically need checkpointing.
Comment thread
realAsma marked this conversation as resolved.
layerwise: true
quant_cfg:
- quantizer_name: '*'
enable: false
- quantizer_name: '*mlp.experts*weight_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
- quantizer_name: '*mlp.experts*input_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
- quantizer_name: '*block_sparse_moe*weight_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
- quantizer_name: '*block_sparse_moe*input_quantizer'
enable: true
cfg:
block_sizes:
-1: 16
type: dynamic
scale_bits: e4m3
num_bits: e2m1
- quantizer_name: '*[kv]_bmm_quantizer'
enable: true
cfg:
num_bits: e4m3
- quantizer_name: '*block_sparse_moe.gate*'
enable: false
- quantizer_name: '*linear_attn.conv1d*'
enable: false
- quantizer_name: '*lm_head*'
enable: false
- quantizer_name: '*mixer.conv1d*'
enable: false
- quantizer_name: '*mlp.gate.*'
enable: false
- quantizer_name: '*mlp.shared_expert_gate.*'
enable: false
- quantizer_name: '*output_layer*'
enable: false
- quantizer_name: '*proj_out.*'
enable: false
- quantizer_name: '*router*'
enable: false
- quantizer_name: 'output.*'
enable: false
- parent_class: 'nn.BatchNorm1d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.BatchNorm2d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.BatchNorm3d'
quantizer_name: '*'
enable: false
- parent_class: 'nn.LeakyReLU'
quantizer_name: '*'
enable: false
33 changes: 33 additions & 0 deletions tests/unit/torch/quantization/test_config_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
INT4_AWQ_CFG,
NVFP4_DEFAULT_CFG,
W4A8_AWQ_BETA_CFG,
MaxCalibConfig,
QuantizeConfig,
find_quant_cfg_entry_by_path,
need_calibration,
Expand Down Expand Up @@ -525,3 +526,35 @@ def test_validate_quant_cfg_entries_accepts_valid_cfg(self):
algorithm="max",
)
assert len(cfg.quant_cfg) == 2


class TestLayerwiseUseSequentialAlias:
"""`layerwise` accepts the legacy `use_sequential` name via validation_alias.

Old PTQ checkpoints serialized the field as `use_sequential` before #1251 renamed
it to `layerwise`. AliasChoices lets those checkpoints load without a migration
validator while still serializing under the current name.
"""

def test_use_sequential_true_sets_layerwise(self):
cfg = MaxCalibConfig(use_sequential=True)
assert cfg.layerwise is True

def test_use_sequential_false_sets_layerwise(self):
cfg = MaxCalibConfig(use_sequential=False)
assert cfg.layerwise is False

def test_layerwise_name_still_accepted(self):
cfg = MaxCalibConfig(layerwise=True)
assert cfg.layerwise is True

def test_serializes_under_current_name(self):
"""Dump must use `layerwise`, not the legacy alias."""
dumped = MaxCalibConfig(use_sequential=True).model_dump()
assert dumped["layerwise"] is True
assert "use_sequential" not in dumped

def test_unknown_field_still_rejected(self):
"""extra='forbid' must still reject unrelated unknown fields."""
with pytest.raises(ValidationError):
MaxCalibConfig(not_a_real_field=True)
Loading