diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1dbc33e6b20..d80829261ed 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,8 @@ Changelog **New Features** +- Add NVFP4 W4A16 weight-only quantization (``nvfp4_w4a16``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.NVFP4_W4A16_CFG`` or ``--qformat nvfp4_w4a16`` in ``hf_ptq.py``. Exported checkpoints can be served on vLLM after conversion to compressed-tensors format. +- Register ``nn.Embedding`` with ``QuantModuleRegistry`` (weight-only wrapper) and extend the unified HF exporter to pack quantized embedding weights. Enables NVFP4 quantization of ``lm_head`` and the input token embedding on hybrid SSM+Attention models such as Nemotron-H, where those two tables are a sizeable fraction of parameters and leaving them in bf16 wastes most of the compression. Use ``--recipe models/Nemotron-H/nvfp4_w4a16`` (see `modelopt_recipes/models/Nemotron-H/nvfp4_w4a16.yaml `_) to opt in. The ``--exclude_modules`` CLI flag in ``examples/llm_ptq/hf_ptq.py`` lets users selectively exclude individual modules from the recipe's coverage. - Support full Transformer Engine spec for Minitron pruning (``mcore_minitron``). Now we no longer need to use custom ModelOpt spec. Note that this does not affect the usage of the pruning workflow but makes pruning slightly faster and may result in slightly different pruned model because of different kernel and numerics. - Add Puzzletron - a new algorithm for heterogeneous pruning of LLM and VLM models. See `examples/puzzletron/README.md `_ for more details. - Added iterator interface using CalibrationDataReader in ONNX quantization workflow. diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 90532efe38d..86c0a299a08 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -23,6 +23,7 @@ import shutil import sys import warnings +from contextlib import contextmanager from pathlib import Path from typing import Any @@ -709,6 +710,38 @@ def has_pack_quantized_config(config): return model +@contextmanager +def normalized_generation_config_for_export(model): + """Temporarily swap in a normalized generation_config for export. + + Some model cards ship a ``generation_config.json`` that sets sampling hyperparameters + (``top_p``/``top_k``/``temperature``) without ``do_sample=True`` (e.g. + ``NVIDIA-Nemotron-3-Nano-4B-BF16``). transformers 5.x strictly validates this on + ``save_pretrained`` so the export step fails. We normalize by swapping in a deep copy + with ``do_sample=True`` for the duration of the export and restoring the original + afterwards — leaving ``model.generation_config`` unchanged so any ``.generate()`` calls + outside the export window (e.g. the pre-/post-PTQ previews) remain deterministic. + """ + original = getattr(model, "generation_config", None) + normalized = None + if original is not None and not getattr(original, "do_sample", False): + has_sampling_hyperparam = ( + getattr(original, "top_p", None) not in (None, 1.0) + or getattr(original, "top_k", None) not in (None, 0, 50) + or getattr(original, "temperature", None) not in (None, 1.0) + ) + if has_sampling_hyperparam: + normalized = copy.deepcopy(original) + normalized.do_sample = True + try: + if normalized is not None: + model.generation_config = normalized + yield + finally: + if normalized is not None: + model.generation_config = original + + def is_model_on_gpu(model) -> bool: """Returns if the model is fully loaded on GPUs.""" return all("cuda" in str(param.device) for param in model.parameters()) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 831d230a672..d7a8e3dfea0 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -35,6 +35,7 @@ is_nemotron_vl, load_mtp_weights, needs_checkpoint_path_update, + normalized_generation_config_for_export, resolve_checkpoint_dir, run_nemotron_vl_preview, ) @@ -107,6 +108,7 @@ def _set_kv_cache_constant_amax(quant_cfg: list) -> None: "int4_awq": mtq.INT4_AWQ_CFG, "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG, "nvfp4": mtq.NVFP4_DEFAULT_CFG, + "nvfp4_w4a16": mtq.NVFP4_W4A16_CFG, "nvfp4_awq": mtq.NVFP4_AWQ_LITE_CFG, "nvfp4_mse": mtq.NVFP4_W4A4_WEIGHT_MSE_FP8_SWEEP_CFG, "fp8_pb_wo": mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG, @@ -331,6 +333,7 @@ def auto_quantize( "nvfp4", "nvfp4_awq", "nvfp4_mse", + "nvfp4_w4a16", "w4a8_awq", "fp8_pb_wo", "w4a8_mxfp4_fp8", @@ -629,6 +632,12 @@ def mono_quantize( ) # Nemotron-Parse specific print("Quantization will only be applied to the decoder (text generation) component") + # Model-specific quantization extensions (e.g. quantizing lm_head + input embedding for + # Nemotron-H, where those tables are a large fraction of parameters and leaving them at + # bf16 wastes most of the memory savings) are now expressed as recipes under + # ``modelopt_recipes/models//``. Pass ``--recipe models//`` + # (e.g. ``--recipe models/Nemotron-H/nvfp4_w4a16``) to opt in. + if not model_is_already_quantized or calibration_only: # quantize the model @@ -677,7 +686,14 @@ def export_quantized( default_padding_side, default_pad_token, ): - with torch.inference_mode(): + # ``normalized_generation_config_for_export`` swaps ``model.generation_config`` with + # a deep-copied ``do_sample=True`` variant for the duration of the export so + # ``save_pretrained`` passes transformers 5.x's strict validation without affecting + # any ``.generate()`` callers outside the export window. + with ( + torch.inference_mode(), + normalized_generation_config_for_export(full_model), + ): if model_type is None: print(f"Unknown model type {type(language_model).__name__}. Continue exporting...") model_type = f"unknown:{type(language_model).__name__}" @@ -781,6 +797,12 @@ def export_quantized( extra_state_dict=mtp_state_dict, ) + if args.qformat == "nvfp4_w4a16": + warnings.warn( + "TensorRT-LLM and SGLang do not support this format. " + "To serve on vLLM, convert the NVFP4 W4A16 checkpoint to compressed-tensors format." + ) + # Restore default padding and export the tokenizer as well. if tokenizer is not None: tokenizer.padding_side = default_padding_side diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index d9c4ff8a7a0..bcf29a7e8ff 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -53,9 +53,9 @@ esac IFS="," for qformat in $QFORMAT; do case $qformat in - fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian) ;; + fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | nvfp4_mse | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_experts_only | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8 | nvfp4_local_hessian | nvfp4_w4a16) ;; *) - echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, nvfp4_mse, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_experts_only, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8, nvfp4_local_hessian, nvfp4_w4a16]" >&2 exit 1 ;; esac @@ -199,6 +199,12 @@ if [[ $TASKS =~ "quant" ]] || [[ ! -d "$SAVE_PATH" ]] || [[ ! $(ls -A $SAVE_PATH exit 0 fi + if [ "$QFORMAT" = "nvfp4_w4a16" ]; then + echo "nvfp4_w4a16 checkpoint exported to $SAVE_PATH" + echo "To serve on vLLM, convert to compressed-tensors" + exit 0 + fi + if [[ "$QFORMAT" == *"nvfp4"* ]] || [[ "$KV_CACHE_QUANT" == *"nvfp4"* ]]; then cuda_major=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader -i 0 | cut -d. -f1) diff --git a/modelopt/torch/export/convert_hf_config.py b/modelopt/torch/export/convert_hf_config.py index 5f8c3f3b55c..8afb6a32be6 100644 --- a/modelopt/torch/export/convert_hf_config.py +++ b/modelopt/torch/export/convert_hf_config.py @@ -57,6 +57,11 @@ def _quant_algo_to_group_config(quant_algo: str, group_size: int | None = None) return { "weights": {"dynamic": False, "num_bits": 4, "type": "int", "group_size": gs}, } + elif quant_algo == "NVFP4_W4A16": + gs = group_size or 16 + return { + "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": gs}, + } elif quant_algo in ("NVFP4_AWQ", "W4A8_AWQ"): gs = group_size or 128 return { @@ -183,6 +188,18 @@ def convert_hf_quant_config_format(input_config: dict[str, Any]) -> dict[str, An "targets": ["Linear"], } new_config["config_groups"] = {"group_0": config_group_details} + elif quant_algo_value == "NVFP4_W4A16": + # Weight-only FP4. Embedding is included alongside Linear because + # ``NVFP4_W4A16_CFG`` targets ``["*"]`` with ``weight_only=True``, so any registered + # ``QuantEmbedding`` gets weight-quantized too. Compressed-tensors dispatches on the + # module's ``__class__.__name__``, so omitting ``Embedding`` would leave quantized + # embedding weights orphaned on the consumer side. + group_size = original_quantization_details.get("group_size", 16) + config_group_details = { + "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": group_size}, + "targets": ["Linear", "Embedding"], + } + new_config["config_groups"] = {"group_0": config_group_details} elif quant_algo_value == "MIXED_PRECISION": quantized_layers = original_quantization_details.get("quantized_layers", {}) diff --git a/modelopt/torch/export/model_config.py b/modelopt/torch/export/model_config.py index dce39767c76..ba4220a40d8 100755 --- a/modelopt/torch/export/model_config.py +++ b/modelopt/torch/export/model_config.py @@ -39,6 +39,7 @@ QUANTIZATION_MXFP8 = "mxfp8" QUANTIZATION_W4A8_MXFP4_FP8 = "w4a8_mxfp4_fp8" QUANTIZATION_NVFP4_AWQ = "nvfp4_awq" +QUANTIZATION_NVFP4_W4A16 = "nvfp4_w4a16" # weight-only FP4 QUANTIZATION_FP8_PB_REAL = "fp8_pb_real" QUANTIZATION_FP8_PB_WO = "fp8_pb_wo" QUANTIZATION_FP8_PC_PT = "fp8_pc_pt" diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py index 4ceb51cd2c0..0a5a73b2cbd 100755 --- a/modelopt/torch/export/quant_utils.py +++ b/modelopt/torch/export/quant_utils.py @@ -65,6 +65,7 @@ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_MXFP4_FP8, QUANTIZATION_W4A8_NVFP4_FP8, @@ -358,6 +359,7 @@ def get_weight_scaling_factor(module: nn.Module, weight_name: str = "weight") -> QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_NVFP4_FP8, ]: # Calibrate weight quantizer if amax is not set @@ -402,6 +404,7 @@ def get_weight_scaling_factor_2(module: nn.Module, weight_name: str = "weight") QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_NVFP4_FP8, ]: # Calibrate weight quantizer if amax is not set @@ -636,6 +639,10 @@ def _get_quantization_from_layer(layer, quantizer_attr_names: QuantizerAttrNames return QUANTIZATION_NVFP4_AWQ if getattr(layer, "fused_with_prequant", False): return QUANTIZATION_NVFP4_AWQ + # W4A16 weight-only: input_quantizer absent or disabled + if input_quantizer is None or not input_quantizer.is_enabled: + if scale_bits == (4, 3): + return QUANTIZATION_NVFP4_W4A16 assert input_quantizer is not None, ( f"input_quantizer is None for {quantizer_attr_names}" ) @@ -803,6 +810,11 @@ def process_layer_quant_config(layer_config_dict): "quant_algo": "NVFP4", "group_size": block_size_value, } + elif v == "nvfp4_w4a16": + layer_config = { + "quant_algo": "NVFP4_W4A16", + "group_size": block_size_value, + } elif v == "nvfp4_awq": layer_config = { "quant_algo": "NVFP4_AWQ", @@ -980,6 +992,7 @@ def to_quantized_weight( if quantization in [ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_NVFP4_FP8, QUANTIZATION_NVFP4_SVDQUANT, ]: diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index af936a3002a..68b723028f3 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -69,6 +69,7 @@ from .layer_utils import ( get_expert_linear_names, get_experts_list, + is_embedding, is_layernorm, is_moe, is_quantlinear, @@ -84,6 +85,7 @@ QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ) @@ -520,6 +522,7 @@ def _export_quantized_weight( QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_W4A16, QUANTIZATION_W4A8_AWQ, QUANTIZATION_W4A8_NVFP4_FP8, ]: @@ -548,6 +551,7 @@ def _export_quantized_weight( QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ, QUANTIZATION_NVFP4_SVDQUANT, + QUANTIZATION_NVFP4_W4A16, ]: # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim) # for NVFP4 quantization functions that expect input_dim as the last dimension for block quantization @@ -650,7 +654,7 @@ def _process_quantized_modules( # Skip QuantMoELinear - it's handled separately in _reconstruct_fused_moe_linear if type(sub_module).__name__ == "QuantMoELinear": continue - if is_quantlinear(sub_module): + if is_quantlinear(sub_module) or is_embedding(sub_module): try: with fsdp2_aware_weight_update(model, sub_module, reshard=False): _export_quantized_weight(sub_module, dtype) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 186ff1c7edd..15acefe4a12 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -794,6 +794,7 @@ def _nvfp4_selective_quant_cfg( NVFP4_EXPERTS_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp.experts*", "*block_sparse_moe*"]) NVFP4_MLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*mlp*", "*block_sparse_moe*"]) NVFP4_OMLP_ONLY_CFG = _nvfp4_selective_quant_cfg(["*o_proj*", "*mlp*", "*block_sparse_moe*"]) +NVFP4_W4A16_CFG = _nvfp4_selective_quant_cfg(["*"], weight_only=True) # DO NOT ADD NEW CONFIGS HERE. If you want to add a new general recipe, add it to # modelopt_recipes/general/ptq/ as a yaml file @@ -828,6 +829,7 @@ def _nvfp4_selective_quant_cfg( "NVFP4_MLP_ONLY_CFG", "NVFP4_EXPERTS_ONLY_CFG", "NVFP4_OMLP_ONLY_CFG", + "NVFP4_W4A16_CFG", "MAMBA_MOE_NVFP4_CONSERVATIVE_CFG", "MAMBA_MOE_NVFP4_AGGRESSIVE_CFG", "MAMBA_MOE_FP8_CONSERVATIVE_CFG", diff --git a/modelopt/torch/quantization/nn/__init__.py b/modelopt/torch/quantization/nn/__init__.py index ca7082eb1cb..972fa23bb4d 100644 --- a/modelopt/torch/quantization/nn/__init__.py +++ b/modelopt/torch/quantization/nn/__init__.py @@ -18,6 +18,7 @@ from .modules.quant_activations import * from .modules.quant_batchnorm import * from .modules.quant_conv import * +from .modules.quant_embedding import * from .modules.quant_instancenorm import * from .modules.quant_linear import * from .modules.quant_module import * diff --git a/modelopt/torch/quantization/nn/modules/quant_embedding.py b/modelopt/torch/quantization/nn/modules/quant_embedding.py new file mode 100644 index 00000000000..6f7fc7bb7dc --- /dev/null +++ b/modelopt/torch/quantization/nn/modules/quant_embedding.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Quantized Embedding. + +``nn.Embedding`` quantization is weight-only: only the lookup table (``weight``) is +fake-quantized. Embedding inputs are integer indices — their ``input_quantizer`` is +registered (so config entries like ``"*input_quantizer"`` can still target it) but is +disabled by default so integer tensors pass through untouched. +""" + +import torch.nn as nn + +from ... import tensor_quant +from .quant_module import QuantLinearConvBase, QuantModuleRegistry + +__all__ = ["QuantEmbedding"] + + +@QuantModuleRegistry.register({nn.Embedding: "nn.Embedding"}) +class _QuantEmbedding(QuantLinearConvBase): + """Quantized base class for ``nn.Embedding``. + + Weight-only quantization. Input/output quantizers are created (so wildcard configs + still resolve cleanly) but are disabled — an embedding's input is an index tensor. + """ + + default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW + + def _setup(self): + super()._setup() + # Embedding inputs are integer indices; never fake-quantize them. + self.input_quantizer.disable() + # output_quantizer is already disabled by QuantInputBase._setup(). + + +# Alias to follow the naming convention of QuantLinear. +QuantEmbedding = _QuantEmbedding diff --git a/modelopt_recipes/models/Nemotron-H/nvfp4_w4a16.yaml b/modelopt_recipes/models/Nemotron-H/nvfp4_w4a16.yaml new file mode 100644 index 00000000000..dc7b0169094 --- /dev/null +++ b/modelopt_recipes/models/Nemotron-H/nvfp4_w4a16.yaml @@ -0,0 +1,126 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# NVFP4 W4A16 (weight-only) recipe for Nemotron-H hybrid models. +# +# Mirrors the general ``nvfp4_w4a16`` qformat (NVFP4_W4A16_CFG) but additionally +# re-enables quantization of ``lm_head`` and the input token embedding. On +# Nemotron-3-Nano-4B those two 131072x3136 tables account for ~21% of model +# parameters, so leaving them at bf16 wastes most of the NVFP4 memory benefit. +# +# Coverage: +# * Linear layers in attention + MLP: NVFP4 W4A16 weight-only. +# * lm_head: NVFP4 W4A16 weight-only (re-enabled here; default disables it). +# * Input embedding (``backbone.embeddings`` / ``model.embed_tokens``): +# NVFP4 W4A16 weight-only via ``QuantEmbedding``. Embedding inputs are +# integer indices, so the input quantizer is intentionally not enabled. +# * Mamba ``*mixer.conv1d*``: kept at bf16 (default exclusion). +# +# Notes for vLLM consumption: +# * ``vllm.compressed-tensors`` consumes packed NVFP4 weights for ``Linear`` +# and ``Embedding`` layers when the corresponding kernels are present. As +# of vLLM 0.19, ``ParallelLMHead`` and ``VocabParallelEmbedding`` need an +# additional patch to dispatch ``CompressedTensorsLinearMethod``; see the +# PR notes for details. If the target deployment is stock vLLM and you +# can't apply that patch, use the general ``nvfp4_w4a16`` qformat +# instead, which leaves ``lm_head`` and embeddings at bf16. + +metadata: + recipe_type: ptq + description: NVFP4 W4A16 weight-only for Nemotron-H, including lm_head and input embedding. +quantize: + algorithm: max + quant_cfg: + # Start with everything disabled, then enable layers explicitly. + - quantizer_name: '*' + enable: false + + # Quantize all Linear weight quantizers (attention q/k/v/o + MLP up/down). + - quantizer_name: '*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # Standard exclusions copied from ``_default_disabled_quantizer_cfg``. + # Order matters: later entries override earlier ones in + # ``modelopt.torch.quantization.set_quantizer_by_cfg``. + - quantizer_name: '*lm_head*' + enable: false + - quantizer_name: '*proj_out.*' + enable: false + - quantizer_name: '*block_sparse_moe.gate*' + enable: false + - quantizer_name: '*router*' + enable: false + - quantizer_name: '*mlp.gate.*' + enable: false + - quantizer_name: '*mlp.shared_expert_gate.*' + enable: false + - quantizer_name: '*linear_attn.conv1d*' + enable: false + - quantizer_name: '*mixer.conv1d*' + enable: false + - quantizer_name: '*output_layer*' + enable: false + - quantizer_name: 'output.*' + enable: false + - parent_class: 'nn.BatchNorm1d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm2d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm3d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.LeakyReLU' + quantizer_name: '*' + enable: false + + # Nemotron-H specific overrides: re-enable the weight quantizer for + # ``lm_head`` and the input embedding. These come AFTER the default + # disables above so they take precedence (last matching entry wins). + - quantizer_name: '*lm_head*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # Two embedding patterns cover both the Nemotron-H remote-code path + # (``backbone.embeddings``) and the standard transformers naming + # (``model.embed_tokens``). + - quantizer_name: '*embeddings*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*embed_tokens*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 diff --git a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py index 8bdf3f5e659..638aee0899c 100644 --- a/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py +++ b/tests/gpu/torch/export/test_unified_hf_export_and_check_safetensors.py @@ -47,6 +47,7 @@ ("w4a8_awq", "tiny_llama-w4a8-awq", True, False, True, True, False), ("int8_wo", "tiny_llama-int8-wo", False, False, False, False, False), ("nvfp4_svdquant", "tiny_llama-nvfp4-svdquant", True, False, True, True, True), + ("nvfp4_w4a16", "tiny_llama-nvfp4-w4a16", False, False, False, False, False), # MoE models (fused experts: Qwen3 MoE, GPT-OSS) ("nvfp4", "tiny_qwen3_moe-nvfp4", True, False, True, True, False), ("fp8", "tiny_gpt_oss-fp8", True, False, True, True, False),