From 35dad9a9bca1254c8e461cf097c84d7af55207bd Mon Sep 17 00:00:00 2001 From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> Date: Fri, 1 May 2026 17:06:14 -0700 Subject: [PATCH] fixes for fused moe (qwen3.6, GLM5.1 + MSE calibration Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com> --- examples/llm_ptq/hf_ptq.py | 7 +- modelopt/torch/export/moe_utils.py | 106 ++++++++++++-- modelopt/torch/quantization/model_calib.py | 18 ++- modelopt/torch/quantization/model_quant.py | 1 + .../nn/modules/tensor_quantizer.py | 4 +- .../quantization/qtensor/nvfp4_tensor.py | 13 +- .../general/ptq/nvfp4_experts_only_mse.yaml | 130 ++++++++++++++++++ 7 files changed, 263 insertions(+), 16 deletions(-) create mode 100644 modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index edf9c4d6f19..5726be4c89b 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -797,6 +797,11 @@ def pre_quantize( preview_input_ids = next(iter(calib_dataloader))[ "input_features" if model_type == "whisper" else "input_ids" ][0:1] + # Strip leading padding tokens so the preview input shows real content + if model_type not in ("whisper",) and tokenizer is not None and tokenizer.pad_token_id is not None: + first_non_pad = (preview_input_ids[0] != tokenizer.pad_token_id).nonzero(as_tuple=True)[0] + if first_non_pad.numel() > 0: + preview_input_ids = preview_input_ids[:, first_non_pad[0]:] # Generate preview before quantization if args.skip_generate: @@ -897,7 +902,7 @@ def input_decode(input_ids): if processor is not None and isinstance(processor, WhisperProcessor): return first_text_speech_dataset elif tokenizer is not None: - return tokenizer.batch_decode(input_ids) + return tokenizer.batch_decode(input_ids, skip_special_tokens=True) else: raise ValueError("The processor or tokenizer must be set") diff --git a/modelopt/torch/export/moe_utils.py b/modelopt/torch/export/moe_utils.py index 952ed1e39c1..1b780923456 100644 --- a/modelopt/torch/export/moe_utils.py +++ b/modelopt/torch/export/moe_utils.py @@ -59,9 +59,26 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: # 2-3. Split + export each per-expert projection. fused_dim0 = gate_up.shape[1] # 2 * expert_dim + def _safe_cpu_amax(quantizer_src: nn.Module) -> torch.Tensor | None: + """Extract _amax to CPU float32, surfacing and clearing any pending CUDA error first.""" + amax = getattr(quantizer_src, "_amax", None) + if amax is None or not isinstance(amax, torch.Tensor): + return None + try: + if amax.is_cuda: + torch.cuda.synchronize(amax.device) + return amax.detach().cpu().float() + except Exception: + return None + for idx in range(n): expert = nn.Module() + # Extract amaxes to CPU before deepcopy: cloning a corrupt CUDA _amax tensor + # (e.g. from an under-calibrated expert) triggers an async CUDA error. + gu_amax_cpu = _safe_cpu_amax(module.gate_up_proj_weight_quantizers[idx]) + down_amax_cpu = _safe_cpu_amax(module.down_proj_weight_quantizers[idx]) + projections = [ ("gate_proj", gate_up[idx, :expert_dim, :], 0, fused_dim0, True), ("up_proj", gate_up[idx, expert_dim:, :], expert_dim, fused_dim0, True), @@ -76,8 +93,17 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: ) i_quantizer = gate_up_input_q if is_gate_up else down_input_q - # gate/up share a weight quantizer — clone so each gets independent amax. - w_quantizer = copy.deepcopy(w_quantizer_src) if is_gate_up else w_quantizer_src + # gate/up share a quantizer — deepcopy with _amax nulled to avoid cloning + # the corrupt CUDA tensor, then inject the pre-extracted CPU amax. + if is_gate_up: + _saved_amax = getattr(w_quantizer_src, "_amax", None) + w_quantizer_src._amax = None + w_quantizer = copy.deepcopy(w_quantizer_src) + w_quantizer_src._amax = _saved_amax + w_quantizer._amax = gu_amax_cpu + else: + w_quantizer = w_quantizer_src + w_quantizer._amax = down_amax_cpu # For per-channel amax (dim >= 1), proportionally slice dim-0 # to match the split weight. @@ -86,12 +112,14 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: and w_quantizer._amax is not None and w_quantizer._amax.dim() >= 1 ): - amax = w_quantizer._amax + amax = w_quantizer._amax # CPU float32 amax_dim0 = amax.shape[0] - if fused_total % amax_dim0 == 0: + if amax_dim0 % fused_total == 0: slice_start = fused_start * amax_dim0 // fused_total slice_end = (fused_start + weight_slice.shape[0]) * amax_dim0 // fused_total - w_quantizer.amax = amax[slice_start:slice_end].contiguous() + # Bypass amax.setter (which forbids shape changes); w_quantizer is a + # deepcopy for gate/up so mutating it is safe. + w_quantizer._amax = amax[slice_start:slice_end].contiguous() else: warnings.warn( f"Expert {idx} {proj_name}: fused amax dim0 ({amax_dim0}) does not " @@ -100,20 +128,68 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: stacklevel=2, ) - # If the weight quantizer was never calibrated, compute amax from weights. + # Patch invalid per-block amax entries (NaN/inf/negative/zero/too-small/too-large) + # with weight-derived fallback values. + _MIN_VALID_AMAX = 1e-4 + _MAX_VALID_AMAX = 1e6 + if ( + hasattr(w_quantizer, "_amax") + and w_quantizer._amax is not None + and w_quantizer._amax.numel() > 1 + ): + amax_cpu = w_quantizer._amax + invalid_mask = ~( + torch.isfinite(amax_cpu) + & (amax_cpu >= _MIN_VALID_AMAX) + & (amax_cpu <= _MAX_VALID_AMAX) + ) + if invalid_mask.any(): + per_block_fallback = ( + weight_slice.detach() + .reshape(-1, 16) + .abs() + .amax(dim=1, keepdim=True) + .cpu() + .float() + .clamp(min=2e-3) + .reshape(amax_cpu.shape) + ) + amax_cpu[invalid_mask] = per_block_fallback[invalid_mask] + w_quantizer._amax = amax_cpu + + # For uncalibrated experts (amax missing or invalid scalar), fall back to + # per-block amax from weights so the static export path can reshape it correctly. if ( hasattr(w_quantizer, "is_enabled") and w_quantizer.is_enabled and ( not hasattr(w_quantizer, "_amax") or w_quantizer._amax is None - or torch.all(w_quantizer._amax == 0) + or ( + w_quantizer._amax.numel() == 1 + and not ( + torch.isfinite(w_quantizer._amax) + and w_quantizer._amax >= _MIN_VALID_AMAX + and w_quantizer._amax <= _MAX_VALID_AMAX + ) + ) ) ): - w_quantizer.amax = weight_slice.abs().amax().to(torch.float32) + _block_size = 16 + fallback_per_block = ( + weight_slice.detach() + .reshape(-1, _block_size) + .abs() + .amax(dim=1, keepdim=True) + .cpu() + .float() + .clamp(min=2e-3) + .reshape(*weight_slice.shape[:-1], weight_slice.shape[-1] // _block_size) + ) + w_quantizer._amax = fallback_per_block warnings.warn( f"Expert {idx} {proj_name} weight quantizer was not calibrated " - f"(amax missing or zero). Using weight-derived amax as fallback. " + f"(amax missing or zero). Using weight-derived per-block amax as fallback. " f"Consider using more calibration data to activate all experts.", stacklevel=2, ) @@ -123,6 +199,18 @@ def _export_fused_experts(module: nn.Module, dtype: torch.dtype) -> None: wrapper.weight_quantizer = w_quantizer wrapper.input_quantizer = i_quantizer + # Set global_amax to route to the static NVFP4 export path (reads per-block _amax). + # Always recompute from the current (possibly patched) _amax — a stale zero + # global_amax causes division-by-zero in the per-block scale formula. + wq = wrapper.weight_quantizer + if ( + hasattr(wq, "_amax") + and wq._amax is not None + and wq._amax.numel() > 1 + ): + wq._amax = wq._amax.to(weight_slice.device) + wq.global_amax = wq._amax.float().amax().clamp(min=2e-3) + _export_quantized_weight(wrapper, dtype) proj = nn.Module() diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py index 4ce0f62a75d..1048738ef65 100644 --- a/modelopt/torch/quantization/model_calib.py +++ b/modelopt/torch/quantization/model_calib.py @@ -421,6 +421,18 @@ def mse_calibrate( if isinstance(weight_quantizer, TensorQuantizer) and weight_quantizer.is_enabled: if getattr(weight_quantizer, "_calibrator", None) is not None: weight_quantizers.append((parent_module, weight_name, weight_quantizer)) + # _QuantFusedExperts stores per-expert weight quantizers as nn.ModuleList named + # {param_name}_weight_quantizers (plural). Detect this pattern and enqueue each + # per-expert quantizer individually. + for param_name, _ in parent_module.named_parameters(recurse=False): + qlist = getattr(parent_module, f"{param_name}_weight_quantizers", None) + if not isinstance(qlist, nn.ModuleList): + continue + for expert_idx, wq in enumerate(qlist): + if isinstance(wq, TensorQuantizer) and wq.is_enabled: + if getattr(wq, "_calibrator", None) is not None: + weight_quantizers.append((parent_module, (param_name, expert_idx), wq)) + seen_modules.add(parent_module) # Step 3: Calibrate weight quantizers ONE AT A TIME with immediate amax computation @@ -432,7 +444,11 @@ def mse_calibrate( weight_quantizer.disable_quant() weight_quantizer.enable_calib() with enable_weight_access_and_writeback(parent_module, model, name_to_module): - weight = getattr(parent_module, weight_name) + if isinstance(weight_name, tuple): + param_name, expert_idx = weight_name + weight = getattr(parent_module, param_name)[expert_idx] + else: + weight = getattr(parent_module, weight_name) weight_quantizer(weight) # IMMEDIATELY compute amax and reset calibrator to free memory diff --git a/modelopt/torch/quantization/model_quant.py b/modelopt/torch/quantization/model_quant.py index 5e65f9cc1d4..3582223c4d3 100644 --- a/modelopt/torch/quantization/model_quant.py +++ b/modelopt/torch/quantization/model_quant.py @@ -595,6 +595,7 @@ def print_quant_summary(model: nn.Module, output_dir: str | None = None): lines.append(f"{len(lines)} TensorQuantizers found in model") if output_dir: + os.makedirs(output_dir, exist_ok=True) path = os.path.join(output_dir, ".quant_summary.txt") with open(path, "w", encoding="utf-8") as f: f.write("\n".join(lines) + "\n") diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index 3ff7401ec3e..928e319dfff 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -1112,7 +1112,7 @@ def forward(self, inputs): return outputs - def _short_amax(self, fmt=".4f"): + def _short_amax(self, fmt=".2e"): """Short description of amax. Returns: @@ -1130,7 +1130,7 @@ def _short_amax(self, fmt=".4f"): return "meta" return self._short_tensor(self._amax, fmt) - def _short_tensor(self, tensor: torch.Tensor, fmt=".4f"): + def _short_tensor(self, tensor: torch.Tensor, fmt=".2e"): """Short description of tensor.""" if tensor.numel() == 1: return f"{tensor.item():{fmt}}" diff --git a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py index fe30e283c2d..1a9855dfeb5 100644 --- a/modelopt/torch/quantization/qtensor/nvfp4_tensor.py +++ b/modelopt/torch/quantization/qtensor/nvfp4_tensor.py @@ -124,9 +124,10 @@ def get_weights_scaling_factor_from_quantizer( # Quantize scales to FP8 if not keep_high_precision: - per_block_scale = (per_block_scale * 448.0 / per_block_scale_max).to( - torch.float8_e4m3fn - ) + _FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive subnormal + per_block_scale = (per_block_scale * 448.0 / per_block_scale_max).clamp( + min=_FP8_E4M3FN_MIN + ).to(torch.float8_e4m3fn) return per_block_scale, weights_scaling_factor_2 else: # Dynamic path: compute from weight tensor @@ -167,6 +168,12 @@ def get_weights_scaling_factor( per_block_scale[per_block_scale == 0] = 1.0 # Convert to torch.float8_e4m3fn if not keep_high_precision: + # Clamp to the minimum positive FP8 E4M3FN subnormal (~0.00195 = 2^-9) before + # casting. Without this, blocks whose scale falls below the FP8 representable + # range silently underflow to 0, causing those blocks to produce zero output at + # inference even when the weights are non-trivial. + _FP8_E4M3FN_MIN = 2**-9 # 0.001953125 — smallest positive subnormal + per_block_scale = per_block_scale.clamp(min=_FP8_E4M3FN_MIN) per_block_scale = per_block_scale.to(torch.float8_e4m3fn) return per_block_scale, weights_scaling_factor_2 diff --git a/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml new file mode 100644 index 00000000000..0fbdb421075 --- /dev/null +++ b/modelopt_recipes/general/ptq/nvfp4_experts_only_mse.yaml @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +metadata: + recipe_type: ptq + description: > + NVFP4 W4A4 for MoE routed experts only. Static weight scales via MSE + FP8 scale sweep; + dynamic activation scales. Supports sequential experts (nn.Linear-based) and fused experts + (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style). +quantize: + algorithm: + method: mse + fp8_scale_sweep: true + layerwise: false + quant_cfg: + # ── Disable everything first ───────────────────────────────────────────── + - quantizer_name: '*' + enable: false + + # ── Sequential experts (nn.Linear per expert) ──────────────────────────── + - quantizer_name: '*mlp.experts*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*mlp.experts*input_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Sequential experts: Mixtral / block_sparse_moe style ──────────────── + - quantizer_name: '*block_sparse_moe*weight_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*block_sparse_moe*input_quantizer' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Fused experts (_QuantFusedExperts, HF transformers 5.0+ 3D nn.Parameter style) ── + - quantizer_name: '*gate_up_proj_weight_quantizers*' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*gate_up_proj_input_quantizer*' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*down_proj_weight_quantizers*' + enable: true + cfg: + block_sizes: + -1: 16 + type: static + scale_bits: e4m3 + num_bits: e2m1 + - quantizer_name: '*down_proj_input_quantizer*' + enable: true + cfg: + block_sizes: + -1: 16 + type: dynamic + scale_bits: e4m3 + num_bits: e2m1 + + # ── Exclusions: shared experts, attention, routers, lm_head ───────────── + - quantizer_name: '*block_sparse_moe.gate*' + enable: false + - quantizer_name: '*linear_attn.conv1d*' + enable: false + - quantizer_name: '*lm_head*' + enable: false + - quantizer_name: '*mlp.gate.*' + enable: false + - quantizer_name: '*mlp.shared_expert*' + enable: false + - quantizer_name: '*mlp.shared_expert_gate.*' + enable: false + - quantizer_name: '*router*' + enable: false + - quantizer_name: 'output.*' + enable: false + - parent_class: 'nn.BatchNorm1d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm2d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.BatchNorm3d' + quantizer_name: '*' + enable: false + - parent_class: 'nn.LeakyReLU' + quantizer_name: '*' + enable: false