From 04847b3c82114891ac9e2083195c17ab218c7b10 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Wed, 4 Mar 2026 18:05:14 +0000 Subject: [PATCH 01/12] Add omlp_only config Signed-off-by: Chenjie Luo --- examples/llm_ptq/hf_ptq.py | 1 + modelopt/torch/quantization/config.py | 54 ++++++++++++++++++++------- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 6b29be4eb0..264d0580ca 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -87,6 +87,7 @@ "w4a8_nvfp4_fp8": mtq.W4A8_NVFP4_FP8_CFG, "w4a8_mxfp4_fp8": mtq.W4A8_MXFP4_FP8_CFG, "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG, + "nvfp4_omlp_only": mtq.NVFP4_OMLP_ONLY_CFG, "nvfp4_svdquant": mtq.NVFP4_SVDQUANT_DEFAULT_CFG, "mxfp8": mtq.MXFP8_DEFAULT_CFG, } diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 80a2a68761..f939b22f11 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -699,6 +699,12 @@ "enable": True, "pass_through_bwd": True, }, + "*block_sparse_moe*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, + "pass_through_bwd": True, + }, **_default_disabled_quantizer_cfg, }, "algorithm": None, @@ -716,26 +722,46 @@ "enable": True, "pass_through_bwd": True, }, + "*block_sparse_moe*weight_quantizer": { + "num_bits": (2, 1), + "block_sizes": { + -1: 32, + "type": "dynamic", + "scale_bits": (4, 3), + }, # Note: block_size is 32 here + "enable": True, + "pass_through_bwd": True, + }, **_default_disabled_quantizer_cfg, }, "algorithm": "max", } +_nvfp4_quantizer = { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "enable": True, + "pass_through_bwd": True, +} + +_nvfp4_mlp_only_quant_cfg = { + "*mlp*weight_quantizer": _nvfp4_quantizer, + "*mlp*input_quantizer": _nvfp4_quantizer, + "*block_sparse_moe*weight_quantizer": _nvfp4_quantizer, + "*block_sparse_moe*input_quantizer": _nvfp4_quantizer, + **_default_disabled_quantizer_cfg, +} + NVFP4_MLP_ONLY_CFG = { + "quant_cfg": _nvfp4_mlp_only_quant_cfg, + "algorithm": "max", +} + +NVFP4_OMLP_ONLY_CFG = { "quant_cfg": { - "*mlp*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "enable": True, - "pass_through_bwd": True, - }, - "*mlp*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "enable": True, - "pass_through_bwd": True, - }, - **_default_disabled_quantizer_cfg, + "*o_proj*weight_quantizer": _nvfp4_quantizer, + "*o_proj*input_quantizer": _nvfp4_quantizer, + **_nvfp4_mlp_only_quant_cfg, }, "algorithm": "max", } @@ -1081,7 +1107,7 @@ def validate_calibrator(cls, v, info: ValidationInfo): 'Straight-Through Estimator (STE)'. STE does not require saving of the input tensor for performing backward pass and hence consumes less memory. - If set to False, we will use STE with zeroed outlier gradients. This setting could + If set to False, we will use STE with zeroed outlier gradients. This setting may yield better QAT accuracy depending on the quantization format. However, this setting requires saving of the input tensor for computing gradients which uses more memory. From 69edc7e58bf12837c9b9c01ef53fdc6b0852b816 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Wed, 4 Mar 2026 18:11:09 +0000 Subject: [PATCH 02/12] Set pass_through_bwd to True by default Signed-off-by: Chenjie Luo --- modelopt/torch/quantization/config.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index f939b22f11..897c1015ac 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -697,13 +697,11 @@ "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - "pass_through_bwd": True, }, "*block_sparse_moe*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, "enable": True, - "pass_through_bwd": True, }, **_default_disabled_quantizer_cfg, }, @@ -720,7 +718,6 @@ "scale_bits": (4, 3), }, # Note: block_size is 32 here "enable": True, - "pass_through_bwd": True, }, "*block_sparse_moe*weight_quantizer": { "num_bits": (2, 1), @@ -730,7 +727,6 @@ "scale_bits": (4, 3), }, # Note: block_size is 32 here "enable": True, - "pass_through_bwd": True, }, **_default_disabled_quantizer_cfg, }, @@ -741,7 +737,6 @@ "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, "enable": True, - "pass_through_bwd": True, } _nvfp4_mlp_only_quant_cfg = { @@ -1100,7 +1095,7 @@ def validate_calibrator(cls, v, info: ValidationInfo): ) pass_through_bwd: bool = ModeloptField( - default=False, + default=True, title="If set to true, fake quantization will be a pass through for gradient computation.", description=""" Gradient computation where fake quantization is pass through is called From ec4e2aa3c3c656d59945456aed11999e76c7d784 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Wed, 4 Mar 2026 18:15:50 +0000 Subject: [PATCH 03/12] Remove axis, enable if default Signed-off-by: Chenjie Luo --- modelopt/torch/quantization/config.py | 94 ++++----------------------- 1 file changed, 14 insertions(+), 80 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 897c1015ac..66113ed8c1 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -248,7 +248,6 @@ "*weight_quantizer": { "num_bits": (4, 3), "block_sizes": {-1: 128, -2: 128}, - "enable": True, }, "*input_quantizer": {"enable": False}, **_default_disabled_quantizer_cfg, @@ -258,7 +257,10 @@ INT4_BLOCKWISE_WEIGHT_ONLY_CFG = { "quant_cfg": { - "*weight_quantizer": {"num_bits": 4, "block_sizes": {-1: 128}, "enable": True}, + "*weight_quantizer": { + "num_bits": 4, + "block_sizes": {-1: 128}, + }, "*input_quantizer": {"enable": False}, **_default_disabled_quantizer_cfg, }, @@ -271,7 +273,6 @@ "*weight_quantizer": { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, - "enable": True, }, "*input_quantizer": {"enable": False}, **_default_disabled_quantizer_cfg, @@ -286,10 +287,17 @@ W4A8_AWQ_BETA_CFG = { "quant_cfg": { "*weight_quantizer": [ - {"num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, "enable": True}, - {"num_bits": (4, 3), "axis": None, "enable": True}, + { + "num_bits": 4, + "block_sizes": {-1: 128, "type": "static"}, + }, + { + "num_bits": (4, 3), + }, ], - "*input_quantizer": {"num_bits": (4, 3), "axis": None, "enable": True}, + "*input_quantizer": { + "num_bits": (4, 3), + }, **_default_disabled_quantizer_cfg, }, "algorithm": "awq_lite", @@ -300,12 +308,10 @@ "*weight_quantizer": { "num_bits": (4, 3), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, "*input_quantizer": { "num_bits": (4, 3), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -317,12 +323,10 @@ "*weight_quantizer": { "num_bits": (3, 2), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, "*input_quantizer": { "num_bits": (3, 2), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -334,12 +338,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -351,7 +353,6 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, "*input_quantizer": {"num_bits": (4, 3), "axis": None}, **_default_disabled_quantizer_cfg, @@ -364,12 +365,10 @@ "*weight_quantizer": { "num_bits": 8, "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, "*input_quantizer": { "num_bits": 8, "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -380,8 +379,6 @@ "quant_cfg": { "*[kv]_bmm_quantizer": { "num_bits": (4, 3), - "axis": None, - "enable": True, }, "default": {"enable": False}, }, @@ -392,7 +389,6 @@ "quant_cfg": { "*[kv]_bmm_quantizer": { "num_bits": (4, 3), - "axis": None, "bias": {-2: None, -4: None, "type": "static"}, }, "default": {"enable": False}, @@ -405,14 +401,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -424,14 +416,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -446,14 +434,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -468,14 +452,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, @@ -487,14 +467,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, @@ -510,14 +486,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -529,14 +501,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -548,14 +516,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -568,8 +532,6 @@ "*[kv]_bmm_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, "bias": {-2: None, -4: None, "type": "static"}, }, "default": {"enable": False}, @@ -582,8 +544,6 @@ "*[kv]_bmm_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "default": {"enable": False}, }, @@ -596,35 +556,26 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*output_quantizer": {"enable": False}, "*q_bmm_quantizer": { "num_bits": (4, 3), - "axis": None, }, "*k_bmm_quantizer": { "num_bits": (4, 3), - "axis": None, }, "*v_bmm_quantizer": { "num_bits": (4, 3), - "axis": None, }, "*softmax_quantizer": { "num_bits": (4, 3), - "axis": None, }, "transformer_blocks*bmm2_output_quantizer": { "num_bits": (4, 3), - "axis": None, }, "default": {"enable": False}, }, @@ -640,15 +591,11 @@ "*k_bmm_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, "rotate": True, }, "*v_bmm_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, }, "algorithm": "max", @@ -659,14 +606,10 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -678,13 +621,9 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, - "axis": None, - "enable": True, }, "*input_quantizer": { "num_bits": (4, 3), - "axis": None, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -696,12 +635,10 @@ "*mlp*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, "*block_sparse_moe*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -717,7 +654,6 @@ "type": "dynamic", "scale_bits": (4, 3), }, # Note: block_size is 32 here - "enable": True, }, "*block_sparse_moe*weight_quantizer": { "num_bits": (2, 1), @@ -726,7 +662,6 @@ "type": "dynamic", "scale_bits": (4, 3), }, # Note: block_size is 32 here - "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -736,7 +671,6 @@ _nvfp4_quantizer = { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - "enable": True, } _nvfp4_mlp_only_quant_cfg = { From 9c2fabed7a77f92e9b422bb397cee1e9434a228c Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Wed, 4 Mar 2026 18:21:31 +0000 Subject: [PATCH 04/12] Simplify nvfp4 quantizer Signed-off-by: Chenjie Luo --- modelopt/torch/quantization/config.py | 116 ++++++-------------------- 1 file changed, 27 insertions(+), 89 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 66113ed8c1..6cafe60d03 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -396,16 +396,15 @@ "algorithm": "max", } +_nvfp4_quantizer = { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, +} + NVFP4_DEFAULT_CFG = { "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*weight_quantizer": _nvfp4_quantizer, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, }, "algorithm": "max", @@ -417,10 +416,7 @@ "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, }, "algorithm": { @@ -435,10 +431,7 @@ "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, }, "algorithm": { @@ -449,14 +442,8 @@ MAMBA_MOE_NVFP4_AGGRESSIVE_CFG = { "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*weight_quantizer": _nvfp4_quantizer, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, }, @@ -464,14 +451,8 @@ } MAMBA_MOE_NVFP4_CONSERVATIVE_CFG = { "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*weight_quantizer": _nvfp4_quantizer, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, **_mamba_moe_disabled_quantizer_cfg, "*mixer.in_proj*": {"enable": False}, # Skip mamba linear @@ -483,14 +464,8 @@ NVFP4_AWQ_LITE_CFG = { "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*weight_quantizer": _nvfp4_quantizer, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, }, "algorithm": "awq_lite", @@ -498,14 +473,8 @@ NVFP4_AWQ_CLIP_CFG = { "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*weight_quantizer": _nvfp4_quantizer, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, }, "algorithm": {"method": "awq_clip"}, @@ -513,14 +482,8 @@ NVFP4_AWQ_FULL_CFG = { "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*weight_quantizer": _nvfp4_quantizer, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, }, "algorithm": {"method": "awq_full", "alpha_step": 0.1}, @@ -530,8 +493,7 @@ NVFP4_AFFINE_KV_CFG = { "quant_cfg": { "*[kv]_bmm_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + **_nvfp4_quantizer, "bias": {-2: None, -4: None, "type": "static"}, }, "default": {"enable": False}, @@ -541,10 +503,7 @@ NVFP4_KV_CFG = { "quant_cfg": { - "*[kv]_bmm_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*[kv]_bmm_quantizer": _nvfp4_quantizer, "default": {"enable": False}, }, "algorithm": "max", @@ -553,14 +512,8 @@ # Moved from examples/diffusers/quantization/config.py to here NVFP4_FP8_MHA_CONFIG = { "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*weight_quantizer": _nvfp4_quantizer, + "*input_quantizer": _nvfp4_quantizer, "*output_quantizer": {"enable": False}, "*q_bmm_quantizer": { "num_bits": (4, 3), @@ -589,28 +542,18 @@ "rotate": True, }, "*k_bmm_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + **_nvfp4_quantizer, "rotate": True, }, - "*v_bmm_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*v_bmm_quantizer": _nvfp4_quantizer, }, "algorithm": "max", } NVFP4_SVDQUANT_DEFAULT_CFG = { "quant_cfg": { - "*weight_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, - "*input_quantizer": { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - }, + "*weight_quantizer": _nvfp4_quantizer, + "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, }, "algorithm": {"method": "svdquant", "lowrank": 32}, @@ -668,11 +611,6 @@ "algorithm": "max", } -_nvfp4_quantizer = { - "num_bits": (2, 1), - "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, -} - _nvfp4_mlp_only_quant_cfg = { "*mlp*weight_quantizer": _nvfp4_quantizer, "*mlp*input_quantizer": _nvfp4_quantizer, From 5a7c7ccc4b42abebe5fc014c569eb5a130966f37 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Wed, 4 Mar 2026 18:23:45 +0000 Subject: [PATCH 05/12] Update config Signed-off-by: Chenjie Luo --- examples/llm_ptq/example_utils.py | 12 ------------ examples/llm_ptq/scripts/huggingface_example.sh | 4 ++-- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 0234e731ea..50ac51aace 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -258,18 +258,6 @@ def build_quant_cfg( quant_cfg["quant_cfg"]["*image*"] = {"enable": False} quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} - if model_type in ["qwen3moe", "qwen3next", "minimax"] and qformat == "nvfp4": - # Disable the attention projection layers to retain accuracy - quant_cfg["quant_cfg"]["model*.*attn*in_proj*"] = {"enable": False} - quant_cfg["quant_cfg"]["model*.*attn*q_proj*"] = {"enable": False} - quant_cfg["quant_cfg"]["model*.*attn*k_proj*"] = {"enable": False} - quant_cfg["quant_cfg"]["model*.*attn*v_proj*"] = {"enable": False} - - if model_type == "deepseek": - # Disable MLA quantization for accuracy. - quant_cfg["quant_cfg"]["*self_attn.q*"] = {"enable": False} - quant_cfg["quant_cfg"]["*self_attn.kv*"] = {"enable": False} - return quant_cfg diff --git a/examples/llm_ptq/scripts/huggingface_example.sh b/examples/llm_ptq/scripts/huggingface_example.sh index a74a1671a8..d49fb4005a 100755 --- a/examples/llm_ptq/scripts/huggingface_example.sh +++ b/examples/llm_ptq/scripts/huggingface_example.sh @@ -53,9 +53,9 @@ esac IFS="," for qformat in $QFORMAT; do case $qformat in - fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_mlp_only | nvfp4_svdquant | mxfp8) ;; + fp8 | fp8_pc_pt | fp8_pb_wo | int8_wo | int8_sq | int4_awq | w4a8_awq | fp16 | bf16 | nvfp4 | nvfp4_awq | w4a8_nvfp4_fp8 | w4a8_mxfp4_fp8 | nvfp4_mlp_only | nvfp4_omlp_only | nvfp4_svdquant | mxfp8) ;; *) - echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_mlp_only, nvfp4_svdquant, mxfp8]" >&2 + echo "Unknown quant argument: Expected one of: [fp8, fp8_pc_pt, fp8_pb_wo, int8_wo, int8_sq, int4_awq, w4a8_awq, fp16, bf16, nvfp4, nvfp4_awq, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, nvfp4_mlp_only, nvfp4_omlp_only, nvfp4_svdquant, mxfp8]" >&2 exit 1 ;; esac From 9aa775ab5db3933d2672ea0d53e36e8fccc4758b Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Wed, 4 Mar 2026 18:30:04 +0000 Subject: [PATCH 06/12] Update readme Signed-off-by: Chenjie Luo --- CHANGELOG.rst | 1 + examples/llm_ptq/README.md | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4b3ee96fb0..5d35916278 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -17,6 +17,7 @@ NVIDIA Model Optimizer Changelog - Add support for rotating the input before quantization for RHT. - Add support for advanced weight scale search for NVFP4 quantization and its export path. - Enable PTQ workflow for Qwen3.5 MoE models. +- Add ``nvfp4_omlp_only`` quantization format for NVFP4 quantization. This is similar to ``nvfp4_mlp_only`` but also quantizes the output projection layer in attention. **Misc** diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index 7a9a71f885..ade5acea15 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -69,6 +69,8 @@ def forward_loop(model): model = mtq.quantize(model, mtq.NVFP4_DEFAULT_CFG, forward_loop) ``` +> *For higher NVFP4 PTQ accuracy, we recommend using `mtq.NVFP4_MLP_ONLY_CFG` or `mtq.NVFP4_OMLP_ONLY_CFG` instead of `mtq.NVFP4_DEFAULT_CFG`. `NVFP4_MLP_ONLY_CFG` applies NVFP4 quantization only to MLP (and MoE) layers, leaving attention layers unquantized. `NVFP4_OMLP_ONLY_CFG` additionally quantizes the `o_proj` layer. Both preserve accuracy in the sensitive attention QKV projections while still providing significant compression.* + ### 2. Export Quantized Model Once your model is quantized, you can now export that model to a checkpoint for easy deployment. \ @@ -126,7 +128,7 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http > *7.[PTQ for DeepSeek](../deepseek/README.md)* \ > *8.GLM-4.7 has MTP (Multi-Token Prediction) layers that are automatically loaded and excluded from quantization.* -> *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead.* +> *The accuracy loss after PTQ may vary depending on the actual model and the quantization method. Different models may have different accuracy loss and usually the accuracy loss is more significant when the base model is small. If the accuracy after PTQ is not meeting the requirement, please try either modifying [hf_ptq.py](./hf_ptq.py) and disabling the KV cache quantization or using the [QAT](./../llm_qat/README.md) instead. For NVFP4 quantization specifically, we recommend `nvfp4_mlp_only` or `nvfp4_omlp_only` to achieve higher accuracy by restricting quantization to the MLP layers (and optionally the `o_proj` layer) while keeping the attention QKV projections unquantized.* > You can also create your own custom config using [this](https://nvidia.github.io/Model-Optimizer/guides/_pytorch_quantization.html#custom-calibration-algorithm) guide. @@ -144,7 +146,7 @@ For LLM models like [Llama-3](https://huggingface.co/meta-llama): # Install model specific pip dependencies if needed export HF_PATH= -scripts/huggingface_example.sh --model $HF_PATH --quant [fp8|nvfp4|int8_sq|int4_awq|w4a8_awq] --tp [1|2|4|8] +scripts/huggingface_example.sh --model $HF_PATH --quant [fp8|nvfp4|nvfp4_mlp_only|nvfp4_omlp_only|int8_sq|int4_awq|w4a8_awq] --tp [1|2|4|8] ``` > *By default `trust_remote_code` is set to false. Please turn it on if model calibration and eval requires it using `--trust_remote_code`.* @@ -295,7 +297,7 @@ accelerate launch --config_file fsdp2.yaml \ --fsdp_transformer_layer_cls_to_wrap= multinode_ptq.py \ --pyt_ckpt_path \ - --qformat \ + --qformat \ --kv_cache_qformat \ --batch_size \ --calib_size \ @@ -460,4 +462,4 @@ There are many quantization schemes supported in the example scripts: 1. The W4A8 AWQ is an extension of the INT4 AWQ quantization that it also uses FP8 for activation for more speed up and acceleration. -1. The [NVFP4](https://blogs.nvidia.com/blog/generative-ai-studio-ces-geforce-rtx-50-series/) is one of the new FP4 formats supported by NVIDIA Blackwell GPU and demonstrates good accuracy compared with other 4-bit alternatives. NVFP4 can be applied to both model weights as well as activations, providing the potential for both a significant increase in math throughput and reductions in memory footprint and memory bandwidth usage compared to the FP8 data format on Blackwell. +1. The [NVFP4](https://blogs.nvidia.com/blog/generative-ai-studio-ces-geforce-rtx-50-series/) is one of the new FP4 formats supported by NVIDIA Blackwell GPU and demonstrates good accuracy compared with other 4-bit alternatives. NVFP4 can be applied to both model weights as well as activations, providing the potential for both a significant increase in math throughput and reductions in memory footprint and memory bandwidth usage compared to the FP8 data format on Blackwell. For higher accuracy with NVFP4 PTQ, we recommend `nvfp4_mlp_only` or `nvfp4_omlp_only`. `nvfp4_mlp_only` restricts NVFP4 quantization to MLP (and MoE) layers only, leaving attention layers in higher precision. `nvfp4_omlp_only` extends this by also quantizing the `o_proj` layer, providing a middle ground between full NVFP4 and MLP-only quantization. From 34952a76e0cfecbeb98816fd7a880071dfdb01ee Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Wed, 4 Mar 2026 19:30:25 +0000 Subject: [PATCH 07/12] Fix Signed-off-by: Chenjie Luo --- CHANGELOG.rst | 1 + modelopt/torch/quantization/config.py | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5d35916278..47349527c7 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -18,6 +18,7 @@ NVIDIA Model Optimizer Changelog - Add support for advanced weight scale search for NVFP4 quantization and its export path. - Enable PTQ workflow for Qwen3.5 MoE models. - Add ``nvfp4_omlp_only`` quantization format for NVFP4 quantization. This is similar to ``nvfp4_mlp_only`` but also quantizes the output projection layer in attention. +- ``pass_through_bwd`` in the quantization config is now default to True. Please set it to False if you want to use STE with zeroed outlier gradients for potentially better QAT accuracy. **Misc** diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 6cafe60d03..ff56227136 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -662,6 +662,7 @@ "NVFP4_MLP_WEIGHT_ONLY_CFG", "MXFP4_MLP_WEIGHT_ONLY_CFG", "NVFP4_MLP_ONLY_CFG", + "NVFP4_OMLP_ONLY_CFG", "MAMBA_MOE_NVFP4_CONSERVATIVE_CFG", "MAMBA_MOE_NVFP4_AGGRESSIVE_CFG", "MAMBA_MOE_FP8_CONSERVATIVE_CFG", From 98fdd3bdd1e74af602395db560c36db50119bfd3 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Wed, 4 Mar 2026 21:34:35 +0000 Subject: [PATCH 08/12] Update Signed-off-by: Chenjie Luo --- tests/gpu/torch/export/test_fsdp2_export.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/gpu/torch/export/test_fsdp2_export.py b/tests/gpu/torch/export/test_fsdp2_export.py index ba59324dd7..2a493d4412 100644 --- a/tests/gpu/torch/export/test_fsdp2_export.py +++ b/tests/gpu/torch/export/test_fsdp2_export.py @@ -225,6 +225,7 @@ def test_fsdp2_weight_update_context_for_export(dist_workers): # mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG, #TODO: Fix unit test for this case mtq.W4A8_MXFP4_FP8_CFG, mtq.NVFP4_MLP_ONLY_CFG, + mtq.NVFP4_OMLP_ONLY_CFG, ], ) @pytest.mark.parametrize("bias", [True, False]) @@ -244,6 +245,7 @@ def test_fsdp2_weight_update_context_for_fuse_layers(dist_workers, quant_config, # mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG, #TODO: Fix unit test for this case mtq.W4A8_MXFP4_FP8_CFG, mtq.NVFP4_MLP_ONLY_CFG, + mtq.NVFP4_OMLP_ONLY_CFG, ], ) @pytest.mark.parametrize("bias", [True, False]) From 5b943f9992ba0ba02624e52d9df0d0dc4f613c42 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 5 Mar 2026 17:51:43 +0000 Subject: [PATCH 09/12] Fix Signed-off-by: Chenjie Luo --- examples/llm_ptq/README.md | 2 +- examples/llm_ptq/hf_ptq.py | 2 ++ modelopt/torch/quantization/conversion.py | 20 ++++++++++++++++++-- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/examples/llm_ptq/README.md b/examples/llm_ptq/README.md index ade5acea15..6576242152 100755 --- a/examples/llm_ptq/README.md +++ b/examples/llm_ptq/README.md @@ -69,7 +69,7 @@ def forward_loop(model): model = mtq.quantize(model, mtq.NVFP4_DEFAULT_CFG, forward_loop) ``` -> *For higher NVFP4 PTQ accuracy, we recommend using `mtq.NVFP4_MLP_ONLY_CFG` or `mtq.NVFP4_OMLP_ONLY_CFG` instead of `mtq.NVFP4_DEFAULT_CFG`. `NVFP4_MLP_ONLY_CFG` applies NVFP4 quantization only to MLP (and MoE) layers, leaving attention layers unquantized. `NVFP4_OMLP_ONLY_CFG` additionally quantizes the `o_proj` layer. Both preserve accuracy in the sensitive attention QKV projections while still providing significant compression.* +> *For higher NVFP4 PTQ accuracy, we recommend using `mtq.NVFP4_MLP_ONLY_CFG` or `mtq.NVFP4_OMLP_ONLY_CFG` instead of `mtq.NVFP4_DEFAULT_CFG`. `NVFP4_MLP_ONLY_CFG` applies NVFP4 quantization to MLP (and MoE) layers, leaving attention layers unquantized. `NVFP4_OMLP_ONLY_CFG` additionally quantizes the `o_proj` layer. Both preserve accuracy in the sensitive attention QKV projections while still providing significant compression.* ### 2. Export Quantized Model diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 264d0580ca..67e7016a82 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -253,6 +253,7 @@ def auto_quantize( "fp8_pb_wo", "w4a8_mxfp4_fp8", "nvfp4_mlp_only", + "nvfp4_omlp_only", "mxfp8", ] for args.qformat in qformat_list @@ -901,6 +902,7 @@ def quantize_main( "fp8_pb_wo", "w4a8_mxfp4_fp8", "nvfp4_mlp_only", + "nvfp4_omlp_only", "mxfp8", ] or args.kv_cache_qformat in KV_QUANT_CFG_CHOICES diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index f7ef704eec..ed09ad9fb9 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -211,6 +211,20 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe _replace_quant_module(getattr(model, name), version=version, registry=registry) +def _auto_enable_cfg(cfg): + """Add ``enable=True`` to cfgs that set quantization properties without an explicit ``enable``. + + When ``"default": {"enable": False}`` disables all quantizers and a specific pattern then + sets properties like ``num_bits`` / ``block_sizes``, the quantizer should be re-enabled + implicitly rather than requiring an explicit ``"enable": True`` in every config entry. + """ + if isinstance(cfg, dict) and "enable" not in cfg and cfg: + cfg = {**cfg, "enable": True} + elif isinstance(cfg, list): + cfg = [_auto_enable_cfg(c) for c in cfg] + return cfg + + def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType | dict): """Update the quantizer attributes based on the specified `quant_cfg`. @@ -240,9 +254,11 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType f"Expected a dictionary for quantizer configuration for child tensor quantizers of {parent_class}." ) for sub_pattern, sub_cfg in cfg.items(): - set_quantizer_attribute(quant_model, sub_pattern, sub_cfg, parent_class) + set_quantizer_attribute( + quant_model, sub_pattern, _auto_enable_cfg(sub_cfg), parent_class + ) continue - set_quantizer_attribute(quant_model, pattern, cfg) + set_quantizer_attribute(quant_model, pattern, _auto_enable_cfg(cfg)) def set_quantizer_attribute( From 81c827f54a21be6c387d1e7105a8d20904c81566 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 5 Mar 2026 18:35:06 +0000 Subject: [PATCH 10/12] Revert Signed-off-by: Chenjie Luo --- modelopt/torch/quantization/config.py | 25 +++++++++++++++++++++++ modelopt/torch/quantization/conversion.py | 20 ++---------------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index ff56227136..a9b3574c4d 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -248,6 +248,7 @@ "*weight_quantizer": { "num_bits": (4, 3), "block_sizes": {-1: 128, -2: 128}, + "enable": True, }, "*input_quantizer": {"enable": False}, **_default_disabled_quantizer_cfg, @@ -260,6 +261,7 @@ "*weight_quantizer": { "num_bits": 4, "block_sizes": {-1: 128}, + "enable": True, }, "*input_quantizer": {"enable": False}, **_default_disabled_quantizer_cfg, @@ -273,6 +275,7 @@ "*weight_quantizer": { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, + "enable": True, }, "*input_quantizer": {"enable": False}, **_default_disabled_quantizer_cfg, @@ -290,13 +293,16 @@ { "num_bits": 4, "block_sizes": {-1: 128, "type": "static"}, + "enable": True, }, { "num_bits": (4, 3), + "enable": True, }, ], "*input_quantizer": { "num_bits": (4, 3), + "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -308,10 +314,12 @@ "*weight_quantizer": { "num_bits": (4, 3), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, "*input_quantizer": { "num_bits": (4, 3), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -323,10 +331,12 @@ "*weight_quantizer": { "num_bits": (3, 2), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, "*input_quantizer": { "num_bits": (3, 2), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -338,10 +348,12 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, "*input_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -353,6 +365,7 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, "*input_quantizer": {"num_bits": (4, 3), "axis": None}, **_default_disabled_quantizer_cfg, @@ -365,10 +378,12 @@ "*weight_quantizer": { "num_bits": 8, "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, "*input_quantizer": { "num_bits": 8, "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -379,6 +394,7 @@ "quant_cfg": { "*[kv]_bmm_quantizer": { "num_bits": (4, 3), + "enable": True, }, "default": {"enable": False}, }, @@ -399,6 +415,7 @@ _nvfp4_quantizer = { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "enable": True, } NVFP4_DEFAULT_CFG = { @@ -415,6 +432,7 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, + "enable": True, }, "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, @@ -430,6 +448,7 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 16, "type": "static", "scale_bits": (4, 3)}, + "enable": True, }, "*input_quantizer": _nvfp4_quantizer, **_default_disabled_quantizer_cfg, @@ -564,9 +583,11 @@ "*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (4, 3)}, + "enable": True, }, "*input_quantizer": { "num_bits": (4, 3), + "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -578,10 +599,12 @@ "*mlp*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, "*block_sparse_moe*weight_quantizer": { "num_bits": (2, 1), "block_sizes": {-1: 32, "type": "dynamic", "scale_bits": (8, 0)}, + "enable": True, }, **_default_disabled_quantizer_cfg, }, @@ -597,6 +620,7 @@ "type": "dynamic", "scale_bits": (4, 3), }, # Note: block_size is 32 here + "enable": True, }, "*block_sparse_moe*weight_quantizer": { "num_bits": (2, 1), @@ -605,6 +629,7 @@ "type": "dynamic", "scale_bits": (4, 3), }, # Note: block_size is 32 here + "enable": True, }, **_default_disabled_quantizer_cfg, }, diff --git a/modelopt/torch/quantization/conversion.py b/modelopt/torch/quantization/conversion.py index ed09ad9fb9..f7ef704eec 100644 --- a/modelopt/torch/quantization/conversion.py +++ b/modelopt/torch/quantization/conversion.py @@ -211,20 +211,6 @@ def _replace_quant_module(model: nn.Module, version=None, registry=QuantModuleRe _replace_quant_module(getattr(model, name), version=version, registry=registry) -def _auto_enable_cfg(cfg): - """Add ``enable=True`` to cfgs that set quantization properties without an explicit ``enable``. - - When ``"default": {"enable": False}`` disables all quantizers and a specific pattern then - sets properties like ``num_bits`` / ``block_sizes``, the quantizer should be re-enabled - implicitly rather than requiring an explicit ``"enable": True`` in every config entry. - """ - if isinstance(cfg, dict) and "enable" not in cfg and cfg: - cfg = {**cfg, "enable": True} - elif isinstance(cfg, list): - cfg = [_auto_enable_cfg(c) for c in cfg] - return cfg - - def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType | dict): """Update the quantizer attributes based on the specified `quant_cfg`. @@ -254,11 +240,9 @@ def set_quantizer_by_cfg(quant_model: nn.Module, quant_cfg: QuantizeQuantCfgType f"Expected a dictionary for quantizer configuration for child tensor quantizers of {parent_class}." ) for sub_pattern, sub_cfg in cfg.items(): - set_quantizer_attribute( - quant_model, sub_pattern, _auto_enable_cfg(sub_cfg), parent_class - ) + set_quantizer_attribute(quant_model, sub_pattern, sub_cfg, parent_class) continue - set_quantizer_attribute(quant_model, pattern, _auto_enable_cfg(cfg)) + set_quantizer_attribute(quant_model, pattern, cfg) def set_quantizer_attribute( From 9d5463533c30dfe6b3867a017f9721e24f45e3d2 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 5 Mar 2026 20:33:56 +0000 Subject: [PATCH 11/12] Fix Signed-off-by: Chenjie Luo --- tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py b/tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py index ba7b522ae3..b0757b331c 100644 --- a/tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py +++ b/tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py @@ -84,7 +84,9 @@ def test_fp4(self): def test_fp4_backward(self): fp4_quantizer = tensor_quantizer.TensorQuantizer( QuantizerAttributeConfig( - num_bits=(2, 1), block_sizes={-1: 16, "type": "dynamic", "scale_bits": (4, 3)} + num_bits=(2, 1), + block_sizes={-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + pass_through_bwd=False, ) ).cuda() From db185e39069120933b000c029d5664ba473a0ac6 Mon Sep 17 00:00:00 2001 From: Chenjie Luo Date: Thu, 5 Mar 2026 20:36:39 +0000 Subject: [PATCH 12/12] Fix Signed-off-by: Chenjie Luo --- .../torch/quantization/test_tensor_quantizer_cuda.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py b/tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py index b0757b331c..af884c878b 100644 --- a/tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py +++ b/tests/gpu/torch/quantization/test_tensor_quantizer_cuda.py @@ -81,12 +81,13 @@ def test_fp4(self): assert fp4_quantizer._get_amax(x) == x.abs().amax() - def test_fp4_backward(self): + @pytest.mark.parametrize("pass_through_bwd", [True, False]) + def test_fp4_backward(self, pass_through_bwd): fp4_quantizer = tensor_quantizer.TensorQuantizer( QuantizerAttributeConfig( num_bits=(2, 1), block_sizes={-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, - pass_through_bwd=False, + pass_through_bwd=pass_through_bwd, ) ).cuda() @@ -98,7 +99,11 @@ def test_fp4_backward(self): loss = fp4_quantizer(x).sum() loss.backward() - assert torch.allclose(x.grad, torch.ones_like(x.grad) * (x.abs() <= fp4_quantizer.amax)) + if pass_through_bwd: + expected_grad = torch.ones_like(x.grad) + else: + expected_grad = torch.ones_like(x.grad) * (x.abs() <= fp4_quantizer.amax) + assert torch.allclose(x.grad, expected_grad) def test_fp4_non_contiguous_input(self): contiguous_tensor = torch.ones(2, 16).cuda()