From e17c3c9ef0fad9afdb681d8cb225fb570a1e6f80 Mon Sep 17 00:00:00 2001 From: ajrasane <131806219+ajrasane@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:29:20 +0000 Subject: [PATCH] fix(onnx): restore llm_export_utils as deprecated shim for edgellm compat PR #1210 (b3feebfe) removed the modelopt.onnx.llm_export_utils package in 0.44.0rc1, pointing users at TensorRT-Edge-LLM as the migration target. However, TensorRT-Edge-LLM 0.6.1 itself imports modelopt.onnx.llm_export_utils.surgeon_utils.fold_fp8_qdq_to_dq at module load time, so every tensorrt-edgellm-* CLI fails immediately with ModuleNotFoundError - even tensorrt-edgellm-quantize-llm --help. The "unused" framing in the original removal commit only held inside this repo; the public API surface had an external consumer. Restore the four original submodules verbatim under modelopt/onnx/llm_export_utils/ and emit a DeprecationWarning on package import directing users to modelopt.onnx.export, modelopt.onnx.graph_surgery, or TensorRT-Edge-LLM. The new exporter / graph-surgery packages do not expose fold_fp8_qdq_to_dq, so a pure import-redirect would not have worked - the function itself has to come back. Fixes nvbug 6106576 / OMNIML-3995. Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com> --- modelopt/onnx/llm_export_utils/__init__.py | 42 +++++ .../onnx/llm_export_utils/export_utils.py | 162 ++++++++++++++++++ .../llm_export_utils/quantization_utils.py | 146 ++++++++++++++++ .../onnx/llm_export_utils/surgeon_utils.py | 120 +++++++++++++ 4 files changed, 470 insertions(+) create mode 100644 modelopt/onnx/llm_export_utils/__init__.py create mode 100644 modelopt/onnx/llm_export_utils/export_utils.py create mode 100644 modelopt/onnx/llm_export_utils/quantization_utils.py create mode 100644 modelopt/onnx/llm_export_utils/surgeon_utils.py diff --git a/modelopt/onnx/llm_export_utils/__init__.py b/modelopt/onnx/llm_export_utils/__init__.py new file mode 100644 index 0000000000..8ea066d865 --- /dev/null +++ b/modelopt/onnx/llm_export_utils/__init__.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deprecated shim for the legacy ``modelopt.onnx.llm_export_utils`` package. + +The in-repo LLM ONNX export pipeline (formerly ``examples/torch_onnx/llm_export.py`` +plus this package) was removed in 0.44.0rc1 in favor of +`TensorRT-Edge-LLM `_, which provides +a more complete and actively maintained pipeline. + +This package is preserved only as a compatibility shim so external consumers that +still import ``modelopt.onnx.llm_export_utils`` (notably TensorRT-Edge-LLM 0.6.1 +and earlier) continue to work. It will be removed in a future release. + +New code should migrate to: + +* ``modelopt.onnx.export`` — quant exporters (``FP8QuantExporter``, ``NVFP4QuantExporter``, etc.) +* ``modelopt.onnx.graph_surgery`` — graph transforms (GQA replacement, BF16 conversion, etc.) +* `TensorRT-Edge-LLM `_ — end-to-end LLM export. +""" + +import warnings + +warnings.warn( + "modelopt.onnx.llm_export_utils is deprecated and will be removed in a future " + "release. Use modelopt.onnx.export and modelopt.onnx.graph_surgery, or migrate " + "to TensorRT-Edge-LLM (https://github.com/NVIDIA/TensorRT-Edge-LLM).", + DeprecationWarning, + stacklevel=2, +) diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py new file mode 100644 index 0000000000..2016e872e2 --- /dev/null +++ b/modelopt/onnx/llm_export_utils/export_utils.py @@ -0,0 +1,162 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for exporting LLM models to ONNX.""" + +import json +import os +import time +from enum import Enum + +import torch +from transformers import AutoModelForCausalLM, DynamicCache + + +class RopeType(Enum): + """Rope type enum.""" + + K_NONE = 0 + K_ROPE_ROTATE_GPTJ = 1 + K_ROPE_ROTATE_NEOX = 2 + K_MROPE = 3 + + +class ModelLoader: + """A class to handle HuggingFace model loading and configuration.""" + + def __init__(self, hf_model_path: str, config_path: str): + """Initialize the ModelLoader.""" + self.config_path = config_path + self.hf_model_path = hf_model_path + self.model_type = self.get_model_type() + self.hf_model = None + self.rope_type = RopeType.K_ROPE_ROTATE_NEOX + + def get_model_type(self): + """Get model type from config file.""" + with open(self.config_path) as f: + return json.load(f).get("model_type") + + def load_model(self, trust_remote_code: bool = False) -> AutoModelForCausalLM: + """Load HuggingFace model based on model type.""" + print(f"Loading HF model from {self.hf_model_path} with model type {self.model_type}") + self.hf_model = AutoModelForCausalLM.from_pretrained( + self.hf_model_path, torch_dtype=torch.float16, trust_remote_code=trust_remote_code + ) + + return self.hf_model.eval().cuda() # type: ignore[attr-defined] + + def get_rope_type(self): + """Get rope type.""" + return self.rope_type + + +class WrapperModelForCausalLM(torch.nn.Module): + """Wrapper Model to ensure all models have the same I/O.""" + + def __init__(self, model): + """Initialize the WrapperModelForCausalLM.""" + super().__init__() + try: + self.model = model.model + except Exception: + self.model = model + self.lm_head = model.lm_head + self.config = model.config + + def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple): + """Forward pass.""" + # Convert tuple cache to DynamicCache for models that require it (e.g., Qwen3) + cache = DynamicCache(config=self.config) + cache.key_cache = [kv[0] for kv in past_key_values] + cache.value_cache = [kv[1] for kv in past_key_values] + past_key_values = cache + + outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True) + hidden_states = outputs[0] + past_key_values = outputs.past_key_values.to_legacy_cache() + logits = self.lm_head(hidden_states) + return logits, past_key_values + + +def llm_to_onnx(model, output_dir, extra_inputs={}, extra_dyn_axes={}): + """Export the WrapperModelForCausalLM to ONNX with fixed I/O names and shape definitions and save to `output_dir`. + + Parameters: + model: torch.Module + output_dir: str, the output_dir of the original ONNX. + extra_inputs: dict, append additional inputs after kv_cache. Usually for VL models + extra_dyn_axes: dict. Usually for VL models + """ + start_time = time.time() + config = model.config + num_layers = config.num_hidden_layers + num_attention_heads = config.num_attention_heads + num_key_value_heads = config.num_key_value_heads + hidden_size = config.hidden_size + hidden_size_per_layer = hidden_size // num_attention_heads + + dummy_bs = 1 + dummy_len = 10 + dummy_input_ids = torch.randint(100, (dummy_bs, dummy_len), dtype=torch.int64).cuda() + input_names = ["input_ids"] + output_names = ["logits"] + dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}} + dummy_kv_cache = () + for i in range(num_layers): + dummy_k = torch.rand( + (dummy_bs, num_key_value_heads, dummy_len, hidden_size_per_layer), dtype=torch.float16 + ).cuda() + dummy_v = torch.rand( + (dummy_bs, num_key_value_heads, dummy_len, hidden_size_per_layer), dtype=torch.float16 + ).cuda() + dummy_kv_cache = (*dummy_kv_cache, (dummy_k, dummy_v)) + input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"]) + output_names.extend([f"present_key_values.{i}.key", f"present_key_values.{i}.value"]) + input_dynamic_axes = {0: "batch_size", 2: "past_len"} + dynamic_axes[f"past_key_values.{i}.key"] = input_dynamic_axes + dynamic_axes[f"past_key_values.{i}.value"] = input_dynamic_axes + + torch_to_onnx( + model, + (dummy_input_ids, {"past_key_values": dummy_kv_cache, **extra_inputs}), + output_dir, + "model.onnx", + input_names=input_names + list(extra_inputs.keys()), + output_names=output_names, + dynamic_axes=dynamic_axes | extra_dyn_axes, + ) + + end_time = time.time() + print( + f"Native ONNX Export from torch completed in {end_time - start_time}s. ONNX file is saved to {output_dir}." + ) + + +def torch_to_onnx(model, inputs, onnx_dir, onnx_name, input_names, output_names, dynamic_axes): + """Export the model to ONNX.""" + os.makedirs(onnx_dir, exist_ok=True) + with torch.inference_mode(): + torch.onnx.export( + model, + inputs, + f"{onnx_dir}/{onnx_name}", + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + opset_version=19, + do_constant_folding=True, + dynamo=False, + ) diff --git a/modelopt/onnx/llm_export_utils/quantization_utils.py b/modelopt/onnx/llm_export_utils/quantization_utils.py new file mode 100644 index 0000000000..ac24c24a53 --- /dev/null +++ b/modelopt/onnx/llm_export_utils/quantization_utils.py @@ -0,0 +1,146 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Quantization utilities for LLM models.""" + +import copy +import time + +import modelopt.torch.quantization as mtq +from modelopt.torch.utils.dataset_utils import get_dataset_dataloader + + +def _quantize_model(model, quant_config, calib_dataloader=None): + """The calibration loop for the model can be setup using the modelopt API. + + Example usage: + from modelopt.torch.utils.dataset_utils import create_forward_loop + model = ... # Initialize the model + tokenizer = ... # Initialize the tokenizer + quant_cfg = ... # Setup quantization configuration + forward_loop = create_forward_loop(model=model, dataset_name="cnn_dailymail", tokenizer=tokenizer) + mtq.quantize(model, quant_cfg, forward_loop=forward_loop) + """ + + def calibrate_loop(model): + """Adjusts weights and scaling factors based on selected algorithms.""" + for idx, data in enumerate(calib_dataloader): + if idx % 10 == 0: + print(f"Calibrating batch {idx}...") + if isinstance(data, dict): + data = {k: v.to(model.device) for k, v in data.items()} + model(**data) + else: + data = data.to(model.device) + model(data) + + print("Starting quantization...") + start_time = time.time() + mtq.quantize(model, quant_config, forward_loop=calibrate_loop) + end_time = time.time() + print(f"Quantization finishes in {end_time - start_time}s.") + + return model + + +def get_quant_config(precision, lm_head_precision="fp16"): + """Get the quantization configuration.""" + if precision == "fp8": + quant_cfg = copy.deepcopy(mtq.FP8_DEFAULT_CFG) + + elif precision == "nvfp4": + quant_cfg = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG) + + elif precision == "int4_awq": + quant_cfg = copy.deepcopy(mtq.INT4_AWQ_CFG) # type: ignore[arg-type] + + else: + raise ValueError(f"Unsupported precision: {precision}") + + quant_cfg_list: list = [ + e for e in quant_cfg["quant_cfg"] if isinstance(e, dict) and "quantizer_name" in e + ] + + if lm_head_precision == "fp8": + quant_cfg_list.append( + { + "quantizer_name": "*lm_head.input_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + } + ) + quant_cfg_list.append( + { + "quantizer_name": "*lm_head.weight_quantizer", + "cfg": {"num_bits": (4, 3), "axis": None}, + } + ) + elif lm_head_precision == "nvfp4": + quant_cfg_list.append( + { + "quantizer_name": "*lm_head.input_quantizer", + "cfg": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + }, + "enable": True, + } + ) + quant_cfg_list.append( + { + "quantizer_name": "*lm_head.weight_quantizer", + "cfg": { + "num_bits": (2, 1), + "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)}, + "axis": None, + }, + "enable": True, + } + ) + quant_cfg["quant_cfg"] = quant_cfg_list + return quant_cfg + + +def quantize( + model, tokenizer, precision, lm_head_precision="fp16", dataset_dir=None, calib_size=512 +): + """Quantize the PyTorch model to fp8 or int4_awq.""" + assert precision in [ + "fp8", + "int4_awq", + "nvfp4", + ], ( + f"Only fp8(W8A8), int4_awq(W4A16), nvfp4(W4A4) is supported. You passed an unsupported precision: {precision}." + ) + + assert lm_head_precision in ["fp16"], ( + f"Only fp16(unquantized) is supported for lm_head. You passed an unsupported precision: {lm_head_precision}." + ) + + if tokenizer.pad_token != "": # nosec B105 + tokenizer.pad_token = tokenizer.eos_token + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + if not dataset_dir: + dataset_dir = "cnn_dailymail" + + batch_size = 1 + data_loader = get_dataset_dataloader( + dataset_name=dataset_dir, tokenizer=tokenizer, batch_size=batch_size, num_samples=calib_size + ) + quant_config = get_quant_config(precision, lm_head_precision) + quantized_model = _quantize_model(model, quant_config, data_loader) + mtq.print_quant_summary(quantized_model) + return quantized_model diff --git a/modelopt/onnx/llm_export_utils/surgeon_utils.py b/modelopt/onnx/llm_export_utils/surgeon_utils.py new file mode 100644 index 0000000000..2937f6ad0c --- /dev/null +++ b/modelopt/onnx/llm_export_utils/surgeon_utils.py @@ -0,0 +1,120 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities to surgeon ONNX graph after export.""" + +import re +import time + +import onnx +import onnx_graphsurgeon as gs +import torch +from onnx_graphsurgeon.ir.tensor import LazyValues + + +def clear_inputs(node: gs.Node | gs.Tensor): + """Clear all inputs for a node or tensor in ONNX.""" + for i in node.inputs: + i.outputs.clear() + node.inputs.clear() + return node + + +def clear_outputs(node: gs.Node | gs.Tensor): + """Clear all outputs for a node or tensor in ONNX.""" + for o in node.outputs: + o.inputs.clear() + node.outputs.clear() + return node + + +def extract_layer_id(name: str): + """Extract layer id from certain ONNX layer name. + + Parameters: + name: str + The name of ONNX layer. e.g. /model/layer.0/q_proj/... + + Returns: + The layer id for the layer as int. In the example above, it returns 0 + """ + match = re.search(r"layers\.(\d+)", name) + if match: + return int(match.group(1)) + raise Exception(f"{name} does not contain layer info!") + + +def no_none_elements(elements: list): + """Check if all elements in the list are not None.""" + return all(i is not None for i in elements) + + +def fold_fp8_qdq_to_dq(graph: gs.Graph): + """Convert FP32/FP16 weights of the given ONNX model to FP8 weights. + + Even though modelopt supports FP8 onnx export, the weights are represented in fp32 + QDQ. + The storage is therefore very bad. In this function, + Q nodes will get removed from the weights and have only DQ nodes with those converted FP8 + weights in the output model. + + Parameters: + graph: gs.Graph. + + Returns: + gs.Graph with only DQ nodes for weights and same QDQ nodes for activations. + """ + start_time = time.time() + print("Replacing all (fp32 weights + fp8 QDQ) with (fp8 weights + DQ)...") + # Fold constants is required since the scale is not constant yet. + graph.cleanup().toposort().fold_constants().cleanup() + + for node in graph.nodes: + if node.op == "TRT_FP8QuantizeLinear": + # Should not remove input QDQ + if not isinstance(node.inputs[0], gs.Constant): + continue + + weights = node.inputs[0] + scale = node.inputs[1] + torch_weights = torch.from_numpy(weights.values) + torch_scale = torch.from_numpy(scale.values) + quantizer_name = scale.name.rsplit("/", 1)[0] + dq_op = node.outputs[0].outputs[0] + assert dq_op.op == "TRT_FP8DequantizeLinear", ( + f"QDQ does not occur in pairs. You reached {dq_op.op}" + ) + + # Replace it with Dequantize with FP8 weights. This is a WAR because numpy does not support fp8. + numpy_weights = ( + (torch_weights / torch_scale).to(torch.float8_e4m3fn).view(torch.uint8).numpy() + ) + tensor = onnx.TensorProto() + tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN + tensor.dims.extend(numpy_weights.shape) + tensor.raw_data = numpy_weights.tobytes() + values = LazyValues(tensor) + onnx_weights_fp8 = gs.Constant(quantizer_name + "/fp8_weights", values) + + node.outputs.clear() + # DQ Op is separated out + dq_op.inputs[0] = onnx_weights_fp8 + dq_op.op = "DequantizeLinear" + dq_op.outputs[0].dtype = dq_op.inputs[1].dtype + + graph.cleanup().toposort() + end_time = time.time() + print(f"fp8 qdq replaced with only dq completed in {end_time - start_time}s.") + + return graph