From e17c3c9ef0fad9afdb681d8cb225fb570a1e6f80 Mon Sep 17 00:00:00 2001
From: ajrasane <131806219+ajrasane@users.noreply.github.com>
Date: Mon, 27 Apr 2026 17:29:20 +0000
Subject: [PATCH] fix(onnx): restore llm_export_utils as deprecated shim for
 edgellm compat

PR #1210 (b3feebfe) removed the modelopt.onnx.llm_export_utils package in
0.44.0rc1, pointing users at TensorRT-Edge-LLM as the migration target.
However, TensorRT-Edge-LLM 0.6.1 itself imports
modelopt.onnx.llm_export_utils.surgeon_utils.fold_fp8_qdq_to_dq at module
load time, so every tensorrt-edgellm-* CLI fails immediately with
ModuleNotFoundError - even tensorrt-edgellm-quantize-llm --help. The
"unused" framing in the original removal commit only held inside this
repo; the public API surface had an external consumer.

Restore the four original submodules verbatim under
modelopt/onnx/llm_export_utils/ and emit a DeprecationWarning on package
import directing users to modelopt.onnx.export, modelopt.onnx.graph_surgery,
or TensorRT-Edge-LLM. The new exporter / graph-surgery packages do not
expose fold_fp8_qdq_to_dq, so a pure import-redirect would not have worked
- the function itself has to come back.

Fixes nvbug 6106576 / OMNIML-3995.

Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
---
 modelopt/onnx/llm_export_utils/__init__.py    |  42 +++++
 .../onnx/llm_export_utils/export_utils.py     | 162 ++++++++++++++++++
 .../llm_export_utils/quantization_utils.py    | 146 ++++++++++++++++
 .../onnx/llm_export_utils/surgeon_utils.py    | 120 +++++++++++++
 4 files changed, 470 insertions(+)
 create mode 100644 modelopt/onnx/llm_export_utils/__init__.py
 create mode 100644 modelopt/onnx/llm_export_utils/export_utils.py
 create mode 100644 modelopt/onnx/llm_export_utils/quantization_utils.py
 create mode 100644 modelopt/onnx/llm_export_utils/surgeon_utils.py

diff --git a/modelopt/onnx/llm_export_utils/__init__.py b/modelopt/onnx/llm_export_utils/__init__.py
new file mode 100644
index 0000000000..8ea066d865
--- /dev/null
+++ b/modelopt/onnx/llm_export_utils/__init__.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Deprecated shim for the legacy ``modelopt.onnx.llm_export_utils`` package.
+
+The in-repo LLM ONNX export pipeline (formerly ``examples/torch_onnx/llm_export.py``
+plus this package) was removed in 0.44.0rc1 in favor of
+`TensorRT-Edge-LLM <https://github.com/NVIDIA/TensorRT-Edge-LLM>`_, which provides
+a more complete and actively maintained pipeline.
+
+This package is preserved only as a compatibility shim so external consumers that
+still import ``modelopt.onnx.llm_export_utils`` (notably TensorRT-Edge-LLM 0.6.1
+and earlier) continue to work. It will be removed in a future release.
+
+New code should migrate to:
+
+* ``modelopt.onnx.export`` — quant exporters (``FP8QuantExporter``, ``NVFP4QuantExporter``, etc.)
+* ``modelopt.onnx.graph_surgery`` — graph transforms (GQA replacement, BF16 conversion, etc.)
+* `TensorRT-Edge-LLM <https://github.com/NVIDIA/TensorRT-Edge-LLM>`_ — end-to-end LLM export.
+"""
+
+import warnings
+
+warnings.warn(
+    "modelopt.onnx.llm_export_utils is deprecated and will be removed in a future "
+    "release. Use modelopt.onnx.export and modelopt.onnx.graph_surgery, or migrate "
+    "to TensorRT-Edge-LLM (https://github.com/NVIDIA/TensorRT-Edge-LLM).",
+    DeprecationWarning,
+    stacklevel=2,
+)
diff --git a/modelopt/onnx/llm_export_utils/export_utils.py b/modelopt/onnx/llm_export_utils/export_utils.py
new file mode 100644
index 0000000000..2016e872e2
--- /dev/null
+++ b/modelopt/onnx/llm_export_utils/export_utils.py
@@ -0,0 +1,162 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for exporting LLM models to ONNX."""
+
+import json
+import os
+import time
+from enum import Enum
+
+import torch
+from transformers import AutoModelForCausalLM, DynamicCache
+
+
+class RopeType(Enum):
+    """Rope type enum."""
+
+    K_NONE = 0
+    K_ROPE_ROTATE_GPTJ = 1
+    K_ROPE_ROTATE_NEOX = 2
+    K_MROPE = 3
+
+
+class ModelLoader:
+    """A class to handle HuggingFace model loading and configuration."""
+
+    def __init__(self, hf_model_path: str, config_path: str):
+        """Initialize the ModelLoader."""
+        self.config_path = config_path
+        self.hf_model_path = hf_model_path
+        self.model_type = self.get_model_type()
+        self.hf_model = None
+        self.rope_type = RopeType.K_ROPE_ROTATE_NEOX
+
+    def get_model_type(self):
+        """Get model type from config file."""
+        with open(self.config_path) as f:
+            return json.load(f).get("model_type")
+
+    def load_model(self, trust_remote_code: bool = False) -> AutoModelForCausalLM:
+        """Load HuggingFace model based on model type."""
+        print(f"Loading HF model from {self.hf_model_path} with model type {self.model_type}")
+        self.hf_model = AutoModelForCausalLM.from_pretrained(
+            self.hf_model_path, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+        )
+
+        return self.hf_model.eval().cuda()  # type: ignore[attr-defined]
+
+    def get_rope_type(self):
+        """Get rope type."""
+        return self.rope_type
+
+
+class WrapperModelForCausalLM(torch.nn.Module):
+    """Wrapper Model to ensure all models have the same I/O."""
+
+    def __init__(self, model):
+        """Initialize the WrapperModelForCausalLM."""
+        super().__init__()
+        try:
+            self.model = model.model
+        except Exception:
+            self.model = model
+        self.lm_head = model.lm_head
+        self.config = model.config
+
+    def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple):
+        """Forward pass."""
+        # Convert tuple cache to DynamicCache for models that require it (e.g., Qwen3)
+        cache = DynamicCache(config=self.config)
+        cache.key_cache = [kv[0] for kv in past_key_values]
+        cache.value_cache = [kv[1] for kv in past_key_values]
+        past_key_values = cache
+
+        outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True)
+        hidden_states = outputs[0]
+        past_key_values = outputs.past_key_values.to_legacy_cache()
+        logits = self.lm_head(hidden_states)
+        return logits, past_key_values
+
+
+def llm_to_onnx(model, output_dir, extra_inputs={}, extra_dyn_axes={}):
+    """Export the WrapperModelForCausalLM to ONNX with fixed I/O names and shape definitions and save to `output_dir`.
+
+    Parameters:
+        model: torch.Module
+        output_dir: str, the output_dir of the original ONNX.
+        extra_inputs: dict, append additional inputs after kv_cache. Usually for VL models
+        extra_dyn_axes: dict. Usually for VL models
+    """
+    start_time = time.time()
+    config = model.config
+    num_layers = config.num_hidden_layers
+    num_attention_heads = config.num_attention_heads
+    num_key_value_heads = config.num_key_value_heads
+    hidden_size = config.hidden_size
+    hidden_size_per_layer = hidden_size // num_attention_heads
+
+    dummy_bs = 1
+    dummy_len = 10
+    dummy_input_ids = torch.randint(100, (dummy_bs, dummy_len), dtype=torch.int64).cuda()
+    input_names = ["input_ids"]
+    output_names = ["logits"]
+    dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}}
+    dummy_kv_cache = ()
+    for i in range(num_layers):
+        dummy_k = torch.rand(
+            (dummy_bs, num_key_value_heads, dummy_len, hidden_size_per_layer), dtype=torch.float16
+        ).cuda()
+        dummy_v = torch.rand(
+            (dummy_bs, num_key_value_heads, dummy_len, hidden_size_per_layer), dtype=torch.float16
+        ).cuda()
+        dummy_kv_cache = (*dummy_kv_cache, (dummy_k, dummy_v))
+        input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"])
+        output_names.extend([f"present_key_values.{i}.key", f"present_key_values.{i}.value"])
+        input_dynamic_axes = {0: "batch_size", 2: "past_len"}
+        dynamic_axes[f"past_key_values.{i}.key"] = input_dynamic_axes
+        dynamic_axes[f"past_key_values.{i}.value"] = input_dynamic_axes
+
+    torch_to_onnx(
+        model,
+        (dummy_input_ids, {"past_key_values": dummy_kv_cache, **extra_inputs}),
+        output_dir,
+        "model.onnx",
+        input_names=input_names + list(extra_inputs.keys()),
+        output_names=output_names,
+        dynamic_axes=dynamic_axes | extra_dyn_axes,
+    )
+
+    end_time = time.time()
+    print(
+        f"Native ONNX Export from torch completed in {end_time - start_time}s. ONNX file is saved to {output_dir}."
+    )
+
+
+def torch_to_onnx(model, inputs, onnx_dir, onnx_name, input_names, output_names, dynamic_axes):
+    """Export the model to ONNX."""
+    os.makedirs(onnx_dir, exist_ok=True)
+    with torch.inference_mode():
+        torch.onnx.export(
+            model,
+            inputs,
+            f"{onnx_dir}/{onnx_name}",
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            opset_version=19,
+            do_constant_folding=True,
+            dynamo=False,
+        )
diff --git a/modelopt/onnx/llm_export_utils/quantization_utils.py b/modelopt/onnx/llm_export_utils/quantization_utils.py
new file mode 100644
index 0000000000..ac24c24a53
--- /dev/null
+++ b/modelopt/onnx/llm_export_utils/quantization_utils.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Quantization utilities for LLM models."""
+
+import copy
+import time
+
+import modelopt.torch.quantization as mtq
+from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
+
+
+def _quantize_model(model, quant_config, calib_dataloader=None):
+    """The calibration loop for the model can be setup using the modelopt API.
+
+    Example usage:
+    from modelopt.torch.utils.dataset_utils import create_forward_loop
+    model = ...  # Initialize the model
+    tokenizer = ...  # Initialize the tokenizer
+    quant_cfg = ...  # Setup quantization configuration
+    forward_loop = create_forward_loop(model=model, dataset_name="cnn_dailymail", tokenizer=tokenizer)
+    mtq.quantize(model, quant_cfg, forward_loop=forward_loop)
+    """
+
+    def calibrate_loop(model):
+        """Adjusts weights and scaling factors based on selected algorithms."""
+        for idx, data in enumerate(calib_dataloader):
+            if idx % 10 == 0:
+                print(f"Calibrating batch {idx}...")
+            if isinstance(data, dict):
+                data = {k: v.to(model.device) for k, v in data.items()}
+                model(**data)
+            else:
+                data = data.to(model.device)
+                model(data)
+
+    print("Starting quantization...")
+    start_time = time.time()
+    mtq.quantize(model, quant_config, forward_loop=calibrate_loop)
+    end_time = time.time()
+    print(f"Quantization finishes in {end_time - start_time}s.")
+
+    return model
+
+
+def get_quant_config(precision, lm_head_precision="fp16"):
+    """Get the quantization configuration."""
+    if precision == "fp8":
+        quant_cfg = copy.deepcopy(mtq.FP8_DEFAULT_CFG)
+
+    elif precision == "nvfp4":
+        quant_cfg = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG)
+
+    elif precision == "int4_awq":
+        quant_cfg = copy.deepcopy(mtq.INT4_AWQ_CFG)  # type: ignore[arg-type]
+
+    else:
+        raise ValueError(f"Unsupported precision: {precision}")
+
+    quant_cfg_list: list = [
+        e for e in quant_cfg["quant_cfg"] if isinstance(e, dict) and "quantizer_name" in e
+    ]
+
+    if lm_head_precision == "fp8":
+        quant_cfg_list.append(
+            {
+                "quantizer_name": "*lm_head.input_quantizer",
+                "cfg": {"num_bits": (4, 3), "axis": None},
+            }
+        )
+        quant_cfg_list.append(
+            {
+                "quantizer_name": "*lm_head.weight_quantizer",
+                "cfg": {"num_bits": (4, 3), "axis": None},
+            }
+        )
+    elif lm_head_precision == "nvfp4":
+        quant_cfg_list.append(
+            {
+                "quantizer_name": "*lm_head.input_quantizer",
+                "cfg": {
+                    "num_bits": (2, 1),
+                    "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+                    "axis": None,
+                },
+                "enable": True,
+            }
+        )
+        quant_cfg_list.append(
+            {
+                "quantizer_name": "*lm_head.weight_quantizer",
+                "cfg": {
+                    "num_bits": (2, 1),
+                    "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+                    "axis": None,
+                },
+                "enable": True,
+            }
+        )
+    quant_cfg["quant_cfg"] = quant_cfg_list
+    return quant_cfg
+
+
+def quantize(
+    model, tokenizer, precision, lm_head_precision="fp16", dataset_dir=None, calib_size=512
+):
+    """Quantize the PyTorch model to fp8 or int4_awq."""
+    assert precision in [
+        "fp8",
+        "int4_awq",
+        "nvfp4",
+    ], (
+        f"Only fp8(W8A8), int4_awq(W4A16), nvfp4(W4A4) is supported. You passed an unsupported precision: {precision}."
+    )
+
+    assert lm_head_precision in ["fp16"], (
+        f"Only fp16(unquantized) is supported for lm_head. You passed an unsupported precision: {lm_head_precision}."
+    )
+
+    if tokenizer.pad_token != "<unk>":  # nosec B105
+        tokenizer.pad_token = tokenizer.eos_token
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if not dataset_dir:
+        dataset_dir = "cnn_dailymail"
+
+    batch_size = 1
+    data_loader = get_dataset_dataloader(
+        dataset_name=dataset_dir, tokenizer=tokenizer, batch_size=batch_size, num_samples=calib_size
+    )
+    quant_config = get_quant_config(precision, lm_head_precision)
+    quantized_model = _quantize_model(model, quant_config, data_loader)
+    mtq.print_quant_summary(quantized_model)
+    return quantized_model
diff --git a/modelopt/onnx/llm_export_utils/surgeon_utils.py b/modelopt/onnx/llm_export_utils/surgeon_utils.py
new file mode 100644
index 0000000000..2937f6ad0c
--- /dev/null
+++ b/modelopt/onnx/llm_export_utils/surgeon_utils.py
@@ -0,0 +1,120 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities to surgeon ONNX graph after export."""
+
+import re
+import time
+
+import onnx
+import onnx_graphsurgeon as gs
+import torch
+from onnx_graphsurgeon.ir.tensor import LazyValues
+
+
+def clear_inputs(node: gs.Node | gs.Tensor):
+    """Clear all inputs for a node or tensor in ONNX."""
+    for i in node.inputs:
+        i.outputs.clear()
+    node.inputs.clear()
+    return node
+
+
+def clear_outputs(node: gs.Node | gs.Tensor):
+    """Clear all outputs for a node or tensor in ONNX."""
+    for o in node.outputs:
+        o.inputs.clear()
+    node.outputs.clear()
+    return node
+
+
+def extract_layer_id(name: str):
+    """Extract layer id from certain ONNX layer name.
+
+    Parameters:
+        name: str
+            The name of ONNX layer. e.g. /model/layer.0/q_proj/...
+
+    Returns:
+        The layer id for the layer as int. In the example above, it returns 0
+    """
+    match = re.search(r"layers\.(\d+)", name)
+    if match:
+        return int(match.group(1))
+    raise Exception(f"{name} does not contain layer info!")
+
+
+def no_none_elements(elements: list):
+    """Check if all elements in the list are not None."""
+    return all(i is not None for i in elements)
+
+
+def fold_fp8_qdq_to_dq(graph: gs.Graph):
+    """Convert FP32/FP16 weights of the given ONNX model to FP8 weights.
+
+    Even though modelopt supports FP8 onnx export, the weights are represented in fp32 + QDQ.
+    The storage is therefore very bad. In this function,
+    Q nodes will get removed from the weights and have only DQ nodes with those converted FP8
+    weights in the output model.
+
+    Parameters:
+        graph: gs.Graph.
+
+    Returns:
+        gs.Graph with only DQ nodes for weights and same QDQ nodes for activations.
+    """
+    start_time = time.time()
+    print("Replacing all (fp32 weights + fp8 QDQ) with (fp8 weights + DQ)...")
+    # Fold constants is required since the scale is not constant yet.
+    graph.cleanup().toposort().fold_constants().cleanup()
+
+    for node in graph.nodes:
+        if node.op == "TRT_FP8QuantizeLinear":
+            # Should not remove input QDQ
+            if not isinstance(node.inputs[0], gs.Constant):
+                continue
+
+            weights = node.inputs[0]
+            scale = node.inputs[1]
+            torch_weights = torch.from_numpy(weights.values)
+            torch_scale = torch.from_numpy(scale.values)
+            quantizer_name = scale.name.rsplit("/", 1)[0]
+            dq_op = node.outputs[0].outputs[0]
+            assert dq_op.op == "TRT_FP8DequantizeLinear", (
+                f"QDQ does not occur in pairs. You reached {dq_op.op}"
+            )
+
+            # Replace it with Dequantize with FP8 weights. This is a WAR because numpy does not support fp8.
+            numpy_weights = (
+                (torch_weights / torch_scale).to(torch.float8_e4m3fn).view(torch.uint8).numpy()
+            )
+            tensor = onnx.TensorProto()
+            tensor.data_type = onnx.TensorProto.FLOAT8E4M3FN
+            tensor.dims.extend(numpy_weights.shape)
+            tensor.raw_data = numpy_weights.tobytes()
+            values = LazyValues(tensor)
+            onnx_weights_fp8 = gs.Constant(quantizer_name + "/fp8_weights", values)
+
+            node.outputs.clear()
+            # DQ Op is separated out
+            dq_op.inputs[0] = onnx_weights_fp8
+            dq_op.op = "DequantizeLinear"
+            dq_op.outputs[0].dtype = dq_op.inputs[1].dtype
+
+    graph.cleanup().toposort()
+    end_time = time.time()
+    print(f"fp8 qdq replaced with only dq completed in {end_time - start_time}s.")
+
+    return graph