Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions modelopt/onnx/llm_export_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Deprecated shim for the legacy ``modelopt.onnx.llm_export_utils`` package.

The in-repo LLM ONNX export pipeline (formerly ``examples/torch_onnx/llm_export.py``
plus this package) was removed in 0.44.0rc1 in favor of
`TensorRT-Edge-LLM <https://github.com/NVIDIA/TensorRT-Edge-LLM>`_, which provides
a more complete and actively maintained pipeline.

This package is preserved only as a compatibility shim so external consumers that
still import ``modelopt.onnx.llm_export_utils`` (notably TensorRT-Edge-LLM 0.6.1
and earlier) continue to work. It will be removed in a future release.

New code should migrate to:

* ``modelopt.onnx.export`` — quant exporters (``FP8QuantExporter``, ``NVFP4QuantExporter``, etc.)
* ``modelopt.onnx.graph_surgery`` — graph transforms (GQA replacement, BF16 conversion, etc.)
* `TensorRT-Edge-LLM <https://github.com/NVIDIA/TensorRT-Edge-LLM>`_ — end-to-end LLM export.
"""

import warnings

warnings.warn(
"modelopt.onnx.llm_export_utils is deprecated and will be removed in a future "
"release. Use modelopt.onnx.export and modelopt.onnx.graph_surgery, or migrate "
"to TensorRT-Edge-LLM (https://github.com/NVIDIA/TensorRT-Edge-LLM).",
DeprecationWarning,
stacklevel=2,
)
162 changes: 162 additions & 0 deletions modelopt/onnx/llm_export_utils/export_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utilities for exporting LLM models to ONNX."""

import json
import os
import time
from enum import Enum

import torch
from transformers import AutoModelForCausalLM, DynamicCache


class RopeType(Enum):
"""Rope type enum."""

K_NONE = 0
K_ROPE_ROTATE_GPTJ = 1
K_ROPE_ROTATE_NEOX = 2
K_MROPE = 3


class ModelLoader:
"""A class to handle HuggingFace model loading and configuration."""

def __init__(self, hf_model_path: str, config_path: str):
"""Initialize the ModelLoader."""
self.config_path = config_path
self.hf_model_path = hf_model_path
self.model_type = self.get_model_type()
self.hf_model = None
self.rope_type = RopeType.K_ROPE_ROTATE_NEOX

def get_model_type(self):
"""Get model type from config file."""
with open(self.config_path) as f:
return json.load(f).get("model_type")

def load_model(self, trust_remote_code: bool = False) -> AutoModelForCausalLM:
"""Load HuggingFace model based on model type."""
print(f"Loading HF model from {self.hf_model_path} with model type {self.model_type}")
self.hf_model = AutoModelForCausalLM.from_pretrained(
self.hf_model_path, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
)

return self.hf_model.eval().cuda() # type: ignore[attr-defined]

def get_rope_type(self):
"""Get rope type."""
return self.rope_type


class WrapperModelForCausalLM(torch.nn.Module):
"""Wrapper Model to ensure all models have the same I/O."""

def __init__(self, model):
"""Initialize the WrapperModelForCausalLM."""
super().__init__()
try:
self.model = model.model
except Exception:
self.model = model
self.lm_head = model.lm_head
self.config = model.config

def forward(self, input_ids: torch.Tensor | None, past_key_values: tuple):
"""Forward pass."""
# Convert tuple cache to DynamicCache for models that require it (e.g., Qwen3)
cache = DynamicCache(config=self.config)
cache.key_cache = [kv[0] for kv in past_key_values]
cache.value_cache = [kv[1] for kv in past_key_values]
past_key_values = cache

outputs = self.model(input_ids=input_ids, past_key_values=past_key_values, use_cache=True)
hidden_states = outputs[0]
past_key_values = outputs.past_key_values.to_legacy_cache()
logits = self.lm_head(hidden_states)
return logits, past_key_values


def llm_to_onnx(model, output_dir, extra_inputs={}, extra_dyn_axes={}):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Mutable default argument is a Python antipattern.

Using {} as a default value for extra_inputs and extra_dyn_axes can lead to subtle bugs since the same dict object is shared across all calls. Use None and initialize inside the function.

🐛 Suggested fix
-def llm_to_onnx(model, output_dir, extra_inputs={}, extra_dyn_axes={}):
+def llm_to_onnx(model, output_dir, extra_inputs=None, extra_dyn_axes=None):
     """Export the WrapperModelForCausalLM to ONNX with fixed I/O names and shape definitions and save to `output_dir`.

     Parameters:
         model: torch.Module
         output_dir: str, the output_dir of the original ONNX.
         extra_inputs: dict, append additional inputs after kv_cache. Usually for VL models
         extra_dyn_axes: dict. Usually for VL models
     """
+    if extra_inputs is None:
+        extra_inputs = {}
+    if extra_dyn_axes is None:
+        extra_dyn_axes = {}
     start_time = time.time()
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@modelopt/onnx/llm_export_utils/export_utils.py` at line 94, The function
llm_to_onnx uses mutable default arguments extra_inputs={} and extra_dyn_axes={}
which can cause shared-state bugs; change the signature to use None for those
defaults and inside llm_to_onnx initialize them with empty dicts (e.g., if
extra_inputs is None: extra_inputs = {} and similarly for extra_dyn_axes) so
each call gets a fresh mapping while preserving current behavior.

"""Export the WrapperModelForCausalLM to ONNX with fixed I/O names and shape definitions and save to `output_dir`.

Parameters:
model: torch.Module
output_dir: str, the output_dir of the original ONNX.
extra_inputs: dict, append additional inputs after kv_cache. Usually for VL models
extra_dyn_axes: dict. Usually for VL models
"""
start_time = time.time()
config = model.config
num_layers = config.num_hidden_layers
num_attention_heads = config.num_attention_heads
num_key_value_heads = config.num_key_value_heads
hidden_size = config.hidden_size
hidden_size_per_layer = hidden_size // num_attention_heads

dummy_bs = 1
dummy_len = 10
dummy_input_ids = torch.randint(100, (dummy_bs, dummy_len), dtype=torch.int64).cuda()
input_names = ["input_ids"]
output_names = ["logits"]
dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}}
dummy_kv_cache = ()
for i in range(num_layers):
dummy_k = torch.rand(
(dummy_bs, num_key_value_heads, dummy_len, hidden_size_per_layer), dtype=torch.float16
).cuda()
dummy_v = torch.rand(
(dummy_bs, num_key_value_heads, dummy_len, hidden_size_per_layer), dtype=torch.float16
).cuda()
dummy_kv_cache = (*dummy_kv_cache, (dummy_k, dummy_v))
input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"])
output_names.extend([f"present_key_values.{i}.key", f"present_key_values.{i}.value"])
input_dynamic_axes = {0: "batch_size", 2: "past_len"}
dynamic_axes[f"past_key_values.{i}.key"] = input_dynamic_axes
dynamic_axes[f"past_key_values.{i}.value"] = input_dynamic_axes

torch_to_onnx(
model,
(dummy_input_ids, {"past_key_values": dummy_kv_cache, **extra_inputs}),
output_dir,
"model.onnx",
input_names=input_names + list(extra_inputs.keys()),
output_names=output_names,
dynamic_axes=dynamic_axes | extra_dyn_axes,
)

end_time = time.time()
print(
f"Native ONNX Export from torch completed in {end_time - start_time}s. ONNX file is saved to {output_dir}."
)


def torch_to_onnx(model, inputs, onnx_dir, onnx_name, input_names, output_names, dynamic_axes):
"""Export the model to ONNX."""
os.makedirs(onnx_dir, exist_ok=True)
with torch.inference_mode():
torch.onnx.export(
model,
inputs,
f"{onnx_dir}/{onnx_name}",
input_names=input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
opset_version=19,
do_constant_folding=True,
dynamo=False,
)
146 changes: 146 additions & 0 deletions modelopt/onnx/llm_export_utils/quantization_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Quantization utilities for LLM models."""

import copy
import time

import modelopt.torch.quantization as mtq
from modelopt.torch.utils.dataset_utils import get_dataset_dataloader


def _quantize_model(model, quant_config, calib_dataloader=None):
"""The calibration loop for the model can be setup using the modelopt API.

Example usage:
from modelopt.torch.utils.dataset_utils import create_forward_loop
model = ... # Initialize the model
tokenizer = ... # Initialize the tokenizer
quant_cfg = ... # Setup quantization configuration
forward_loop = create_forward_loop(model=model, dataset_name="cnn_dailymail", tokenizer=tokenizer)
mtq.quantize(model, quant_cfg, forward_loop=forward_loop)
"""

def calibrate_loop(model):
"""Adjusts weights and scaling factors based on selected algorithms."""
for idx, data in enumerate(calib_dataloader):
if idx % 10 == 0:
print(f"Calibrating batch {idx}...")
if isinstance(data, dict):
data = {k: v.to(model.device) for k, v in data.items()}
model(**data)
else:
data = data.to(model.device)
model(data)

print("Starting quantization...")
start_time = time.time()
mtq.quantize(model, quant_config, forward_loop=calibrate_loop)
end_time = time.time()
print(f"Quantization finishes in {end_time - start_time}s.")

return model


def get_quant_config(precision, lm_head_precision="fp16"):
"""Get the quantization configuration."""
if precision == "fp8":
quant_cfg = copy.deepcopy(mtq.FP8_DEFAULT_CFG)

elif precision == "nvfp4":
quant_cfg = copy.deepcopy(mtq.NVFP4_DEFAULT_CFG)

elif precision == "int4_awq":
quant_cfg = copy.deepcopy(mtq.INT4_AWQ_CFG) # type: ignore[arg-type]

else:
raise ValueError(f"Unsupported precision: {precision}")

quant_cfg_list: list = [
e for e in quant_cfg["quant_cfg"] if isinstance(e, dict) and "quantizer_name" in e
]

if lm_head_precision == "fp8":
quant_cfg_list.append(
{
"quantizer_name": "*lm_head.input_quantizer",
"cfg": {"num_bits": (4, 3), "axis": None},
}
)
quant_cfg_list.append(
{
"quantizer_name": "*lm_head.weight_quantizer",
"cfg": {"num_bits": (4, 3), "axis": None},
}
)
elif lm_head_precision == "nvfp4":
quant_cfg_list.append(
{
"quantizer_name": "*lm_head.input_quantizer",
"cfg": {
"num_bits": (2, 1),
"block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
"axis": None,
},
"enable": True,
}
)
quant_cfg_list.append(
{
"quantizer_name": "*lm_head.weight_quantizer",
"cfg": {
"num_bits": (2, 1),
"block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
"axis": None,
},
"enable": True,
}
)
quant_cfg["quant_cfg"] = quant_cfg_list
return quant_cfg


def quantize(
model, tokenizer, precision, lm_head_precision="fp16", dataset_dir=None, calib_size=512
):
"""Quantize the PyTorch model to fp8 or int4_awq."""
assert precision in [
"fp8",
"int4_awq",
"nvfp4",
], (
f"Only fp8(W8A8), int4_awq(W4A16), nvfp4(W4A4) is supported. You passed an unsupported precision: {precision}."
)

assert lm_head_precision in ["fp16"], (
f"Only fp16(unquantized) is supported for lm_head. You passed an unsupported precision: {lm_head_precision}."
)

if tokenizer.pad_token != "<unk>": # nosec B105
tokenizer.pad_token = tokenizer.eos_token
Comment thread
kevalmorabia97 marked this conversation as resolved.
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
if not dataset_dir:
dataset_dir = "cnn_dailymail"

batch_size = 1
data_loader = get_dataset_dataloader(
dataset_name=dataset_dir, tokenizer=tokenizer, batch_size=batch_size, num_samples=calib_size
)
quant_config = get_quant_config(precision, lm_head_precision)
quantized_model = _quantize_model(model, quant_config, data_loader)
mtq.print_quant_summary(quantized_model)
return quantized_model
Loading
Loading