Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
94c1347
Initial autotune codebase
gcunhase Jan 24, 2026
7c00172
Add more tests
gcunhase Jan 26, 2026
bd85051
Refactor: PR #702
gcunhase Jan 26, 2026
ce35655
Remove python path in tests
gcunhase Jan 26, 2026
ab3ea21
Recover docstrings and simplify code (->, , )
gcunhase Jan 27, 2026
0d4166c
Added unittest for workflows.py (failing)
gcunhase Jan 27, 2026
5201fae
Fix: 'Autotuning failed: 'PatternSchemes' object has no attribute 'no…
gcunhase Jan 27, 2026
33ece8c
Updated workflow test to test TRT and PythonTRT benchmarking
gcunhase Jan 27, 2026
0e2ed76
Fix test: use_trtexec flag
gcunhase Jan 28, 2026
6a97bea
Add real scales to Q/DQ nodes
gcunhase Feb 19, 2026
7f93106
fix precommit failures
gcunhase Feb 24, 2026
b6c7ac3
Fix: Add->Q/DQ->Activation(Relu)
gcunhase Feb 24, 2026
29c209d
Fix: correctly dequantize Add input with shared Q/DQ
gcunhase Feb 24, 2026
ecf6148
[5916893] Fix weighted ops quantization logic: both input and weights…
gcunhase Feb 24, 2026
12b4d6b
Changed keep_output_dir to True as default
gcunhase Mar 2, 2026
30f8df8
test_workflow was moved to 'tests/gpu/onnx'
gcunhase Mar 2, 2026
c72862d
Removed cli.py, moved into __main__.py
gcunhase Mar 2, 2026
6ca4878
Removed PatternSchemes import from region_pattern.py: no longer needed.
gcunhase Mar 2, 2026
97aed99
Added intermediate Autotune model to be removed at the end of the qua…
gcunhase Mar 2, 2026
e422b85
Removed _MUTATION_SPECS from autotuner.py: moved to autotuner_base.py
gcunhase Mar 2, 2026
0dfffe0
Removed test_config and test_pattern_cache. Should be added in the or…
gcunhase Mar 2, 2026
2eea491
Fixed minor coderabbit suggestions
gcunhase Mar 2, 2026
4e788c5
Moved autotune imports to the top of the file
gcunhase Mar 2, 2026
8e5430c
Eliminate intermediate ONNX export in _find_nodes_to_quantize_autotun…
gcunhase Mar 3, 2026
be54609
Add support for Add->Q/DQ->Relu patterns by including those 'Add' nod…
gcunhase Mar 3, 2026
9d95481
Add integration test
gcunhase Mar 3, 2026
7d29c91
Remove 'keep_output_dir' arg (no longer needed due to tmp path)
gcunhase Mar 3, 2026
10d816e
Remove 'get_quantized_nodes' and other comments that are no longer ne…
gcunhase Mar 3, 2026
24029b2
Added docstring for 'default_dq_dtype' in workflows.py
gcunhase Mar 3, 2026
11480df
Added mode presets and additional autotune configurations
gcunhase Mar 5, 2026
d4f19c2
Fixed tmp_path in test
gcunhase Mar 5, 2026
5e413f5
Fixed copilot comments
gcunhase Mar 5, 2026
c573ccd
Fix: skip rewiring in graph_utils if no index is found. This prevents…
gcunhase Mar 5, 2026
b5eaf17
Match args for preset mode default
gcunhase Mar 5, 2026
631eaa0
Exposed _StoreWithExplicitFlag
gcunhase Mar 9, 2026
c2f1d05
Renamed new_ips to new_insertion_points
gcunhase Mar 9, 2026
e953ec5
Address coderabbit and copilot issues + other minor issues
gcunhase Mar 9, 2026
c23eb70
Address additional coderabbit and copilot issues
gcunhase Mar 9, 2026
b03a170
Added real scales test in the integration workflow
gcunhase Mar 9, 2026
45d375c
Address additional copilot issues: includes fix for op_types_to_quant…
gcunhase Mar 9, 2026
b5550f3
nit: added docstring and comment
gcunhase Mar 9, 2026
9ca4e96
Created autotune utils
gcunhase Mar 10, 2026
de2ef53
Added 'bf16' as an option in workflows
gcunhase Mar 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions modelopt/onnx/op_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,3 +386,25 @@ def get_symmetric_ops():
"BitwiseOr",
"BitwiseXor",
}


def get_activation_ops():
"""Returns set of activation operations."""
return {
"Relu",
"LeakyRelu",
"PRelu",
"Elu",
"Selu",
"ThresholdedRelu",
"Sigmoid",
"Tanh",
"HardSigmoid",
"Softmax",
"LogSoftmax",
"Clip",
"Softplus",
Comment thread
gcunhase marked this conversation as resolved.
"Softsign",
"Swish",
"HardSwish",
}
145 changes: 145 additions & 0 deletions modelopt/onnx/quantization/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@

import numpy as np

from modelopt.onnx.quantization.autotune import (
MODE_PRESETS,
StoreWithExplicitFlag,
get_node_filter_list,
)
from modelopt.onnx.quantization.quantize import quantize

__all__ = ["main"]
Expand Down Expand Up @@ -295,9 +300,128 @@ def get_parser() -> argparse.ArgumentParser:
"if certain operations require a higher version."
),
)
argparser.add_argument(
"--autotune",
nargs="?",
const="default",
default=None,
choices=["quick", "default", "extensive"],
help=(
"If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes. "
"Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): "
" - 'quick': fewer schemes and benchmark runs for quick exploration; "
" - 'default': balanced, recommended for most cases; "
" - 'extensive': more schemes and runs for extensive search and thorough tuning. "
"Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset."
),
Comment thread
gcunhase marked this conversation as resolved.
)

autotune_group = argparser.add_argument_group(
"Autotune (only applicable when --autotune is set)"
)
autotune_group.add_argument(
"--autotune_output_dir",
type=str,
default=None,
help="Output directory for autotune results (state file, logs). Default: temp directory.",
)
autotune_group.add_argument(
"--autotune_schemes_per_region",
type=int,
default=MODE_PRESETS["default"]["schemes_per_region"],
help="Number of Q/DQ schemes to test per region.",
action=StoreWithExplicitFlag,
explicit_attr="_explicit_autotune_schemes_per_region",
)
autotune_group.add_argument(
"--autotune_pattern_cache",
type=str,
default=None,
dest="autotune_pattern_cache_file",
help="Path to pattern cache YAML for warm-start.",
)
autotune_group.add_argument(
"--autotune_qdq_baseline",
type=str,
default=None,
help="Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.",
)
autotune_group.add_argument(
"--autotune_state_file",
type=str,
default=None,
help="State file path for crash recovery and resume capability (default: <output_dir>/autotuner_state.yaml).",
)
autotune_group.add_argument(
"--autotune_node_filter_list",
type=str,
default=None,
help=(
"Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
"Regions without any matching nodes are skipped during autotuning."
),
)
autotune_group.add_argument(
"--autotune_verbose",
action="store_true",
help="Enable verbose logging in the autotuner.",
)
autotune_group.add_argument(
"--autotune_use_trtexec",
action="store_true",
Comment thread
gcunhase marked this conversation as resolved.
help="Use trtexec for benchmarking instead of the TensorRT Python API.",
)
autotune_group.add_argument(
"--autotune_timing_cache",
type=str,
default=None,
help="TensorRT timing cache file for faster engine builds.",
)
autotune_group.add_argument(
"--autotune_warmup_runs",
type=int,
default=MODE_PRESETS["default"]["warmup_runs"],
help="Number of warmup runs before timing.",
action=StoreWithExplicitFlag,
explicit_attr="_explicit_autotune_warmup_runs",
)
autotune_group.add_argument(
"--autotune_timing_runs",
type=int,
default=MODE_PRESETS["default"]["timing_runs"],
help="Number of timed runs for latency measurement.",
action=StoreWithExplicitFlag,
explicit_attr="_explicit_autotune_timing_runs",
)
autotune_group.add_argument(
"--autotune_trtexec_args",
type=str,
default=None,
help=(
"Additional trtexec arguments as a single quoted string. "
"Example: --autotune_trtexec_args '--fp16 --workspace=4096'"
),
)
return argparser


def apply_mode_presets(args) -> None:
"""Apply --autotune=mode preset to schemes_per_region, warmup_runs, timing_runs.

Only applies preset for an option when that option was not explicitly set on the
command line (explicit flags override the preset).
"""
if args.autotune not in MODE_PRESETS:
return
preset = MODE_PRESETS[args.autotune]
if not getattr(args, "_explicit_autotune_schemes_per_region", False):
args.autotune_schemes_per_region = preset["schemes_per_region"]
if not getattr(args, "_explicit_autotune_warmup_runs", False):
args.autotune_warmup_runs = preset["warmup_runs"]
if not getattr(args, "_explicit_autotune_timing_runs", False):
args.autotune_timing_runs = preset["timing_runs"]


def main():
"""Command-line entrypoint for ONNX PTQ."""
args = get_parser().parse_args()
Expand Down Expand Up @@ -331,6 +455,14 @@ def main():
else:
raise

# Autotune configs
autotune_enabled = args.autotune is not None
if autotune_enabled:
apply_mode_presets(args)
autotune_node_filter_list = (
get_node_filter_list(args.autotune_node_filter_list) if autotune_enabled else None
)

Comment thread
gcunhase marked this conversation as resolved.
quantize(
args.onnx_path,
quantize_mode=args.quantize_mode,
Expand Down Expand Up @@ -362,6 +494,19 @@ def main():
calibrate_per_node=args.calibrate_per_node,
direct_io_types=args.direct_io_types,
opset=args.opset,
autotune=autotune_enabled,
autotune_output_dir=args.autotune_output_dir,
autotune_num_schemes_per_region=args.autotune_schemes_per_region,
autotune_pattern_cache_file=args.autotune_pattern_cache_file,
autotune_state_file=args.autotune_state_file,
autotune_qdq_baseline=args.autotune_qdq_baseline,
autotune_node_filter_list=autotune_node_filter_list,
autotune_verbose=args.autotune_verbose,
autotune_use_trtexec=args.autotune_use_trtexec,
autotune_timing_cache=args.autotune_timing_cache,
autotune_warmup_runs=args.autotune_warmup_runs,
autotune_timing_runs=args.autotune_timing_runs,
autotune_trtexec_args=args.autotune_trtexec_args,
)


Expand Down
7 changes: 7 additions & 0 deletions modelopt/onnx/quantization/autotune/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
region analysis to efficiently explore and optimize Q/DQ insertion strategies.
"""

# Expose Autotune modes
from .__main__ import MODE_PRESETS

# Core data structures
from .autotuner import QDQAutotuner
from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
Expand All @@ -42,8 +45,10 @@
)
from .region_pattern import RegionPattern
from .region_search import CombinedRegionSearch
from .utils import StoreWithExplicitFlag, get_node_filter_list

__all__ = [
"MODE_PRESETS",
"AutotunerError",
"AutotunerNotInitializedError",
"ChildRegionInputInsertionPoint",
Expand All @@ -60,6 +65,8 @@
"RegionPattern",
"RegionType",
"ResolvedInsertionPoint",
"StoreWithExplicitFlag",
"TensorRTPyBenchmark",
"TrtExecBenchmark",
"get_node_filter_list",
]
62 changes: 10 additions & 52 deletions modelopt/onnx/quantization/autotune/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
from pathlib import Path

from modelopt.onnx.logging_config import logger
from modelopt.onnx.quantization.autotune.utils import (
StoreWithExplicitFlag,
get_node_filter_list,
validate_file_path,
)
from modelopt.onnx.quantization.autotune.workflows import (
init_benchmark_instance,
region_pattern_autotuning_workflow,
Expand All @@ -44,18 +49,6 @@
}


class _StoreWithExplicitFlag(argparse.Action):
"""Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""

def __init__(self, explicit_attr: str, *args, **kwargs):
self._explicit_attr = explicit_attr
super().__init__(*args, **kwargs)

def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, values)
setattr(namespace, self._explicit_attr, True)


def apply_mode_presets(args) -> None:
"""Apply --mode preset to schemes_per_region, warmup_runs, timing_runs.

Expand All @@ -73,30 +66,6 @@ def apply_mode_presets(args) -> None:
args.timing_runs = preset["timing_runs"]


def validate_file_path(path: str | None, description: str) -> Path | None:
"""Validate that a file path exists.

Args:
path: Path string to validate (can be None)
description: Description of the file for error messages

Returns:
Path object if valid, None if path is None

Raises:
SystemExit: If path is provided but doesn't exist
"""
if path is None:
return None

path_obj = Path(path)
if not path_obj.exists():
logger.error(f"{description} not found: {path_obj}")
sys.exit(1)

return path_obj


def log_benchmark_config(args):
"""Log TensorRT benchmark configuration for transparency.

Expand Down Expand Up @@ -155,20 +124,9 @@ def run_autotune() -> int:
return 1

try:
node_filter_list = None
if args.node_filter_list:
filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
if filter_file:
with open(filter_file) as f:
node_filter_list = [
line.strip()
for line in f
if line.strip() and not line.strip().startswith("#")
]
logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")

node_filter_list = get_node_filter_list(args.node_filter_list)
region_pattern_autotuning_workflow(
model_path=str(model_path),
model_or_path=str(model_path),
output_dir=output_dir,
num_schemes_per_region=args.num_schemes,
pattern_cache_file=args.pattern_cache_file,
Expand Down Expand Up @@ -265,7 +223,7 @@ def get_parser() -> argparse.ArgumentParser:
type=int,
default=DEFAULT_NUM_SCHEMES,
dest="num_schemes",
action=_StoreWithExplicitFlag,
action=StoreWithExplicitFlag,
explicit_attr="_explicit_num_schemes",
help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)",
)
Expand Down Expand Up @@ -331,15 +289,15 @@ def get_parser() -> argparse.ArgumentParser:
"--warmup_runs",
type=int,
default=DEFAULT_WARMUP_RUNS,
action=_StoreWithExplicitFlag,
action=StoreWithExplicitFlag,
explicit_attr="_explicit_warmup_runs",
help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)",
)
trt_group.add_argument(
"--timing_runs",
type=int,
default=DEFAULT_TIMING_RUNS,
action=_StoreWithExplicitFlag,
action=StoreWithExplicitFlag,
explicit_attr="_explicit_timing_runs",
help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)",
)
Expand Down
Loading
Loading