NVIDIA · gcunhase · Mar 11, 2026 · Jan 24, 2026 · Jan 26, 2026 · Jan 26, 2026
@@ -386,3 +386,25 @@ def get_symmetric_ops():
         "BitwiseOr",
         "BitwiseXor",
     }
+
+
+def get_activation_ops():
+    """Returns set of activation operations."""
+    return {
+        "Relu",
+        "LeakyRelu",
+        "PRelu",
+        "Elu",
+        "Selu",
+        "ThresholdedRelu",
+        "Sigmoid",
+        "Tanh",
+        "HardSigmoid",
+        "Softmax",
+        "LogSoftmax",
+        "Clip",
+        "Softplus",
+        "Softsign",
+        "Swish",
+        "HardSwish",
+    }
@@ -20,6 +20,11 @@
 
 import numpy as np
 
+from modelopt.onnx.quantization.autotune import (
+    MODE_PRESETS,
+    StoreWithExplicitFlag,
+    get_node_filter_list,
+)
 from modelopt.onnx.quantization.quantize import quantize
 
 __all__ = ["main"]
@@ -295,9 +300,128 @@ def get_parser() -> argparse.ArgumentParser:
             "if certain operations require a higher version."
         ),
     )
+    argparser.add_argument(
+        "--autotune",
+        nargs="?",
+        const="default",
+        default=None,
+        choices=["quick", "default", "extensive"],
+        help=(
+            "If set, enable Autotune to detect optimal Q/DQ node placements according to TensorRT runtimes. "
+            "Available modes (presets 'schemes_per_region', 'warmup_runs', and 'timing_runs' values): "
+            "  - 'quick': fewer schemes and benchmark runs for quick exploration; "
+            "  - 'default': balanced, recommended for most cases; "
+            "  - 'extensive': more schemes and runs for extensive search and thorough tuning. "
+            "Explicit --autotune_schemes_per_region/warmup_runs/timing_runs override the preset."
+        ),
+    )
+
+    autotune_group = argparser.add_argument_group(
+        "Autotune (only applicable when --autotune is set)"
+    )
+    autotune_group.add_argument(
+        "--autotune_output_dir",
+        type=str,
+        default=None,
+        help="Output directory for autotune results (state file, logs). Default: temp directory.",
+    )
+    autotune_group.add_argument(
+        "--autotune_schemes_per_region",
+        type=int,
+        default=MODE_PRESETS["default"]["schemes_per_region"],
+        help="Number of Q/DQ schemes to test per region.",
+        action=StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_schemes_per_region",
+    )
+    autotune_group.add_argument(
+        "--autotune_pattern_cache",
+        type=str,
+        default=None,
+        dest="autotune_pattern_cache_file",
+        help="Path to pattern cache YAML for warm-start.",
+    )
+    autotune_group.add_argument(
+        "--autotune_qdq_baseline",
+        type=str,
+        default=None,
+        help="Path to a pre-quantized ONNX model to import Q/DQ patterns as warm-start.",
+    )
+    autotune_group.add_argument(
+        "--autotune_state_file",
+        type=str,
+        default=None,
+        help="State file path for crash recovery and resume capability (default: <output_dir>/autotuner_state.yaml).",
+    )
+    autotune_group.add_argument(
+        "--autotune_node_filter_list",
+        type=str,
+        default=None,
+        help=(
+            "Path to a file containing wildcard patterns to filter ONNX nodes (one pattern per line). "
+            "Regions without any matching nodes are skipped during autotuning."
+        ),
+    )
+    autotune_group.add_argument(
+        "--autotune_verbose",
+        action="store_true",
+        help="Enable verbose logging in the autotuner.",
+    )
+    autotune_group.add_argument(
+        "--autotune_use_trtexec",
+        action="store_true",
+        help="Use trtexec for benchmarking instead of the TensorRT Python API.",
+    )
+    autotune_group.add_argument(
+        "--autotune_timing_cache",
+        type=str,
+        default=None,
+        help="TensorRT timing cache file for faster engine builds.",
+    )
+    autotune_group.add_argument(
+        "--autotune_warmup_runs",
+        type=int,
+        default=MODE_PRESETS["default"]["warmup_runs"],
+        help="Number of warmup runs before timing.",
+        action=StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_warmup_runs",
+    )
+    autotune_group.add_argument(
+        "--autotune_timing_runs",
+        type=int,
+        default=MODE_PRESETS["default"]["timing_runs"],
+        help="Number of timed runs for latency measurement.",
+        action=StoreWithExplicitFlag,
+        explicit_attr="_explicit_autotune_timing_runs",
+    )
+    autotune_group.add_argument(
+        "--autotune_trtexec_args",
+        type=str,
+        default=None,
+        help=(
+            "Additional trtexec arguments as a single quoted string. "
+            "Example: --autotune_trtexec_args '--fp16 --workspace=4096'"
+        ),
+    )
     return argparser
 
 
+def apply_mode_presets(args) -> None:
+    """Apply --autotune=mode preset to schemes_per_region, warmup_runs, timing_runs.
+
+    Only applies preset for an option when that option was not explicitly set on the
+    command line (explicit flags override the preset).
+    """
+    if args.autotune not in MODE_PRESETS:
+        return
+    preset = MODE_PRESETS[args.autotune]
+    if not getattr(args, "_explicit_autotune_schemes_per_region", False):
+        args.autotune_schemes_per_region = preset["schemes_per_region"]
+    if not getattr(args, "_explicit_autotune_warmup_runs", False):
+        args.autotune_warmup_runs = preset["warmup_runs"]
+    if not getattr(args, "_explicit_autotune_timing_runs", False):
+        args.autotune_timing_runs = preset["timing_runs"]
+
+
 def main():
     """Command-line entrypoint for ONNX PTQ."""
     args = get_parser().parse_args()
@@ -331,6 +455,14 @@ def main():
             else:
                 raise
 
+    # Autotune configs
+    autotune_enabled = args.autotune is not None
+    if autotune_enabled:
+        apply_mode_presets(args)
+    autotune_node_filter_list = (
+        get_node_filter_list(args.autotune_node_filter_list) if autotune_enabled else None
+    )
+
     quantize(
         args.onnx_path,
         quantize_mode=args.quantize_mode,
@@ -362,6 +494,19 @@ def main():
         calibrate_per_node=args.calibrate_per_node,
         direct_io_types=args.direct_io_types,
         opset=args.opset,
+        autotune=autotune_enabled,
+        autotune_output_dir=args.autotune_output_dir,
+        autotune_num_schemes_per_region=args.autotune_schemes_per_region,
+        autotune_pattern_cache_file=args.autotune_pattern_cache_file,
+        autotune_state_file=args.autotune_state_file,
+        autotune_qdq_baseline=args.autotune_qdq_baseline,
+        autotune_node_filter_list=autotune_node_filter_list,
+        autotune_verbose=args.autotune_verbose,
+        autotune_use_trtexec=args.autotune_use_trtexec,
+        autotune_timing_cache=args.autotune_timing_cache,
+        autotune_warmup_runs=args.autotune_warmup_runs,
+        autotune_timing_runs=args.autotune_timing_runs,
+        autotune_trtexec_args=args.autotune_trtexec_args,
     )
 
 

@@ -20,6 +20,9 @@
 region analysis to efficiently explore and optimize Q/DQ insertion strategies.
 """
 
+# Expose Autotune modes
+from .__main__ import MODE_PRESETS
+
 # Core data structures
 from .autotuner import QDQAutotuner
 from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark
@@ -42,8 +45,10 @@
 )
 from .region_pattern import RegionPattern
 from .region_search import CombinedRegionSearch
+from .utils import StoreWithExplicitFlag, get_node_filter_list
 
 __all__ = [
+    "MODE_PRESETS",
     "AutotunerError",
     "AutotunerNotInitializedError",
     "ChildRegionInputInsertionPoint",
@@ -60,6 +65,8 @@
     "RegionPattern",
     "RegionType",
     "ResolvedInsertionPoint",
+    "StoreWithExplicitFlag",
     "TensorRTPyBenchmark",
     "TrtExecBenchmark",
+    "get_node_filter_list",
 ]
@@ -21,6 +21,11 @@
 from pathlib import Path
 
 from modelopt.onnx.logging_config import logger
+from modelopt.onnx.quantization.autotune.utils import (
+    StoreWithExplicitFlag,
+    get_node_filter_list,
+    validate_file_path,
+)
 from modelopt.onnx.quantization.autotune.workflows import (
     init_benchmark_instance,
     region_pattern_autotuning_workflow,
@@ -44,18 +49,6 @@
 }
 
 
-class _StoreWithExplicitFlag(argparse.Action):
-    """Store the value and set an 'explicit' flag on the namespace so mode presets do not override."""
-
-    def __init__(self, explicit_attr: str, *args, **kwargs):
-        self._explicit_attr = explicit_attr
-        super().__init__(*args, **kwargs)
-
-    def __call__(self, parser, namespace, values, option_string=None):
-        setattr(namespace, self.dest, values)
-        setattr(namespace, self._explicit_attr, True)
-
-
 def apply_mode_presets(args) -> None:
     """Apply --mode preset to schemes_per_region, warmup_runs, timing_runs.
 
@@ -73,30 +66,6 @@ def apply_mode_presets(args) -> None:
         args.timing_runs = preset["timing_runs"]
 
 
-def validate_file_path(path: str | None, description: str) -> Path | None:
-    """Validate that a file path exists.
-
-    Args:
-        path: Path string to validate (can be None)
-        description: Description of the file for error messages
-
-    Returns:
-        Path object if valid, None if path is None
-
-    Raises:
-        SystemExit: If path is provided but doesn't exist
-    """
-    if path is None:
-        return None
-
-    path_obj = Path(path)
-    if not path_obj.exists():
-        logger.error(f"{description} not found: {path_obj}")
-        sys.exit(1)
-
-    return path_obj
-
-
 def log_benchmark_config(args):
     """Log TensorRT benchmark configuration for transparency.
 
@@ -155,20 +124,9 @@ def run_autotune() -> int:
         return 1
 
     try:
-        node_filter_list = None
-        if args.node_filter_list:
-            filter_file = validate_file_path(args.node_filter_list, "Node filter list file")
-            if filter_file:
-                with open(filter_file) as f:
-                    node_filter_list = [
-                        line.strip()
-                        for line in f
-                        if line.strip() and not line.strip().startswith("#")
-                    ]
-                logger.info(f"Loaded {len(node_filter_list)} filter patterns from {filter_file}")
-
+        node_filter_list = get_node_filter_list(args.node_filter_list)
         region_pattern_autotuning_workflow(
-            model_path=str(model_path),
+            model_or_path=str(model_path),
             output_dir=output_dir,
             num_schemes_per_region=args.num_schemes,
             pattern_cache_file=args.pattern_cache_file,
@@ -265,7 +223,7 @@ def get_parser() -> argparse.ArgumentParser:
         type=int,
         default=DEFAULT_NUM_SCHEMES,
         dest="num_schemes",
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_num_schemes",
         help=f"Schemes per region (default: {DEFAULT_NUM_SCHEMES}; preset from --mode if not set)",
     )
@@ -331,15 +289,15 @@ def get_parser() -> argparse.ArgumentParser:
         "--warmup_runs",
         type=int,
         default=DEFAULT_WARMUP_RUNS,
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_warmup_runs",
         help=f"Number of warmup runs (default: {DEFAULT_WARMUP_RUNS}; preset from --mode applies if not set)",
     )
     trt_group.add_argument(
         "--timing_runs",
         type=int,
         default=DEFAULT_TIMING_RUNS,
-        action=_StoreWithExplicitFlag,
+        action=StoreWithExplicitFlag,
         explicit_attr="_explicit_timing_runs",
         help=f"Number of timing runs (default: {DEFAULT_TIMING_RUNS}; preset from --mode applies if not set)",
     )