From 06ce30b05ca39cd2de0c79cfb1fc5f8b1e394681 Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Thu, 5 Mar 2026 02:57:14 +0000
Subject: [PATCH 1/2] Fix benchmark allocation failure

Signed-off-by: Will Guo <willg@nvidia.com>
---
 modelopt/onnx/quantization/autotune/benchmark.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py
index 700cb97eb5..f627caf237 100644
--- a/modelopt/onnx/quantization/autotune/benchmark.py
+++ b/modelopt/onnx/quantization/autotune/benchmark.py
@@ -534,13 +534,22 @@ def _alloc_pinned_host(size: int, dtype: np.dtype) -> tuple[Any, np.ndarray, Any
             (host_ptr, arr, err): On success err is cudaSuccess; on failure host_ptr/arr
             may be None and err is the CUDA error code.
         """
-        nbytes = size * np.dtype(dtype).itemsize
+        dtype = np.dtype(dtype)
+        nbytes = size * dtype.itemsize
         err, host_ptr = cudart.cudaMallocHost(nbytes)
         if err != cudart.cudaError_t.cudaSuccess:
             return (None, None, err)
         addr = int(host_ptr) if hasattr(host_ptr, "__int__") else host_ptr
-        ctype = np.ctypeslib.as_ctypes_type(dtype)
-        arr = np.ctypeslib.as_array((ctype * size).from_address(addr))
+        try:
+            ctype = np.ctypeslib.as_ctypes_type(dtype)
+            arr = np.ctypeslib.as_array((ctype * size).from_address(addr))
+        except NotImplementedError:
+            # float16/bfloat16 have no ctypes equivalent; use same-size type and view
+            if dtype.itemsize == 2:
+                ctype = ctypes.c_uint16
+            else:
+                raise
+            arr = np.ctypeslib.as_array((ctype * size).from_address(addr)).view(dtype)
         return (host_ptr, arr, cudart.cudaError_t.cudaSuccess)
 
     @staticmethod

From 7345b99101dfd6a600a4a0808e29e9370005df0d Mon Sep 17 00:00:00 2001
From: Will Guo <willg@nvidia.com>
Date: Fri, 6 Mar 2026 06:18:30 +0000
Subject: [PATCH 2/2] resolve comments

Signed-off-by: Will Guo <willg@nvidia.com>
---
 modelopt/onnx/quantization/autotune/benchmark.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py
index f627caf237..7613a119af 100644
--- a/modelopt/onnx/quantization/autotune/benchmark.py
+++ b/modelopt/onnx/quantization/autotune/benchmark.py
@@ -543,12 +543,15 @@ def _alloc_pinned_host(size: int, dtype: np.dtype) -> tuple[Any, np.ndarray, Any
         try:
             ctype = np.ctypeslib.as_ctypes_type(dtype)
             arr = np.ctypeslib.as_array((ctype * size).from_address(addr))
-        except NotImplementedError:
+        except NotImplementedError as e:
             # float16/bfloat16 have no ctypes equivalent; use same-size type and view
             if dtype.itemsize == 2:
                 ctype = ctypes.c_uint16
             else:
-                raise
+                raise TypeError(
+                    f"Pinned host allocation for dtype {dtype} is not supported: "
+                    "no ctypes mapping and no fallback for this itemsize"
+                ) from e
             arr = np.ctypeslib.as_array((ctype * size).from_address(addr)).view(dtype)
         return (host_ptr, arr, cudart.cudaError_t.cudaSuccess)