From 06ce30b05ca39cd2de0c79cfb1fc5f8b1e394681 Mon Sep 17 00:00:00 2001 From: Will Guo Date: Thu, 5 Mar 2026 02:57:14 +0000 Subject: [PATCH 1/2] Fix benchmark allocation failure Signed-off-by: Will Guo --- modelopt/onnx/quantization/autotune/benchmark.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index 700cb97eb5..f627caf237 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -534,13 +534,22 @@ def _alloc_pinned_host(size: int, dtype: np.dtype) -> tuple[Any, np.ndarray, Any (host_ptr, arr, err): On success err is cudaSuccess; on failure host_ptr/arr may be None and err is the CUDA error code. """ - nbytes = size * np.dtype(dtype).itemsize + dtype = np.dtype(dtype) + nbytes = size * dtype.itemsize err, host_ptr = cudart.cudaMallocHost(nbytes) if err != cudart.cudaError_t.cudaSuccess: return (None, None, err) addr = int(host_ptr) if hasattr(host_ptr, "__int__") else host_ptr - ctype = np.ctypeslib.as_ctypes_type(dtype) - arr = np.ctypeslib.as_array((ctype * size).from_address(addr)) + try: + ctype = np.ctypeslib.as_ctypes_type(dtype) + arr = np.ctypeslib.as_array((ctype * size).from_address(addr)) + except NotImplementedError: + # float16/bfloat16 have no ctypes equivalent; use same-size type and view + if dtype.itemsize == 2: + ctype = ctypes.c_uint16 + else: + raise + arr = np.ctypeslib.as_array((ctype * size).from_address(addr)).view(dtype) return (host_ptr, arr, cudart.cudaError_t.cudaSuccess) @staticmethod From 7345b99101dfd6a600a4a0808e29e9370005df0d Mon Sep 17 00:00:00 2001 From: Will Guo Date: Fri, 6 Mar 2026 06:18:30 +0000 Subject: [PATCH 2/2] resolve comments Signed-off-by: Will Guo --- modelopt/onnx/quantization/autotune/benchmark.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index f627caf237..7613a119af 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -543,12 +543,15 @@ def _alloc_pinned_host(size: int, dtype: np.dtype) -> tuple[Any, np.ndarray, Any try: ctype = np.ctypeslib.as_ctypes_type(dtype) arr = np.ctypeslib.as_array((ctype * size).from_address(addr)) - except NotImplementedError: + except NotImplementedError as e: # float16/bfloat16 have no ctypes equivalent; use same-size type and view if dtype.itemsize == 2: ctype = ctypes.c_uint16 else: - raise + raise TypeError( + f"Pinned host allocation for dtype {dtype} is not supported: " + "no ctypes mapping and no fallback for this itemsize" + ) from e arr = np.ctypeslib.as_array((ctype * size).from_address(addr)).view(dtype) return (host_ptr, arr, cudart.cudaError_t.cudaSuccess)