From a84c403328a0997b5d86c34292802d3b779d73da Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 9 Jan 2026 10:20:07 -0500 Subject: [PATCH] cuda.bindings._nvml changes for cuda.core.system work --- cuda_bindings/cuda/bindings/_nvml.pxd | 5 - cuda_bindings/cuda/bindings/_nvml.pyx | 338 +++++++++++++++++------- cuda_bindings/tests/nvml/test_nvlink.py | 4 +- 3 files changed, 251 insertions(+), 96 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index d08b087b38..ddf9ab2b28 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -34,7 +34,6 @@ ctypedef nvmlViolationTime_t ViolationTime ctypedef nvmlUUIDValue_t UUIDValue ctypedef nvmlVgpuPlacementList_v1_t VgpuPlacementList_v1 ctypedef nvmlNvLinkPowerThres_t NvLinkPowerThres -ctypedef nvmlSystemEventData_v1_t SystemEventData_v1 ctypedef nvmlGpuInstanceProfileInfo_t GpuInstanceProfileInfo ctypedef nvmlComputeInstanceProfileInfo_t ComputeInstanceProfileInfo ctypedef nvmlMask255_t Mask255 @@ -329,10 +328,6 @@ cpdef device_register_events(intptr_t device, unsigned long long event_types, in cpdef unsigned long long device_get_supported_event_types(intptr_t device) except? 0 cpdef object event_set_wait_v2(intptr_t set, unsigned int timeoutms) cpdef event_set_free(intptr_t set) -cpdef system_event_set_create(intptr_t request) -cpdef system_event_set_free(intptr_t request) -cpdef system_register_events(intptr_t request) -cpdef system_event_set_wait(intptr_t request) cpdef device_modify_drain_state(intptr_t pci_info, int new_state) cpdef int device_query_drain_state(intptr_t pci_info) except? -1 cpdef device_remove_gpu_v2(intptr_t pci_info, int gpu_state, int link_state) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index d9bddcc4bc..524a7e3d0f 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -787,7 +787,7 @@ class AffinityScope(_IntEnum): SOCKET = 1 # Scope of processor socket for affinity queries -class FI(_IntEnum): +class FieldId(_IntEnum): DEV_ECC_CURRENT = 1 # Current ECC mode. 1=Active. 0=Inactive DEV_ECC_PENDING = 2 # Pending ECC mode. 1=Active. 0=Inactive # ECC Count Totals @@ -1778,7 +1778,7 @@ cdef _get_pci_info_ext_v1_dtype_offsets(): cdef nvmlPciInfoExt_v1_t pod = nvmlPciInfoExt_v1_t() return _numpy.dtype({ 'names': ['version', 'domain', 'bus', 'device_', 'pci_device_id', 'pci_sub_system_id', 'base_class', 'sub_class', 'bus_id'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 32)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.domain)) - (&pod), @@ -1998,7 +1998,7 @@ cdef _get_pci_info_dtype_offsets(): cdef nvmlPciInfo_t pod = nvmlPciInfo_t() return _numpy.dtype({ 'names': ['bus_id_legacy', 'domain', 'bus', 'device_', 'pci_device_id', 'pci_sub_system_id', 'bus_id'], - 'formats': [_numpy.int8, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8], + 'formats': [(_numpy.int8, 16), _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 32)], 'offsets': [ (&(pod.busIdLegacy)) - (&pod), (&(pod.domain)) - (&pod), @@ -4901,7 +4901,7 @@ cdef _get_device_perf_modes_v1_dtype_offsets(): cdef nvmlDevicePerfModes_v1_t pod = nvmlDevicePerfModes_v1_t() return _numpy.dtype({ 'names': ['version', 'str'], - 'formats': [_numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, (_numpy.int8, 2048)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.str)) - (&pod), @@ -5037,7 +5037,7 @@ cdef _get_device_current_clock_freqs_v1_dtype_offsets(): cdef nvmlDeviceCurrentClockFreqs_v1_t pod = nvmlDeviceCurrentClockFreqs_v1_t() return _numpy.dtype({ 'names': ['version', 'str'], - 'formats': [_numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, (_numpy.int8, 2048)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.str)) - (&pod), @@ -5849,7 +5849,7 @@ cdef _get_platform_info_v1_dtype_offsets(): cdef nvmlPlatformInfo_v1_t pod = nvmlPlatformInfo_v1_t() return _numpy.dtype({ 'names': ['version', 'ib_guid', 'rack_guid', 'chassis_physical_slot_number', 'compute_slot_ind_ex', 'node_ind_ex', 'peer_type', 'module_id'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, (_numpy.uint8, 16), (_numpy.uint8, 16), _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.ibGuid)) - (&pod), @@ -6061,7 +6061,7 @@ cdef _get_platform_info_v2_dtype_offsets(): cdef nvmlPlatformInfo_v2_t pod = nvmlPlatformInfo_v2_t() return _numpy.dtype({ 'names': ['version', 'ib_guid', 'chassis_serial_number', 'slot_number', 'tray_ind_ex', 'host_id', 'peer_type', 'module_id'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, (_numpy.uint8, 16), (_numpy.uint8, 16), _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8, _numpy.uint8], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.ibGuid)) - (&pod), @@ -6993,7 +6993,7 @@ cdef _get_vgpu_process_utilization_info_v1_dtype_offsets(): cdef nvmlVgpuProcessUtilizationInfo_v1_t pod = nvmlVgpuProcessUtilizationInfo_v1_t() return _numpy.dtype({ 'names': ['process_name', 'time_stamp', 'vgpu_instance', 'pid', 'sm_util', 'mem_util', 'enc_util', 'dec_util', 'jpg_util', 'ofa_util'], - 'formats': [_numpy.int8, _numpy.uint64, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32], + 'formats': [(_numpy.int8, 64), _numpy.uint64, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32], 'offsets': [ (&(pod.processName)) - (&pod), (&(pod.timeStamp)) - (&pod), @@ -8063,7 +8063,7 @@ cdef _get_vgpu_scheduler_capabilities_dtype_offsets(): cdef nvmlVgpuSchedulerCapabilities_t pod = nvmlVgpuSchedulerCapabilities_t() return _numpy.dtype({ 'names': ['supported_schedulers', 'max_timeslice', 'min_timeslice', 'is_arr_mode_supported', 'max_frequency_for_arr', 'min_frequency_for_arr', 'max_avg_factor_for_arr', 'min_avg_factor_for_arr'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32], + 'formats': [(_numpy.uint32, 3), _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32], 'offsets': [ (&(pod.supportedSchedulers)) - (&pod), (&(pod.maxTimeslice)) - (&pod), @@ -9233,7 +9233,7 @@ cdef _get_hwbc_entry_dtype_offsets(): cdef nvmlHwbcEntry_t pod = nvmlHwbcEntry_t() return _numpy.dtype({ 'names': ['hwbc_id', 'firmware_version'], - 'formats': [_numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, (_numpy.int8, 32)], 'offsets': [ (&(pod.hwbcId)) - (&pod), (&(pod.firmwareVersion)) - (&pod), @@ -9377,7 +9377,7 @@ cdef _get_led_state_dtype_offsets(): cdef nvmlLedState_t pod = nvmlLedState_t() return _numpy.dtype({ 'names': ['cause', 'color'], - 'formats': [_numpy.int8, _numpy.int32], + 'formats': [(_numpy.int8, 256), _numpy.int32], 'offsets': [ (&(pod.cause)) - (&pod), (&(pod.color)) - (&pod), @@ -9513,7 +9513,7 @@ cdef _get_unit_info_dtype_offsets(): cdef nvmlUnitInfo_t pod = nvmlUnitInfo_t() return _numpy.dtype({ 'names': ['name', 'id', 'serial', 'firmware_version'], - 'formats': [_numpy.int8, _numpy.int8, _numpy.int8, _numpy.int8], + 'formats': [(_numpy.int8, 96), (_numpy.int8, 96), (_numpy.int8, 96), (_numpy.int8, 96)], 'offsets': [ (&(pod.name)) - (&pod), (&(pod.id)) - (&pod), @@ -9685,7 +9685,7 @@ cdef _get_psu_info_dtype_offsets(): cdef nvmlPSUInfo_t pod = nvmlPSUInfo_t() return _numpy.dtype({ 'names': ['state', 'current', 'voltage', 'power'], - 'formats': [_numpy.int8, _numpy.uint32, _numpy.uint32, _numpy.uint32], + 'formats': [(_numpy.int8, 256), _numpy.uint32, _numpy.uint32, _numpy.uint32], 'offsets': [ (&(pod.state)) - (&pod), (&(pod.current)) - (&pod), @@ -10155,11 +10155,157 @@ cdef class EventData: return obj +cdef _get_system_event_data_v1_dtype_offsets(): + cdef nvmlSystemEventData_v1_t pod = nvmlSystemEventData_v1_t() + return _numpy.dtype({ + 'names': ['event_type', 'gpu_id'], + 'formats': [_numpy.uint64, _numpy.uint32], + 'offsets': [ + (&(pod.eventType)) - (&pod), + (&(pod.gpuId)) - (&pod), + ], + 'itemsize': sizeof(nvmlSystemEventData_v1_t), + }) + +system_event_data_v1_dtype = _get_system_event_data_v1_dtype_offsets() + +cdef class SystemEventData_v1: + """Empty-initialize an array of `nvmlSystemEventData_v1_t`. + + The resulting object is of length `size` and of dtype `system_event_data_v1_dtype`. + If default-constructed, the instance represents a single struct. + + Args: + size (int): number of structs, default=1. + + + .. seealso:: `nvmlSystemEventData_v1_t` + """ + cdef: + readonly object _data + + + + def __init__(self, size=1): + arr = _numpy.empty(size, dtype=system_event_data_v1_dtype) + self._data = arr.view(_numpy.recarray) + assert self._data.itemsize == sizeof(nvmlSystemEventData_v1_t), \ + f"itemsize {self._data.itemsize} mismatches struct size { sizeof(nvmlSystemEventData_v1_t) }" + + def __repr__(self): + if self._data.size > 1: + return f"<{__name__}.SystemEventData_v1_Array_{self._data.size} object at {hex(id(self))}>" + else: + return f"<{__name__}.SystemEventData_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return self._data.ctypes.data + + cdef intptr_t _get_ptr(self): + return self._data.ctypes.data + + def __int__(self): + if self._data.size > 1: + raise TypeError("int() argument must be a bytes-like object of size 1. " + "To get the pointer address of an array, use .ptr") + return self._data.ctypes.data + + def __len__(self): + return self._data.size + + def __eq__(self, other): + cdef object self_data = self._data + if (not isinstance(other, SystemEventData_v1)) or self_data.size != other._data.size or self_data.dtype != other._data.dtype: + return False + return bool((self_data == other._data).all()) + + @property + def event_type(self): + """Union[~_numpy.uint64, int]: Information about what specific system event occurred.""" + if self._data.size == 1: + return int(self._data.event_type[0]) + return self._data.event_type + + @event_type.setter + def event_type(self, val): + self._data.event_type = val + + @property + def gpu_id(self): + """Union[~_numpy.uint32, int]: gpuId in PCI format""" + if self._data.size == 1: + return int(self._data.gpu_id[0]) + return self._data.gpu_id + + @gpu_id.setter + def gpu_id(self, val): + self._data.gpu_id = val + + def __getitem__(self, key): + cdef ssize_t key_ + cdef ssize_t size + if isinstance(key, int): + key_ = key + size = self._data.size + if key_ >= size or key_ <= -(size+1): + raise IndexError("index is out of bounds") + if key_ < 0: + key_ += size + return SystemEventData_v1.from_data(self._data[key_:key_+1]) + out = self._data[key] + if isinstance(out, _numpy.recarray) and out.dtype == system_event_data_v1_dtype: + return SystemEventData_v1.from_data(out) + return out + + def __setitem__(self, key, val): + self._data[key] = val + + @staticmethod + def from_data(data): + """Create an SystemEventData_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a 1D array of dtype `system_event_data_v1_dtype` holding the data. + """ + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + if not isinstance(data, _numpy.ndarray): + raise TypeError("data argument must be a NumPy ndarray") + if data.ndim != 1: + raise ValueError("data array must be 1D") + if data.dtype != system_event_data_v1_dtype: + raise ValueError("data array must be of dtype system_event_data_v1_dtype") + obj._data = data.view(_numpy.recarray) + + return obj + + @staticmethod + def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False): + """Create an SystemEventData_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + size (int): number of structs, default=1. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + cdef flag = cpython.buffer.PyBUF_READ if readonly else cpython.buffer.PyBUF_WRITE + cdef object buf = cpython.memoryview.PyMemoryView_FromMemory( + ptr, sizeof(nvmlSystemEventData_v1_t) * size, flag) + data = _numpy.ndarray(size, buffer=buf, dtype=system_event_data_v1_dtype) + obj._data = data.view(_numpy.recarray) + + return obj + + cdef _get_accounting_stats_dtype_offsets(): cdef nvmlAccountingStats_t pod = nvmlAccountingStats_t() return _numpy.dtype({ 'names': ['gpu_utilization', 'memory_utilization', 'max_memory_usage', 'time', 'start_time', 'is_running', 'reserved'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, (_numpy.uint32, 5)], 'offsets': [ (&(pod.gpuUtilization)) - (&pod), (&(pod.memoryUtilization)) - (&pod), @@ -11544,7 +11690,7 @@ cdef _get_conf_compute_gpu_certificate_dtype_offsets(): cdef nvmlConfComputeGpuCertificate_t pod = nvmlConfComputeGpuCertificate_t() return _numpy.dtype({ 'names': ['cert_chain_size', 'attestation_cert_chain_size', 'cert_chain', 'attestation_cert_chain'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, _numpy.uint32, (_numpy.uint8, 4096), (_numpy.uint8, 5120)], 'offsets': [ (&(pod.certChainSize)) - (&pod), (&(pod.attestationCertChainSize)) - (&pod), @@ -11708,7 +11854,7 @@ cdef _get_conf_compute_gpu_attestation_report_dtype_offsets(): cdef nvmlConfComputeGpuAttestationReport_t pod = nvmlConfComputeGpuAttestationReport_t() return _numpy.dtype({ 'names': ['is_cec_attestation_report_present', 'attestation_report_size', 'cec_attestation_report_size', 'nonce', 'attestation_report', 'cec_attestation_report'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint8, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.uint8, 32), (_numpy.uint8, 8192), (_numpy.uint8, 4096)], 'offsets': [ (&(pod.isCecAttestationReportPresent)) - (&pod), (&(pod.attestationReportSize)) - (&pod), @@ -12032,7 +12178,7 @@ cdef _get_gpu_fabric_info_v2_dtype_offsets(): cdef nvmlGpuFabricInfo_v2_t pod = nvmlGpuFabricInfo_v2_t() return _numpy.dtype({ 'names': ['version', 'cluster_uuid', 'status', 'clique_id', 'state', 'health_mask'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.int32, _numpy.uint32, _numpy.uint8, _numpy.uint32], + 'formats': [_numpy.uint32, (_numpy.uint8, 16), _numpy.int32, _numpy.uint32, _numpy.uint8, _numpy.uint32], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.clusterUuid)) - (&pod), @@ -12216,7 +12362,7 @@ cdef _get_nvlink_supported_bw_modes_v1_dtype_offsets(): cdef nvmlNvlinkSupportedBwModes_v1_t pod = nvmlNvlinkSupportedBwModes_v1_t() return _numpy.dtype({ 'names': ['version', 'bw_modes', 'total_bw_modes'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.uint8], + 'formats': [_numpy.uint32, (_numpy.uint8, 23), _numpy.uint8], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.bwModes)) - (&pod), @@ -12784,7 +12930,7 @@ cdef _get_vgpu_metadata_dtype_offsets(): cdef nvmlVgpuMetadata_t pod = nvmlVgpuMetadata_t() return _numpy.dtype({ 'names': ['version', 'revision', 'guest_info_state', 'guest_driver_version', 'host_driver_version', 'reserved', 'vgpu_virtualization_caps', 'guest_vgpu_version', 'opaque_data_size', 'opaque_data'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.int32, _numpy.int8, _numpy.int8, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.int32, (_numpy.int8, 80), (_numpy.int8, 80), (_numpy.uint32, 6), _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 4)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.revision)) - (&pod), @@ -13291,7 +13437,7 @@ cdef _get_gpu_instance_profile_info_v2_dtype_offsets(): cdef nvmlGpuInstanceProfileInfo_v2_t pod = nvmlGpuInstanceProfileInfo_v2_t() return _numpy.dtype({ 'names': ['version', 'id', 'is_p2p_supported', 'slice_count', 'instance_count', 'multiprocessor_count', 'copy_engine_count', 'decoder_count', 'encoder_count', 'jpeg_count', 'ofa_count', 'memory_size_mb', 'name'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint64, (_numpy.int8, 96)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.id)) - (&pod), @@ -13559,7 +13705,7 @@ cdef _get_gpu_instance_profile_info_v3_dtype_offsets(): cdef nvmlGpuInstanceProfileInfo_v3_t pod = nvmlGpuInstanceProfileInfo_v3_t() return _numpy.dtype({ 'names': ['version', 'id', 'slice_count', 'instance_count', 'multiprocessor_count', 'copy_engine_count', 'decoder_count', 'encoder_count', 'jpeg_count', 'ofa_count', 'memory_size_mb', 'name', 'capabilities'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.int8, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint64, (_numpy.int8, 96), _numpy.uint32], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.id)) - (&pod), @@ -13973,7 +14119,7 @@ cdef _get_compute_instance_profile_info_v2_dtype_offsets(): cdef nvmlComputeInstanceProfileInfo_v2_t pod = nvmlComputeInstanceProfileInfo_v2_t() return _numpy.dtype({ 'names': ['version', 'id', 'slice_count', 'instance_count', 'multiprocessor_count', 'shared_copy_engine_count', 'shared_decoder_count', 'shared_encoder_count', 'shared_jpeg_count', 'shared_ofa_count', 'name'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 96)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.id)) - (&pod), @@ -14217,7 +14363,7 @@ cdef _get_compute_instance_profile_info_v3_dtype_offsets(): cdef nvmlComputeInstanceProfileInfo_v3_t pod = nvmlComputeInstanceProfileInfo_v3_t() return _numpy.dtype({ 'names': ['version', 'id', 'slice_count', 'instance_count', 'multiprocessor_count', 'shared_copy_engine_count', 'shared_decoder_count', 'shared_encoder_count', 'shared_jpeg_count', 'shared_ofa_count', 'name', 'capabilities'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.int8, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, (_numpy.int8, 96), _numpy.uint32], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.id)) - (&pod), @@ -15495,7 +15641,7 @@ cdef _get_gpu_fabric_info_v3_dtype_offsets(): cdef nvmlGpuFabricInfo_v3_t pod = nvmlGpuFabricInfo_v3_t() return _numpy.dtype({ 'names': ['version', 'cluster_uuid', 'status', 'clique_id', 'state', 'health_mask', 'health_summary'], - 'formats': [_numpy.uint32, _numpy.uint8, _numpy.int32, _numpy.uint32, _numpy.uint8, _numpy.uint32, _numpy.uint8], + 'formats': [_numpy.uint32, (_numpy.uint8, 16), _numpy.int32, _numpy.uint32, _numpy.uint8, _numpy.uint32, _numpy.uint8], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.clusterUuid)) - (&pod), @@ -16351,7 +16497,7 @@ cdef _get_excluded_device_info_dtype_offsets(): cdef nvmlExcludedDeviceInfo_t pod = nvmlExcludedDeviceInfo_t() return _numpy.dtype({ 'names': ['pci_info', 'uuid'], - 'formats': [pci_info_dtype, _numpy.int8], + 'formats': [pci_info_dtype, (_numpy.int8, 80)], 'offsets': [ (&(pod.pciInfo)) - (&pod), (&(pod.uuid)) - (&pod), @@ -16641,7 +16787,7 @@ cdef _get_bridge_chip_hierarchy_dtype_offsets(): cdef nvmlBridgeChipHierarchy_t pod = nvmlBridgeChipHierarchy_t() return _numpy.dtype({ 'names': ['bridge_count', 'bridge_chip_info'], - 'formats': [_numpy.uint8, bridge_chip_info_dtype], + 'formats': [_numpy.uint8, (bridge_chip_info_dtype, 128)], 'offsets': [ (&(pod.bridgeCount)) - (&pod), (&(pod.bridgeChipInfo)) - (&pod), @@ -17475,7 +17621,7 @@ cdef _get_gpu_thermal_settings_dtype_offsets(): cdef nvmlGpuThermalSettings_t pod = nvmlGpuThermalSettings_t() return _numpy.dtype({ 'names': ['count', 'sensor'], - 'formats': [_numpy.uint32, _py_anon_pod0_dtype], + 'formats': [_numpy.uint32, (_py_anon_pod0_dtype, 3)], 'offsets': [ (&(pod.count)) - (&pod), (&(pod.sensor)) - (&pod), @@ -17610,7 +17756,7 @@ cdef _get_clk_mon_status_dtype_offsets(): cdef nvmlClkMonStatus_t pod = nvmlClkMonStatus_t() return _numpy.dtype({ 'names': ['b_global_status', 'clk_mon_list_size', 'clk_mon_list'], - 'formats': [_numpy.uint32, _numpy.uint32, clk_mon_fault_info_dtype], + 'formats': [_numpy.uint32, _numpy.uint32, (clk_mon_fault_info_dtype, 32)], 'offsets': [ (&(pod.bGlobalStatus)) - (&pod), (&(pod.clkMonListSize)) - (&pod), @@ -17910,7 +18056,7 @@ cdef _get_gpu_dynamic_pstates_info_dtype_offsets(): cdef nvmlGpuDynamicPstatesInfo_t pod = nvmlGpuDynamicPstatesInfo_t() return _numpy.dtype({ 'names': ['flags_', 'utilization'], - 'formats': [_numpy.uint32, _py_anon_pod1_dtype], + 'formats': [_numpy.uint32, (_py_anon_pod1_dtype, 8)], 'offsets': [ (&(pod.flags)) - (&pod), (&(pod.utilization)) - (&pod), @@ -18601,7 +18747,7 @@ cdef _get_grid_licensable_feature_dtype_offsets(): cdef nvmlGridLicensableFeature_t pod = nvmlGridLicensableFeature_t() return _numpy.dtype({ 'names': ['feature_code', 'feature_state', 'license_info', 'product_name', 'feature_enabled', 'license_expiry'], - 'formats': [_numpy.int32, _numpy.uint32, _numpy.int8, _numpy.int8, _numpy.uint32, grid_license_expiry_dtype], + 'formats': [_numpy.int32, _numpy.uint32, (_numpy.int8, 128), (_numpy.int8, 128), _numpy.uint32, grid_license_expiry_dtype], 'offsets': [ (&(pod.featureCode)) - (&pod), (&(pod.featureState)) - (&pod), @@ -18789,7 +18935,7 @@ cdef _get_unit_fan_speeds_dtype_offsets(): cdef nvmlUnitFanSpeeds_t pod = nvmlUnitFanSpeeds_t() return _numpy.dtype({ 'names': ['fans', 'count'], - 'formats': [unit_fan_info_dtype, _numpy.uint32], + 'formats': [(unit_fan_info_dtype, 24), _numpy.uint32], 'offsets': [ (&(pod.fans)) - (&pod), (&(pod.count)) - (&pod), @@ -18924,7 +19070,7 @@ cdef _get_vgpu_pgpu_metadata_dtype_offsets(): cdef nvmlVgpuPgpuMetadata_t pod = nvmlVgpuPgpuMetadata_t() return _numpy.dtype({ 'names': ['version', 'revision', 'host_driver_version', 'pgpu_virtualization_caps', 'reserved', 'host_supported_vgpu_range', 'opaque_data_size', 'opaque_data'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.int8, _numpy.uint32, _numpy.uint32, vgpu_version_dtype, _numpy.uint32, _numpy.int8], + 'formats': [_numpy.uint32, _numpy.uint32, (_numpy.int8, 80), _numpy.uint32, (_numpy.uint32, 5), vgpu_version_dtype, _numpy.uint32, (_numpy.int8, 4)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.revision)) - (&pod), @@ -19593,7 +19739,7 @@ cdef _get_nvlink_firmware_info_dtype_offsets(): cdef nvmlNvlinkFirmwareInfo_t pod = nvmlNvlinkFirmwareInfo_t() return _numpy.dtype({ 'names': ['firmware_version', 'num_valid_entries'], - 'formats': [nvlink_firmware_version_dtype, _numpy.uint32], + 'formats': [(nvlink_firmware_version_dtype, 100), _numpy.uint32], 'offsets': [ (&(pod.firmwareVersion)) - (&pod), (&(pod.numValidEntries)) - (&pod), @@ -20039,7 +20185,7 @@ cdef _get_vgpu_scheduler_log_dtype_offsets(): cdef nvmlVgpuSchedulerLog_t pod = nvmlVgpuSchedulerLog_t() return _numpy.dtype({ 'names': ['engine_id', 'scheduler_policy', 'arr_mode', 'scheduler_params', 'entries_count', 'log_entries'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, vgpu_scheduler_params_dtype, _numpy.uint32, vgpu_scheduler_log_entry_dtype], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, vgpu_scheduler_params_dtype, _numpy.uint32, (vgpu_scheduler_log_entry_dtype, 200)], 'offsets': [ (&(pod.engineId)) - (&pod), (&(pod.schedulerPolicy)) - (&pod), @@ -20537,7 +20683,7 @@ cdef _get_vgpu_scheduler_log_info_v1_dtype_offsets(): cdef nvmlVgpuSchedulerLogInfo_v1_t pod = nvmlVgpuSchedulerLogInfo_v1_t() return _numpy.dtype({ 'names': ['version', 'engine_id', 'scheduler_policy', 'arr_mode', 'scheduler_params', 'entries_count', 'log_entries'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, vgpu_scheduler_params_dtype, _numpy.uint32, vgpu_scheduler_log_entry_dtype], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32, _numpy.uint32, vgpu_scheduler_params_dtype, _numpy.uint32, (vgpu_scheduler_log_entry_dtype, 200)], 'offsets': [ (&(pod.version)) - (&pod), (&(pod.engineId)) - (&pod), @@ -20902,7 +21048,7 @@ cdef _get_grid_licensable_features_dtype_offsets(): cdef nvmlGridLicensableFeatures_t pod = nvmlGridLicensableFeatures_t() return _numpy.dtype({ 'names': ['is_grid_license_supported', 'licensable_features_count', 'grid_licensable_features'], - 'formats': [_numpy.int32, _numpy.uint32, grid_licensable_feature_dtype], + 'formats': [_numpy.int32, _numpy.uint32, (grid_licensable_feature_dtype, 3)], 'offsets': [ (&(pod.isGridLicenseSupported)) - (&pod), (&(pod.licensableFeaturesCount)) - (&pod), @@ -24913,58 +25059,6 @@ cpdef event_set_free(intptr_t set): check_status(__status__) -cpdef system_event_set_create(intptr_t request): - """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetCreateRequest_t. - - .. seealso:: `nvmlSystemEventSetCreate` - """ - with nogil: - __status__ = nvmlSystemEventSetCreate(request) - check_status(__status__) - - -cpdef system_event_set_free(intptr_t request): - """Releases system event set. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetFreeRequest_t. - - .. seealso:: `nvmlSystemEventSetFree` - """ - with nogil: - __status__ = nvmlSystemEventSetFree(request) - check_status(__status__) - - -cpdef system_register_events(intptr_t request): - """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. - - Args: - request (intptr_t): Reference to the struct nvmlSystemRegisterEventRequest_t. - - .. seealso:: `nvmlSystemRegisterEvents` - """ - with nogil: - __status__ = nvmlSystemRegisterEvents(request) - check_status(__status__) - - -cpdef system_event_set_wait(intptr_t request): - """Waits on system events and delivers events. - - Args: - request (intptr_t): Reference in which to nvmlSystemEventSetWaitRequest_t. - - .. seealso:: `nvmlSystemEventSetWait` - """ - with nogil: - __status__ = nvmlSystemEventSetWait(request) - check_status(__status__) - - cpdef device_modify_drain_state(intptr_t pci_info, int new_state): """Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before this call is made. Must be called as administrator. For Linux only. @@ -27299,10 +27393,16 @@ cdef FieldValue _cast_field_values(values): values_ = FieldValue(valuesCount) for i, v in enumerate(values): if isinstance(v, tuple): + if len(v) != 2: + raise ValueError("FieldValue tuple must be of length 2") + if not isinstance(v[0], int) or not isinstance(v[1], int): + raise ValueError("FieldValue tuple elements must be integers") values_[i].field_id = v[0] values_[i].scope_id = v[1] - else: + elif isinstance(v, int): values_[i].field_id = v + else: + raise ValueError("Each entry must be an integer field ID, or a tuple of (field ID, scope ID)") return values_ @@ -27901,3 +28001,63 @@ cpdef object device_get_nvlink_info(intptr_t device): __status__ = nvmlDeviceGetNvLinkInfo(device, info) check_status(__status__) return info_v1_py + + +cpdef intptr_t system_event_set_create(): + """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``.""" + cdef nvmlSystemEventSetCreateRequest_v1_t[1] request + with nogil: + request[0].version = sizeof(nvmlSystemEventSetCreateRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetCreate(request) + check_status(__status__) + return (request[0].set) + + +cpdef system_event_set_free(intptr_t event_set): + """Frees an event set.""" + cdef nvmlSystemEventSetFreeRequest_v1_t[1] request + request[0].set = event_set + with nogil: + request[0].version = sizeof(nvmlSystemEventSetFreeRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetFree(request) + check_status(__status__) + + +cpdef system_register_events(intptr_t event_set, unsigned long long event_types): + """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. + + Args: + event_set (intptr_t): The system event set handle. + event_types (unsigned long long): Bitmask of nvmlSystemEventType_t values representing the events to register. + """ + cdef nvmlSystemRegisterEventRequest_v1_t[1] request + request[0].set = event_set + request[0].eventTypes = event_types + with nogil: + request[0].version = sizeof(nvmlSystemRegisterEventRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemRegisterEvents(request) + check_status(__status__) + + +cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, unsigned int buffer_size): + """Waits for events to occur on the system event set. + + Args: + event_set (intptr_t): The system event set handle. + timeout_ms (unsigned int): The maximum amount of time in milliseconds to wait for an event. + buffer_size (unsigned int): The size of the event buffer. + + Returns: + SystemEvent: The system event that occurred. + """ + cdef nvmlSystemEventSetWaitRequest_v1_t[1] request + cdef SystemEventData_v1 event_data = SystemEventData_v1(buffer_size) + request[0].timeoutms = timeout_ms + request[0].set = event_set + request[0].data = (event_data._get_ptr()) + request[0].dataSize = buffer_size + with nogil: + request[0].version = sizeof(nvmlSystemEventSetWaitRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetWait(request) + check_status(__status__) + return SystemEventData_v1.from_ptr(event_data._get_ptr(), size=request[0].numEvent) diff --git a/cuda_bindings/tests/nvml/test_nvlink.py b/cuda_bindings/tests/nvml/test_nvlink.py index 14799898be..99407abc19 100644 --- a/cuda_bindings/tests/nvml/test_nvlink.py +++ b/cuda_bindings/tests/nvml/test_nvlink.py @@ -11,14 +11,14 @@ def test_nvlink_get_link_count(all_devices): """ for device in all_devices: fields = nvml.FieldValue(1) - fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT + fields[0].field_id = nvml.FieldId.DEV_NVLINK_LINK_COUNT value = nvml.device_get_field_values(device, fields)[0] assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, ( f"Unexpected return {value.nvml_return} for link count field query" ) # Use the alternative argument to device_get_field_values - value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0] + value = nvml.device_get_field_values(device, [nvml.FieldId.DEV_NVLINK_LINK_COUNT])[0] assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, ( f"Unexpected return {value.nvml_return} for link count field query" )