diff --git a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc index 82fd1734..e88acbbe 100644 --- a/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc +++ b/sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc @@ -529,6 +529,7 @@ smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle, amdsmi_gpu_metrics_t *metrics_info) { uint32_t clk_cnt = 0; + uint64_t cur_freq; amdsmi_status_t amdsmi_ret; uint32_t low_freq, high_freq; amdsmi_frequencies_t freq = {}; @@ -642,10 +643,12 @@ smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle, } // data fabric clock amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_DF, &freq); + cur_freq = current_frequency_hz(&freq); if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { AGA_TRACE_ERR("Failed to get data fabric clock frequencies for GPU {}, " "err {}", gpu_handle, amdsmi_ret); - } else { + } else if (cur_freq != AMDSMI_INVALID_UINT32) { + // skip clock types whose current frequency is reported as NA low_freq = high_freq = 0; clock_status = &status->clock_status[clk_cnt]; // min and max frequencies are per clock type @@ -653,17 +656,19 @@ smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle, &clock_status->low_frequency, &clock_status->high_frequency); clock_status->clock_type = AGA_GPU_CLOCK_TYPE_FABRIC; - clock_status->frequency = freq.frequency[freq.current]/1000000; + clock_status->frequency = cur_freq / 1000000; clock_status->deep_sleep = (clock_status->frequency < clock_status->low_frequency); clk_cnt++; } // DCE clock amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_DCEF, &freq); + cur_freq = current_frequency_hz(&freq); if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { AGA_TRACE_ERR("Failed to get DCE clock frequencies for GPU {}, err {}", gpu_handle, amdsmi_ret); - } else { + } else if (cur_freq != AMDSMI_INVALID_UINT32) { + // skip clock types whose current frequency is reported as NA low_freq = high_freq = 0; clock_status = &status->clock_status[clk_cnt]; // min and max frequencies are per clock type @@ -671,17 +676,19 @@ smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle, &clock_status->low_frequency, &clock_status->high_frequency); clock_status->clock_type = AGA_GPU_CLOCK_TYPE_DCE; - clock_status->frequency = freq.frequency[freq.current]/1000000; + clock_status->frequency = cur_freq / 1000000; clock_status->deep_sleep = (clock_status->frequency < clock_status->low_frequency); clk_cnt++; } // PCIe clock amdsmi_ret = amdsmi_get_clk_freq(gpu_handle, AMDSMI_CLK_TYPE_PCIE, &freq); + cur_freq = current_frequency_hz(&freq); if (unlikely(amdsmi_ret != AMDSMI_STATUS_SUCCESS)) { AGA_TRACE_ERR("Failed to get PCIe clock frequencies for GPU {}, err {}", gpu_handle, amdsmi_ret); - } else { + } else if (cur_freq != AMDSMI_INVALID_UINT32) { + // skip clock types whose current frequency is reported as NA low_freq = high_freq = 0; clock_status = &status->clock_status[clk_cnt]; // min and max frequencies are per clock type @@ -689,7 +696,7 @@ smi_fill_clock_status_ (aga_gpu_handle_t gpu_handle, &clock_status->low_frequency, &clock_status->high_frequency); clock_status->clock_type = AGA_GPU_CLOCK_TYPE_PCIE; - clock_status->frequency = freq.frequency[freq.current]/1000000; + clock_status->frequency = cur_freq / 1000000; clock_status->deep_sleep = (clock_status->frequency < clock_status->low_frequency); clk_cnt++; diff --git a/sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp b/sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp index 760c8c21..d5929270 100644 --- a/sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp +++ b/sw/nic/gpuagent/api/smi/amdsmi/smi_utils.hpp @@ -46,9 +46,10 @@ static inline void find_low_high_frequency (amdsmi_frequencies_t *freq, uint32_t *min, uint32_t *max) { - // create a vector of the valid frequencies - std::vector f(freq->frequency, - freq->frequency + freq->num_supported); + // clamp num_supported to the array bound before building the vector + uint32_t n = (freq->num_supported < AMDSMI_MAX_NUM_FREQUENCIES) ? + freq->num_supported : AMDSMI_MAX_NUM_FREQUENCIES; + std::vector f(freq->frequency, freq->frequency + n); // sort vector std::sort(f.begin(), f.end()); @@ -76,6 +77,22 @@ find_low_high_frequency (amdsmi_frequencies_t *freq, return; } +/// \brief return current raw frequency in Hz, clamping an out-of-bounds index +/// \param[in] freq frequencies struct from amdsmi +/// \return current frequency in Hz; 0 when no supported frequencies, +/// else the first entry if current is out of range +static inline uint64_t +current_frequency_hz (amdsmi_frequencies_t *freq) +{ + if (freq->num_supported == 0) { + return 0; + } + uint32_t n = (freq->num_supported < AMDSMI_MAX_NUM_FREQUENCIES) ? + freq->num_supported : AMDSMI_MAX_NUM_FREQUENCIES; + uint32_t idx = (freq->current < n) ? freq->current : 0; + return freq->frequency[idx]; +} + /// \brief convert amdsmi virtualization mode to aga vritualization mode /// \param[in] virt_mode amdsmi virtualization mode /// \return aga virtualization mode