Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ sw/nic/build/
sw/nic/build*
sw/nic/third-party/libev-4.33/*
sw/nic/gpuagent/vendor
.worktrees/
4 changes: 2 additions & 2 deletions sw/nic/gpuagent/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,11 @@ gogo-protos:

$(OBJ_DIR)/%.o: $(TOPDIR)/%.cc
@mkdir -p $(dir $@) # Create the necessary subdirectories in temp
$(CC) $(CFLAGS) $(DEFS) $(INCS_AMD_SMI) -DAMD_SMI -D__FNAME__=__FILE__ -DGPUAGENT_VERSION=\"$(GPUAGENT_VERSION)\" -c $< -o $@
$(CC) $(CFLAGS) $(DEFS) $(INCS_AMD_SMI) -DAMD_SMI -DENABLE_ESMI_LIB -D__FNAME__=__FILE__ -DGPUAGENT_VERSION=\"$(GPUAGENT_VERSION)\" -c $< -o $@

$(OBJ_DIR)/%.o: $(TOPDIR)/%.c
@mkdir -p $(dir $@) # Create the necessary subdirectories in temp
$(CC) $(CFLAGS) $(INCS_AMD_SMI) -DAMD_SMI -D__FNAME__=__FILE__ -DGPUAGENT_VERSION=\"$(GPUAGENT_VERSION)\" -c $< -o $@
$(CC) $(CFLAGS) $(INCS_AMD_SMI) -DAMD_SMI -DENABLE_ESMI_LIB -D__FNAME__=__FILE__ -DGPUAGENT_VERSION=\"$(GPUAGENT_VERSION)\" -c $< -o $@

$(OBJ_DIR_GIM)/%.o: $(TOPDIR)/%.cc
@mkdir -p $(dir $@) # Create the necessary subdirectories in temp
Expand Down
6 changes: 6 additions & 0 deletions sw/nic/gpuagent/api/include/aga_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,12 @@ typedef struct aga_gpu_stats_s {
uint64_t gfx_activity_accumulated;
/// memory activity accumulated in %
uint64_t mem_activity_accumulated;
/// HSMP accumulation counter, incremented when accumulators update
uint64_t accumulation_counter;
/// accumulated socket XCC busy
uint64_t socket_gfx_busy_accumulated;
/// accumulated HBM bandwidth for all HBM stacks in the socket
uint64_t dram_bandwidth_accumulated;
/// XGMI link statistics
aga_gpu_xgmi_link_stats_t xgmi_link_stats[AGA_GPU_MAX_XGMI_LINKS];
/// GPU violation statistics
Expand Down
61 changes: 61 additions & 0 deletions sw/nic/gpuagent/api/smi/amdsmi/smi_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ namespace aga {
/// status and statistics
std::mutex g_gpu_metrics_mutex;
std::unordered_map<aga_gpu_handle_t, amdsmi_gpu_metrics_t> g_gpu_metrics;
/// cache the CPU socket processor handle associated with each GPU, used to
/// read socket level HSMP accumulator metrics
std::mutex g_gpu_cpu_handle_mutex;
std::unordered_map<aga_gpu_handle_t, amdsmi_processor_handle> g_gpu_cpu_handle;
/// counter resolution in uJ; this is a constant value that we get once during
/// init time and use whenever we want to calculate energy accumalated
float g_energy_counter_resolution;
Expand Down Expand Up @@ -1254,6 +1258,36 @@ smi_fill_vram_usage_ (aga_gpu_handle_t gpu_handle,
return SDK_RET_OK;
}

/// fill socket level HSMP accumulator metrics for the CPU socket associated
/// with this GPU; only available on platforms with HSMP (e.g. APU)
static void
smi_gpu_fill_hsmp_stats_ (aga_gpu_handle_t gpu_handle, aga_gpu_stats_t *stats)
{
#ifdef ENABLE_ESMI_LIB
amdsmi_status_t status;
amdsmi_processor_handle cpu_handle = NULL;
amdsmi_hsmp_metrics_table_t table = {};

{
std::lock_guard<std::mutex> lock(g_gpu_cpu_handle_mutex);
auto it = g_gpu_cpu_handle.find(gpu_handle);
if (it != g_gpu_cpu_handle.end()) {
cpu_handle = it->second;
}
}
if (cpu_handle == NULL) {
return;
}
status = amdsmi_get_hsmp_metrics_table(cpu_handle, &table);
if (status != AMDSMI_STATUS_SUCCESS) {
return;
}
stats->accumulation_counter = table.accumulation_counter;
stats->socket_gfx_busy_accumulated = table.socket_gfx_busy_acc;
stats->dram_bandwidth_accumulated = table.dram_bandwidth_acc;
#endif
}

sdk_ret_t
smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle,
const aga_obj_key_t *gpu_key,
Expand Down Expand Up @@ -1283,6 +1317,10 @@ smi_gpu_fill_stats (aga_gpu_handle_t gpu_handle,
metrics_info = g_gpu_metrics[gpu_handle];
}
}
// fill socket level HSMP accumulator metrics; this is independent of the
// GPU metrics table (it reads the CPU socket HSMP table) so it is filled
// here unconditionally, outside the metrics_info structure-size guard below
smi_gpu_fill_hsmp_stats_(gpu_handle, stats);
if (metrics_info.common_header.structure_size != 0) {
// power and voltage
stats->avg_package_power = metrics_info.average_socket_power;
Expand Down Expand Up @@ -2057,6 +2095,23 @@ smi_discover_gpus (uint32_t *num_gpu, aga_gpu_profile_t *gpu)
"err {}", status);
return amdsmi_ret_to_sdk_ret(status);
}
// enumerate CPU socket handles once; on APU platforms (e.g. MI300A) the
// socket level HSMP accumulator metrics are read from the CPU socket
// handle, which lives in a separate namespace from the GPU socket handles
// and is only reachable via amdsmi_get_cpu_handles()
uint32_t num_cpu = 0;
amdsmi_processor_handle cpu_handles[AGA_MAX_SOCKET] = {};
#ifdef ENABLE_ESMI_LIB
if (amdsmi_get_cpu_handles(&num_cpu, NULL) == AMDSMI_STATUS_SUCCESS) {
if (num_cpu > AGA_MAX_SOCKET) {
num_cpu = AGA_MAX_SOCKET;
}
if (amdsmi_get_cpu_handles(&num_cpu, &cpu_handles[0]) !=
AMDSMI_STATUS_SUCCESS) {
num_cpu = 0;
}
}
#endif
for (uint32_t i = 0; i < num_sockets; i++) {
// for each socket get the number of processors
status = amdsmi_get_processor_handles(socket_handles[i],
Expand Down Expand Up @@ -2093,6 +2148,12 @@ smi_discover_gpus (uint32_t *num_gpu, aga_gpu_profile_t *gpu)
proc_handles[j]);
return ret;
}
// map this GPU to its CPU socket handle (socket index i)
// for socket level HSMP accumulator metrics
if (i < num_cpu && cpu_handles[i] != NULL) {
std::lock_guard<std::mutex> lock(g_gpu_cpu_handle_mutex);
g_gpu_cpu_handle[proc_handles[j]] = cpu_handles[i];
}
(*num_gpu)++;
}
}
Expand Down
14 changes: 10 additions & 4 deletions sw/nic/gpuagent/api/smi/amdsmi/smi_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1555,11 +1555,17 @@ smi_state::init(aga_api_init_params_t *init_params) {
amdsmi_status_t status;
aga_gpu_profile_t gpu[AGA_MAX_GPU];

// initialize smi library
status = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
// APUS enables HSMP socket accumulators; fall back to GPUS on non-APU
status = amdsmi_init(AMDSMI_INIT_AMD_APUS);
if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
AGA_TRACE_ERR("Failed to initialize amd smi library, err {}", status);
return amdsmi_ret_to_sdk_ret(status);
AGA_TRACE_WARN("APU smi init failed (err {}), falling back to GPU init",
status);
status = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
if (unlikely(status != AMDSMI_STATUS_SUCCESS)) {
AGA_TRACE_ERR("Failed to initialize amd smi library, err {}",
status);
return amdsmi_ret_to_sdk_ret(status);
}
}
// discover gpus
ret = aga::smi_discover_gpus(&num_gpu_, gpu);
Expand Down
18 changes: 18 additions & 0 deletions sw/nic/gpuagent/cli/cmd/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -1733,6 +1733,24 @@ func printGPUStats(gpu *aga.GPU, statsOnly bool) {
"Memory activity accumulated",
stats.GetMemoryActivityAccumulated())
}
if (stats.GetAccumulationCounter() != 0) &&
(stats.GetAccumulationCounter() != UINT64_MAX_VAL) {
fmt.Printf(indent+"%-38s : %d\n",
"Accumulation counter",
stats.GetAccumulationCounter())
}
if (stats.GetSocketGFXBusyAccumulated() != 0) &&
(stats.GetSocketGFXBusyAccumulated() != UINT64_MAX_VAL) {
fmt.Printf(indent+"%-38s : %d\n",
"Socket GFX busy accumulated",
stats.GetSocketGFXBusyAccumulated())
}
if (stats.GetDRAMBandwidthAccumulated() != 0) &&
(stats.GetDRAMBandwidthAccumulated() != UINT64_MAX_VAL) {
fmt.Printf(indent+"%-38s : %d\n",
"DRAM bandwidth accumulated",
stats.GetDRAMBandwidthAccumulated())
}
for i, linkStats := range stats.GetXGMILinkStats() {
link := "Link " + fmt.Sprintf("%v", i+1)
if (linkStats.GetDataRead() != 0) &&
Expand Down
6 changes: 6 additions & 0 deletions sw/nic/gpuagent/protos/gpu.proto
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,12 @@ message GPUStats {
uint64 GFXActivityAccumulated = 65;
// memory activity accumulated in %
uint64 MemoryActivityAccumulated = 66;
// accumulated HBM bandwidth for all HBM stacks in the socket
uint64 DRAMBandwidthAccumulated = 89;
// HSMP accumulation counter, incremented when accumulators update
uint64 AccumulationCounter = 90;
// accumulated socket XCC busy
uint64 SocketGFXBusyAccumulated = 91;
// GPU XGMI link statistics
repeated GPUXGMILinkStats XGMILinkStats = 67;
// GPU violation statistics
Expand Down
3 changes: 3 additions & 0 deletions sw/nic/gpuagent/svc/gpu_to_proto.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,9 @@ aga_gpu_api_stats_to_proto (GPUStats *proto_stats,
proto_stats->set_fanspeed(stats->fan_speed);
proto_stats->set_gfxactivityaccumulated(stats->gfx_activity_accumulated);
proto_stats->set_memoryactivityaccumulated(stats->mem_activity_accumulated);
proto_stats->set_accumulationcounter(stats->accumulation_counter);
proto_stats->set_socketgfxbusyaccumulated(stats->socket_gfx_busy_accumulated);
proto_stats->set_drambandwidthaccumulated(stats->dram_bandwidth_accumulated);
for (uint32_t i = 0; i < AGA_GPU_MAX_XGMI_LINKS; i++) {
aga_gpu_xgmi_link_stats_to_proto(proto_stats->add_xgmilinkstats(),
&stats->xgmi_link_stats[i]);
Expand Down
Binary file not shown.
Loading