Skip to content

Commit

Permalink
rocm_smi_lib: Fix gpu_metrics_v1_5 support
Browse files Browse the repository at this point in the history
Adds support and implement APIs for 'gpu_metrics_v1_5'

Code changes related to the following:
  * gpu metrics 1.5 support
  * Unit tests
  * Examples

Build changes related to the following: None

Change-Id: Ie8917dd63c1dd1a94467b100fa44b634cebe62b6
Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
(cherry picked from commit 373621a)
  • Loading branch information
oliveiradan authored and charis-poag-amd committed Jan 11, 2024
1 parent db2d128 commit 0d13f6d
Show file tree
Hide file tree
Showing 7 changed files with 964 additions and 39 deletions.
91 changes: 86 additions & 5 deletions include/rocm_smi/rocm_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,9 @@ struct metrics_table_header_t {
uint8_t content_revision;
/// \endcond
};
/// \cond Ignore in docs.
typedef struct metrics_table_header_t metrics_table_header_t;
/// \endcond

/**
* @brief The following structure holds the gpu metrics values for a device.
Expand All @@ -934,9 +937,14 @@ struct metrics_table_header_t {
#define RSMI_NUM_HBM_INSTANCES 4

/**
* @brief This should match kRSMI_MAX_NUM_VCN
* @brief This should match kRSMI_MAX_NUM_VCNS
*/
#define RSMI_MAX_NUM_VCNS 4

/**
* @brief This should match kRSMI_MAX_JPEG_ENGINES
*/
#define RSMI_MAX_NUM_VCN 4
#define RSMI_MAX_NUM_JPEG_ENGS 32

/**
* @brief This should match kRSMI_MAX_NUM_CLKS
Expand Down Expand Up @@ -1057,7 +1065,7 @@ typedef struct {
uint16_t current_socket_power;

// Utilization (%)
uint16_t vcn_activity[RSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode)
uint16_t vcn_activity[RSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)

// Clock Lock Status. Each bit corresponds to clock instance
uint32_t gfxclk_lock_status;
Expand Down Expand Up @@ -1091,6 +1099,19 @@ typedef struct {
uint16_t current_vclk0s[RSMI_MAX_NUM_CLKS];
uint16_t current_dclk0s[RSMI_MAX_NUM_CLKS];

/*
* v1.5 additions
*/
// JPEG activity percent (encode/decode)
uint16_t jpeg_activity[RSMI_MAX_NUM_JPEG_ENGS];

// PCIE NAK sent accumulated count
uint32_t pcie_nak_sent_count_acc;

// PCIE NAK received accumulated count
uint32_t pcie_nak_rcvd_count_acc;


/// \endcond
} rsmi_gpu_metrics_t;

Expand Down Expand Up @@ -4366,7 +4387,8 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind);
* Metric multi-valued counter types
*/
typedef uint16_t GPUMetricTempHbm_t[RSMI_NUM_HBM_INSTANCES];
typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCN];
typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCNS];
typedef uint16_t GPUMetricJpegActivity_t[RSMI_MAX_NUM_JPEG_ENGS];
typedef uint64_t GPUMetricXgmiReadDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS];
typedef uint64_t GPUMetricXgmiWriteDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS];
typedef uint16_t GPUMetricCurrGfxClk_t[RSMI_MAX_NUM_GFX_CLKS];
Expand Down Expand Up @@ -4797,6 +4819,44 @@ rsmi_dev_metrics_pcie_replay_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count
rsmi_status_t
rsmi_dev_metrics_pcie_replay_rover_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value);

/**
* @brief Get the 'pcie_nak_sent_count_acc' from the GPU metrics associated with the device
*
* @details Given a device index @p dv_ind and a pointer to a uint32_t in which
* the 'mem_max_bandwidth_usage' will stored
*
* @param[in] dv_ind a device index
*
* @param[inout] pcie_nak_sent_count_acc_value a pointer to uint32_t to which the device gpu
* metric unit will be stored
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
* ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit
* does not exist for the given device
*
*/
rsmi_status_t
rsmi_dev_metrics_pcie_nak_sent_count_acc_get(uint32_t dv_ind, uint32_t* pcie_nak_sent_count_acc_value);

/**
* @brief Get the 'pcie_nak_rcvd_count_acc' from the GPU metrics associated with the device
*
* @details Given a device index @p dv_ind and a pointer to a uint32_t in which
* the 'mem_max_bandwidth_usage' will stored
*
* @param[in] dv_ind a device index
*
* @param[inout] pcie_nak_rcvd_count_acc_value a pointer to uint32_t to which the device gpu
* metric unit will be stored
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
* ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit
* does not exist for the given device
*
*/
rsmi_status_t
rsmi_dev_metrics_pcie_nak_rcvd_count_acc_get(uint32_t dv_ind, uint32_t* pcie_nak_rcvd_count_acc_value);

/**
* @brief Get the 'curr_uclk' from the GPU metrics associated with the device
*
Expand Down Expand Up @@ -4852,7 +4912,7 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu
*
* @param[inout] vcn_activity_value a pointer to uint16_t to which the device gpu
* metric unit will be stored
* - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCN)
* - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCNS)
* element array (GPUMetricVcnActivity_t)
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
Expand All @@ -4863,6 +4923,27 @@ rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_valu
rsmi_status_t
rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_activity_value);

/**
* @brief Get the 'jpeg_activity' from the GPU metrics associated with the device
*
* @details Given a device index @p dv_ind and a pointer to a uint16_t in which
* the 'vcn_activity' will stored
*
* @param[in] dv_ind a device index
*
* @param[inout] jpeg_activity_value a pointer to uint16_t to which the device gpu
* metric unit will be stored
* - This is a multi-valued counter holding a 32 (RSMI_MAX_NUM_JPEG_ENGS)
* element array (GPUMetricJpegActivity_t)
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
* ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit
* does not exist for the given device
*
*/
rsmi_status_t
rsmi_dev_metrics_jpeg_activity_get(uint32_t dv_ind, GPUMetricJpegActivity_t* jpeg_activity_value);

/**
* @brief Get the 'xgmi_read_data' from the GPU metrics associated with the device
*
Expand Down
132 changes: 129 additions & 3 deletions include/rocm_smi/rocm_smi_gpu_metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ constexpr uint32_t kRSMI_MAX_NUM_GFX_CLKS = 8;
constexpr uint32_t kRSMI_MAX_NUM_CLKS = 4;

// Note: This *must* match NUM_VCN
constexpr uint32_t kRSMI_MAX_NUM_VCN = 4;
constexpr uint32_t kRSMI_MAX_NUM_VCNS = 4;

// Note: This *must* match NUM_JPEG_ENG
constexpr uint32_t kRSMI_MAX_JPEG_ENGINES = 32;


struct AMDGpuMetricsHeader_v1_t
Expand Down Expand Up @@ -326,7 +329,7 @@ struct AMDGpuMetrics_v14_t
// Utilization (%)
uint16_t m_average_gfx_activity;
uint16_t m_average_umc_activity; // memory controller
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCN]; // VCN instances activity percent (encode/decode)
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)

// Energy (15.259uJ (2^-16) units)
uint64_t m_energy_accumulator;
Expand Down Expand Up @@ -383,7 +386,89 @@ struct AMDGpuMetrics_v14_t

uint16_t m_padding;
};
using AMGpuMetricsLatest_t = AMDGpuMetrics_v14_t;

struct AMDGpuMetrics_v15_t
{
~AMDGpuMetrics_v15_t() = default;

struct AMDGpuMetricsHeader_v1_t m_common_header;

// Temperature (Celsius). It will be zero (0) if unsupported.
uint16_t m_temperature_hotspot;
uint16_t m_temperature_mem;
uint16_t m_temperature_vrsoc;

// Power (Watts)
uint16_t m_current_socket_power;

// Utilization (%)
uint16_t m_average_gfx_activity;
uint16_t m_average_umc_activity; // memory controller
uint16_t m_vcn_activity[kRSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode)
uint16_t m_jpeg_activity[kRSMI_MAX_JPEG_ENGINES]; // JPEG activity percent (encode/decode)

// Energy (15.259uJ (2^-16) units)
uint64_t m_energy_accumulator;

// Driver attached timestamp (in ns)
uint64_t m_system_clock_counter;

// Throttle status
uint32_t m_throttle_status;

// Clock Lock Status. Each bit corresponds to clock instance
uint32_t m_gfxclk_lock_status;

// Link width (number of lanes) and speed (in 0.1 GT/s)
uint16_t m_pcie_link_width;
uint16_t m_pcie_link_speed; // in 0.1 GT/s

// XGMI bus width and bitrate (in Gbps)
uint16_t m_xgmi_link_width;
uint16_t m_xgmi_link_speed;

// Utilization Accumulated (%)
uint32_t m_gfx_activity_acc;
uint32_t m_mem_activity_acc;

// PCIE accumulated bandwidth (GB/sec)
uint64_t m_pcie_bandwidth_acc;

// PCIE instantaneous bandwidth (GB/sec)
uint64_t m_pcie_bandwidth_inst;

// PCIE L0 to recovery state transition accumulated count
uint64_t m_pcie_l0_to_recov_count_acc;

// PCIE replay accumulated count
uint64_t m_pcie_replay_count_acc;

// PCIE replay rollover accumulated count
uint64_t m_pcie_replay_rover_count_acc;

// PCIE NAK sent accumulated count
uint32_t m_pcie_nak_sent_count_acc;

// PCIE NAK received accumulated count
uint32_t m_pcie_nak_rcvd_count_acc;

// XGMI accumulated data transfer size(KiloBytes)
uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];
uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS];

// PMFW attached timestamp (10ns resolution)
uint64_t m_firmware_timestamp;

// Current clocks (Mhz)
uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS];
uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS];
uint16_t m_current_uclk;

uint16_t m_padding;
};
using AMGpuMetricsLatest_t = AMDGpuMetrics_v15_t;

/**
* This is GPU Metrics version that gets to public access.
Expand All @@ -410,6 +495,9 @@ using GPUMetricTempHbmTbl_t = GpuMetricU16Tbl_t;
using GPUMetricVcnActivity_t = decltype(AMDGpuMetrics_v14_t::m_vcn_activity);
using GPUMetricVcnActivityTbl_t = GpuMetricU16Tbl_t;

using GPUMetricJpegActivity_t = decltype(AMDGpuMetrics_v15_t::m_jpeg_activity);
using GPUMetricJpegActivityTbl_t = GpuMetricU16Tbl_t;

using GPUMetricXgmiReadDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_read_data_acc);
using GPUMetricXgmiWriteDataAcc_t = decltype(AMDGpuMetrics_v14_t::m_xgmi_write_data_acc);
using GPUMetricXgmiAccTbl_t = GpuMetricU64Tbl_t;
Expand Down Expand Up @@ -518,6 +606,7 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
kMetricGfxActivityAccumulator,
kMetricMemActivityAccumulator,
kMetricVcnActivity, //v1.4
kMetricJpegActivity, //v1.5

// kGpuMetricAverageClock counters
kMetricAvgGfxClockFrequency,
Expand Down Expand Up @@ -559,6 +648,8 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t
kMetricPcieL0RecovCountAccumulator, //v1.4
kMetricPcieReplayCountAccumulator, //v1.4
kMetricPcieReplayRollOverCountAccumulator, //v1.4
kMetricPcieNakSentCountAccumulator, //v1.5
kMetricPcieNakReceivedCountAccumulator, //v1.5

// kGpuMetricPowerEnergy counters
kMetricAvgSocketPower,
Expand Down Expand Up @@ -608,6 +699,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t
kGpuMetricV12 = (0x1 << 2),
kGpuMetricV13 = (0x1 << 3),
kGpuMetricV14 = (0x1 << 4),
kGpuMetricV15 = (0x1 << 5),
};
using AMDGpuMetricVersionTranslationTbl_t = std::map<uint16_t, AMDGpuMetricVersionFlags_t>;
using GpuMetricTypePtr_t = std::shared_ptr<void>;
Expand Down Expand Up @@ -780,6 +872,40 @@ class GpuMetricsBase_v14_t final : public GpuMetricsBase_t

};

class GpuMetricsBase_v15_t final : public GpuMetricsBase_t
{
public:
~GpuMetricsBase_v15_t() = default;

size_t sizeof_metric_table() override {
return sizeof(AMDGpuMetrics_v15_t);
}

GpuMetricTypePtr_t get_metrics_table() override
{
if (!m_gpu_metric_ptr) {
m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v15_t*){});
}
assert(m_gpu_metric_ptr != nullptr);
return m_gpu_metric_ptr;
}

void dump_internal_metrics_table() override;

AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override
{
return AMDGpuMetricVersionFlags_t::kGpuMetricV15;
}

rsmi_status_t populate_metrics_dynamic_tbl() override;
AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override;


private:
AMDGpuMetrics_v15_t m_gpu_metrics_tbl;
std::shared_ptr<AMDGpuMetrics_v15_t> m_gpu_metric_ptr;

};

template<typename T>
rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value);
Expand Down
7 changes: 7 additions & 0 deletions rocm_smi/example/rocm_smi_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,7 @@ int main() {
metrics_table_header_t header_values;
GPUMetricTempHbm_t hbm_values;
GPUMetricVcnActivity_t vcn_values;
GPUMetricJpegActivity_t jpeg_values;
GPUMetricXgmiReadDataAcc_t xgmi_read_values;
GPUMetricXgmiWriteDataAcc_t xgmi_write_values;
GPUMetricCurrGfxClk_t curr_gfxclk_values;
Expand Down Expand Up @@ -1039,6 +1040,8 @@ int main() {
std::cout << "\t -> average_mm_activity(): " << print_error_or_value(ret, val_ui16) << "\n";
ret = rsmi_dev_metrics_vcn_activity_get(i, &vcn_values);
std::cout << "\t -> vcn_activity(): " << print_error_or_value(ret, vcn_values) << "\n";
ret = rsmi_dev_metrics_jpeg_activity_get(i, &jpeg_values);
std::cout << "\t -> jpeg_activity(): " << print_error_or_value(ret, jpeg_values) << "\n";
ret = rsmi_dev_metrics_mem_activity_acc_get(i, &val_ui32);
std::cout << "\t -> mem_activity_accum(): " << print_error_or_value(ret, val_ui32) << "\n";
ret = rsmi_dev_metrics_gfx_activity_acc_get(i, &val_ui32);
Expand Down Expand Up @@ -1119,6 +1122,10 @@ int main() {
std::cout << "\t -> xgmi_read_data(): " << print_error_or_value(ret, xgmi_read_values) << "\n";
ret = rsmi_dev_metrics_xgmi_write_data_get(i, &xgmi_write_values);
std::cout << "\t -> xgmi_write_data(): " << print_error_or_value(ret, xgmi_write_values) << "\n";
ret = rsmi_dev_metrics_pcie_nak_sent_count_acc_get(i, &val_ui32);
std::cout << "\t -> pcie_nak_sent_count_accum(): " << print_error_or_value(ret, val_ui32) << "\n";
ret = rsmi_dev_metrics_pcie_nak_rcvd_count_acc_get(i, &val_ui32);
std::cout << "\t -> pcie_nak_rcvd_count_accum(): " << print_error_or_value(ret, val_ui32) << "\n";

std::cout << "\n";
std::cout << "\t[Voltage]" << "\n";
Expand Down
Loading

0 comments on commit 0d13f6d

Please sign in to comment.