diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 460f69dc6aa..ce098c1a83f 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -7548,11 +7548,18 @@ def node(self, args, multiple_devices=False, nodes=None, power_management=None, if isinstance(npm_info, dict): limit = npm_info.get('limit', "N/A") status = npm_info.get('status', npm_info.get('current', "N/A")) + ubb_power = npm_info.get('ubb_power', "N/A") + ubb_power_limit = npm_info.get('ubb_power_limit', "N/A") - if limit !="N/A": + if limit != "N/A": npm_dict['limit'] = limit status = "DISABLED" if status == amdsmi_interface.amdsmi_wrapper.AMDSMI_NPM_STATUS_DISABLED else "ENABLED" npm_dict.update({"status": status}) + # Add UBB power info if available (not UINT64_MAX sentinel) + if ubb_power != "N/A" and ubb_power != 0xFFFFFFFFFFFFFFFF: + npm_dict['ubb_power'] = ubb_power + if ubb_power_limit != "N/A" and ubb_power_limit != 0xFFFFFFFFFFFFFFFF: + npm_dict['ubb_power_limit'] = ubb_power_limit # Get base board temperatures using node_handle if args.base_board_temps: diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 6da7fc61436..e2ebb4c3e71 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -2234,7 +2234,9 @@ typedef enum { typedef struct { amdsmi_npm_status_t status; //!< NPM status (enabled/disabled). uint64_t limit; //!< Node-level power limit in Watts. - uint64_t reserved[6]; + uint64_t ubb_power; //!< Current UBB (baseboard) power in Watts. + uint64_t ubb_power_limit; //!< UBB power limit threshold in Watts. + uint64_t reserved[4]; } amdsmi_npm_info_t; /** diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 8830a228098..5c7ecce3520 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -4638,6 +4638,8 @@ def amdsmi_get_npm_info(node_handle: processor_handle_t) -> Dict[str, Any]: dict_ret = { "limit": npm_info.limit, "status": npm_info.status, + "ubb_power": npm_info.ubb_power, + "ubb_power_limit": npm_info.ubb_power_limit, } return dict_ret diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 97f6869cc28..b52d208fae6 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -2353,7 +2353,9 @@ class struct_amdsmi_npm_info_t(Structure): ('status', amdsmi_npm_status_t), ('PADDING_0', ctypes.c_ubyte * 4), ('limit', ctypes.c_uint64), - ('reserved', ctypes.c_uint64 * 6), + ('ubb_power', ctypes.c_uint64), + ('ubb_power_limit', ctypes.c_uint64), + ('reserved', ctypes.c_uint64 * 4), ] amdsmi_npm_info_t = struct_amdsmi_npm_info_t diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 0eeef15ad40..96089ac37d5 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -613,7 +613,9 @@ typedef struct { rsmi_npm_status_t status; //!< NPM status (enabled/disabled). uint64_t limit; //!< Node-level power limit in Watts. - uint64_t reserved[6]; + uint64_t ubb_power; //!< Current UBB (baseboard) power in Watts. + uint64_t ubb_power_limit; //!< UBB power limit threshold in Watts. + uint64_t reserved[4]; } rsmi_npm_info_t; /** diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h index 7d6674463bb..16c57522914 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -162,6 +162,8 @@ enum DevInfoTypes { kDevBaseBoardTempMetrics, kDevGpuBoardTempMetrics, kDevGpuReset, + kDevBaseBoardPower, + kDevBaseBoardPowerLimit, kDevAvailableComputePartition, kDevComputePartition, kDevMemoryPartition, diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_npm.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_npm.h index f728fceacc6..49efdf7c1fd 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_npm.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_npm.h @@ -1,23 +1,6 @@ /* * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * See LICENSE file for full license text. */ #ifndef ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_ @@ -26,14 +9,15 @@ #include "rocm_smi/rocm_smi.h" #include - namespace amd::smi { -rsmi_status_t get_npm_board_status(const std::string &board_path, - bool *enabled); +// NPM board status and limit queries +rsmi_status_t get_npm_board_status(const std::string &board_path, bool *enabled); +rsmi_status_t get_npm_board_limit(const std::string &board_path, uint64_t *limit); -rsmi_status_t get_npm_board_limit(const std::string &board_path, - uint64_t *limit); +// UBB (baseboard) power queries +rsmi_status_t get_ubb_power(const std::string &board_path, uint64_t *power); +rsmi_status_t get_ubb_power_limit(const std::string &board_path, uint64_t *limit); -} +} // namespace amd::smi #endif // ROCM_SMI_INCLUDE_ROCM_SMI_ROCM_SMI_NPM_H_ diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi.cc b/projects/amdsmi/rocm_smi/src/rocm_smi.cc index 6d604799c8a..bfd9d99b877 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi.cc @@ -3317,10 +3317,18 @@ rsmi_dev_npm_info_get(uint32_t dv_ind, uintptr_t node_handle, npm_limit = UINT64_MAX; } + // Get UBB power and limit (optional - don't fail if not available) + uint64_t ubb_power = UINT64_MAX; + uint64_t ubb_power_limit = UINT64_MAX; + amd::smi::get_ubb_power(*board_path_str, &ubb_power); + amd::smi::get_ubb_power_limit(*board_path_str, &ubb_power_limit); + // fill output std::memset(npm_info, 0, sizeof(*npm_info)); npm_info->status = npm_status ? RSMI_NPM_STATUS_ENABLED : RSMI_NPM_STATUS_DISABLED; npm_info->limit = npm_limit; + npm_info->ubb_power = ubb_power; + npm_info->ubb_power_limit = ubb_power_limit; ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " << getRSMIStatusString(RSMI_STATUS_SUCCESS); diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index e6258690b88..5d7d5fd5535 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -119,6 +119,8 @@ static const char *kDevPmMetricsFName = "pm_metrics"; // PM log static const char *kDevRegMetricsFName = "reg_state"; // register table static const char *kDevBaseBoardTempMetricsFName = "board/baseboard_temp"; static const char *kDevGpuBoardTempMetricsFName = "board/gpuboard_temp"; +static const char *kDevBaseBoardPowerFName = "board/baseboard_power"; +static const char *kDevBaseBoardPowerLimitFName = "board/baseboard_power_limit"; static const char *kDevPtlSupportedFName = "ptl/ptl_supported_formats"; // Only used internally for verification static const char *kDevPtlStatusFName = "ptl/ptl_enable"; static const char *kDevPtlFormatFName = "ptl/ptl_format"; @@ -334,6 +336,8 @@ static const std::map kDevAttribNameMap = { {kDevRegMetrics, kDevRegMetricsFName}, {kDevBaseBoardTempMetrics, kDevBaseBoardTempMetricsFName}, {kDevGpuBoardTempMetrics, kDevGpuBoardTempMetricsFName}, + {kDevBaseBoardPower, kDevBaseBoardPowerFName}, + {kDevBaseBoardPowerLimit, kDevBaseBoardPowerLimitFName}, {kDevPtlSupported, kDevPtlSupportedFName}, {kDevPtlStatus, kDevPtlStatusFName}, {kDevPtlFormat, kDevPtlFormatFName}, @@ -511,6 +515,8 @@ Device::devInfoTypesStrings = { {kDevRegMetrics, "kDevRegMetrics"}, {kDevBaseBoardTempMetrics, "kDevBaseBoardTempMetrics"}, {kDevGpuBoardTempMetrics, "kDevGpuBoardTempMetrics"}, + {kDevBaseBoardPower, "kDevBaseBoardPower"}, + {kDevBaseBoardPowerLimit, "kDevBaseBoardPowerLimit"}, {kDevGpuReset, "kDevGpuReset"}, {kDevAvailableComputePartition, "kDevAvailableComputePartition"}, {kDevComputePartition, "kDevComputePartition"}, diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_npm.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_npm.cc index 561271b5830..370dfb43408 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_npm.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_npm.cc @@ -97,4 +97,55 @@ rsmi_status_t get_npm_board_limit(const std::string &board_path, uint64_t *limit } } + +rsmi_status_t get_ubb_power(const std::string &board_path, uint64_t *power) { + if (power == nullptr) return RSMI_STATUS_INVALID_ARGS; + if (board_path.empty()) return RSMI_STATUS_INVALID_ARGS; + + fs::path bd(board_path); + if (!fs::exists(bd) || !fs::is_directory(bd)) return RSMI_STATUS_NOT_SUPPORTED; + + fs::path p = bd / "baseboard_power"; + if (!fs::exists(p) || !fs::is_regular_file(p)) return RSMI_STATUS_NOT_SUPPORTED; + + std::string s; + rsmi_status_t r = read_npm_file(p, s); + if (r != RSMI_STATUS_SUCCESS) return RSMI_STATUS_NOT_SUPPORTED; + + try { + size_t idx = 0; + unsigned long long v = std::stoull(s, &idx, 10); + if (idx != s.size()) return RSMI_STATUS_UNEXPECTED_DATA; + *power = static_cast(v); + return RSMI_STATUS_SUCCESS; + } catch (...) { + return RSMI_STATUS_UNEXPECTED_DATA; + } +} + +rsmi_status_t get_ubb_power_limit(const std::string &board_path, uint64_t *limit) { + if (limit == nullptr) return RSMI_STATUS_INVALID_ARGS; + if (board_path.empty()) return RSMI_STATUS_INVALID_ARGS; + + fs::path bd(board_path); + if (!fs::exists(bd) || !fs::is_directory(bd)) return RSMI_STATUS_NOT_SUPPORTED; + + fs::path p = bd / "baseboard_power_limit"; + if (!fs::exists(p) || !fs::is_regular_file(p)) return RSMI_STATUS_NOT_SUPPORTED; + + std::string s; + rsmi_status_t r = read_npm_file(p, s); + if (r != RSMI_STATUS_SUCCESS) return RSMI_STATUS_NOT_SUPPORTED; + + try { + size_t idx = 0; + unsigned long long v = std::stoull(s, &idx, 10); + if (idx != s.size()) return RSMI_STATUS_UNEXPECTED_DATA; + *limit = static_cast(v); + return RSMI_STATUS_SUCCESS; + } catch (...) { + return RSMI_STATUS_UNEXPECTED_DATA; + } +} + } // end namespace