Skip to content

Commit 1351cd7

Browse files
authored
SWDEV-539145 - Simplify host memory pool management (#668)
* SWDEV-539145 - Simplify host memory pool management Remove unnecessary variables and functions. Make code simpler and clear. * Change cpu_agent_info_ into pointer. * Restore getPreferredNumaNode()
1 parent b568971 commit 1351cd7

File tree

4 files changed

+77
-99
lines changed

4 files changed

+77
-99
lines changed

rocclr/device/rocm/rocdevice.cpp

Lines changed: 54 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -191,11 +191,8 @@ Device::Device(hsa_agent_t bkendDevice)
191191
, numOfVgpus_(0)
192192
, preferred_numa_node_(0)
193193
, maxSdmaReadMask_(0)
194-
, maxSdmaWriteMask_(0) {
194+
, maxSdmaWriteMask_(0), cpu_agent_info_(nullptr) {
195195
group_segment_.handle = 0;
196-
system_segment_.handle = 0;
197-
system_coarse_segment_.handle = 0;
198-
system_kernarg_segment_.handle = 0;
199196
gpuvm_segment_.handle = 0;
200197
gpu_fine_grained_segment_.handle = 0;
201198
gpu_ext_fine_grained_segment_.handle = 0;
@@ -225,20 +222,20 @@ void Device::setupCpuAgent() {
225222
}
226223

227224
preferred_numa_node_ = index;
228-
cpu_agent_ = cpu_agents_[index].agent;
229-
system_segment_ = cpu_agents_[index].fine_grain_pool;
230-
system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool;
231-
system_kernarg_segment_ = cpu_agents_[index].kern_arg_pool;
232-
system_ext_segment_ = cpu_agents_[index].ext_fine_grain_pool;
225+
cpu_agent_info_ = &cpu_agents_[index];
226+
233227
ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Numa selects cpu agent[%zu]=0x%zx(fine=0x%zx,"
234-
"coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index, cpu_agent_.handle,
235-
system_segment_.handle, system_coarse_segment_.handle, bkendDevice_.handle, isXgmi_);
228+
"coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d", index,
229+
cpu_agent_info_->agent.handle,
230+
cpu_agent_info_->fine_grain_pool.handle,
231+
cpu_agent_info_->coarse_grain_pool.handle,
232+
bkendDevice_.handle, isXgmi_);
236233
}
237234

238235
void Device::checkAtomicSupport() {
239236
std::vector<amd::Device::LinkAttrType> link_attrs;
240237
link_attrs.push_back(std::make_pair(LinkAttribute::kLinkAtomicSupport, 0));
241-
if (findLinkInfo(system_segment_, &link_attrs)) {
238+
if (findLinkInfo(cpu_agent_info_->fine_grain_pool, &link_attrs)) {
242239
if (link_attrs[0].second == 1) {
243240
info_.pcie_atomics_ = true;
244241
}
@@ -863,7 +860,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
863860
// If cpu agent cannot access this pool, the device does not support large bar.
864861
hsa_amd_memory_pool_access_t tmp{};
865862
hsa_amd_agent_memory_pool_get_info(
866-
dev->cpu_agent_,
863+
dev->cpu_agent_info_->agent,
867864
pool,
868865
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
869866
&tmp);
@@ -1166,7 +1163,7 @@ bool Device::populateOCLDeviceConstants() {
11661163

11671164
checkAtomicSupport();
11681165

1169-
assert(system_segment_.handle != 0);
1166+
assert(cpu_agent_info_->fine_grain_pool.handle != 0);
11701167
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
11711168
bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) {
11721169
return false;
@@ -1286,7 +1283,8 @@ bool Device::populateOCLDeviceConstants() {
12861283

12871284
if (HSA_STATUS_SUCCESS !=
12881285
hsa_amd_memory_pool_get_info(
1289-
system_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &alloc_granularity_)) {
1286+
cpu_agent_info_->fine_grain_pool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
1287+
&alloc_granularity_)) {
12901288
return false;
12911289
}
12921290
}
@@ -2005,46 +2003,54 @@ device::Memory* Device::createMemory(size_t size, size_t alignment) const {
20052003
}
20062004

20072005
// ================================================================================================
2008-
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
2009-
void* ptr = nullptr;
2010-
2006+
hsa_amd_memory_pool_t Device::getHostMemoryPool(MemorySegment mem_seg,
2007+
const AgentInfo* agentInfo) const {
2008+
if (agentInfo == nullptr) {
2009+
agentInfo = cpu_agent_info_;
2010+
}
20112011
hsa_amd_memory_pool_t segment{0};
20122012
switch (mem_seg) {
20132013
case kKernArg : {
20142014
if (settings().fgs_kernel_arg_) {
2015-
segment = system_kernarg_segment_;
2015+
segment = agentInfo->kern_arg_pool;
20162016
break;
20172017
}
20182018
// Falls through on else case.
20192019
}
20202020
case kNoAtomics :
20212021
// If runtime disables barrier, then all host allocations must have L2 disabled
2022-
if (system_coarse_segment_.handle != 0) {
2023-
segment = system_coarse_segment_;
2022+
if (agentInfo->coarse_grain_pool.handle != 0) {
2023+
segment = agentInfo->coarse_grain_pool;
20242024
break;
20252025
}
20262026
// Falls through on else case.
20272027
case kAtomics :
2028-
segment = system_segment_;
2028+
segment = agentInfo->fine_grain_pool;
20292029
break;
20302030
case kUncachedAtomics :
2031-
if (system_ext_segment_.handle != 0) {
2031+
if (agentInfo->ext_fine_grain_pool.handle != 0) {
20322032
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
20332033
"Using extended fine grained access system memory pool");
2034-
segment = system_ext_segment_;
2034+
segment = agentInfo->ext_fine_grain_pool;
20352035
} else {
20362036
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
20372037
"Falling through on fine grained access system memory pool");
2038-
segment = system_segment_;
2038+
segment = agentInfo->fine_grain_pool;
20392039
}
20402040
break;
20412041
default :
20422042
guarantee(false, "Invalid Memory Segment");
20432043
break;
20442044
}
2045-
20462045
assert(segment.handle != 0);
2047-
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
2046+
return segment;
2047+
}
2048+
2049+
// ================================================================================================
2050+
void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const {
2051+
void* ptr = nullptr;
2052+
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg);
2053+
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
20482054
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx,"
20492055
" numa_node = %d, mem_seg = %d", ptr, size, preferred_numa_node_, static_cast<int>(mem_seg));
20502056
if (stat != HSA_STATUS_SUCCESS) {
@@ -2065,28 +2071,8 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
20652071
// ================================================================================================
20662072
void* Device::hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
20672073
void* ptr = nullptr;
2068-
hsa_amd_memory_pool_t segment = agentInfo.fine_grain_pool;
2069-
switch (mem_seg) {
2070-
case kNoAtomics :
2071-
if (agentInfo.coarse_grain_pool.handle != 0) {
2072-
segment = agentInfo.coarse_grain_pool;
2073-
}
2074-
break;
2075-
case kUncachedAtomics :
2076-
if (agentInfo.ext_fine_grain_pool.handle != 0) {
2077-
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
2078-
"Using extended fine grained access system memory pool in hostAgentAlloc");
2079-
segment = agentInfo.ext_fine_grain_pool;
2080-
} else {
2081-
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
2082-
"Falling through on fine grained access system memory pool in hostAgentAlloc");
2083-
}
2084-
break;
2085-
default :
2086-
break;
2087-
}
2088-
assert(segment.handle != 0);
2089-
hsa_status_t stat = hsa_amd_memory_pool_allocate(segment, size, 0, &ptr);
2074+
hsa_amd_memory_pool_t pool = getHostMemoryPool(mem_seg, &agentInfo);
2075+
hsa_status_t stat = hsa_amd_memory_pool_allocate(pool, size, 0, &ptr);
20902076
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Allocate hsa host memory %p, size 0x%zx", ptr, size);
20912077
if (stat != HSA_STATUS_SUCCESS) {
20922078
LogPrintfError("Fail allocation host memory with err %d", stat);
@@ -2144,6 +2130,21 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
21442130
return ptr;
21452131
}
21462132

2133+
void* Device::hostLock(void* hostMem, size_t size, const MemorySegment memSegment) const {
2134+
hsa_amd_memory_pool_t pool = getHostMemoryPool(memSegment);
2135+
void *deviceMemory = nullptr;
2136+
hsa_status_t status = hsa_amd_memory_lock_to_pool(hostMem, size,
2137+
const_cast<hsa_agent_t*>(&bkendDevice_), 1, pool, 0, &deviceMemory);
2138+
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, hostMem = %p,"
2139+
" deviceMemory = %p, memSegment = %d", pool, size, hostMem, deviceMemory,
2140+
static_cast<int>(memSegment));
2141+
if (status != HSA_STATUS_SUCCESS) {
2142+
DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
2143+
deviceMemory = nullptr;
2144+
}
2145+
return deviceMemory;
2146+
}
2147+
21472148
void Device::hostFree(void* ptr, size_t size) const { memFree(ptr, size); }
21482149

21492150
bool Device::deviceAllowAccess(void* ptr) const {
@@ -2585,11 +2586,11 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
25852586
case amd::MemRangeAttribute::AccessedBy:
25862587
accessed_by = attr.size();
25872588
// Add all GPU devices into the query
2588-
for (const auto agent : getGpuAgents()) {
2589+
for (const auto agent : gpu_agents_) {
25892590
attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle});
25902591
}
25912592
// Add CPU devices
2592-
for (const auto agent_info : getCpuAgents()) {
2593+
for (const auto agent_info : cpu_agents_) {
25932594
attr.push_back({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent.handle});
25942595
}
25952596
accessed_by = attr.size() - accessed_by;
@@ -2643,7 +2644,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
26432644
}
26442645
}
26452646
// Find CPU agent returned by ROCr
2646-
for (auto& agent_info : getCpuAgents()) {
2647+
for (auto& agent_info : cpu_agents_) {
26472648
if (agent_info.agent.handle == it.value) {
26482649
*reinterpret_cast<int32_t*>(data[idx]) = static_cast<int32_t>(amd::CpuDeviceId);
26492650
}
@@ -2678,7 +2679,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
26782679
}
26792680
}
26802681
// Find CPU agent returned by ROCr
2681-
for (auto& agent_info : getCpuAgents()) {
2682+
for (auto& agent_info : cpu_agents_) {
26822683
if (agent_info.agent.handle == it.value) {
26832684
reinterpret_cast<int32_t*>(data[idx])[entry] =
26842685
static_cast<int32_t>(amd::CpuDeviceId);

rocclr/device/rocm/rocdevice.hpp

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -341,10 +341,8 @@ class Device : public NullDevice {
341341
static bool loadHsaModules();
342342

343343
hsa_agent_t getBackendDevice() const { return bkendDevice_; }
344-
const hsa_agent_t &getCpuAgent() const { return cpu_agent_; } // Get the CPU agent with the least NUMA distance to this GPU
345-
346-
static const std::vector<hsa_agent_t>& getGpuAgents() { return gpu_agents_; }
347-
static const std::vector<AgentInfo>& getCpuAgents() { return cpu_agents_; }
344+
//! Get the CPU agent with the least NUMA distance to this GPU
345+
const hsa_agent_t &getCpuAgent() const { return cpu_agent_info_->agent; }
348346

349347
void setupCpuAgent(); // Setup the CPU agent which has the least NUMA distance to this GPU
350348

@@ -408,7 +406,6 @@ class Device : public NullDevice {
408406
virtual bool globalFreeMemory(size_t* freeMemory) const;
409407
virtual void* hostAlloc(size_t size, size_t alignment,
410408
MemorySegment mem_seg = MemorySegment::kNoAtomics) const;
411-
412409
virtual void hostFree(void* ptr, size_t size = 0) const;
413410

414411
bool deviceAllowAccess(void* dst) const;
@@ -459,6 +456,10 @@ class Device : public NullDevice {
459456
//! Allocate host memory from agent info
460457
void* hostAgentAlloc(size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const;
461458

459+
//! Pin a host pointer allocated by C/C++ or OS allocator (i.e. ordinary system DRAM) and
460+
//! return a new device pointer accessible by the GPU agent.
461+
void* hostLock(void* hostMem, size_t size, MemorySegment memSegment) const;
462+
462463
//! Returns transfer engine object
463464
const device::BlitManager& xferMgr() const { return xferQueue()->blitMgr(); }
464465

@@ -501,10 +502,6 @@ class Device : public NullDevice {
501502

502503
VirtualGPU* xferQueue() const;
503504

504-
hsa_amd_memory_pool_t SystemSegment() const { return system_segment_; }
505-
hsa_amd_memory_pool_t SystemExtSegment() const { return system_ext_segment_; }
506-
hsa_amd_memory_pool_t SystemCoarseSegment() const { return system_coarse_segment_; }
507-
508505
//! Acquire HSA queue. This method can create a new HSA queue or
509506
//! share previously created
510507
hsa_queue_t* acquireQueue(uint32_t queue_size_hint, bool coop_queue = false,
@@ -547,6 +544,7 @@ class Device : public NullDevice {
547544
virtual amd::Memory* GetArenaMemObj(const void* ptr, size_t& offset, size_t size = 0);
548545

549546
const uint32_t getPreferredNumaNode() const { return preferred_numa_node_; }
547+
550548
const bool isFineGrainSupported() const;
551549

552550
//! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
@@ -588,8 +586,6 @@ class Device : public NullDevice {
588586
static bool isHsaInitialized_;
589587
static std::vector<hsa_agent_t> gpu_agents_;
590588
static std::vector<AgentInfo> cpu_agents_;
591-
592-
hsa_agent_t cpu_agent_;
593589
uint32_t preferred_numa_node_;
594590
std::vector<hsa_agent_t> p2p_agents_; //!< List of P2P agents available for this device
595591
mutable std::mutex lock_allow_access_; //!< To serialize allow_access calls
@@ -598,10 +594,8 @@ class Device : public NullDevice {
598594
hsa_agent_t* p2p_agents_list_ = nullptr;
599595
hsa_profile_t agent_profile_;
600596
hsa_amd_memory_pool_t group_segment_;
601-
hsa_amd_memory_pool_t system_segment_;
602-
hsa_amd_memory_pool_t system_coarse_segment_;
603-
hsa_amd_memory_pool_t system_kernarg_segment_;
604-
hsa_amd_memory_pool_t system_ext_segment_;
597+
598+
AgentInfo *cpu_agent_info_;
605599

606600
hsa_amd_memory_pool_t gpuvm_segment_;
607601
hsa_amd_memory_pool_t gpu_fine_grained_segment_;
@@ -649,7 +643,8 @@ class Device : public NullDevice {
649643

650644
//! Pool of HSA queues with custom CU masks
651645
std::vector<std::map<hsa_queue_t*, QueueInfo, QueueCompare>> queueWithCUMaskPool_;
652-
646+
hsa_amd_memory_pool_t getHostMemoryPool(MemorySegment mem_seg,
647+
const AgentInfo* agentInfo = nullptr) const;
653648
//! Read and Write mask for device<->host
654649
uint32_t maxSdmaReadMask_;
655650
uint32_t maxSdmaWriteMask_;

rocclr/device/rocm/rocmemory.cpp

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -825,10 +825,7 @@ bool Buffer::create(bool alloc_local) {
825825
deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics);
826826
}
827827
} else if (memFlags & CL_MEM_FOLLOW_USER_NUMA_POLICY) {
828-
deviceMemory_ = dev().hostNumaAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
829-
? Device::MemorySegment::kNoAtomics :
830-
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
831-
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
828+
deviceMemory_ = dev().hostNumaAlloc(size(), 1, getHostMemorySegment(memFlags));
832829
} else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
833830
// TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal,
834831
// replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready.
@@ -852,10 +849,7 @@ bool Buffer::create(bool alloc_local) {
852849
// Disable host access to force blit path for memeory writes.
853850
flags_ &= ~HostMemoryDirectAccess;
854851
} else {
855-
deviceMemory_ = dev().hostAlloc(size(), 1, (memFlags & CL_MEM_SVM_ATOMICS) == 0
856-
? Device::MemorySegment::kNoAtomics :
857-
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
858-
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics));
852+
deviceMemory_ = dev().hostAlloc(size(), 1, getHostMemorySegment(memFlags));
859853
}
860854
} else {
861855
assert(!isHostMemDirectAccess() && "Runtime doesn't support direct access to GPU memory!");
@@ -1012,28 +1006,8 @@ bool Buffer::create(bool alloc_local) {
10121006
owner()->setHostMem(deviceMemory_);
10131007
} else if (owner()->getSvmPtr() != owner()->getHostMem()) {
10141008
if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) {
1015-
hsa_amd_memory_pool_t pool = dev().SystemSegment(); // Default
1016-
if ((memFlags & CL_MEM_SVM_ATOMICS) == 0) {
1017-
if (dev().SystemCoarseSegment().handle != 0) {
1018-
pool = dev().SystemCoarseSegment();
1019-
}
1020-
} else if ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0) {
1021-
if (dev().SystemExtSegment().handle != 0) {
1022-
pool = dev().SystemExtSegment();
1023-
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM,
1024-
"Using extended fine grained access system memory pool to lock");
1025-
}
1026-
}
1027-
hsa_agent_t hsa_agent = dev().getBackendDevice();
1028-
hsa_status_t status = hsa_amd_memory_lock_to_pool(owner()->getHostMem(),
1029-
owner()->getSize(), &hsa_agent, 1, pool, 0, &deviceMemory_);
1030-
ClPrint(amd::LOG_DEBUG, amd::LOG_MEM, "Locking to pool %p, size 0x%zx, HostPtr = %p,"
1031-
" DevPtr = %p, memFlags = 0x%xh", pool, owner()->getSize(),
1032-
owner()->getHostMem(), deviceMemory_, memFlags);
1033-
if (status != HSA_STATUS_SUCCESS) {
1034-
DevLogPrintfError("Failed to lock memory to pool, failed with hsa_status: %d \n", status);
1035-
deviceMemory_ = nullptr;
1036-
}
1009+
deviceMemory_ = dev().hostLock(owner()->getHostMem(), owner()->getSize(),
1010+
getHostMemorySegment(memFlags));
10371011
} else {
10381012
deviceMemory_ = owner()->getHostMem();
10391013
}

rocclr/device/rocm/rocmemory.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,14 @@ class Memory : public device::Memory {
151151

152152
void* persistent_host_ptr_; //!< Host accessible pointer for persistent memory
153153

154+
// Get MemorySegment type in terms of host memory allocation flags
155+
Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) {
156+
return (memFlags & CL_MEM_SVM_ATOMICS) == 0
157+
? Device::MemorySegment::kNoAtomics :
158+
((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ?
159+
Device::MemorySegment::kUncachedAtomics : Device::MemorySegment::kAtomics);
160+
}
161+
154162
private:
155163
// Disable copy constructor
156164
Memory(const Memory&);

0 commit comments

Comments
 (0)