@@ -191,11 +191,8 @@ Device::Device(hsa_agent_t bkendDevice)
191
191
, numOfVgpus_(0 )
192
192
, preferred_numa_node_(0 )
193
193
, maxSdmaReadMask_(0 )
194
- , maxSdmaWriteMask_(0 ) {
194
+ , maxSdmaWriteMask_(0 ), cpu_agent_info_( nullptr ) {
195
195
group_segment_.handle = 0 ;
196
- system_segment_.handle = 0 ;
197
- system_coarse_segment_.handle = 0 ;
198
- system_kernarg_segment_.handle = 0 ;
199
196
gpuvm_segment_.handle = 0 ;
200
197
gpu_fine_grained_segment_.handle = 0 ;
201
198
gpu_ext_fine_grained_segment_.handle = 0 ;
@@ -225,20 +222,20 @@ void Device::setupCpuAgent() {
225
222
}
226
223
227
224
preferred_numa_node_ = index;
228
- cpu_agent_ = cpu_agents_[index].agent ;
229
- system_segment_ = cpu_agents_[index].fine_grain_pool ;
230
- system_coarse_segment_ = cpu_agents_[index].coarse_grain_pool ;
231
- system_kernarg_segment_ = cpu_agents_[index].kern_arg_pool ;
232
- system_ext_segment_ = cpu_agents_[index].ext_fine_grain_pool ;
225
+ cpu_agent_info_ = &cpu_agents_[index];
226
+
233
227
ClPrint (amd::LOG_INFO, amd::LOG_INIT, " Numa selects cpu agent[%zu]=0x%zx(fine=0x%zx,"
234
- " coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d" , index, cpu_agent_.handle ,
235
- system_segment_.handle , system_coarse_segment_.handle , bkendDevice_.handle , isXgmi_);
228
+ " coarse=0x%zx) for gpu agent=0x%zx CPU<->GPU XGMI=%d" , index,
229
+ cpu_agent_info_->agent .handle ,
230
+ cpu_agent_info_->fine_grain_pool .handle ,
231
+ cpu_agent_info_->coarse_grain_pool .handle ,
232
+ bkendDevice_.handle , isXgmi_);
236
233
}
237
234
238
235
void Device::checkAtomicSupport () {
239
236
std::vector<amd::Device::LinkAttrType> link_attrs;
240
237
link_attrs.push_back (std::make_pair (LinkAttribute::kLinkAtomicSupport , 0 ));
241
- if (findLinkInfo (system_segment_ , &link_attrs)) {
238
+ if (findLinkInfo (cpu_agent_info_-> fine_grain_pool , &link_attrs)) {
242
239
if (link_attrs[0 ].second == 1 ) {
243
240
info_.pcie_atomics_ = true ;
244
241
}
@@ -863,7 +860,7 @@ hsa_status_t Device::iterateGpuMemoryPoolCallback(hsa_amd_memory_pool_t pool, vo
863
860
// If cpu agent cannot access this pool, the device does not support large bar.
864
861
hsa_amd_memory_pool_access_t tmp{};
865
862
hsa_amd_agent_memory_pool_get_info (
866
- dev->cpu_agent_ ,
863
+ dev->cpu_agent_info_ -> agent ,
867
864
pool,
868
865
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS,
869
866
&tmp);
@@ -1166,7 +1163,7 @@ bool Device::populateOCLDeviceConstants() {
1166
1163
1167
1164
checkAtomicSupport ();
1168
1165
1169
- assert (system_segment_ .handle != 0 );
1166
+ assert (cpu_agent_info_-> fine_grain_pool .handle != 0 );
1170
1167
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools (
1171
1168
bkendDevice_, Device::iterateGpuMemoryPoolCallback, this )) {
1172
1169
return false ;
@@ -1286,7 +1283,8 @@ bool Device::populateOCLDeviceConstants() {
1286
1283
1287
1284
if (HSA_STATUS_SUCCESS !=
1288
1285
hsa_amd_memory_pool_get_info (
1289
- system_segment_, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &alloc_granularity_)) {
1286
+ cpu_agent_info_->fine_grain_pool , HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE,
1287
+ &alloc_granularity_)) {
1290
1288
return false ;
1291
1289
}
1292
1290
}
@@ -2005,46 +2003,54 @@ device::Memory* Device::createMemory(size_t size, size_t alignment) const {
2005
2003
}
2006
2004
2007
2005
// ================================================================================================
2008
- void * Device::hostAlloc (size_t size, size_t alignment, MemorySegment mem_seg) const {
2009
- void * ptr = nullptr ;
2010
-
2006
+ hsa_amd_memory_pool_t Device::getHostMemoryPool (MemorySegment mem_seg,
2007
+ const AgentInfo* agentInfo) const {
2008
+ if (agentInfo == nullptr ) {
2009
+ agentInfo = cpu_agent_info_;
2010
+ }
2011
2011
hsa_amd_memory_pool_t segment{0 };
2012
2012
switch (mem_seg) {
2013
2013
case kKernArg : {
2014
2014
if (settings ().fgs_kernel_arg_ ) {
2015
- segment = system_kernarg_segment_ ;
2015
+ segment = agentInfo-> kern_arg_pool ;
2016
2016
break ;
2017
2017
}
2018
2018
// Falls through on else case.
2019
2019
}
2020
2020
case kNoAtomics :
2021
2021
// If runtime disables barrier, then all host allocations must have L2 disabled
2022
- if (system_coarse_segment_ .handle != 0 ) {
2023
- segment = system_coarse_segment_ ;
2022
+ if (agentInfo-> coarse_grain_pool .handle != 0 ) {
2023
+ segment = agentInfo-> coarse_grain_pool ;
2024
2024
break ;
2025
2025
}
2026
2026
// Falls through on else case.
2027
2027
case kAtomics :
2028
- segment = system_segment_ ;
2028
+ segment = agentInfo-> fine_grain_pool ;
2029
2029
break ;
2030
2030
case kUncachedAtomics :
2031
- if (system_ext_segment_ .handle != 0 ) {
2031
+ if (agentInfo-> ext_fine_grain_pool .handle != 0 ) {
2032
2032
ClPrint (amd::LOG_DEBUG, amd::LOG_MEM,
2033
2033
" Using extended fine grained access system memory pool" );
2034
- segment = system_ext_segment_ ;
2034
+ segment = agentInfo-> ext_fine_grain_pool ;
2035
2035
} else {
2036
2036
ClPrint (amd::LOG_DEBUG, amd::LOG_MEM,
2037
2037
" Falling through on fine grained access system memory pool" );
2038
- segment = system_segment_ ;
2038
+ segment = agentInfo-> fine_grain_pool ;
2039
2039
}
2040
2040
break ;
2041
2041
default :
2042
2042
guarantee (false , " Invalid Memory Segment" );
2043
2043
break ;
2044
2044
}
2045
-
2046
2045
assert (segment.handle != 0 );
2047
- hsa_status_t stat = hsa_amd_memory_pool_allocate (segment, size, 0 , &ptr);
2046
+ return segment;
2047
+ }
2048
+
2049
+ // ================================================================================================
2050
+ void * Device::hostAlloc (size_t size, size_t alignment, MemorySegment mem_seg) const {
2051
+ void * ptr = nullptr ;
2052
+ hsa_amd_memory_pool_t pool = getHostMemoryPool (mem_seg);
2053
+ hsa_status_t stat = hsa_amd_memory_pool_allocate (pool, size, 0 , &ptr);
2048
2054
ClPrint (amd::LOG_DEBUG, amd::LOG_MEM, " Allocate hsa host memory %p, size 0x%zx,"
2049
2055
" numa_node = %d, mem_seg = %d" , ptr, size, preferred_numa_node_, static_cast <int >(mem_seg));
2050
2056
if (stat != HSA_STATUS_SUCCESS) {
@@ -2065,28 +2071,8 @@ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) co
2065
2071
// ================================================================================================
2066
2072
void * Device::hostAgentAlloc (size_t size, const AgentInfo& agentInfo, MemorySegment mem_seg) const {
2067
2073
void * ptr = nullptr ;
2068
- hsa_amd_memory_pool_t segment = agentInfo.fine_grain_pool ;
2069
- switch (mem_seg) {
2070
- case kNoAtomics :
2071
- if (agentInfo.coarse_grain_pool .handle != 0 ) {
2072
- segment = agentInfo.coarse_grain_pool ;
2073
- }
2074
- break ;
2075
- case kUncachedAtomics :
2076
- if (agentInfo.ext_fine_grain_pool .handle != 0 ) {
2077
- ClPrint (amd::LOG_DEBUG, amd::LOG_MEM,
2078
- " Using extended fine grained access system memory pool in hostAgentAlloc" );
2079
- segment = agentInfo.ext_fine_grain_pool ;
2080
- } else {
2081
- ClPrint (amd::LOG_DEBUG, amd::LOG_MEM,
2082
- " Falling through on fine grained access system memory pool in hostAgentAlloc" );
2083
- }
2084
- break ;
2085
- default :
2086
- break ;
2087
- }
2088
- assert (segment.handle != 0 );
2089
- hsa_status_t stat = hsa_amd_memory_pool_allocate (segment, size, 0 , &ptr);
2074
+ hsa_amd_memory_pool_t pool = getHostMemoryPool (mem_seg, &agentInfo);
2075
+ hsa_status_t stat = hsa_amd_memory_pool_allocate (pool, size, 0 , &ptr);
2090
2076
ClPrint (amd::LOG_DEBUG, amd::LOG_MEM, " Allocate hsa host memory %p, size 0x%zx" , ptr, size);
2091
2077
if (stat != HSA_STATUS_SUCCESS) {
2092
2078
LogPrintfError (" Fail allocation host memory with err %d" , stat);
@@ -2144,6 +2130,21 @@ void* Device::hostNumaAlloc(size_t size, size_t alignment, MemorySegment mem_seg
2144
2130
return ptr;
2145
2131
}
2146
2132
2133
+ void * Device::hostLock (void * hostMem, size_t size, const MemorySegment memSegment) const {
2134
+ hsa_amd_memory_pool_t pool = getHostMemoryPool (memSegment);
2135
+ void *deviceMemory = nullptr ;
2136
+ hsa_status_t status = hsa_amd_memory_lock_to_pool (hostMem, size,
2137
+ const_cast <hsa_agent_t *>(&bkendDevice_), 1 , pool, 0 , &deviceMemory);
2138
+ ClPrint (amd::LOG_DEBUG, amd::LOG_MEM, " Locking to pool %p, size 0x%zx, hostMem = %p,"
2139
+ " deviceMemory = %p, memSegment = %d" , pool, size, hostMem, deviceMemory,
2140
+ static_cast <int >(memSegment));
2141
+ if (status != HSA_STATUS_SUCCESS) {
2142
+ DevLogPrintfError (" Failed to lock memory to pool, failed with hsa_status: %d \n " , status);
2143
+ deviceMemory = nullptr ;
2144
+ }
2145
+ return deviceMemory;
2146
+ }
2147
+
2147
2148
void Device::hostFree (void * ptr, size_t size) const { memFree (ptr, size); }
2148
2149
2149
2150
bool Device::deviceAllowAccess (void * ptr) const {
@@ -2585,11 +2586,11 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
2585
2586
case amd::MemRangeAttribute::AccessedBy:
2586
2587
accessed_by = attr.size ();
2587
2588
// Add all GPU devices into the query
2588
- for (const auto agent : getGpuAgents () ) {
2589
+ for (const auto agent : gpu_agents_ ) {
2589
2590
attr.push_back ({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent.handle });
2590
2591
}
2591
2592
// Add CPU devices
2592
- for (const auto agent_info : getCpuAgents () ) {
2593
+ for (const auto agent_info : cpu_agents_ ) {
2593
2594
attr.push_back ({HSA_AMD_SVM_ATTRIB_ACCESS_QUERY, agent_info.agent .handle });
2594
2595
}
2595
2596
accessed_by = attr.size () - accessed_by;
@@ -2643,7 +2644,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
2643
2644
}
2644
2645
}
2645
2646
// Find CPU agent returned by ROCr
2646
- for (auto & agent_info : getCpuAgents () ) {
2647
+ for (auto & agent_info : cpu_agents_ ) {
2647
2648
if (agent_info.agent .handle == it.value ) {
2648
2649
*reinterpret_cast <int32_t *>(data[idx]) = static_cast <int32_t >(amd::CpuDeviceId);
2649
2650
}
@@ -2678,7 +2679,7 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
2678
2679
}
2679
2680
}
2680
2681
// Find CPU agent returned by ROCr
2681
- for (auto & agent_info : getCpuAgents () ) {
2682
+ for (auto & agent_info : cpu_agents_ ) {
2682
2683
if (agent_info.agent .handle == it.value ) {
2683
2684
reinterpret_cast <int32_t *>(data[idx])[entry] =
2684
2685
static_cast <int32_t >(amd::CpuDeviceId);
0 commit comments