Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions projects/clr/rocclr/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1304,6 +1304,7 @@ class VirtualDevice : public amd::ReferenceCountedObject {
virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }

virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
virtual void ReleaseSdmaEngines() {} //!< Release SDMA engine assignments (ROCm specific)
virtual void ReleaseAllHwQueues() {}
virtual void ReleaseHwQueue() {}

Expand Down
49 changes: 24 additions & 25 deletions projects/clr/rocclr/device/rocm/rocblit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
StagingXferSize(dev().settings().stagedXferSize_),
completeOperation_(false),
context_(nullptr) {
dev().getSdmaRWMasks(&sdmaEngineReadMask_, &sdmaEngineWriteMask_);
}

inline void DmaBlitManager::synchronize() const {
Expand Down Expand Up @@ -475,11 +474,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
hsa_agent_t& srcAgent, size_t size,
amd::CopyMetadata& copyMetadata) const {
hsa_status_t status = HSA_STATUS_SUCCESS;

uint32_t copyMask = 0;
uint32_t freeEngineMask = 0;
uint32_t recIdMask = 0;
bool kUseRegularCopyApi = 0;
bool kUseRegularCopyApi = false;
constexpr size_t kRetainCountThreshold = 8;
bool forceSDMA =
(copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::SDMA);
Expand Down Expand Up @@ -508,31 +504,34 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());

if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
copyMask = gpu().getLastUsedSdmaEngine();
ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
copyMask &= (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
if (copyMask == 0) {
// Check SDMA engine status
status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);

if (status == HSA_STATUS_SUCCESS) {
status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask);
}
// Check if this VirtualGPU already has an assigned engine with affinity
uint32_t assignedEngineMask = gpu().AssignedSdmaEngine();

if (assignedEngineMask != 0) {
// This VirtualGPU/stream already has an assigned engine - just use it
// Stream ordering handles any busy conditions naturally
copyMask = assignedEngineMask;

ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Query copy engine status %x, srcAgent %p, "
"dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x",
status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask);
"Using assigned SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d",
&gpu(), copyMask, engine);
} else {
// No assigned engine yet - allocate one using device-level allocator
copyMask = dev().AllocateSdmaEngine(&gpu(), engine, dstAgent, srcAgent);

if (copyMask != 0) {
// Store the assigned engine in the VirtualGPU for future use
gpu().SetAssignedSdmaEngine(copyMask);

// If requested engine is valid and available, use it
if (recIdMask != 0 && (freeEngineMask & recIdMask) != 0) {
copyMask = recIdMask - (recIdMask & (recIdMask - 1));
ClPrint(amd::LOG_INFO, amd::LOG_COPY,
"Allocated new SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d",
&gpu(), copyMask, engine);
} else {
// Otherwise use first available engine
copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1));
ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
"Failed to allocate SDMA engine for VirtualGPU %p, falling back to regular copy",
&gpu());
kUseRegularCopyApi = true;
}

gpu().setLastUsedSdmaEngine(copyMask);
}

if (copyMask != 0 && status == HSA_STATUS_SUCCESS) {
Expand Down
2 changes: 0 additions & 2 deletions projects/clr/rocclr/device/rocm/rocblit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,6 @@ class DmaBlitManager : public device::HostBlitManager {

bool completeOperation_; //!< DMA blit manager must complete operation
amd::Context* context_; //!< A dummy context
uint32_t sdmaEngineReadMask_; //!< SDMA Engine Read Mask
uint32_t sdmaEngineWriteMask_; //!< SDMA Engine Write Mask

private:
//! Disable copy constructor
Expand Down
141 changes: 138 additions & 3 deletions projects/clr/rocclr/device/rocm/rocdevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ Device::Device(hsa_agent_t bkendDevice)
preferred_numa_node_(0),
maxSdmaReadMask_(0),
maxSdmaWriteMask_(0),
sdma_engine_allocator_(*this),
cpu_agent_info_(nullptr) {
group_segment_.handle = 0;
gpuvm_segment_.handle = 0;
Expand Down Expand Up @@ -3479,9 +3480,143 @@ void Device::HiddenHeapInit(const VirtualGPU& gpu) {
}

// ================================================================================================
void Device::getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const {
*readMask = maxSdmaReadMask_;
*writeMask = maxSdmaWriteMask_;
uint32_t Device::SdmaEngineAllocator::AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
hsa_agent_t dstAgent, hsa_agent_t srcAgent) {
amd::ScopedLock lock(lock_);

// Get valid engine mask based on operation type (read vs write)
uint32_t validEngineMask = (engine_type == HwQueueEngine::SdmaRead)
? device_.maxSdmaReadMask_
: device_.maxSdmaWriteMask_;

// Simple round-robin path if all engines have equal bandwidth
// Disabled by default - use preferred engine logic for current GPUs
constexpr bool kUseSimpleRR = false;

if (kUseSimpleRR) {
// Simple round-robin: just cycle through valid engines
// This will be enabled for future GPUs where engine selection doesn't matter
if (validEngineMask == 0) {
ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
"No valid SDMA engines for VirtualGPU %p", vgpu);
return 0;
}

// Cycle through bit positions, find next valid engine
uint32_t start_bit = next_rr_engine_.fetch_add(1, std::memory_order_relaxed);
uint32_t selected_mask = 0;

// Try up to 32 positions to find a valid engine
for (uint32_t i = 0; i < 32; ++i) {
uint32_t bit = (start_bit + i) % 32;
uint32_t mask = 1u << bit;
if (validEngineMask & mask) {
selected_mask = mask;
break;
}
}

vgpu_to_engine_[vgpu] = selected_mask;

ClPrint(amd::LOG_INFO, amd::LOG_COPY,
"Assigned SDMA engine (simple RR) to VirtualGPU %p: mask=0x%x, engine_type=%d",
vgpu, selected_mask, engine_type);

return selected_mask;
}

// Current path: Query HSA for engine status and preferences
uint32_t freeEngineMask = 0;
uint32_t preferredMask = 0;
hsa_status_t status = HSA_STATUS_SUCCESS;

// Query current engine status
status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
if (status == HSA_STATUS_SUCCESS) {
// Query preferred (high-bandwidth) engines
status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &preferredMask);
}

// Constrain to valid engines
freeEngineMask &= validEngineMask;
preferredMask &= validEngineMask;

ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Engine query for VirtualGPU %p: status=%x, free_mask=0x%x, preferred_mask=0x%x, "
"valid_mask=0x%x, engine_type=%d",
vgpu, status, freeEngineMask, preferredMask, validEngineMask, engine_type);

uint32_t candidate_mask = 0;
uint32_t allocated_mask = 0;

// For inter-GPU copies, strongly prefer the recommended engines
bool is_inter_gpu = (engine_type == HwQueueEngine::SdmaInter);

if (is_inter_gpu && (preferredMask != 0)) {
// Inter-GPU: prioritize preferredMask, even if engines are already allocated
candidate_mask = validEngineMask & preferredMask;

ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Inter-GPU copy for VirtualGPU %p: prioritizing preferred engines, "
"candidate_mask=0x%x",
vgpu, candidate_mask);
} else {
// Regular read/write/intra: enforce exclusivity (don't share engines)
// Build a mask of engines already allocated to other VirtualGPUs
for (const auto& pair : vgpu_to_engine_) {
allocated_mask |= pair.second;
}

uint32_t available_mask = validEngineMask & ~allocated_mask;

if (available_mask == 0) {
ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
"No unallocated SDMA engines available for VirtualGPU %p, engine_type=%d "
"(valid_mask=0x%x, allocated_mask=0x%x)",
vgpu, engine_type, validEngineMask, allocated_mask);
return 0;
}

// Prefer high-bandwidth (recommended) engines if available
candidate_mask = available_mask & preferredMask;
if (candidate_mask == 0) {
candidate_mask = available_mask;
}
}

if (candidate_mask == 0) {
ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
"No candidate SDMA engines for VirtualGPU %p, engine_type=%d",
vgpu, engine_type);
return 0;
}

// Select the lowest bit (first available engine)
uint32_t selected_mask = candidate_mask & (~candidate_mask + 1);

// Update the map
vgpu_to_engine_[vgpu] = selected_mask;

ClPrint(amd::LOG_INFO, amd::LOG_COPY,
"Assigned SDMA engine to VirtualGPU %p: mask=0x%x, engine_type=%d, "
"valid_mask=0x%x, preferred_mask=0x%x, allocated_mask=0x%x, is_inter_gpu=%d",
vgpu, selected_mask, engine_type, validEngineMask, preferredMask,
allocated_mask, is_inter_gpu);

return selected_mask;
}

// ================================================================================================
void Device::SdmaEngineAllocator::ReleaseEngine(VirtualGPU* vgpu) {
amd::ScopedLock lock(lock_);

auto it = vgpu_to_engine_.find(vgpu);
if (it != vgpu_to_engine_.end()) {
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
"Released SDMA engine for VirtualGPU %p: mask=0x%x",
vgpu, it->second);
vgpu_to_engine_.erase(it);
}
}

// ================================================================================================
Expand Down
32 changes: 30 additions & 2 deletions projects/clr/rocclr/device/rocm/rocdevice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -602,9 +602,16 @@ class Device : public NullDevice {
void HiddenHeapAlloc(const VirtualGPU& gpu);
//! Init hidden heap for device memory allocations
void HiddenHeapInit(const VirtualGPU& gpu);
void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
bool isXgmi() const { return isXgmi_; }
bool isXgmi() const override { return isXgmi_; }

//! SDMA engine allocation for per-stream affinity
uint32_t AllocateSdmaEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
hsa_agent_t dstAgent, hsa_agent_t srcAgent) const {
return sdma_engine_allocator_.AllocateEngine(vgpu, engine_type, dstAgent, srcAgent);
}
void ReleaseSdmaEngine(VirtualGPU* vgpu) const {
sdma_engine_allocator_.ReleaseEngine(vgpu);
}
//! Returns the map of code objects to kernels
const auto& KernelMap() const { return kernel_map_; }
//! Adds a kernel to the kernel map
Expand Down Expand Up @@ -702,6 +709,27 @@ class Device : public NullDevice {
uint32_t maxSdmaWriteMask_;
bool isXgmi_; //!< Flag to indicate if there is XGMI between CPU<->GPU

//! SDMA engine allocator for per-stream affinity
struct SdmaEngineAllocator {
amd::Monitor lock_; //!< Protects the allocation state
std::unordered_map<VirtualGPU*, uint32_t> vgpu_to_engine_; //!< VirtualGPU -> engine mask
std::atomic<uint32_t> next_rr_engine_{0}; //!< Simple RR counter for future use
const Device& device_; //!< Reference to parent device for accessing masks

SdmaEngineAllocator(const Device& device)
: lock_(true), device_(device) {}

//! Allocate an SDMA engine for a VirtualGPU
//! Queries HSA for engine status and preferred engines, then allocates
//! For inter-GPU copies, strongly prefers recommended engines even if already allocated
uint32_t AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
hsa_agent_t dstAgent, hsa_agent_t srcAgent);

//! Release engine allocation for a VirtualGPU
void ReleaseEngine(VirtualGPU* vgpu);
};
mutable SdmaEngineAllocator sdma_engine_allocator_;

//! Code object to kernel info map (used in the crash dump analysis)
mutable std::map<uint64_t, Kernel&> kernel_map_;

Expand Down
12 changes: 12 additions & 0 deletions projects/clr/rocclr/device/rocm/rocvirtual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1763,6 +1763,10 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,

// ================================================================================================
VirtualGPU::~VirtualGPU() {
// Release SDMA engine assignment for this VirtualGPU
dev().ReleaseSdmaEngine(this);
ClearAssignedSdmaEngine();

delete blitMgr_;

if (tracking_created_) {
Expand Down Expand Up @@ -1965,6 +1969,14 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
}
}

// ================================================================================================
void VirtualGPU::ReleaseSdmaEngines() {
// Release SDMA engine assignment when queue is idle
// This allows the engine to be reassigned to other active streams
dev().ReleaseSdmaEngine(this);
ClearAssignedSdmaEngine();
}

// ================================================================================================
void VirtualGPU::ReleaseAllHwQueues() {
if (roc_device_.settings().dynamic_queues_ &&
Expand Down
15 changes: 15 additions & 0 deletions projects/clr/rocclr/device/rocm/rocvirtual.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ class VirtualGPU : public device::VirtualDevice {
virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {}

virtual address allocKernelArguments(size_t size, size_t alignment) final;
virtual void ReleaseSdmaEngines() final; //!< Release SDMA engine assignments
virtual void ReleaseAllHwQueues() final;
virtual void ReleaseHwQueue() final;

Expand Down Expand Up @@ -461,6 +462,17 @@ class VirtualGPU : public device::VirtualDevice {
//! Analyzes a crashed AQL queue to find a broken AQL packet
void AnalyzeAqlQueue() const;

//! SDMA engine affinity management
uint32_t AssignedSdmaEngine() const {
return assigned_sdma_engine_;
}
void SetAssignedSdmaEngine(uint32_t engine_mask) {
assigned_sdma_engine_ = engine_mask;
}
void ClearAssignedSdmaEngine() {
assigned_sdma_engine_ = 0;
}

private:
//! Dispatches a barrier with blocking HSA signals
void dispatchBlockingWait();
Expand Down Expand Up @@ -633,6 +645,9 @@ class VirtualGPU : public device::VirtualDevice {
//!< with a complition signal
hsa_signal_t last_completion_signal_{}; //!< The last completion signal

//! SDMA engine affinity tracking for this VirtualGPU/stream
uint32_t assigned_sdma_engine_ = 0; //!< Assigned SDMA engine mask for all operations

using KernelArgImpl = device::Settings::KernelArgImpl;
};
} // namespace amd::roc
2 changes: 2 additions & 0 deletions projects/clr/rocclr/platform/commandqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ void HostQueue::finish(bool cpu_wait) {
lastEnqueueCommand_ = nullptr;
}
}
// Release SDMA engine assignments
vdev()->ReleaseSdmaEngines();
// Release all HW queues, which are idle or nearly idle
vdev()->ReleaseAllHwQueues();

Expand Down
Loading