ROCm · saleelk · Jan 2, 2026
@@ -1304,6 +1304,7 @@ class VirtualDevice : public amd::ReferenceCountedObject {
   virtual void submitUserEvent(amd::UserEvent& vcmd) { ShouldNotReachHere(); }
 
   virtual address allocKernelArguments(size_t size, size_t alignment) { return nullptr; }
+  virtual void ReleaseSdmaEngines() {}  //!< Release SDMA engine assignments (ROCm specific)
   virtual void ReleaseAllHwQueues() {}
   virtual void ReleaseHwQueue() {}
 

@@ -35,7 +35,6 @@ DmaBlitManager::DmaBlitManager(VirtualGPU& gpu, Setup setup)
       StagingXferSize(dev().settings().stagedXferSize_),
       completeOperation_(false),
       context_(nullptr) {
-  dev().getSdmaRWMasks(&sdmaEngineReadMask_, &sdmaEngineWriteMask_);
 }
 
 inline void DmaBlitManager::synchronize() const {
@@ -475,11 +474,8 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
                                            hsa_agent_t& srcAgent, size_t size,
                                            amd::CopyMetadata& copyMetadata) const {
   hsa_status_t status = HSA_STATUS_SUCCESS;
-
   uint32_t copyMask = 0;
-  uint32_t freeEngineMask = 0;
-  uint32_t recIdMask = 0;
-  bool kUseRegularCopyApi = 0;
+  bool kUseRegularCopyApi = false;
   constexpr size_t kRetainCountThreshold = 8;
   bool forceSDMA =
       (copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::SDMA);
@@ -508,31 +504,34 @@ inline bool DmaBlitManager::rocrCopyBuffer(address dst, hsa_agent_t& dstAgent, c
   hsa_signal_t active = gpu().Barriers().ActiveSignal(kInitSignalValueOne, gpu().timestamp());
 
   if (!kUseRegularCopyApi && engine != HwQueueEngine::Unknown) {
-    copyMask = gpu().getLastUsedSdmaEngine();
-    ClPrint(amd::LOG_DETAIL_DEBUG, amd::LOG_COPY, "Last copy mask 0x%x", copyMask);
-    copyMask &= (engine == HwQueueEngine::SdmaRead ? sdmaEngineReadMask_ : sdmaEngineWriteMask_);
-    if (copyMask == 0) {
-      // Check SDMA engine status
-      status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
-
-      if (status == HSA_STATUS_SUCCESS) {
-        status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &recIdMask);
-      }
+    // Check if this VirtualGPU already has an assigned engine with affinity
+    uint32_t assignedEngineMask = gpu().AssignedSdmaEngine();
+
+    if (assignedEngineMask != 0) {
+      // This VirtualGPU/stream already has an assigned engine - just use it
+      // Stream ordering handles any busy conditions naturally
+      copyMask = assignedEngineMask;
 
       ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
-              "Query copy engine status %x, srcAgent %p, "
-              "dstAgent %p, free_engine_mask 0x%x, rec_engine_mask 0x%x",
-              status, srcAgent.handle, dstAgent.handle, freeEngineMask, recIdMask);
+              "Using assigned SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d",
+              &gpu(), copyMask, engine);
+    } else {
+      // No assigned engine yet - allocate one using device-level allocator
+      copyMask = dev().AllocateSdmaEngine(&gpu(), engine, dstAgent, srcAgent);
+
+      if (copyMask != 0) {
+        // Store the assigned engine in the VirtualGPU for future use
+        gpu().SetAssignedSdmaEngine(copyMask);
 
-      // If requested engine is valid and available, use it
-      if (recIdMask != 0 && (freeEngineMask & recIdMask) != 0) {
-        copyMask = recIdMask - (recIdMask & (recIdMask - 1));
+        ClPrint(amd::LOG_INFO, amd::LOG_COPY,
+                "Allocated new SDMA engine for VirtualGPU %p: mask=0x%x, engine_type=%d",
+                &gpu(), copyMask, engine);
       } else {
-        // Otherwise use first available engine
-        copyMask = freeEngineMask - (freeEngineMask & (freeEngineMask - 1));
+        ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
+                "Failed to allocate SDMA engine for VirtualGPU %p, falling back to regular copy",
+                &gpu());
+        kUseRegularCopyApi = true;
       }
-
-      gpu().setLastUsedSdmaEngine(copyMask);
     }
 
     if (copyMask != 0 && status == HSA_STATUS_SUCCESS) {

@@ -259,8 +259,6 @@ class DmaBlitManager : public device::HostBlitManager {
 
   bool completeOperation_;        //!< DMA blit manager must complete operation
   amd::Context* context_;         //!< A dummy context
-  uint32_t sdmaEngineReadMask_;   //!< SDMA Engine Read Mask
-  uint32_t sdmaEngineWriteMask_;  //!< SDMA Engine Write Mask
 
  private:
   //! Disable copy constructor

@@ -146,6 +146,7 @@ Device::Device(hsa_agent_t bkendDevice)
       preferred_numa_node_(0),
       maxSdmaReadMask_(0),
       maxSdmaWriteMask_(0),
+      sdma_engine_allocator_(*this),
       cpu_agent_info_(nullptr) {
   group_segment_.handle = 0;
   gpuvm_segment_.handle = 0;
@@ -3479,9 +3480,143 @@ void Device::HiddenHeapInit(const VirtualGPU& gpu) {
 }
 
 // ================================================================================================
-void Device::getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const {
-  *readMask = maxSdmaReadMask_;
-  *writeMask = maxSdmaWriteMask_;
+uint32_t Device::SdmaEngineAllocator::AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
+                                                      hsa_agent_t dstAgent, hsa_agent_t srcAgent) {
+  amd::ScopedLock lock(lock_);
+
+  // Get valid engine mask based on operation type (read vs write)
+  uint32_t validEngineMask = (engine_type == HwQueueEngine::SdmaRead)
+                              ? device_.maxSdmaReadMask_
+                              : device_.maxSdmaWriteMask_;
+
+  // Simple round-robin path if all engines have equal bandwidth
+  // Disabled by default - use preferred engine logic for current GPUs
+  constexpr bool kUseSimpleRR = false;
+
+  if (kUseSimpleRR) {
+    // Simple round-robin: just cycle through valid engines
+    // This will be enabled for future GPUs where engine selection doesn't matter
+    if (validEngineMask == 0) {
+      ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
+              "No valid SDMA engines for VirtualGPU %p", vgpu);
+      return 0;
+    }
+
+    // Cycle through bit positions, find next valid engine
+    uint32_t start_bit = next_rr_engine_.fetch_add(1, std::memory_order_relaxed);
+    uint32_t selected_mask = 0;
+
+    // Try up to 32 positions to find a valid engine
+    for (uint32_t i = 0; i < 32; ++i) {
+      uint32_t bit = (start_bit + i) % 32;
+      uint32_t mask = 1u << bit;
+      if (validEngineMask & mask) {
+        selected_mask = mask;
+        break;
+      }
+    }
+
+    vgpu_to_engine_[vgpu] = selected_mask;
+
+    ClPrint(amd::LOG_INFO, amd::LOG_COPY,
+            "Assigned SDMA engine (simple RR) to VirtualGPU %p: mask=0x%x, engine_type=%d",
+            vgpu, selected_mask, engine_type);
+
+    return selected_mask;
+  }
+
+  // Current path: Query HSA for engine status and preferences
+  uint32_t freeEngineMask = 0;
+  uint32_t preferredMask = 0;
+  hsa_status_t status = HSA_STATUS_SUCCESS;
+
+  // Query current engine status
+  status = Hsa::memory_copy_engine_status(dstAgent, srcAgent, &freeEngineMask);
+  if (status == HSA_STATUS_SUCCESS) {
+    // Query preferred (high-bandwidth) engines
+    status = Hsa::memory_get_preferred_copy_engine(dstAgent, srcAgent, &preferredMask);
+  }
+
+  // Constrain to valid engines
+  freeEngineMask &= validEngineMask;
+  preferredMask &= validEngineMask;
+
+  ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+          "Engine query for VirtualGPU %p: status=%x, free_mask=0x%x, preferred_mask=0x%x, "
+          "valid_mask=0x%x, engine_type=%d",
+          vgpu, status, freeEngineMask, preferredMask, validEngineMask, engine_type);
+
+  uint32_t candidate_mask = 0;
+  uint32_t allocated_mask = 0;
+
+  // For inter-GPU copies, strongly prefer the recommended engines
+  bool is_inter_gpu = (engine_type == HwQueueEngine::SdmaInter);
+
+  if (is_inter_gpu && (preferredMask != 0)) {
+    // Inter-GPU: prioritize preferredMask, even if engines are already allocated
+    candidate_mask = validEngineMask & preferredMask;
+
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+            "Inter-GPU copy for VirtualGPU %p: prioritizing preferred engines, "
+            "candidate_mask=0x%x",
+            vgpu, candidate_mask);
+  } else {
+    // Regular read/write/intra: enforce exclusivity (don't share engines)
+    // Build a mask of engines already allocated to other VirtualGPUs
+    for (const auto& pair : vgpu_to_engine_) {
+      allocated_mask |= pair.second;
+    }
+
+    uint32_t available_mask = validEngineMask & ~allocated_mask;
+
+    if (available_mask == 0) {
+      ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
+              "No unallocated SDMA engines available for VirtualGPU %p, engine_type=%d "
+              "(valid_mask=0x%x, allocated_mask=0x%x)",
+              vgpu, engine_type, validEngineMask, allocated_mask);
+      return 0;
+    }
+
+    // Prefer high-bandwidth (recommended) engines if available
+    candidate_mask = available_mask & preferredMask;
+    if (candidate_mask == 0) {
+      candidate_mask = available_mask;
+    }
+  }
+
+  if (candidate_mask == 0) {
+    ClPrint(amd::LOG_WARNING, amd::LOG_COPY,
+            "No candidate SDMA engines for VirtualGPU %p, engine_type=%d",
+            vgpu, engine_type);
+    return 0;
+  }
+
+  // Select the lowest bit (first available engine)
+  uint32_t selected_mask = candidate_mask & (~candidate_mask + 1);
+
+  // Update the map
+  vgpu_to_engine_[vgpu] = selected_mask;
+
+  ClPrint(amd::LOG_INFO, amd::LOG_COPY,
+          "Assigned SDMA engine to VirtualGPU %p: mask=0x%x, engine_type=%d, "
+          "valid_mask=0x%x, preferred_mask=0x%x, allocated_mask=0x%x, is_inter_gpu=%d",
+          vgpu, selected_mask, engine_type, validEngineMask, preferredMask,
+          allocated_mask, is_inter_gpu);
+
+  return selected_mask;
+}
+
+// ================================================================================================
+void Device::SdmaEngineAllocator::ReleaseEngine(VirtualGPU* vgpu) {
+  amd::ScopedLock lock(lock_);
+
+  auto it = vgpu_to_engine_.find(vgpu);
+  if (it != vgpu_to_engine_.end()) {
+    ClPrint(amd::LOG_DEBUG, amd::LOG_COPY,
+            "Released SDMA engine for VirtualGPU %p: mask=0x%x",
+            vgpu, it->second);
+    vgpu_to_engine_.erase(it);
+  }
 }
 
 // ================================================================================================

@@ -602,9 +602,16 @@ class Device : public NullDevice {
   void HiddenHeapAlloc(const VirtualGPU& gpu);
   //! Init hidden heap for device memory allocations
   void HiddenHeapInit(const VirtualGPU& gpu);
-  void getSdmaRWMasks(uint32_t* readMask, uint32_t* writeMask) const;
-  bool isXgmi() const { return isXgmi_; }
+  bool isXgmi() const override { return isXgmi_; }
 
+  //! SDMA engine allocation for per-stream affinity
+  uint32_t AllocateSdmaEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
+                              hsa_agent_t dstAgent, hsa_agent_t srcAgent) const {
+    return sdma_engine_allocator_.AllocateEngine(vgpu, engine_type, dstAgent, srcAgent);
+  }
+  void ReleaseSdmaEngine(VirtualGPU* vgpu) const {
+    sdma_engine_allocator_.ReleaseEngine(vgpu);
+  }
   //! Returns the map of code objects to kernels
   const auto& KernelMap() const { return kernel_map_; }
   //! Adds a kernel to the kernel map
@@ -702,6 +709,27 @@ class Device : public NullDevice {
   uint32_t maxSdmaWriteMask_;
   bool isXgmi_;  //!< Flag to indicate if there is XGMI between CPU<->GPU
 
+  //! SDMA engine allocator for per-stream affinity
+  struct SdmaEngineAllocator {
+    amd::Monitor lock_;  //!< Protects the allocation state
+    std::unordered_map<VirtualGPU*, uint32_t> vgpu_to_engine_;  //!< VirtualGPU -> engine mask
+    std::atomic<uint32_t> next_rr_engine_{0};  //!< Simple RR counter for future use
+    const Device& device_;  //!< Reference to parent device for accessing masks
+
+    SdmaEngineAllocator(const Device& device)
+        : lock_(true), device_(device) {}
+
+    //! Allocate an SDMA engine for a VirtualGPU
+    //! Queries HSA for engine status and preferred engines, then allocates
+    //! For inter-GPU copies, strongly prefers recommended engines even if already allocated
+    uint32_t AllocateEngine(VirtualGPU* vgpu, HwQueueEngine engine_type,
+                           hsa_agent_t dstAgent, hsa_agent_t srcAgent);
+
+    //! Release engine allocation for a VirtualGPU
+    void ReleaseEngine(VirtualGPU* vgpu);
+  };
+  mutable SdmaEngineAllocator sdma_engine_allocator_;
+
   //! Code object to kernel info map (used in the crash dump analysis)
   mutable std::map<uint64_t, Kernel&> kernel_map_;
 

@@ -1763,6 +1763,10 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
 
 // ================================================================================================
 VirtualGPU::~VirtualGPU() {
+  // Release SDMA engine assignment for this VirtualGPU
+  dev().ReleaseSdmaEngine(this);
+  ClearAssignedSdmaEngine();
+
   delete blitMgr_;
 
   if (tracking_created_) {
@@ -1965,6 +1969,14 @@ address VirtualGPU::allocKernelArguments(size_t size, size_t alignment) {
   }
 }
 
+// ================================================================================================
+void VirtualGPU::ReleaseSdmaEngines() {
+  // Release SDMA engine assignment when queue is idle
+  // This allows the engine to be reassigned to other active streams
+  dev().ReleaseSdmaEngine(this);
+  ClearAssignedSdmaEngine();
+}
+
 // ================================================================================================
 void VirtualGPU::ReleaseAllHwQueues() {
   if (roc_device_.settings().dynamic_queues_ &&

@@ -391,6 +391,7 @@ class VirtualGPU : public device::VirtualDevice {
   virtual void submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {}
 
   virtual address allocKernelArguments(size_t size, size_t alignment) final;
+  virtual void ReleaseSdmaEngines() final;  //!< Release SDMA engine assignments
   virtual void ReleaseAllHwQueues() final;
   virtual void ReleaseHwQueue() final;
 
@@ -461,6 +462,17 @@ class VirtualGPU : public device::VirtualDevice {
   //! Analyzes a crashed AQL queue to find a broken AQL packet
   void AnalyzeAqlQueue() const;
 
+  //! SDMA engine affinity management
+  uint32_t AssignedSdmaEngine() const {
+    return assigned_sdma_engine_;
+  }
+  void SetAssignedSdmaEngine(uint32_t engine_mask) {
+    assigned_sdma_engine_ = engine_mask;
+  }
+  void ClearAssignedSdmaEngine() {
+    assigned_sdma_engine_ = 0;
+  }
+
  private:
   //! Dispatches a barrier with blocking HSA signals
   void dispatchBlockingWait();
@@ -633,6 +645,9 @@ class VirtualGPU : public device::VirtualDevice {
                                               //!< with a complition signal
   hsa_signal_t last_completion_signal_{};     //!< The last completion signal
 
+  //! SDMA engine affinity tracking for this VirtualGPU/stream
+  uint32_t assigned_sdma_engine_ = 0;           //!< Assigned SDMA engine mask for all operations
+
   using KernelArgImpl = device::Settings::KernelArgImpl;
 };
 }  // namespace amd::roc
@@ -221,6 +221,8 @@ void HostQueue::finish(bool cpu_wait) {
       lastEnqueueCommand_ = nullptr;
     }
   }
+  // Release SDMA engine assignments
+  vdev()->ReleaseSdmaEngines();
   // Release all HW queues, which are idle or nearly idle
   vdev()->ReleaseAllHwQueues();