[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause #152831

kevinsala · 2025-08-09T05:48:23Z

Part 3 adding offload runtime support. See #152651.

…ause

llvmbot · 2025-08-09T05:48:52Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-offload

Author: Kevin Sala Penades (kevinsala)

Changes

Part 2 adding offload runtime support. See #152651.

Patch is 39.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152831.diff

23 Files Affected:

(modified) offload/DeviceRTL/include/DeviceTypes.h (+4)
(modified) offload/DeviceRTL/include/Interface.h (+1-1)
(modified) offload/DeviceRTL/include/State.h (+1-1)
(modified) offload/DeviceRTL/src/Kernel.cpp (+7-7)
(modified) offload/DeviceRTL/src/State.cpp (+46-2)
(modified) offload/include/Shared/APITypes.h (+4-2)
(modified) offload/include/Shared/Environment.h (+3-1)
(modified) offload/include/device.h (+3)
(modified) offload/include/omptarget.h (+6-1)
(modified) offload/libomptarget/OpenMP/API.cpp (+14)
(modified) offload/libomptarget/device.cpp (+6)
(modified) offload/libomptarget/exports (+1)
(modified) offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h (+1)
(modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+22-12)
(modified) offload/plugins-nextgen/common/include/PluginInterface.h (+29-4)
(modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+65-21)
(modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h (+1)
(modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+26-11)
(modified) offload/plugins-nextgen/host/src/rtl.cpp (+2-2)
(added) offload/test/offloading/dyn_groupprivate_strict.cpp (+141)
(modified) openmp/runtime/src/include/omp.h.var (+10)
(modified) openmp/runtime/src/kmp_csupport.cpp (+9)
(modified) openmp/runtime/src/kmp_stub.cpp (+16)

diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h
index 2e5d92380f040..a43b506d6879e 100644
--- a/offload/DeviceRTL/include/DeviceTypes.h
+++ b/offload/DeviceRTL/include/DeviceTypes.h
@@ -163,4 +163,8 @@ typedef enum omp_allocator_handle_t {
 
 ///}
 
+enum omp_access_t {
+  omp_access_cgroup = 0,
+};
+
 #endif
diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h
index c4bfaaa2404b4..672afea206785 100644
--- a/offload/DeviceRTL/include/Interface.h
+++ b/offload/DeviceRTL/include/Interface.h
@@ -222,7 +222,7 @@ struct KernelEnvironmentTy;
 int8_t __kmpc_is_spmd_exec_mode();
 
 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+                           KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
 
 void __kmpc_target_deinit();
 
diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h
index db396dae6e445..17c3c6f2d3e42 100644
--- a/offload/DeviceRTL/include/State.h
+++ b/offload/DeviceRTL/include/State.h
@@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;
 
 /// Initialize the state machinery. Must be called by all threads.
 void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-          KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
+          KernelLaunchEnvironmentTy *KernelLaunchEnvironment);
 
 /// Return the kernel and kernel launch environment associated with the current
 /// kernel. The former is static and contains compile time information that
diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp
index 467e44a65276c..58e9a09105a76 100644
--- a/offload/DeviceRTL/src/Kernel.cpp
+++ b/offload/DeviceRTL/src/Kernel.cpp
@@ -34,8 +34,8 @@ enum OMPTgtExecModeFlags : unsigned char {
 };
 
 static void
-inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                    KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+                  KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
   // Order is important here.
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
@@ -80,17 +80,17 @@ extern "C" {
 /// \param Ident               Source location identification, can be NULL.
 ///
 int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
-                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                           KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
   ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
   bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
   bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
   if (IsSPMD) {
-    inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
-                        KernelLaunchEnvironment);
+    initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
+                      KernelLaunchEnvironment);
     synchronize::threadsAligned(atomic::relaxed);
   } else {
-    inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
-                        KernelLaunchEnvironment);
+    initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
+                      KernelLaunchEnvironment);
     // No need to wait since only the main threads will execute user
     // code and workers will run into a barrier right away.
   }
diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp
index 62b03e7bba720..9e2a9999167b4 100644
--- a/offload/DeviceRTL/src/State.cpp
+++ b/offload/DeviceRTL/src/State.cpp
@@ -158,6 +158,34 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
   memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
 }
 
+struct DynCGroupMemTy {
+  void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) {
+    Size = 0;
+    Ptr = nullptr;
+    IsFallback = false;
+    if (KLE) {
+      Size = KLE->DynCGroupMemSize;
+      if (void *Fallback = KLE->DynCGroupMemFallback) {
+        Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
+        IsFallback = true;
+      } else {
+        Ptr = static_cast<char *>(NativeDynCGroup);
+      }
+    }
+  }
+
+  char *getPtr(size_t Offset) const { return Ptr + Offset; }
+  bool isFallback() const { return IsFallback; }
+  size_t getSize() const { return Size; }
+
+private:
+  char *Ptr;
+  size_t Size;
+  bool IsFallback;
+};
+
+[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;
+
 } // namespace
 
 void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
@@ -246,13 +274,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
 } // namespace
 
 void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
-                 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
+                 KernelLaunchEnvironmentTy *KLE) {
   SharedMemorySmartStack.init(IsSPMD);
+
+  if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
+    KLE = nullptr;
+
   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
+    DynCGroupMem.init(KLE, DynamicSharedBuffer);
     TeamState.init(IsSPMD);
     ThreadStates = nullptr;
     KernelEnvironmentPtr = &KernelEnvironment;
-    KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
+    KernelLaunchEnvironmentPtr = KLE;
   }
 }
 
@@ -430,6 +463,17 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
 int omp_get_initial_device(void) { return -1; }
 
 int omp_is_initial_device(void) { return 0; }
+
+void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
+                                   omp_access_t) {
+  if (IsFallback != NULL)
+    *IsFallback = DynCGroupMem.isFallback();
+  return DynCGroupMem.getPtr(Offset);
+}
+
+size_t omp_get_dyn_groupprivate_size(omp_access_t) {
+  return DynCGroupMem.getSize();
+}
 }
 
 extern "C" {
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 978b53d5d69b9..0ef2dd162292b 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -97,8 +97,10 @@ struct KernelArgsTy {
   struct {
     uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
     uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
-    uint64_t Unused : 62;
-  } Flags = {0, 0, 0};
+    uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup
+                                            // mem fallback.
+    uint64_t Unused : 61;
+  } Flags = {0, 0, 0, 0};
   // The number of teams (for x,y,z dimension).
   uint32_t NumTeams[3] = {0, 0, 0};
   // The number of threads (for x,y,z dimension).
diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..0670ac1090da4 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -93,9 +93,11 @@ struct KernelEnvironmentTy {
 };
 
 struct KernelLaunchEnvironmentTy {
+  void *ReductionBuffer = nullptr;
+  void *DynCGroupMemFallback = nullptr;
   uint32_t ReductionCnt = 0;
   uint32_t ReductionIterCnt = 0;
-  void *ReductionBuffer = nullptr;
+  uint32_t DynCGroupMemSize = 0;
 };
 
 #endif // OMPTARGET_SHARED_ENVIRONMENT_H
diff --git a/offload/include/device.h b/offload/include/device.h
index f4b10abbaa3fd..0e93cf8ec1a8b 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -158,6 +158,9 @@ struct DeviceTy {
   /// Indicate that there are pending images for this device or not.
   void setHasPendingImages(bool V) { HasPendingImages = V; }
 
+  /// Get the maximum shared memory per team for any kernel.
+  uint64_t getMaxSharedTeamMemory();
+
 private:
   /// Deinitialize the device (and plugin).
   void deinit();
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 6971780c7bdb5..45bb74ec367d6 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
 	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
+	     0,      {0,0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
 
 struct DeviceTy;
 
@@ -273,10 +273,15 @@ struct __tgt_target_non_contig {
 extern "C" {
 #endif
 
+typedef enum {
+    omp_access_cgroup = 0,
+} omp_access_t;
+
 void ompx_dump_mapping_tables(void);
 int omp_get_num_devices(void);
 int omp_get_device_num(void);
 int omp_get_initial_device(void);
+size_t omp_get_groupprivate_limit(int device_num, omp_access_t access_group = omp_access_cgroup);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
 int omp_target_is_present(const void *Ptr, int DeviceNum);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index 4576f9bd06121..1ed4192157fc8 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -98,6 +98,20 @@ EXTERN int omp_get_initial_device(void) {
   return HostDevice;
 }
 
+EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
+                                         omp_access_t AccessGroup) {
+  TIMESCOPE();
+  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+  if (DeviceNum == omp_get_initial_device())
+    return 0;
+
+  auto DeviceOrErr = PM->getDevice(DeviceNum);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+  return DeviceOrErr->getMaxSharedTeamMemory();
+}
+
 EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
   TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
                          ";size=" + std::to_string(Size));
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index f88e30ae9e76b..31bfc7d092424 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -281,3 +281,9 @@ bool DeviceTy::useAutoZeroCopy() {
     return false;
   return RTL->use_auto_zero_copy(RTLDeviceID);
 }
+
+uint64_t DeviceTy::getMaxSharedTeamMemory() {
+  using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
+  return RTL->query_device_info(
+      RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
+}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index 2406776c1fb5f..b5a1401564d58 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -40,6 +40,7 @@ VERS1.0 {
     omp_get_num_devices;
     omp_get_device_num;
     omp_get_initial_device;
+    omp_get_groupprivate_limit;
     omp_target_alloc;
     omp_target_free;
     omp_target_is_present;
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 3117763e35896..2cf156e576c5f 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -52,6 +52,7 @@ typedef enum {
   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
   HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
   HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
+  HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
 } hsa_amd_memory_pool_info_t;
 
 typedef enum {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 12c7cc62905c9..fa373c2029f0c 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -273,7 +273,6 @@ struct AMDGPUMemoryPoolTy {
 
     if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
       return Err;
-
     return Plugin::success();
   }
 
@@ -543,6 +542,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
         return Err;
     }
 
+    StaticBlockMemSize = GroupSize;
+
     // Make sure it is a kernel symbol.
     if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
       return Plugin::error(ErrorCode::INVALID_BINARY,
@@ -566,8 +567,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
 
   /// Launch the AMDGPU kernel function.
   Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
-                   uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
-                   KernelLaunchParamsTy LaunchParams,
+                   uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
+                   KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
   /// Print more elaborate kernel launch info for AMDGPU
@@ -2020,6 +2021,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = checkIfAPU())
       return Err;
 
+    // Retrieve the size of the group memory.
+    for (const auto *Pool : AllMemoryPools) {
+      if (Pool->isGroup()) {
+        size_t Size = 0;
+        if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
+          return Err;
+        MaxBlockSharedMemSize = Size;
+        break;
+      }
+    }
+
+    // Supports block shared memory natively.
+    HasNativeBlockSharedMem = true;
+
     return Plugin::success();
   }
 
@@ -2856,7 +2871,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     KernelArgsTy KernelArgs = {};
     uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
     if (auto Err = AMDGPUKernel.launchImpl(
-            *this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
+            *this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
             KernelLaunchParamsTy{}, AsyncInfoWrapper))
       return Err;
 
@@ -3357,6 +3372,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
 
 Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                                  uint32_t NumThreads[3], uint32_t NumBlocks[3],
+                                 uint32_t DynBlockMemSize,
                                  KernelArgsTy &KernelArgs,
                                  KernelLaunchParamsTy LaunchParams,
                                  AsyncInfoWrapperTy &AsyncInfoWrapper) const {
@@ -3374,13 +3390,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
     return Err;
 
-  // Account for user requested dynamic shared memory.
-  uint32_t GroupSize = getGroupSize();
-  if (uint32_t MaxDynCGroupMem = std::max(
-          KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
-    GroupSize += MaxDynCGroupMem;
-  }
-
   uint64_t StackSize;
   if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
     return Err;
@@ -3434,7 +3443,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
-                                  GroupSize, StackSize, ArgsMemoryManager);
+                                  getStaticBlockMemSize() + DynBlockMemSize,
+                                  StackSize, ArgsMemoryManager);
 }
 
 Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 162b149ab483e..3357ccfe0c9b5 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -226,6 +226,10 @@ struct InfoTreeNode {
   }
 };
 
+enum class DeviceQueryKind {
+  DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0,
+};
+
 /// Class wrapping a __tgt_device_image and its offload entry table on a
 /// specific device. This class is responsible for storing and managing
 /// the offload entries for an image on a device.
@@ -312,13 +316,16 @@ struct GenericKernelTy {
                AsyncInfoWrapperTy &AsyncInfoWrapper) const;
   virtual Error launchImpl(GenericDeviceTy &GenericDevice,
                            uint32_t NumThreads[3], uint32_t NumBlocks[3],
-                           KernelArgsTy &KernelArgs,
+                           uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
   /// Get the kernel name.
   const char *getName() const { return Name.c_str(); }
 
+  /// Get the size of the static per-block memory consumed by the kernel.
+  uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };
+
   /// Get the kernel image.
   DeviceImageTy &getImage() const {
     assert(ImagePtr && "Kernel is not initialized!");
@@ -331,9 +338,9 @@ struct GenericKernelTy {
   }
 
   /// Return a device pointer to a new kernel launch environment.
-  Expected<KernelLaunchEnvironmentTy *>
-  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
-                             AsyncInfoWrapperTy &AsyncInfo) const;
+  Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
+      GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+      void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfo) const;
 
   /// Indicate whether an execution mode is valid.
   static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
@@ -425,6 +432,9 @@ struct GenericKernelTy {
   /// The maximum number of threads which the kernel could leverage.
   uint32_t MaxNumThreads;
 
+  /// The static memory sized per block.
+  uint32_t StaticBlockMemSize = 0;
+
   /// The kernel environment, including execution flags.
   KernelEnvironmentTy KernelEnvironment;
 
@@ -731,6 +741,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// this id is not unique between different plugins; they may overlap.
   int32_t getDeviceId() const { return DeviceId; }
 
+  /// Get the total shared memory per block that can be used in any kernel.
+  uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
+  /// Indicate whether the device has native block shared memory.
+  bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }
+
   /// Set the context of the device if needed, before calling device-specific
   /// functions. Plugins may implement this function as a no-op if not needed.
   virtual Error setContext() = 0;
@@ -1132,6 +1148,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   std::atomic<bool> OmptInitialized;
 #endif
 
+  /// The total per-block shared memory that a kernel may use.
+  uint32_t MaxBlockSharedMemSize = 0;
+
+  /// Whether the device has native block shared memory.
+  bool HasNativeBlockSharedMem = false;
+
 private:
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
@@ -1347,6 +1369,9 @@ struct GenericPluginTy {
   /// Prints information about the given devices supported by the plugin.
   void print_device_info(int32_t DeviceId);
 
+  /// Retrieve information about the given device.
+  int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);
+
   /// Creates an event in the given plugin if supported.
   int32_t create_event(int32_t DeviceId, void **EventPtr);
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 81b9d423e13d8..2997585e1660f 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -477,20 +477,20 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
 Expected<KernelLaunchEnvironmentTy *>
 GenericKernelTy::getKernelLaunchEnvironment(
-    GenericDeviceTy &GenericDevice, uint32_t Version,
-    AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+    GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
+    void *FallbackBlockMem, AsyncInfoWrapperTy &AsyncInfoWrapper) const {
   // Ctor/Dtor have no arguments, replaying uses the original kernel launch
   // environment. Older versions of the compiler do not generate a kernel
   // launch environment.
   if (GenericDevice.Plugin.getRecordReplay().isReplaying() ||
-      Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
+      KernelArgs.Version < OMP_KERNEL_ARG_MIN_VERSION_WITH_DYN_PTR)
     return nullptr;
 
-  if (!KernelEnvironment....
[truncated]

openmp/device/include/DeviceTypes.h

arsenm · 2025-08-09T07:27:06Z

offload/DeviceRTL/src/State.cpp

+    Size = 0;
+    Ptr = nullptr;
+    IsFallback = false;


Move to field initializers?

With the clang::loader_uninitialized attribute in the DynCGroupMem variable, I can't use field initializers or a constructor.

offload/DeviceRTL/src/State.cpp

openmp/runtime/src/kmp_csupport.cpp

github-actions · 2025-08-10T05:55:07Z

✅ With the latest revision this PR passed the C/C++ code formatter.

offload/plugins-nextgen/common/src/PluginInterface.cpp

mjklemm · 2025-08-14T11:56:40Z

offload/include/omptarget.h

 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
 	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
+	     0,      {0,0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};


I think it would make sense to add comments to what these fields mean. Line 110 is full of magic constants.

I'll add comments in a separate PR.

mjklemm · 2025-08-14T12:02:32Z

I have started to put the Fortran interfaces together in https://github.com/mjklemm/llvm-project/tree/ftn-dyn-groupprivate-api. I will add the Fortran API once the Fortran support has been done in Flang.

…rivate-codegen-pr' into users/kevinsala/omp-dyn-groupprivate-rt-pr

…sers/kevinsala/omp-dyn-groupprivate-rt-pr

[OpenMP][Offload] Add offload runtime support for dyn_groupprivate cl…

fa3c742

…ause

llvmbot added backend:AMDGPU openmp:libomp OpenMP host runtime offload labels Aug 9, 2025

kevinsala mentioned this pull request Aug 9, 2025

[OpenMP] Add parser/semantic support for dyn_groupprivate clause #152651

Open

kevinsala requested review from jhuber6, kparzysz, mjklemm, shiltian and skc7 August 9, 2025 05:51

arsenm reviewed Aug 9, 2025

View reviewed changes

Add fixes

f20f4ba

kevinsala force-pushed the users/kevinsala/omp-dyn-groupprivate-rt-pr branch from 20a96c6 to f20f4ba Compare August 10, 2025 05:56

arsenm reviewed Aug 10, 2025

View reviewed changes

offload/plugins-nextgen/common/src/PluginInterface.cpp Show resolved Hide resolved

offload/plugins-nextgen/common/src/PluginInterface.cpp Show resolved Hide resolved

kevinsala force-pushed the users/kevinsala/omp-dyn-groupprivate-rt-pr branch from f20f4ba to f2c6f97 Compare August 10, 2025 19:16

mjklemm reviewed Aug 14, 2025

View reviewed changes

kevinsala added 3 commits August 24, 2025 23:15

Merge remote-tracking branch 'upstream/users/kevinsala/omp-dyn-groupp…

c698b84

…rivate-codegen-pr' into users/kevinsala/omp-dyn-groupprivate-rt-pr

Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…

d4bf656

…sers/kevinsala/omp-dyn-groupprivate-rt-pr

Add support for null fallback

5f68ea0

kevinsala force-pushed the users/kevinsala/omp-dyn-groupprivate-rt-pr branch from f2c6f97 to 5f68ea0 Compare October 26, 2025 20:39

llvmbot added the openmp:libomptarget OpenMP offload runtime label Oct 26, 2025

kevinsala added 2 commits October 26, 2025 20:04

Fix test

9fb6e27

Merge branch 'users/kevinsala/omp-dyn-groupprivate-codegen-pr' into u…

968cc69

…sers/kevinsala/omp-dyn-groupprivate-rt-pr

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause #152831

[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause #152831

Uh oh!

kevinsala commented Aug 9, 2025 •

edited

Loading

Uh oh!

llvmbot commented Aug 9, 2025 •

edited

Loading

Uh oh!

Uh oh!

arsenm Aug 9, 2025

Uh oh!

kevinsala Aug 10, 2025

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Aug 10, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

mjklemm Aug 14, 2025

Uh oh!

kevinsala Oct 25, 2025

Uh oh!

mjklemm commented Aug 14, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause #152831

Are you sure you want to change the base?

[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause #152831

Uh oh!

Conversation

kevinsala commented Aug 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Aug 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

arsenm Aug 9, 2025

Choose a reason for hiding this comment

Uh oh!

kevinsala Aug 10, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

github-actions bot commented Aug 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

mjklemm Aug 14, 2025

Choose a reason for hiding this comment

Uh oh!

kevinsala Oct 25, 2025

Choose a reason for hiding this comment

Uh oh!

mjklemm commented Aug 14, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

kevinsala commented Aug 9, 2025 •

edited

Loading

llvmbot commented Aug 9, 2025 •

edited

Loading

github-actions bot commented Aug 10, 2025 •

edited

Loading