diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt
index 82ebdf84019a5..d972402d03a7d 100644
--- a/offload/CMakeLists.txt
+++ b/offload/CMakeLists.txt
@@ -475,3 +475,5 @@ if(OFFLOAD_INCLUDE_TESTS)
   add_subdirectory(test)
   add_subdirectory(unittests)
 endif()
+
+add_subdirectory(utils)
diff --git a/offload/utils/CMakeLists.txt b/offload/utils/CMakeLists.txt
new file mode 100644
index 0000000000000..d6f2d6729d18c
--- /dev/null
+++ b/offload/utils/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(OPENMP_UTILS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH
+    "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')")
+
+macro(add_openmp_util path)
+  install(PROGRAMS
+     ${path}
+     DESTINATION "${OPENMP_UTILS_INSTALL_DIR}")
+endmacro()
+
+add_subdirectory(gpurun)
diff --git a/offload/utils/gpurun/CMakeLists.txt b/offload/utils/gpurun/CMakeLists.txt
new file mode 100644
index 0000000000000..0483a5737b830
--- /dev/null
+++ b/offload/utils/gpurun/CMakeLists.txt
@@ -0,0 +1 @@
+add_openmp_util(${CMAKE_CURRENT_SOURCE_DIR}/gpurun)
diff --git a/offload/utils/gpurun/gpurun b/offload/utils/gpurun/gpurun
new file mode 100755
index 0000000000000..870bc7a8ccbcd
--- /dev/null
+++ b/offload/utils/gpurun/gpurun
@@ -0,0 +1,697 @@
+#!/bin/bash
+# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+# of the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+#  gpurun: Process launch utility for GPU applications. This is a wrapper
+#          to execute application binaries including OpenMPI GPU applications.
+#          See help message below (gpurun -h) for more information.
+#
+#  Usage Examples:
+#    gpurun true
+#    mpirun -np  4 gpurun env | grep ROCR_VISIBLE_DEVICES
+#
+
+# If set to 1, just invoke the rest of the command line without doing anything
+# else.
+GPURUN_BYPASS=${GPURUN_BYPASS:-0}
+
+function execOnError() {
+   exec "$@"
+}
+
+# PROGVERSION string is updated by cmake when component is installed
+PROGVERSION=X.Y-Z
+function version(){
+   echo $0 version $PROGVERSION
+   exit 0
+}
+function usage(){
+/bin/cat 2>&1 <<"EOF"
+
+   gpurun: Application process launch utility for GPUs.
+           This utility ensures the process will enable either a single
+	   GPU or the number specified with -md (multi-device) option.
+           It launches the application binary with either the 'taskset'
+           or 'numactl' utility so the process only runs on CPU cores
+           in the same NUMA domain as the selected GPUs.
+           This utility sets environment variable ROCR_VISIBLE_DEVICES
+	   to selected GPUs ONLY if it was not already set by the
+	   callers environment AND the number of GPUs is not 1.
+           This utility also sets environment variable HSA_CU_MASK
+           to control which CUs are available to the process.
+	   HSA_CU_MASK is set only when more than one OpenMPI process
+	   (rank) will utilize the same GPU and it is not preset.
+           Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the
+           number of CUs available to the process after masking.
+
+   Usage:
+      gpurun <executable> [ <executable args> ]
+      mpirun -np <num ranks>  gpurun <executable> [ <executable args> ]
+
+   Options:
+      -h   Print this help message and exit
+      -md  Set number of desired devices for multi-device mode, default=1
+      -s   suppress output, often useful in benchmarking
+      -q   suppress output, quiet, alias of -s, same as GPURUN_VERBOSE=0
+      -v   Verbose output, same as GPURUN_VERBOSE=1
+      -vv  Verbose output, same as GPURUN_VERBOSE=2
+      -m   use numactl membind to CPUs in same NUMA domain. Note: Allocation
+           fails when not enough memory available on these nodes.
+      -l   use numactl localalloc to CPUs in same NUMA domain. Note: If
+           memory cannot be allocated, alloc falls back to other nodes.
+      -nr  use numactl ROCR_VISIBLE_DEVICES
+      -nm  use numactl OMPI_COMM_WORLD_LOCAL_RANK
+      --version Print version of gpurun and exit
+
+   Optional Input environment variables:
+      GPURUN_VERBOSE
+        0:  default for silent operation, no trace printed to stderr
+        1:  -v prints trace record including process launch cmd to stderr
+        2:  -vv prints trace and other summary diagnostics
+      ROCMINFO_BINARY  Set location of rocminfo binary
+      AOMP: location of AOMP or ROCM
+      GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0.
+                          This only works for single device mode.
+      GPURUN_VISIBLE_DEVICE_TYPES: useful if machine has different GPU cards
+      GPURUN_MASK_POLICY : useful if machine has different GPU cards
+      ROCR_VISIBLE_DEVICES: See description above
+      OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi
+      OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi
+      This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID
+      and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK
+
+   Generated (output) Environment Variables:
+      OMPX_TARGET_TEAM_PROCS - Number of CUs available to process
+      ROCR_VISIBLE_DEVICES - list of GPU Uuids for the selected devices if not preset
+      HSA_CU_MASK - The CU mask for the device.
+      LIBOMPTARGET_NUM_MULTI_DEVICES - the value set by -md argument
+      GPU_MAX_HW_QUEUES
+      LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES"
+
+   Limitations:
+   - Currently, gpurun creates masks that are mutually exclusive of each other.
+     That is, the MPI processes will not share CUs. If number of ranks is not
+     perfectly divisible by number of CUs or number of GPUs, some resources
+     would be unused.
+     Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization.
+   - Works with AOMP 19.0-0 or ROCM 6.1 or greater
+   - cu masking is not available when multiple devices per process are enabled
+     with -md option (multi-device) mode.
+
+   Notes:
+     With MPI, this utility distributes GPUs and their CUs across
+     multiple ranks of an MPI job into mutually exclusive sets of CUs.
+     It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE
+     and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a
+     the mutually exclusive CU mask.
+
+     An rplace (rank place) is a subset of CUs for a rank. 
+     gpurun calculates the number of rplaces needed to contain all
+     the specified number of ranks for this node. If number of ranks not
+     divisible by number of GPUs, then there will be more rplaces than ranks.
+     The number of CUs in an rplace is calculated by dividing the number of
+     CUs per GPU by the number of rplaces per GPU. This is also the number of
+     bits set in the CU mask. This is also the number of physical locations
+     available for an OpenMP team to execute. This utility exports that number
+     to the environment variable OMPX_TARGET_TEAM_PROCS. This value
+     could be used by the application or runtume to adjust the number
+     of desired teams in a target region. If no masking occurs, the entire
+     GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to
+     the total number of CUs on the GPU.
+
+   Copyright (c) 2024  ADVANCED MICRO DEVICES, INC.
+
+EOF
+  exit 0
+}
+
+_end_gpurun_opts=0
+_devices_per_mdset=1
+_uses_multi_device=0
+while [ "$_end_gpurun_opts" == "0"  ] ; do
+   case "$1" in
+      -s)          GPURUN_VERBOSE=0;;
+      -q)          GPURUN_VERBOSE=0;;
+      --quiet)     GPURUN_VERBOSE=0;;
+      -h)          usage ;;
+      -help)       usage ;;
+      --help)      usage ;;
+      -version)    version ;;
+      --version)   version ;;
+      -v)          GPURUN_VERBOSE=1;;
+      -vv)         GPURUN_VERBOSE=2;;
+      -m)          _use_numactl_membind=1;;
+      -md)         shift; _devices_per_mdset=$1; _uses_multi_device=1;;
+      -nr)          _use_numactl_rocr=1;;
+      -nm)          _use_numactl_ompi=1;;
+      -l)          _use_numactl_localalloc=1;;
+      -nomask)     GPURUN_MASK_POLICY="nomask";;
+      *)           _end_gpurun_opts=1; break;;
+   esac
+   if [ "$_end_gpurun_opts" == "0" ] ; then
+     shift
+   fi
+done
+
+if  [ "$GPURUN_BYPASS" = "1" ]; then
+  execOnError "$@"
+fi
+
+# Default: quiet operation
+GPURUN_VERBOSE=${GPURUN_VERBOSE:-0}
+# Default: create mutually exclusive sets of CUs when GPU is oversubscribed
+GPURUN_MASK_POLICY=${GPURUN_MASK_POLICY:-mutex}
+# switch mask policy to preset if HSA_CU_MASK was preset
+[[ ! -z "$HSA_CU_MASK" ]] && GPURUN_MASK_POLICY=preset
+# switch mask policy to nomask for multi-device
+[[ $_uses_multi_device == 1 ]] && GPURUN_MASK_POLICY=nomask
+# Offset selected device to avoid some heavily used GPUs
+GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0}
+
+#  Get environment variables set by OpenMPI
+_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE
+_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK
+# If not OpenMPI, check for Platform MPI, MVAPICH
+if [ -z "$_num_local_ranks" ] ; then
+   _num_local_ranks=$MPI_LOCALNRANKS
+   _local_rank_num=$MPI_LOCALRANKID
+fi
+# Also try MPI_COMM_WORLD env vars
+if [ -z "$_num_local_ranks" ] ; then
+   _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE
+   _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK
+fi
+# Check if SLURM was used
+if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then
+   _num_local_ranks=$SLURM_CPUS_ON_NODE
+   _local_rank_num=$SLURM_LOCALID
+fi
+
+if [ "$_use_numactl_rocr"  == "1" ] ; then
+  _cmd_binary=`which numactl`
+  if [ $? == 0 ] ; then
+    numactl --cpunodebind $ROCR_VISIBLE_DEVICES  --membind $ROCR_VISIBLE_DEVICES $*
+    exit $?
+  else
+    $*
+    exit $?
+  fi
+fi
+if [ "$_use_numactl_ompi" == "1" ] ; then
+  _cmd_binary=`which numactl`
+  if [ $? == 0 ] ; then
+    numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK  --membind $OMPI_COMM_WORLD_LOCAL_RANK $*
+    exit $?
+  else
+    $*
+    exit $?
+  fi
+fi
+# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU
+if [ -z "$_num_local_ranks" ] ; then
+   _num_local_ranks=1
+   _local_rank_num=0
+fi
+
+# Find location of the rocminfo binary
+AOMP=${AOMP:-_AOMP_INSTALL_DIR_}
+if [ ! -d $AOMP ] ; then
+   AOMP="_AOMP_INSTALL_DIR_"
+fi
+if [ ! -d $AOMP ] ; then
+   AOMP="/opt/rocm/lib/llvm"
+fi
+if [ ! -d $AOMP ] ; then
+   AOMP="/opt/rocm/llvm"
+fi
+if [ ! -d $AOMP ] ; then
+   realpath=`realpath $0`
+   thisdir=`dirname $realpath`
+   AOMP=$thisdir/..
+fi
+if [ ! -d $AOMP ] ; then
+   >&2 echo "ERROR: AOMP not found at $AOMP"
+   >&2 echo "       Please install AOMP or correctly set env-var AOMP"
+   execOnError "$@"
+fi
+ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo}
+[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo
+[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo
+if [ ! -f $ROCMINFO_BINARY ] ; then
+   >&2 echo "ERROR: Could not find binary for rocminfo,"
+   >&2 echo "       Please correct installation of ROCM or AOMP compiler"
+   execOnError "$@"
+fi
+
+# Use rocminfo to find number number of CUs and gfxids for each GPU.
+_tfile="/tmp/rinfo_out$$"
+$ROCMINFO_BINARY 2>/dev/null | grep -E "    Name:| Compute Unit:| Device Type:| BDFID:| Uuid:" |grep -v generic >$_tfile
+_tfile_lines=`wc -l $_tfile | cut -d" " -f1`
+if [ $_tfile_lines == 0 ] ; then
+  >&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices"
+  rm $_tfile
+  execOnError "$@"
+fi
+# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device
+_ri_all_gfxids=""
+_ri_gfxids=()
+_ri_cucount=()
+_ri_bdfids=()
+_ri_dev_idx=()
+_ri_num_devices=0
+_last_cu_count=0
+_ri_uuid=()
+_last_device_type_was_gpu=0
+_device_type_preset=0
+_ri_num_all_devices=0
+[ ! -z $GPURUN_VISIBLE_DEVICE_TYPES ] && _device_type_preset=1
+while read _linepair ; do
+  _fieldvalue=`echo $_linepair | cut -d":" -f2`
+  _fieldtype=`echo $_linepair | cut -d":" -f1`
+  if [ $_fieldvalue == "CPU" ] ; then
+     _last_device_type_was_gpu=0
+  elif [ $_fieldvalue == "GPU" ] ; then
+     _last_device_type_was_gpu=1
+  elif [ "$_fieldtype" == "Uuid" ] ; then
+     _this_uuid=$_fieldvalue
+  elif [ "$_fieldtype" == "BDFID" ] ; then
+     if [[ $_last_device_type_was_gpu == 1 ]] ; then
+        # _domain="$(echo "$_fieldvalue / (2^32)" | bc)"
+        _bus="$(echo "($_fieldvalue / (2^8)) % (2^8)" | bc)"
+        _devfn="$(echo "($_fieldvalue % (2^8))" | bc)"
+        _bdfidstr="$(printf "%.2x:%.2x" "$_bus" "$_devfn")"
+     fi
+  elif [ "$_fieldtype" == "Name" ] ; then
+     #  The device name field is last in rocminfo output, so we can create new _ri_ array entry
+     if [[ $_last_device_type_was_gpu == 1 ]] ; then
+	_this_gfxid=`echo $_fieldvalue | cut -d'-' -f5`
+        ! [[ ${_ri_all_gfxids} == *"$_this_gfxid"* ]] && _ri_all_gfxids+=" $_this_gfxid"
+        _is_type_visible=1
+	if [ $_device_type_preset == 1 ] ; then
+           _is_type_visible=0
+           if [[ ${GPURUN_VISIBLE_DEVICE_TYPES} == *"$_this_gfxid"* ]] ; then
+	     _is_type_visible=1
+	   fi
+	fi
+        if [ $_is_type_visible == 1 ] ; then
+           _ri_gfxids+=( $_this_gfxid )
+           _ri_cucount+=( $_last_cu_count )
+           _ri_bdfids+=( $_bdfidstr )
+	   _ri_dev_idx+=( $_ri_num_all_devices )
+	   _ri_uuid+=( $_this_uuid )
+           _ri_num_devices=$(( $_ri_num_devices + 1 ))
+	fi
+        _ri_num_all_devices=$(( $_ri_num_all_devices + 1 ))
+     fi
+  else
+     # else the _fieldvalue was the number of CUs or GCPUs
+     if [[ $_last_device_type_was_gpu == 1 ]] ; then
+        _last_cu_count=$_fieldvalue
+     fi
+  fi
+done < $_tfile
+rm $_tfile
+
+if [ $_ri_num_devices == 0 ] ; then
+   if [ $_local_rank_num == 0 ] ; then
+      if [ $_device_type_preset == 1 ] ; then
+         >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY of type $GPURUN_VISIBLE_DEVICE_TYPES."
+         >&2 echo "       Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}"
+      else
+         >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY"
+      fi
+      if [ ! -z $ROCR_VISIBLE_DEVICES ] ; then
+         >&2 echo "       ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES"
+         >&2 echo "       Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly."
+      fi
+      execOnError "$@"
+   else
+      execOnError "$@"
+   fi
+fi
+
+# Scan /sys/bus/pci/devices (_ss_) for amdgpu devices and store info in 6 per
+# device arrays indexed by device num. The arrays are _ss_cpulist _ss_bdfids,
+# _ss_numanode, _ss_uuid, _ss_gfxid, and _ss_cucount. Some information
+# (cucount, gfxid, dev_idx) must be copied from the _ri_ arrays built above
+# by scanning output from rocminfo.
+_sysdevdir="/sys/bus/pci/devices"
+_ss_num_devices=0
+_ss_cpulist=()
+_ss_bdfid=()
+_ss_numanode=()
+_ss_uuid=()
+_ss_gfxid=()
+_ss_cucount=()
+for _devid in `ls $_sysdevdir` ; do
+   if [ -f $_sysdevdir/$_devid/device ] ; then
+      _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'`
+      if [ ! -z $_driver_name ] ; then
+         if [ $_driver_name  == "DRIVER=amdgpu" ] ; then
+            _numa_node=`cat $_sysdevdir/$_devid/numa_node`
+            [ "$_numa_node" == "-1" ] && _numa_node=0
+            _this_uuid=0
+	    if [ -f $_sysdevdir/$_devid/unique_id ] ; then
+               _this_uuid=`cat $_sysdevdir/$_devid/unique_id`
+	       if [ -z $_this_uuid ] ; then
+                  _this_uuid=0
+		  _has_unique_id_file=0
+	       else
+                  _this_uuid="GPU-$_this_uuid"
+		  _has_unique_id_file=1
+	       fi
+	    fi
+            _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist`
+	    _match_uuid_count=0
+	    for _ri_i in ${!_ri_bdfids[@]} ; do
+               _ss_value=$_this_uuid
+               _ri_value=${_ri_uuid[$_ri_i]}
+               if [ $_ss_value == $_ri_value ] ; then
+                  _match_uuid_count=$(( $_match_uuid_count + 1 ))
+	       fi
+	    done
+            # Search _ri_ arrays for matching uuids or matching bdfids.
+	    for _ri_i in ${!_ri_bdfids[@]} ; do
+	       if [ "$_has_unique_id_file" == "1" ] ; then
+                  _ss_value=$_this_uuid
+                  _ri_value=${_ri_uuid[$_ri_i]}
+               elif [ "${_ri_bdfids[$_ri_i]}" == "00:00" ]; then
+                  # Under Hyper-V, we may see a zero BDFID.  Fall back to UUID.
+                  _ss_value=$_devid
+                  _ri_value=$_devid
+	       else
+                  _ss_value=$_devid
+                  _ri_value="0000:${_ri_bdfids[$_ri_i]}.0"
+               fi
+               if [ $_ss_value == $_ri_value ] ; then
+	          if [ $_this_uuid == 0 ] || [ $_match_uuid_count -gt 1 ] ; then
+	             # Some GPUs do not have unique_id or TPX mode creates multiple
+		     # identical uuids, so use device index for RVD
+                     _ss_uuid+=( ${_ri_dev_idx[$_ri_i]} )
+		  else
+                     _ss_uuid+=( $_this_uuid )
+		  fi
+		  _ss_gfxid+=( ${_ri_gfxids[$_ri_i]} )
+		  _ss_cucount+=( ${_ri_cucount[$_ri_i]} )
+                  _ss_bdfid+=( $_devid )
+                  _ss_numanode+=( $_numa_node )
+                  _ss_cpulist+=( $_this_cpulist )
+                  _ss_num_devices=$(( $_ss_num_devices + 1 ))
+               fi
+            done
+         fi
+      fi
+   fi
+done
+
+if [[ $_ss_num_devices -lt 1  ]] ; then
+   if [ $_device_type_preset == 1 ] ; then
+      >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir of type $GPURUN_VISIBLE_DEVICE_TYPES."
+      >&2 echo "       Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}"
+   else
+      >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir."
+   fi
+   execOnError "$@"
+fi
+
+# check for taskset or numactl cmd
+if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then
+  _launch_process_cmd_binary=`which numactl`
+  if [ $? != 0 ] ; then
+    >&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed."
+    execOnError "$@"
+  fi
+else
+  _launch_process_cmd_binary=`which taskset`
+  if [ $? != 0 ] ; then
+    >&2 echo "ERROR: $0 requires the taskset command to be installed."
+    execOnError "$@"
+  fi
+fi
+if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then
+  >&2 echo "GPURUN WARNING: When -l and -m are both set, -m is ignored."
+  _use_numactl_membind=0
+fi
+
+_utilized_devices=$_ri_num_devices
+[ $_ri_num_devices -gt $_num_local_ranks ] && _utilized_devices=$_num_local_ranks
+
+# Calculate number of GPUs to use to evenly spread ranks across GPUs.
+# An rplace is a set of CUs that will be used for a rank.
+# The number of rplaces must be at least the number of ranks.
+_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices ))
+_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices ))
+if [ $_uncovered_ranks != 0 ] ; then
+   # If _num_local_ranks not divisible by number of GPUs,
+   # then add an extra rplace per GPU to make room for remainder.
+   _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 ))
+fi
+if [ $GPURUN_MASK_POLICY == "mutex" ] ; then
+   # For mutex policy, adjacent ranks are assigned to the same device.
+   _rplace_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU ))
+   # Some users want to avoid dev 0 etc, by setting GPURUN_DEVICE_BIAS
+   _device_num=$(( ( $_rplace_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices ))
+else
+   # for mask policies nomask or preset, adjacent ranks are assigned to
+   # different GPUs and oversubscribed ranks are assigned round robin
+   _device_num=$(( ( $_local_rank_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices ))
+fi
+
+_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} ))
+if [ $_num_local_ranks -gt $_node_cus ] ; then
+   >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
+   execOnError "$@"
+fi
+
+if [ $_uses_multi_device == 1 ]; then
+   # Enforce some rules on the use of -md option
+   # Note -md forces GPURUN_MASK_POLICY=nomask
+   if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then
+      >&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode"
+      execOnError "$@"
+   fi
+   if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then
+      >&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)"
+      execOnError "$@"
+   fi
+   _md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset ))
+   if [ $_md_total_devices -gt $_ri_num_devices ] &&  [ $_local_rank_num == 0 ] ; then
+      printf "WARNING: processes($_num_local_ranks) * md set size($_devices_per_mdset) = $_md_total_devices > than available devices ($_ri_num_devices)\n         Some multi-device sets will overlap.\n" >&2
+   fi
+   _md_device_set_start=$(( ( $_local_rank_num * $_devices_per_mdset ) % $_ri_num_devices))
+   _md_device_set_end=$(( $_md_device_set_start + $_devices_per_mdset - 1 ))
+
+   # merge entries for this mdset from per device arrays
+   _md_bdfs=""
+   _md_cpus=""
+   _md_nns=""
+   _md_uuids=""
+   _md_dev_idxs=""
+   _sep=""
+   for i in `seq $_md_device_set_start $_md_device_set_end` ; do
+      _dev_index=$i
+      # handle index wrap around number of devices
+      [ $i -ge $_ri_num_devices ] && _dev_index=$(( $i % $_ri_num_devices ))
+      _md_bdfs+=$_sep${_ss_bdfid[$_dev_index]}
+      _new_nn=${_ss_numanode[$_dev_index]}
+      SAVEIFS=$IFS
+      IFS=","
+      _found=0
+      for _existing_nn in $_md_nns ; do
+         [ $_existing_nn == $_new_nn ] && _found=1
+      done
+      IFS=$SAVEIFS
+      if [ $_found == 0 ] ; then
+	 # only add new numa node and cpulist, if not already in the md set
+         _md_nns+=$_sep$_new_nn
+         _md_cpus+=$_sep${_ss_cpulist[$_dev_index]}
+      fi
+      _md_uuids+=$_sep${_ss_uuid[$_dev_index]}
+      _md_dev_idxs+=$_sep$_dev_index
+      _sep=","
+   done
+   _device_num=$_md_device_set_start
+fi
+
+_available_CUs_per_device=${_ss_cucount[$_device_num]}
+_gfxid=${_ss_gfxid[$_device_num]}
+
+_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} ))
+if [ $_num_local_ranks -gt $_node_cus ] ; then
+   >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
+   execOnError "$@"
+fi
+
+_utilized_CUs_per_device=$_available_CUs_per_device
+_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
+# Lower utilized CUs till divisible by number of rplaces per GPU
+while [ $_rem2 != 0 ] ; do
+   _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 ))
+   _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
+done
+_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU ))
+
+# --- THIS BLOCK ONLY FOR VERBOSE DIAGS PRINTED FROM RANK 0
+if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "2" ]]; then
+   if [ $_uses_multi_device == 0 ] ; then
+      _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device ))
+      _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_ri_num_devices ))
+      _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks ))
+      _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU ))
+      _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace ))
+      _utilization=$(( ( $_used_cus * 100 ) / $_node_cus ))
+      if ! [ $_ri_num_devices -gt $_num_local_ranks ] ; then
+         if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then
+            _extra_diags=true
+         fi
+      fi
+      >&2 echo "-  ROCMINFO LOCATION:   $ROCMINFO_BINARY"
+      >&2 echo "-  PROCESSES:           $_num_local_ranks (RANKS)"
+      >&2 echo "-  AVAILABLE GPUS:      $_ri_num_devices"
+      [ $_extra_diags ] && \
+      >&2 echo "-- USED GPUS:           $(( $_ri_num_devices - $_wasted_GPUs ))"
+      [ $_extra_diags ] && \
+      >&2 echo "-- UNUSED GPUS:         $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) "
+      [ $_extra_diags ] && echo
+      >&2 echo "-  RPLACEs PER NODE:    $_total_GPU_rplaces"
+      >&2 echo "-  RPLACEs PER GPU:     $_number_of_rplaces_per_GPU"
+      [ $_extra_diags ] && \
+      >&2 echo "-- USED RPLACEs:        $_num_local_ranks (RANKS)"
+      [ $_extra_diags ] && \
+      >&2 echo "-- UNUSED RPLACEs:      $_total_wasted_rplaces" ; \
+      >&2 echo "-  gfxids               ${_ss_gfxid[@]}"
+      >&2 echo "-  CUs PER GPU:         ${_ss_cucount[@]}"
+      [ $_extra_diags ] && \
+      >&2 echo "-- USED on CUs RANK0:   $_utilized_CUs_per_device"
+      [ $_extra_diags ] && \
+      >&2 echo "-- UNUSED CUs RANK0 :   $_wasted_CUs_on_each_GPU"
+      >&2 echo "-  CUs per RPLACE RANK0:$_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)"
+      >&2 echo "-  FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU"
+      if [[ ! -z "$ROCR_VISIBLE_DEVICES" ]] ; then
+         >&2 echo "-  Preset ROCR_VISIBLE_DEVICES:  $ROCR_VISIBLE_DEVICES"
+      fi
+      if [[ ! -z "$HSA_CU_MASK" ]] ; then
+         # node utilizatino could be incorrect with preset cumask.
+         >&2 echo "-  Preset HSA_CU_MASK: $HSA_CU_MASK"
+      else
+         >&2 echo "-  NODE UTILIZATION:  $_utilization %"
+      fi
+   else
+      >&2 echo "-  ROCMINFO LOCATION: $ROCMINFO_BINARY"
+      >&2 echo "-  PROCESSES:         $_num_local_ranks (RANKS)"
+      >&2 echo "-  AVAILABLE GPUS:    $_ri_num_devices"
+      >&2 echo "-  DEVS PER RANK:     $_devices_per_mdset"
+      >&2 echo "-  MULTI-DEVICE GPUS: $_md_total_devices (RANKS*DEVS-PER-RANK)"
+      _md_utilization=$(( $_md_total_devices * 100 / $_ri_num_devices ))
+      >&2 echo "-  NODE UTILIZATION:  $_md_utilization %"
+   fi
+fi
+#  --- END OF DIAGNOSTIC BLOCK
+
+if [ $_CUs_per_rplace != $_available_CUs_per_device ] && [ $GPURUN_MASK_POLICY == "mutex" ] ; then
+   #  Build the CU mask for this rank, bits_to_set = _CUs_per_rplace
+   _bits_to_set=$_CUs_per_rplace
+   #  This formula keeps adjacent ranks on same GPU which should be preferred
+   _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) ))
+   # use bc because these values can be very large
+   _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc`
+   _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc`
+   # Calculate the number of leading zeros needed for this mask
+   _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 ))
+   for i in `seq 1 $_lz` ; do
+      _mask="0$_mask"
+   done
+   _mask="0x$_mask"
+fi
+
+_launch_process_cmd=""
+if [ $_uses_multi_device == 0 ] ; then
+   # retrieve scanned info from per device arrays
+   _bdfidstrc=${_ss_bdfid[$_device_num]}
+   NUMANODE=${_ss_numanode[$_device_num]}
+   _list_of_cpu_cores=${_ss_cpulist[$_device_num]}
+   _this_uuid=${_ss_uuid[$_device_num]}
+else
+   # Use multi-device values
+   _bdfidstrc=$_md_bdfs
+   NUMANODE=$_md_nns
+   _list_of_cpu_cores=$_md_cpus
+   _this_uuid=$_md_uuids
+   _launch_process_cmd+="env LIBOMPTARGET_NUM_MULTI_DEVICES=$_devices_per_mdset "
+fi
+if [ "$_use_numactl_localalloc" == "1" ] ; then
+   _launch_process_cmd+="$_launch_process_cmd_binary --localalloc --cpunodebind=$NUMANODE"
+elif [ "$_use_numactl_membind" == "1" ] ; then
+   _launch_process_cmd+="$_launch_process_cmd_binary --membind=$NUMANODE --cpunodebind=$NUMANODE"
+else
+   _launch_process_cmd+="$_launch_process_cmd_binary -c $_list_of_cpu_cores"
+fi
+
+# If gpurun was not given command to execute, then dont run _launch_process_cmd
+[ "$*" == "" ] && _launch_process_cmd=""
+
+# only set ROCR_VISIBLE_DEVICES if not already set
+if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then
+   export ROCR_VISIBLE_DEVICES=$_this_uuid
+   _log_word="RVD"
+else
+   _log_word="PRESET-RVD"
+fi
+
+export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace
+
+#  - Limit HSA queues when multiple ranks per GPU
+if [ $_number_of_rplaces_per_GPU != 1 ] ; then
+   # Only set these env controls if not set by caller
+   [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1
+   [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1
+fi
+
+[[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0"  ]] && \
+   [[ $_local_rank_num == 0 ]] && >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK"
+
+if [ $_CUs_per_rplace == $_available_CUs_per_device ] || [ "$GPURUN_MASK_POLICY" == "nomask" ] ; then
+   # --- HSA_CU_MASK is NOT USED in this code block, This code block covers all multi-device execution.
+   if [ "$GPURUN_VERBOSE" != "0" ] ; then
+      if [ $_uses_multi_device == 1 ] ; then
+         printf "RANK:$_local_rank_num D:$_md_dev_idxs NNs:$_md_nns GPUTYPE:$_gfxid $_log_word:$ROCR_VISIBLE_DEVICES\n     CMD:$_launch_process_cmd $*\n" >&2
+      else
+         printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d GPUTYPE:$_gfxid $_log_word:%s \n     CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE $ROCR_VISIBLE_DEVICES "$_launch_process_cmd" >&2
+      fi
+   fi
+   $_launch_process_cmd $*
+   # --- end code block
+else
+   # --- HSA_CU_MASK is required in this code block, assumes no multi-device
+   if [[ -z "$HSA_CU_MASK" ]] ; then
+      # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0:
+      export HSA_CU_MASK=0:$_mask
+   else
+      # use preset mask
+      _mask=$HSA_CU_MASK
+   fi
+   if [ "$GPURUN_VERBOSE" != "0" ] ; then
+      printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d $_gfxid CUMASK:$_mask $_log_word:$ROCR_VISIBLE_DEVICES \n     CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE "$_launch_process_cmd" >&2
+   fi
+   HSA_CU_MASK=0:$_mask \
+   $_launch_process_cmd $*
+   # --- end code block
+fi
+exit $?