diff --git a/offload/CMakeLists.txt b/offload/CMakeLists.txt index 82ebdf84019a5..d972402d03a7d 100644 --- a/offload/CMakeLists.txt +++ b/offload/CMakeLists.txt @@ -475,3 +475,5 @@ if(OFFLOAD_INCLUDE_TESTS) add_subdirectory(test) add_subdirectory(unittests) endif() + +add_subdirectory(utils) diff --git a/offload/utils/CMakeLists.txt b/offload/utils/CMakeLists.txt new file mode 100644 index 0000000000000..d6f2d6729d18c --- /dev/null +++ b/offload/utils/CMakeLists.txt @@ -0,0 +1,10 @@ +set(OPENMP_UTILS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH + "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')") + +macro(add_openmp_util path) + install(PROGRAMS + ${path} + DESTINATION "${OPENMP_UTILS_INSTALL_DIR}") +endmacro() + +add_subdirectory(gpurun) diff --git a/offload/utils/gpurun/CMakeLists.txt b/offload/utils/gpurun/CMakeLists.txt new file mode 100644 index 0000000000000..0483a5737b830 --- /dev/null +++ b/offload/utils/gpurun/CMakeLists.txt @@ -0,0 +1 @@ +add_openmp_util(${CMAKE_CURRENT_SOURCE_DIR}/gpurun) diff --git a/offload/utils/gpurun/gpurun b/offload/utils/gpurun/gpurun new file mode 100755 index 0000000000000..870bc7a8ccbcd --- /dev/null +++ b/offload/utils/gpurun/gpurun @@ -0,0 +1,697 @@ +#!/bin/bash +# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# of the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +# gpurun: Process launch utility for GPU applications. This is a wrapper +# to execute application binaries including OpenMPI GPU applications. +# See help message below (gpurun -h) for more information. +# +# Usage Examples: +# gpurun true +# mpirun -np 4 gpurun env | grep ROCR_VISIBLE_DEVICES +# + +# If set to 1, just invoke the rest of the command line without doing anything +# else. +GPURUN_BYPASS=${GPURUN_BYPASS:-0} + +function execOnError() { + exec "$@" +} + +# PROGVERSION string is updated by cmake when component is installed +PROGVERSION=X.Y-Z +function version(){ + echo $0 version $PROGVERSION + exit 0 +} +function usage(){ +/bin/cat 2>&1 <<"EOF" + + gpurun: Application process launch utility for GPUs. + This utility ensures the process will enable either a single + GPU or the number specified with -md (multi-device) option. + It launches the application binary with either the 'taskset' + or 'numactl' utility so the process only runs on CPU cores + in the same NUMA domain as the selected GPUs. + This utility sets environment variable ROCR_VISIBLE_DEVICES + to selected GPUs ONLY if it was not already set by the + callers environment AND the number of GPUs is not 1. + This utility also sets environment variable HSA_CU_MASK + to control which CUs are available to the process. + HSA_CU_MASK is set only when more than one OpenMPI process + (rank) will utilize the same GPU and it is not preset. + Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the + number of CUs available to the process after masking. + + Usage: + gpurun [ ] + mpirun -np gpurun [ ] + + Options: + -h Print this help message and exit + -md Set number of desired devices for multi-device mode, default=1 + -s suppress output, often useful in benchmarking + -q suppress output, quiet, alias of -s, same as GPURUN_VERBOSE=0 + -v Verbose output, same as GPURUN_VERBOSE=1 + -vv Verbose output, same as GPURUN_VERBOSE=2 + -m use numactl membind to CPUs in same NUMA domain. Note: Allocation + fails when not enough memory available on these nodes. + -l use numactl localalloc to CPUs in same NUMA domain. Note: If + memory cannot be allocated, alloc falls back to other nodes. + -nr use numactl ROCR_VISIBLE_DEVICES + -nm use numactl OMPI_COMM_WORLD_LOCAL_RANK + --version Print version of gpurun and exit + + Optional Input environment variables: + GPURUN_VERBOSE + 0: default for silent operation, no trace printed to stderr + 1: -v prints trace record including process launch cmd to stderr + 2: -vv prints trace and other summary diagnostics + ROCMINFO_BINARY Set location of rocminfo binary + AOMP: location of AOMP or ROCM + GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0. + This only works for single device mode. + GPURUN_VISIBLE_DEVICE_TYPES: useful if machine has different GPU cards + GPURUN_MASK_POLICY : useful if machine has different GPU cards + ROCR_VISIBLE_DEVICES: See description above + OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi + OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi + This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID + and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK + + Generated (output) Environment Variables: + OMPX_TARGET_TEAM_PROCS - Number of CUs available to process + ROCR_VISIBLE_DEVICES - list of GPU Uuids for the selected devices if not preset + HSA_CU_MASK - The CU mask for the device. + LIBOMPTARGET_NUM_MULTI_DEVICES - the value set by -md argument + GPU_MAX_HW_QUEUES + LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" + + Limitations: + - Currently, gpurun creates masks that are mutually exclusive of each other. + That is, the MPI processes will not share CUs. If number of ranks is not + perfectly divisible by number of CUs or number of GPUs, some resources + would be unused. + Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization. + - Works with AOMP 19.0-0 or ROCM 6.1 or greater + - cu masking is not available when multiple devices per process are enabled + with -md option (multi-device) mode. + + Notes: + With MPI, this utility distributes GPUs and their CUs across + multiple ranks of an MPI job into mutually exclusive sets of CUs. + It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE + and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a + the mutually exclusive CU mask. + + An rplace (rank place) is a subset of CUs for a rank. + gpurun calculates the number of rplaces needed to contain all + the specified number of ranks for this node. If number of ranks not + divisible by number of GPUs, then there will be more rplaces than ranks. + The number of CUs in an rplace is calculated by dividing the number of + CUs per GPU by the number of rplaces per GPU. This is also the number of + bits set in the CU mask. This is also the number of physical locations + available for an OpenMP team to execute. This utility exports that number + to the environment variable OMPX_TARGET_TEAM_PROCS. This value + could be used by the application or runtume to adjust the number + of desired teams in a target region. If no masking occurs, the entire + GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to + the total number of CUs on the GPU. + + Copyright (c) 2024 ADVANCED MICRO DEVICES, INC. + +EOF + exit 0 +} + +_end_gpurun_opts=0 +_devices_per_mdset=1 +_uses_multi_device=0 +while [ "$_end_gpurun_opts" == "0" ] ; do + case "$1" in + -s) GPURUN_VERBOSE=0;; + -q) GPURUN_VERBOSE=0;; + --quiet) GPURUN_VERBOSE=0;; + -h) usage ;; + -help) usage ;; + --help) usage ;; + -version) version ;; + --version) version ;; + -v) GPURUN_VERBOSE=1;; + -vv) GPURUN_VERBOSE=2;; + -m) _use_numactl_membind=1;; + -md) shift; _devices_per_mdset=$1; _uses_multi_device=1;; + -nr) _use_numactl_rocr=1;; + -nm) _use_numactl_ompi=1;; + -l) _use_numactl_localalloc=1;; + -nomask) GPURUN_MASK_POLICY="nomask";; + *) _end_gpurun_opts=1; break;; + esac + if [ "$_end_gpurun_opts" == "0" ] ; then + shift + fi +done + +if [ "$GPURUN_BYPASS" = "1" ]; then + execOnError "$@" +fi + +# Default: quiet operation +GPURUN_VERBOSE=${GPURUN_VERBOSE:-0} +# Default: create mutually exclusive sets of CUs when GPU is oversubscribed +GPURUN_MASK_POLICY=${GPURUN_MASK_POLICY:-mutex} +# switch mask policy to preset if HSA_CU_MASK was preset +[[ ! -z "$HSA_CU_MASK" ]] && GPURUN_MASK_POLICY=preset +# switch mask policy to nomask for multi-device +[[ $_uses_multi_device == 1 ]] && GPURUN_MASK_POLICY=nomask +# Offset selected device to avoid some heavily used GPUs +GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0} + +# Get environment variables set by OpenMPI +_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE +_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK +# If not OpenMPI, check for Platform MPI, MVAPICH +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=$MPI_LOCALNRANKS + _local_rank_num=$MPI_LOCALRANKID +fi +# Also try MPI_COMM_WORLD env vars +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE + _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK +fi +# Check if SLURM was used +if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then + _num_local_ranks=$SLURM_CPUS_ON_NODE + _local_rank_num=$SLURM_LOCALID +fi + +if [ "$_use_numactl_rocr" == "1" ] ; then + _cmd_binary=`which numactl` + if [ $? == 0 ] ; then + numactl --cpunodebind $ROCR_VISIBLE_DEVICES --membind $ROCR_VISIBLE_DEVICES $* + exit $? + else + $* + exit $? + fi +fi +if [ "$_use_numactl_ompi" == "1" ] ; then + _cmd_binary=`which numactl` + if [ $? == 0 ] ; then + numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK --membind $OMPI_COMM_WORLD_LOCAL_RANK $* + exit $? + else + $* + exit $? + fi +fi +# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=1 + _local_rank_num=0 +fi + +# Find location of the rocminfo binary +AOMP=${AOMP:-_AOMP_INSTALL_DIR_} +if [ ! -d $AOMP ] ; then + AOMP="_AOMP_INSTALL_DIR_" +fi +if [ ! -d $AOMP ] ; then + AOMP="/opt/rocm/lib/llvm" +fi +if [ ! -d $AOMP ] ; then + AOMP="/opt/rocm/llvm" +fi +if [ ! -d $AOMP ] ; then + realpath=`realpath $0` + thisdir=`dirname $realpath` + AOMP=$thisdir/.. +fi +if [ ! -d $AOMP ] ; then + >&2 echo "ERROR: AOMP not found at $AOMP" + >&2 echo " Please install AOMP or correctly set env-var AOMP" + execOnError "$@" +fi +ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo} +[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo +[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo +if [ ! -f $ROCMINFO_BINARY ] ; then + >&2 echo "ERROR: Could not find binary for rocminfo," + >&2 echo " Please correct installation of ROCM or AOMP compiler" + execOnError "$@" +fi + +# Use rocminfo to find number number of CUs and gfxids for each GPU. +_tfile="/tmp/rinfo_out$$" +$ROCMINFO_BINARY 2>/dev/null | grep -E " Name:| Compute Unit:| Device Type:| BDFID:| Uuid:" |grep -v generic >$_tfile +_tfile_lines=`wc -l $_tfile | cut -d" " -f1` +if [ $_tfile_lines == 0 ] ; then + >&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices" + rm $_tfile + execOnError "$@" +fi +# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device +_ri_all_gfxids="" +_ri_gfxids=() +_ri_cucount=() +_ri_bdfids=() +_ri_dev_idx=() +_ri_num_devices=0 +_last_cu_count=0 +_ri_uuid=() +_last_device_type_was_gpu=0 +_device_type_preset=0 +_ri_num_all_devices=0 +[ ! -z $GPURUN_VISIBLE_DEVICE_TYPES ] && _device_type_preset=1 +while read _linepair ; do + _fieldvalue=`echo $_linepair | cut -d":" -f2` + _fieldtype=`echo $_linepair | cut -d":" -f1` + if [ $_fieldvalue == "CPU" ] ; then + _last_device_type_was_gpu=0 + elif [ $_fieldvalue == "GPU" ] ; then + _last_device_type_was_gpu=1 + elif [ "$_fieldtype" == "Uuid" ] ; then + _this_uuid=$_fieldvalue + elif [ "$_fieldtype" == "BDFID" ] ; then + if [[ $_last_device_type_was_gpu == 1 ]] ; then + # _domain="$(echo "$_fieldvalue / (2^32)" | bc)" + _bus="$(echo "($_fieldvalue / (2^8)) % (2^8)" | bc)" + _devfn="$(echo "($_fieldvalue % (2^8))" | bc)" + _bdfidstr="$(printf "%.2x:%.2x" "$_bus" "$_devfn")" + fi + elif [ "$_fieldtype" == "Name" ] ; then + # The device name field is last in rocminfo output, so we can create new _ri_ array entry + if [[ $_last_device_type_was_gpu == 1 ]] ; then + _this_gfxid=`echo $_fieldvalue | cut -d'-' -f5` + ! [[ ${_ri_all_gfxids} == *"$_this_gfxid"* ]] && _ri_all_gfxids+=" $_this_gfxid" + _is_type_visible=1 + if [ $_device_type_preset == 1 ] ; then + _is_type_visible=0 + if [[ ${GPURUN_VISIBLE_DEVICE_TYPES} == *"$_this_gfxid"* ]] ; then + _is_type_visible=1 + fi + fi + if [ $_is_type_visible == 1 ] ; then + _ri_gfxids+=( $_this_gfxid ) + _ri_cucount+=( $_last_cu_count ) + _ri_bdfids+=( $_bdfidstr ) + _ri_dev_idx+=( $_ri_num_all_devices ) + _ri_uuid+=( $_this_uuid ) + _ri_num_devices=$(( $_ri_num_devices + 1 )) + fi + _ri_num_all_devices=$(( $_ri_num_all_devices + 1 )) + fi + else + # else the _fieldvalue was the number of CUs or GCPUs + if [[ $_last_device_type_was_gpu == 1 ]] ; then + _last_cu_count=$_fieldvalue + fi + fi +done < $_tfile +rm $_tfile + +if [ $_ri_num_devices == 0 ] ; then + if [ $_local_rank_num == 0 ] ; then + if [ $_device_type_preset == 1 ] ; then + >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY of type $GPURUN_VISIBLE_DEVICE_TYPES." + >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" + else + >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY" + fi + if [ ! -z $ROCR_VISIBLE_DEVICES ] ; then + >&2 echo " ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES" + >&2 echo " Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly." + fi + execOnError "$@" + else + execOnError "$@" + fi +fi + +# Scan /sys/bus/pci/devices (_ss_) for amdgpu devices and store info in 6 per +# device arrays indexed by device num. The arrays are _ss_cpulist _ss_bdfids, +# _ss_numanode, _ss_uuid, _ss_gfxid, and _ss_cucount. Some information +# (cucount, gfxid, dev_idx) must be copied from the _ri_ arrays built above +# by scanning output from rocminfo. +_sysdevdir="/sys/bus/pci/devices" +_ss_num_devices=0 +_ss_cpulist=() +_ss_bdfid=() +_ss_numanode=() +_ss_uuid=() +_ss_gfxid=() +_ss_cucount=() +for _devid in `ls $_sysdevdir` ; do + if [ -f $_sysdevdir/$_devid/device ] ; then + _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'` + if [ ! -z $_driver_name ] ; then + if [ $_driver_name == "DRIVER=amdgpu" ] ; then + _numa_node=`cat $_sysdevdir/$_devid/numa_node` + [ "$_numa_node" == "-1" ] && _numa_node=0 + _this_uuid=0 + if [ -f $_sysdevdir/$_devid/unique_id ] ; then + _this_uuid=`cat $_sysdevdir/$_devid/unique_id` + if [ -z $_this_uuid ] ; then + _this_uuid=0 + _has_unique_id_file=0 + else + _this_uuid="GPU-$_this_uuid" + _has_unique_id_file=1 + fi + fi + _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist` + _match_uuid_count=0 + for _ri_i in ${!_ri_bdfids[@]} ; do + _ss_value=$_this_uuid + _ri_value=${_ri_uuid[$_ri_i]} + if [ $_ss_value == $_ri_value ] ; then + _match_uuid_count=$(( $_match_uuid_count + 1 )) + fi + done + # Search _ri_ arrays for matching uuids or matching bdfids. + for _ri_i in ${!_ri_bdfids[@]} ; do + if [ "$_has_unique_id_file" == "1" ] ; then + _ss_value=$_this_uuid + _ri_value=${_ri_uuid[$_ri_i]} + elif [ "${_ri_bdfids[$_ri_i]}" == "00:00" ]; then + # Under Hyper-V, we may see a zero BDFID. Fall back to UUID. + _ss_value=$_devid + _ri_value=$_devid + else + _ss_value=$_devid + _ri_value="0000:${_ri_bdfids[$_ri_i]}.0" + fi + if [ $_ss_value == $_ri_value ] ; then + if [ $_this_uuid == 0 ] || [ $_match_uuid_count -gt 1 ] ; then + # Some GPUs do not have unique_id or TPX mode creates multiple + # identical uuids, so use device index for RVD + _ss_uuid+=( ${_ri_dev_idx[$_ri_i]} ) + else + _ss_uuid+=( $_this_uuid ) + fi + _ss_gfxid+=( ${_ri_gfxids[$_ri_i]} ) + _ss_cucount+=( ${_ri_cucount[$_ri_i]} ) + _ss_bdfid+=( $_devid ) + _ss_numanode+=( $_numa_node ) + _ss_cpulist+=( $_this_cpulist ) + _ss_num_devices=$(( $_ss_num_devices + 1 )) + fi + done + fi + fi + fi +done + +if [[ $_ss_num_devices -lt 1 ]] ; then + if [ $_device_type_preset == 1 ] ; then + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir of type $GPURUN_VISIBLE_DEVICE_TYPES." + >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" + else + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir." + fi + execOnError "$@" +fi + +# check for taskset or numactl cmd +if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then + _launch_process_cmd_binary=`which numactl` + if [ $? != 0 ] ; then + >&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed." + execOnError "$@" + fi +else + _launch_process_cmd_binary=`which taskset` + if [ $? != 0 ] ; then + >&2 echo "ERROR: $0 requires the taskset command to be installed." + execOnError "$@" + fi +fi +if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then + >&2 echo "GPURUN WARNING: When -l and -m are both set, -m is ignored." + _use_numactl_membind=0 +fi + +_utilized_devices=$_ri_num_devices +[ $_ri_num_devices -gt $_num_local_ranks ] && _utilized_devices=$_num_local_ranks + +# Calculate number of GPUs to use to evenly spread ranks across GPUs. +# An rplace is a set of CUs that will be used for a rank. +# The number of rplaces must be at least the number of ranks. +_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices )) +_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices )) +if [ $_uncovered_ranks != 0 ] ; then + # If _num_local_ranks not divisible by number of GPUs, + # then add an extra rplace per GPU to make room for remainder. + _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 )) +fi +if [ $GPURUN_MASK_POLICY == "mutex" ] ; then + # For mutex policy, adjacent ranks are assigned to the same device. + _rplace_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU )) + # Some users want to avoid dev 0 etc, by setting GPURUN_DEVICE_BIAS + _device_num=$(( ( $_rplace_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) +else + # for mask policies nomask or preset, adjacent ranks are assigned to + # different GPUs and oversubscribed ranks are assigned round robin + _device_num=$(( ( $_local_rank_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) +fi + +_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) +if [ $_num_local_ranks -gt $_node_cus ] ; then + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " + execOnError "$@" +fi + +if [ $_uses_multi_device == 1 ]; then + # Enforce some rules on the use of -md option + # Note -md forces GPURUN_MASK_POLICY=nomask + if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then + >&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode" + execOnError "$@" + fi + if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then + >&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)" + execOnError "$@" + fi + _md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset )) + if [ $_md_total_devices -gt $_ri_num_devices ] && [ $_local_rank_num == 0 ] ; then + printf "WARNING: processes($_num_local_ranks) * md set size($_devices_per_mdset) = $_md_total_devices > than available devices ($_ri_num_devices)\n Some multi-device sets will overlap.\n" >&2 + fi + _md_device_set_start=$(( ( $_local_rank_num * $_devices_per_mdset ) % $_ri_num_devices)) + _md_device_set_end=$(( $_md_device_set_start + $_devices_per_mdset - 1 )) + + # merge entries for this mdset from per device arrays + _md_bdfs="" + _md_cpus="" + _md_nns="" + _md_uuids="" + _md_dev_idxs="" + _sep="" + for i in `seq $_md_device_set_start $_md_device_set_end` ; do + _dev_index=$i + # handle index wrap around number of devices + [ $i -ge $_ri_num_devices ] && _dev_index=$(( $i % $_ri_num_devices )) + _md_bdfs+=$_sep${_ss_bdfid[$_dev_index]} + _new_nn=${_ss_numanode[$_dev_index]} + SAVEIFS=$IFS + IFS="," + _found=0 + for _existing_nn in $_md_nns ; do + [ $_existing_nn == $_new_nn ] && _found=1 + done + IFS=$SAVEIFS + if [ $_found == 0 ] ; then + # only add new numa node and cpulist, if not already in the md set + _md_nns+=$_sep$_new_nn + _md_cpus+=$_sep${_ss_cpulist[$_dev_index]} + fi + _md_uuids+=$_sep${_ss_uuid[$_dev_index]} + _md_dev_idxs+=$_sep$_dev_index + _sep="," + done + _device_num=$_md_device_set_start +fi + +_available_CUs_per_device=${_ss_cucount[$_device_num]} +_gfxid=${_ss_gfxid[$_device_num]} + +_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) +if [ $_num_local_ranks -gt $_node_cus ] ; then + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " + execOnError "$@" +fi + +_utilized_CUs_per_device=$_available_CUs_per_device +_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) +# Lower utilized CUs till divisible by number of rplaces per GPU +while [ $_rem2 != 0 ] ; do + _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 )) + _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) +done +_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU )) + +# --- THIS BLOCK ONLY FOR VERBOSE DIAGS PRINTED FROM RANK 0 +if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "2" ]]; then + if [ $_uses_multi_device == 0 ] ; then + _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device )) + _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_ri_num_devices )) + _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks )) + _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) + _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace )) + _utilization=$(( ( $_used_cus * 100 ) / $_node_cus )) + if ! [ $_ri_num_devices -gt $_num_local_ranks ] ; then + if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then + _extra_diags=true + fi + fi + >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" + >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" + >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" + [ $_extra_diags ] && \ + >&2 echo "-- USED GPUS: $(( $_ri_num_devices - $_wasted_GPUs ))" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED GPUS: $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) " + [ $_extra_diags ] && echo + >&2 echo "- RPLACEs PER NODE: $_total_GPU_rplaces" + >&2 echo "- RPLACEs PER GPU: $_number_of_rplaces_per_GPU" + [ $_extra_diags ] && \ + >&2 echo "-- USED RPLACEs: $_num_local_ranks (RANKS)" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED RPLACEs: $_total_wasted_rplaces" ; \ + >&2 echo "- gfxids ${_ss_gfxid[@]}" + >&2 echo "- CUs PER GPU: ${_ss_cucount[@]}" + [ $_extra_diags ] && \ + >&2 echo "-- USED on CUs RANK0: $_utilized_CUs_per_device" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED CUs RANK0 : $_wasted_CUs_on_each_GPU" + >&2 echo "- CUs per RPLACE RANK0:$_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)" + >&2 echo "- FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU" + if [[ ! -z "$ROCR_VISIBLE_DEVICES" ]] ; then + >&2 echo "- Preset ROCR_VISIBLE_DEVICES: $ROCR_VISIBLE_DEVICES" + fi + if [[ ! -z "$HSA_CU_MASK" ]] ; then + # node utilizatino could be incorrect with preset cumask. + >&2 echo "- Preset HSA_CU_MASK: $HSA_CU_MASK" + else + >&2 echo "- NODE UTILIZATION: $_utilization %" + fi + else + >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" + >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" + >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" + >&2 echo "- DEVS PER RANK: $_devices_per_mdset" + >&2 echo "- MULTI-DEVICE GPUS: $_md_total_devices (RANKS*DEVS-PER-RANK)" + _md_utilization=$(( $_md_total_devices * 100 / $_ri_num_devices )) + >&2 echo "- NODE UTILIZATION: $_md_utilization %" + fi +fi +# --- END OF DIAGNOSTIC BLOCK + +if [ $_CUs_per_rplace != $_available_CUs_per_device ] && [ $GPURUN_MASK_POLICY == "mutex" ] ; then + # Build the CU mask for this rank, bits_to_set = _CUs_per_rplace + _bits_to_set=$_CUs_per_rplace + # This formula keeps adjacent ranks on same GPU which should be preferred + _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) )) + # use bc because these values can be very large + _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc` + _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc` + # Calculate the number of leading zeros needed for this mask + _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 )) + for i in `seq 1 $_lz` ; do + _mask="0$_mask" + done + _mask="0x$_mask" +fi + +_launch_process_cmd="" +if [ $_uses_multi_device == 0 ] ; then + # retrieve scanned info from per device arrays + _bdfidstrc=${_ss_bdfid[$_device_num]} + NUMANODE=${_ss_numanode[$_device_num]} + _list_of_cpu_cores=${_ss_cpulist[$_device_num]} + _this_uuid=${_ss_uuid[$_device_num]} +else + # Use multi-device values + _bdfidstrc=$_md_bdfs + NUMANODE=$_md_nns + _list_of_cpu_cores=$_md_cpus + _this_uuid=$_md_uuids + _launch_process_cmd+="env LIBOMPTARGET_NUM_MULTI_DEVICES=$_devices_per_mdset " +fi +if [ "$_use_numactl_localalloc" == "1" ] ; then + _launch_process_cmd+="$_launch_process_cmd_binary --localalloc --cpunodebind=$NUMANODE" +elif [ "$_use_numactl_membind" == "1" ] ; then + _launch_process_cmd+="$_launch_process_cmd_binary --membind=$NUMANODE --cpunodebind=$NUMANODE" +else + _launch_process_cmd+="$_launch_process_cmd_binary -c $_list_of_cpu_cores" +fi + +# If gpurun was not given command to execute, then dont run _launch_process_cmd +[ "$*" == "" ] && _launch_process_cmd="" + +# only set ROCR_VISIBLE_DEVICES if not already set +if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then + export ROCR_VISIBLE_DEVICES=$_this_uuid + _log_word="RVD" +else + _log_word="PRESET-RVD" +fi + +export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace + +# - Limit HSA queues when multiple ranks per GPU +if [ $_number_of_rplaces_per_GPU != 1 ] ; then + # Only set these env controls if not set by caller + [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1 + [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1 +fi + +[[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0" ]] && \ + [[ $_local_rank_num == 0 ]] && >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK" + +if [ $_CUs_per_rplace == $_available_CUs_per_device ] || [ "$GPURUN_MASK_POLICY" == "nomask" ] ; then + # --- HSA_CU_MASK is NOT USED in this code block, This code block covers all multi-device execution. + if [ "$GPURUN_VERBOSE" != "0" ] ; then + if [ $_uses_multi_device == 1 ] ; then + printf "RANK:$_local_rank_num D:$_md_dev_idxs NNs:$_md_nns GPUTYPE:$_gfxid $_log_word:$ROCR_VISIBLE_DEVICES\n CMD:$_launch_process_cmd $*\n" >&2 + else + printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d GPUTYPE:$_gfxid $_log_word:%s \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE $ROCR_VISIBLE_DEVICES "$_launch_process_cmd" >&2 + fi + fi + $_launch_process_cmd $* + # --- end code block +else + # --- HSA_CU_MASK is required in this code block, assumes no multi-device + if [[ -z "$HSA_CU_MASK" ]] ; then + # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0: + export HSA_CU_MASK=0:$_mask + else + # use preset mask + _mask=$HSA_CU_MASK + fi + if [ "$GPURUN_VERBOSE" != "0" ] ; then + printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d $_gfxid CUMASK:$_mask $_log_word:$ROCR_VISIBLE_DEVICES \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE "$_launch_process_cmd" >&2 + fi + HSA_CU_MASK=0:$_mask \ + $_launch_process_cmd $* + # --- end code block +fi +exit $?