|
| 1 | +#!/bin/bash |
| 2 | +#SBATCH -A stf016 |
| 3 | +#SBATCH -J ddp_test |
| 4 | +#SBATCH -o logs/frontier_apptainer_mltests.%j |
| 5 | +#SBATCH -e logs/frontier_apptainer_mltests.%j |
| 6 | +#SBATCH -t 00:30:00 |
| 7 | +#SBATCH -p batch |
| 8 | +#SBATCH -N 2 |
| 9 | + |
| 10 | +# Only necessary if submitting like: sbatch --export=NONE ... (recommended) |
| 11 | +# Do NOT include this line when submitting without --export=NONE |
| 12 | +#unset SLURM_EXPORT_ENV |
| 13 | + |
| 14 | +# Load modules |
| 15 | + |
| 16 | +module load cray-mpich-abi/8.1.31 |
| 17 | +module load craype-accel-amd-gfx90a |
| 18 | +module load rocm/5.7.1 |
| 19 | +module load miniforge3 |
| 20 | + |
| 21 | +read -ra arr <<< ${ips} |
| 22 | + |
| 23 | +export NCCL_SOCKET_IFNAME=hsn0 |
| 24 | + |
| 25 | +export MASTER_ADDR=$(getent hosts $(scontrol show hostnames $SLURM_NODELIST | head -n1) | awk '{ print $1 }') |
| 26 | +#export MASTER_ADDR=$(hostname -i) |
| 27 | +echo "MASTER_ADDR=" $MASTER_ADDR |
| 28 | +export MASTER_PORT=3442 |
| 29 | +export NCCL_SOCKET_IFNAME=hsn0 |
| 30 | +export GLOO_SOCKET_IFNAME=hsn0 |
| 31 | +# Needed to bypass MIOpen, Disk I/O Errors |
| 32 | +export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache" |
| 33 | +export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH} |
| 34 | +rm -rf ${MIOPEN_USER_DB_PATH} |
| 35 | +mkdir -p ${MIOPEN_USER_DB_PATH} |
| 36 | + |
| 37 | +#export NCCL_IB_DISABLE=1 |
| 38 | +##export NCCL_DEBUG=INFO |
| 39 | +#export TF_CPP_MIN_LOG_LEVEL=0 |
| 40 | +##export GRPC_VERBOSITY=debug |
| 41 | +#export GRPC_TRACE=all |
| 42 | +#export GRPC_ENABLE_FORK_SUPPORT=true |
| 43 | +#export TF_FORCE_GPU_ALLOW_GROWTH=true |
| 44 | +#export GRPC_ARG_ENABLE_IPV4_ONLY=true |
| 45 | +# |
| 46 | +#rm -rf ${MIOPEN_USER_DB_PATH} |
| 47 | +#mkdir -p ${MIOPEN_USER_DB_PATH} |
| 48 | +# |
| 49 | +hosts=$(scontrol show hostnames $SLURM_JOB_NODELIST) |
| 50 | +hosts_array=($hosts) |
| 51 | +# |
| 52 | +## Setup TF_CONFIG for each worker |
| 53 | + |
| 54 | +TF_CONFIG=$(cat <<EOF |
| 55 | +{ |
| 56 | + "cluster": { |
| 57 | + "worker": ["${hosts_array[0]}:12345", "${hosts_array[1]}:23456"] |
| 58 | + }, |
| 59 | + "task": {"type": "worker", "index": $rank} |
| 60 | +} |
| 61 | +EOF |
| 62 | +) |
| 63 | +# |
| 64 | +#export TF_CONFIG="$TF_CONFIG" |
| 65 | +#echo TF_CONFIG="$TF_CONFIG" |
| 66 | +# |
| 67 | +export MPICH_GPU_SUPPORT_ENABLED=1 |
| 68 | +export BINDS=/usr/share/libdrm,/var/spool/slurmd,/opt/cray,${PWD} |
| 69 | +export APPTAINERENV_LD_LIBRARY_PATH="/opt/cray/pe/mpich/8.1.31/ofi/crayclang/17.0/lib-abi-mpich:/opt/cray/pe/mpich/8.1.31/gtl/lib:/opt/rocm-5.7.1/lib:$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH:/opt/cray/pe/lib64" |
| 70 | +export APPTAINER_CONTAINLIBS="/usr/lib64/libcxi.so.1,/usr/lib64/libjson-c.so.3,/lib64/libtinfo.so.6,/usr/lib64/libnl-3.so.200" |
| 71 | + |
| 72 | +set -ex |
| 73 | +# Run script |
| 74 | +# |
| 75 | +srun -N2 -n2 --gpus=16 --gpu-bind=closest apptainer exec --workdir `pwd` --rocm --bind $BINDS tensorflow_latest.sif ./pyrun.sh |
| 76 | + |
0 commit comments