added multi-node tensorflow

Elijah MacCarthy · secondspass · commit fa2a2552cfe6 · 2025-09-05T15:49:42.000-04:00
diff --git a/frontier/sample_apps/jax/README.md b/frontier/sample_apps/jax/README.md
@@ -9,4 +9,4 @@ The MNIST test has already been downloaded and located in the examples folder he
 Proceed to submit the job with:
 ```
 sbatch submit.sbatch
-
+```
diff --git a/frontier/sample_apps/tensorflow/README.md b/frontier/sample_apps/tensorflow/README.md
@@ -8,4 +8,4 @@ apptainer pull tensorflow_latest.sif docker://rocm/tensorflow:latest
 Submit the job with:
 ```
 sbatch submit.sbatch
-
+```
diff --git a/frontier/sample_apps/tensorflow/multi-node/README.md b/frontier/sample_apps/tensorflow/multi-node/README.md
@@ -0,0 +1,12 @@
+# Tensorflow MNIST example
+
+There are two common ways of distributed training with data parallelism: 1). synchronous training where steps of training are synced across workers and replicas and 2). Asynchronous training where training steps are not strictly synched. 
+
+Multi-node Tensorflow training is performed using multi-worker distributed training. For this, a TF_CONFIG configuration environment variable is needed for training on multiple nodes. For more on TF_CONFIG and distributed training, please refer to the official Tensorflow tutorials from where this example was borrowed: [Tensorflow/tutorial_distributed_training](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras#multi-worker_configuration)
+
+We use the `tf.distribute.MultiWorkerMirroredStrategy` API for our multi-node distributed tensorflow example. We set the TF_CONFIG configuration environment variable in our `submit.sbatch` script with two workers.
+ 
+Submit the job with:
+```
+sbatch submit.sbatch
+```
diff --git a/frontier/sample_apps/tensorflow/multi-node/main.py b/frontier/sample_apps/tensorflow/multi-node/main.py
@@ -0,0 +1,21 @@
+import os
+import json
+
+import tensorflow as tf
+import mnist_setup
+
+per_worker_batch_size = 64
+#tf_config = json.loads(os.environ['TF_CONFIG'])
+num_workers = 2 #len(tf_config['cluster']['worker'])
+
+strategy = tf.distribute.MultiWorkerMirroredStrategy()
+
+global_batch_size = per_worker_batch_size * num_workers
+multi_worker_dataset = mnist_setup.mnist_dataset(global_batch_size)
+
+with strategy.scope():
+  # Model building/compiling need to be within `strategy.scope()`.
+  multi_worker_model = mnist_setup.build_and_compile_cnn_model()
+
+
+multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)
diff --git a/frontier/sample_apps/tensorflow/multi-node/mnist_setup.py b/frontier/sample_apps/tensorflow/multi-node/mnist_setup.py
@@ -0,0 +1,30 @@
+import os
+import tensorflow as tf
+import numpy as np
+
+def mnist_dataset(batch_size):
+  (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
+  # The `x` arrays are in uint8 and have values in the [0, 255] range.
+  # You need to convert them to float32 with values in the [0, 1] range.
+  x_train = x_train / np.float32(255)
+  y_train = y_train.astype(np.int64)
+  train_dataset = tf.data.Dataset.from_tensor_slices(
+      (x_train, y_train)).shuffle(60000).repeat().batch(batch_size)
+  return train_dataset
+
+def build_and_compile_cnn_model():
+  model = tf.keras.Sequential([
+      tf.keras.layers.InputLayer(input_shape=(28, 28)),
+      tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
+      tf.keras.layers.Conv2D(32, 3, activation='relu'),
+      tf.keras.layers.Flatten(),
+      tf.keras.layers.Dense(128, activation='relu'),
+      tf.keras.layers.Dense(10)
+  ])
+  model.compile(
+      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
+      metrics=['accuracy'])
+  return model
+
+
diff --git a/frontier/sample_apps/tensorflow/multi-node/pyrun.sh b/frontier/sample_apps/tensorflow/multi-node/pyrun.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+#source  /sw/frontier/python/3.10/miniforge3/23.11.0/bin/activate  #/opt/miniforge/bin/activate 
+#python -c 'import tensorflow' 2> /dev/null && echo ‘Success’ || echo ‘Failure’
+
+python -W ignore -u ./main.py #mnist_setup.py #multinode_olcf.py 2000 10 --master_addr=$MASTER_ADDR --master_port=3442
+
diff --git a/frontier/sample_apps/tensorflow/multi-node/submit.sbatch b/frontier/sample_apps/tensorflow/multi-node/submit.sbatch
@@ -0,0 +1,76 @@
+#!/bin/bash
+#SBATCH -A stf016
+#SBATCH -J ddp_test
+#SBATCH -o logs/frontier_apptainer_mltests.%j
+#SBATCH -e logs/frontier_apptainer_mltests.%j
+#SBATCH -t 00:30:00
+#SBATCH -p batch
+#SBATCH -N 2
+
+# Only necessary if submitting like: sbatch --export=NONE ... (recommended)
+# Do NOT include this line when submitting without --export=NONE
+#unset SLURM_EXPORT_ENV
+
+# Load modules
+
+module load cray-mpich-abi/8.1.31
+module load craype-accel-amd-gfx90a
+module load rocm/5.7.1
+module load miniforge3
+
+read -ra arr <<< ${ips}
+
+export NCCL_SOCKET_IFNAME=hsn0
+
+export MASTER_ADDR=$(getent hosts $(scontrol show hostnames $SLURM_NODELIST | head -n1) | awk '{ print $1 }')
+#export MASTER_ADDR=$(hostname -i)
+echo "MASTER_ADDR=" $MASTER_ADDR
+export MASTER_PORT=3442
+export NCCL_SOCKET_IFNAME=hsn0
+export GLOO_SOCKET_IFNAME=hsn0
+# Needed to bypass MIOpen, Disk I/O Errors
+export MIOPEN_USER_DB_PATH="/tmp/my-miopen-cache"
+export MIOPEN_CUSTOM_CACHE_DIR=${MIOPEN_USER_DB_PATH}
+rm -rf ${MIOPEN_USER_DB_PATH}
+mkdir -p ${MIOPEN_USER_DB_PATH}
+
+#export NCCL_IB_DISABLE=1
+##export NCCL_DEBUG=INFO
+#export TF_CPP_MIN_LOG_LEVEL=0
+##export GRPC_VERBOSITY=debug
+#export GRPC_TRACE=all
+#export GRPC_ENABLE_FORK_SUPPORT=true
+#export TF_FORCE_GPU_ALLOW_GROWTH=true
+#export GRPC_ARG_ENABLE_IPV4_ONLY=true
+#
+#rm -rf ${MIOPEN_USER_DB_PATH}
+#mkdir -p ${MIOPEN_USER_DB_PATH}
+#
+hosts=$(scontrol show hostnames $SLURM_JOB_NODELIST)
+hosts_array=($hosts)
+#
+## Setup TF_CONFIG for each worker
+
+TF_CONFIG=$(cat <<EOF
+{
+  "cluster": {
+    "worker": ["${hosts_array[0]}:12345", "${hosts_array[1]}:23456"]
+  },
+  "task": {"type": "worker", "index": $rank}
+}
+EOF
+)
+#
+#export TF_CONFIG="$TF_CONFIG"
+#echo TF_CONFIG="$TF_CONFIG"
+#
+export MPICH_GPU_SUPPORT_ENABLED=1
+export BINDS=/usr/share/libdrm,/var/spool/slurmd,/opt/cray,${PWD}
+export APPTAINERENV_LD_LIBRARY_PATH="/opt/cray/pe/mpich/8.1.31/ofi/crayclang/17.0/lib-abi-mpich:/opt/cray/pe/mpich/8.1.31/gtl/lib:/opt/rocm-5.7.1/lib:$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH:/opt/cray/pe/lib64"
+export APPTAINER_CONTAINLIBS="/usr/lib64/libcxi.so.1,/usr/lib64/libjson-c.so.3,/lib64/libtinfo.so.6,/usr/lib64/libnl-3.so.200"
+
+set -ex
+# Run script
+#
+srun -N2 -n2 --gpus=16 --gpu-bind=closest apptainer exec --workdir `pwd` --rocm --bind $BINDS tensorflow_latest.sif  ./pyrun.sh
+