aws-samples · pbelevich · Nov 19, 2025
diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/.gitignore b/3.test_cases/expert-parallelism/deepep-benchmark/.gitignore
@@ -0,0 +1 @@
+*.sqsh
diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/README.md b/3.test_cases/expert-parallelism/deepep-benchmark/README.md
@@ -0,0 +1,147 @@
+# DeepEP Benchmark
+https://github.com/deepseek-ai/DeepEP
+
+Updated to [NVSHMEM 3.4.5-0](https://github.com/NVIDIA/nvshmem/commit/df2814155acfba6227534dd81a8bf338da9e55f2) and DeepEP [Sep 25, 2025](https://github.com/deepseek-ai/DeepEP/tree/e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39)
+
+## Git clone NVSHMEM
+
+3.4.5-0:
+```
+git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2 && cd ..
+```
+
+devel brach:
+```
+git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout devel && cd ..
+```
+
+## Building DeepEP Docker image
+
+```bash
+GDRCOPY_VERSION=v2.5.1
+EFA_INSTALLER_VERSION=1.43.2
+NCCL_VERSION=v2.27.7-1
+NCCL_TESTS_VERSION=v2.16.9
+NVSHMEM_VERSION=3.4.5-0
+TAG="efa${EFA_INSTALLER_VERSION}-nccl${NCCL_VERSION}-tests${NCCL_TESTS_VERSION}-nvshmem${NVSHMEM_VERSION}"
+DEEPEP_CONTAINER_IMAGE_NAME_TAG="deepep:${TAG}"
+```
+
+```bash
+docker build --progress=plain -f ./deepep.Dockerfile \
+       --build-arg="EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}" \
+       --build-arg="NCCL_VERSION=${NCCL_VERSION}" \
+       --build-arg="NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}" \
+       --build-arg="NVSHMEM_VERSION=${NVSHMEM_VERSION}" \
+       -t ${DEEPEP_CONTAINER_IMAGE_NAME_TAG} \
+       .
+```
+
+```bash
+enroot import -o ./deepep.sqsh dockerd://${DEEPEP_CONTAINER_IMAGE_NAME_TAG}
+```
+
+## Running DeepEP Benchmark
+
+### Intranode
+
+```bash
+srun --mpi=pmix --cpu-bind=none --container-image ./deepep.sqsh python /DeepEP/tests/test_intranode.py
+```
+
+## P5en results
+DeepEP commit [e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39](https://github.com/deepseek-ai/DeepEP/tree/e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39)
+```
+[config] num_tokens=4096, hidden=7168, num_topk=8
+[layout] Kernel performance: 0.041 ms
+
+[testing] Running with BF16, without top-k (async=False, previous=False) ... passed
+[testing] Running with BF16, with top-k (async=False, previous=False) ... passed
+[testing] Running with BF16, without top-k (async=False, previous=False) ... passed
+[testing] Running with BF16, with top-k (async=False, previous=False) ... passed
+[testing] Running with FP8, without top-k (async=False, previous=False) ... passed
+[testing] Running with FP8, with top-k (async=False, previous=False) ... passed
+[testing] Running with BF16, without top-k (async=True, previous=False) ... passed
+[testing] Running with BF16, with top-k (async=True, previous=False) ... passed
+[testing] Running with BF16, without top-k (async=True, previous=False) ... passed
+[testing] Running with BF16, with top-k (async=True, previous=False) ... passed
+[testing] Running with FP8, without top-k (async=True, previous=False) ... passed
+[testing] Running with FP8, with top-k (async=True, previous=False) ... passed
+[testing] Running with BF16, without top-k (async=False, previous=True) ... passed
+[testing] Running with BF16, with top-k (async=False, previous=True) ... passed
+[testing] Running with BF16, without top-k (async=False, previous=True) ... passed
+[testing] Running with BF16, with top-k (async=False, previous=True) ... passed
+[testing] Running with FP8, without top-k (async=False, previous=True) ... passed
+[testing] Running with FP8, with top-k (async=False, previous=True) ... passed
+[testing] Running with BF16, without top-k (async=True, previous=True) ... passed
+[testing] Running with BF16, with top-k (async=True, previous=True) ... passed
+[testing] Running with BF16, without top-k (async=True, previous=True) ... passed
+[testing] Running with BF16, with top-k (async=True, previous=True) ... passed
+[testing] Running with FP8, without top-k (async=True, previous=True) ... passed
+[testing] Running with FP8, with top-k (async=True, previous=True) ... passed
+
+[tuning] SMs 24, NVL chunk 4: 294.24 GB/s (NVL), 544.47 us
+[tuning] SMs 24, NVL chunk 6: 320.68 GB/s (NVL), 499.58 us
+[tuning] SMs 24, NVL chunk 8: 317.79 GB/s (NVL), 504.13 us
+[tuning] SMs 24, NVL chunk 10: 316.46 GB/s (NVL), 506.25 us
+[tuning] SMs 24, NVL chunk 12: 308.37 GB/s (NVL), 519.53 us
+[tuning] SMs 24, NVL chunk 14: 298.15 GB/s (NVL), 537.34 us
+[tuning] SMs 24, NVL chunk 16: 292.44 GB/s (NVL), 547.83 us
+[tuning] SMs 24, NVL chunk 18: 297.46 GB/s (NVL), 538.58 us
+[tuning] SMs 24, NVL chunk 20: 293.29 GB/s (NVL), 546.24 us
+[tuning] SMs 24, NVL chunk 22: 287.31 GB/s (NVL), 557.62 us
+[tuning] SMs 24, NVL chunk 24: 287.20 GB/s (NVL), 557.83 us
+[tuning] SMs 24, NVL chunk 26: 286.76 GB/s (NVL), 558.67 us
+[tuning] SMs 24, NVL chunk 28: 287.96 GB/s (NVL), 556.35 us
+[tuning] SMs 24, NVL chunk 30: 282.88 GB/s (NVL), 566.33 us
+[tuning] SMs 24, NVL chunk 32: 281.40 GB/s (NVL), 569.32 us
+[tuning] SMs 24, NVL chunk default: 319.82 GB/s (NVL), 500.93 us
+[tuning] Best dispatch (FP8): SMs 24, NVL chunk 6, 320.68 GB/s (NVL), t: 499.58 us
+
+[tuning] SMs 24, NVL chunk 4: 331.77 GB/s (NVL), 936.50 us
+[tuning] SMs 24, NVL chunk 6: 304.74 GB/s (NVL), 1019.58 us
+[tuning] SMs 24, NVL chunk 8: 305.57 GB/s (NVL), 1016.81 us
+[tuning] SMs 24, NVL chunk 10: 305.73 GB/s (NVL), 1016.26 us
+[tuning] SMs 24, NVL chunk 12: 303.80 GB/s (NVL), 1022.74 us
+[tuning] SMs 24, NVL chunk 14: 300.82 GB/s (NVL), 1032.85 us
+[tuning] SMs 24, NVL chunk 16: 300.27 GB/s (NVL), 1034.75 us
+[tuning] SMs 24, NVL chunk 18: 301.12 GB/s (NVL), 1031.83 us
+[tuning] SMs 24, NVL chunk 20: 298.67 GB/s (NVL), 1040.29 us
+[tuning] SMs 24, NVL chunk 22: 296.76 GB/s (NVL), 1046.98 us
+[tuning] SMs 24, NVL chunk 24: 296.46 GB/s (NVL), 1048.05 us
+[tuning] SMs 24, NVL chunk 26: 294.70 GB/s (NVL), 1054.29 us
+[tuning] SMs 24, NVL chunk 28: 293.73 GB/s (NVL), 1057.80 us
+[tuning] SMs 24, NVL chunk 30: 292.28 GB/s (NVL), 1063.03 us
+[tuning] SMs 24, NVL chunk 32: 292.16 GB/s (NVL), 1063.47 us
+[tuning] SMs 24, NVL chunk default: 305.72 GB/s (NVL), 1016.31 us
+[tuning] Best dispatch (BF16): SMs 24, NVL chunk 4, 331.77 GB/s (NVL), t: 936.50 us
+
+[tuning] SMs 24, NVL chunk 1: 159.88 GB/s (NVL), 1943.39 us
+[tuning] SMs 24, NVL chunk 2: 277.52 GB/s (NVL), 1119.56 us
+[tuning] SMs 24, NVL chunk 3: 316.19 GB/s (NVL), 982.64 us
+[tuning] SMs 24, NVL chunk 4: 321.89 GB/s (NVL), 965.24 us
+[tuning] SMs 24, NVL chunk 5: 311.73 GB/s (NVL), 996.72 us
+[tuning] SMs 24, NVL chunk 6: 294.88 GB/s (NVL), 1053.67 us
+[tuning] SMs 24, NVL chunk 7: 304.14 GB/s (NVL), 1021.57 us
+[tuning] SMs 24, NVL chunk 8: 288.61 GB/s (NVL), 1076.55 us
+[tuning] SMs 24, NVL chunk 9: 284.72 GB/s (NVL), 1091.26 us
+[tuning] SMs 24, NVL chunk 10: 289.42 GB/s (NVL), 1073.55 us
+[tuning] SMs 24, NVL chunk 11: 284.57 GB/s (NVL), 1091.85 us
+[tuning] SMs 24, NVL chunk 12: 284.85 GB/s (NVL), 1090.75 us
+[tuning] SMs 24, NVL chunk 13: 288.21 GB/s (NVL), 1078.05 us
+[tuning] SMs 24, NVL chunk 14: 285.78 GB/s (NVL), 1087.20 us
+[tuning] SMs 24, NVL chunk 15: 283.55 GB/s (NVL), 1095.76 us
+[tuning] SMs 24, NVL chunk 16: 283.94 GB/s (NVL), 1094.27 us
+[tuning] SMs 24, NVL chunk default: 319.88 GB/s (NVL), 971.32 us
+[tuning] Best combine: SMs 24, NVL chunk 4: 321.89 GB/s (NVL), t: 965.24 us
+```
+
+### Internode
+
+```bash
+srun \
+  -l --mpi=pmix --cpu-bind=none \
+  --container-image ./deepep.sqsh \
+  -N 2 \
+  bash -c 'MASTER_ADDR=${SLURM_NODELIST%%,*} WORLD_SIZE=$SLURM_NNODES RANK=$SLURM_PROCID NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa python3 -u -X faulthandler /DeepEP/tests/test_internode.py'
+```
diff --git a/3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile b/3.test_cases/expert-parallelism/deepep-benchmark/deepep.Dockerfile
@@ -0,0 +1,191 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+ARG CUDA_VERSION=12.8.1
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+################################ NCCL ########################################
+
+ARG GDRCOPY_VERSION=v2.5.1
+ARG EFA_INSTALLER_VERSION=1.43.2
+ARG AWS_OFI_NCCL_VERSION=v1.16.3
+ARG NCCL_VERSION=v2.27.7-1
+ARG NCCL_TESTS_VERSION=v2.16.9
+
+RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get remove -y --allow-change-held-packages \
+    ibverbs-utils \
+    libibverbs-dev \
+    libibverbs1 \
+    libmlx5-1 \
+    libnccl2 \
+    libnccl-dev
+
+RUN rm -rf /opt/hpcx \
+    && rm -rf /usr/local/mpi \
+    && rm -f /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+ENV OPAL_PREFIX=
+
+RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    autoconf \
+    automake \
+    build-essential \
+    check \
+    cmake \
+    curl \
+    debhelper \
+    devscripts \
+    git \
+    gcc \
+    gdb \
+    kmod \
+    libsubunit-dev \
+    libtool \
+    openssh-client \
+    openssh-server \
+    pkg-config \
+    python3-distutils \
+    vim \
+    python3.10-dev \
+    python3.10-venv
+RUN apt-get purge -y cuda-compat-*
+
+RUN mkdir -p /var/run/sshd
+RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
+    echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu:/usr/local/lib:$LD_LIBRARY_PATH
+ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:/usr/bin:/usr/local/bin:$PATH
+
+RUN curl https://bootstrap.pypa.io/get-pip.py -o /tmp/get-pip.py \
+    && python3 /tmp/get-pip.py \
+    && pip3 install awscli pynvml
+
+#################################################
+## Install NVIDIA GDRCopy
+##
+## NOTE: if `nccl-tests` or `/opt/gdrcopy/bin/sanity -v` crashes with incompatible version, ensure
+## that the cuda-compat-xx-x package is the latest.
+RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
+    && cd /tmp/gdrcopy \
+    && make prefix=/opt/gdrcopy install
+
+ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
+ENV CPATH=/opt/gdrcopy/include:$CPATH
+ENV PATH=/opt/gdrcopy/bin:$PATH
+
+#################################################
+## Install EFA installer
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf $HOME/aws-efa-installer
+
+###################################################
+## Install NCCL
+RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git  /opt/nccl \
+    && cd /opt/nccl \
+    && make -j $(nproc) src.build CUDA_HOME=/usr/local/cuda \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+###################################################
+## Install NCCL-tests
+RUN git clone -b ${NCCL_TESTS_VERSION} https://github.com/NVIDIA/nccl-tests.git /opt/nccl-tests \
+    && cd /opt/nccl-tests \
+    && make -j $(nproc) \
+    MPI=1 \
+    MPI_HOME=/opt/amazon/openmpi/ \
+    CUDA_HOME=/usr/local/cuda \
+    NCCL_HOME=/opt/nccl/build \
+    NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_89,code=sm_89 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_100,code=sm_100"
+
+RUN rm -rf /var/lib/apt/lists/*
+
+## Set Open MPI variables to exclude network interface and conduit.
+ENV OMPI_MCA_pml=^ucx            \
+    OMPI_MCA_btl=tcp,self           \
+    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent\
+    OPAL_PREFIX=/opt/amazon/openmpi \
+    NCCL_SOCKET_IFNAME=^docker,lo,veth
+
+## Turn off PMIx Error https://github.com/open-mpi/ompi/issues/7516
+ENV PMIX_MCA_gds=hash
+
+## Set LD_PRELOAD for NCCL library
+ENV LD_PRELOAD=/opt/nccl/build/lib/libnccl.so
+
+################################ NVSHMEM ########################################
+
+ENV NVSHMEM_DIR=/opt/nvshmem
+ENV NVSHMEM_HOME=/opt/nvshmem
+
+# 3.2.5-1: wget https://developer.nvidia.com/downloads/assets/secure/nvshmem/nvshmem_src_3.2.5-1.txz && tar -xvf nvshmem_src_3.2.5-1.txz
+# 3.3.9:   wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && tar -xvf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
+# 3.4.5-0: git clone https://github.com/NVIDIA/nvshmem.git && cd ./nvshmem && git checkout df2814155acfba6227534dd81a8bf338da9e55f2
+COPY ./nvshmem /nvshmem_src
+
+RUN cd /nvshmem_src \
+    && mkdir -p build \
+    && cd build \ 
+    && cmake \
+    -DNVSHMEM_PREFIX=/opt/nvshmem \
+    -DCMAKE_INSTALL_PREFIX=/opt/nvshmem \
+    \
+    -DCUDA_HOME=/usr/local/cuda \
+    -DCMAKE_CUDA_ARCHITECTURES="90a;100" \
+    \
+    -DNVSHMEM_USE_GDRCOPY=1 \
+    -DGDRCOPY_HOME=/opt/gdrcopy \
+    \
+    -DNVSHMEM_USE_NCCL=1 \
+    -DNCCL_HOME=/opt/nccl/build \
+    -DNCCL_INCLUDE=/opt/nccl/build/include \
+    \
+    -DNVSHMEM_LIBFABRIC_SUPPORT=1 \
+    -DLIBFABRIC_HOME=/opt/amazon/efa \
+    \
+    -DNVSHMEM_MPI_SUPPORT=1 \
+    -DMPI_HOME=/opt/amazon/openmpi \
+    \
+    -DNVSHMEM_PMIX_SUPPORT=1 \
+    -DPMIX_HOME=/opt/amazon/pmix \
+    -DNVSHMEM_DEFAULT_PMIX=1 \
+    \
+    -DNVSHMEM_BUILD_TESTS=1 \
+    -DNVSHMEM_BUILD_EXAMPLES=1 \
+    -DNVSHMEM_BUILD_HYDRA_LAUNCHER=1 \
+    -DNVSHMEM_BUILD_TXZ_PACKAGE=1 \
+    \
+    -DNVSHMEM_IBRC_SUPPORT=1 \
+    -DNVSHMEM_IBGDA_SUPPORT=1 \
+    \
+    -DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+    .. \
+    && make -j$(nproc) \
+    && make install
+
+ENV PATH=/opt/nvshmem/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/nvshmem/lib:$LD_LIBRARY_PATH
+# ENV PATH=/opt/nvshmem/bin:$PATH LD_LIBRARY_PATH=/opt/amazon/pmix/lib:/opt/nvshmem/lib:$LD_LIBRARY_PATH NVSHMEM_REMOTE_TRANSPORT=libfabric NVSHMEM_LIBFABRIC_PROVIDER=efa
+
+################################ PyTorch ########################################
+
+RUN pip install torch --index-url https://download.pytorch.org/whl/cu128
+RUN pip install ninja numpy cmake pytest
+
+################################ DeepEP ########################################
+
+ARG DEEPEP_COMMIT=e02e4d2e1fbfdf09e02e870b6acc5831cbd11e39
+
+RUN git clone https://github.com/deepseek-ai/DeepEP.git /DeepEP \
+    && cd /DeepEP \
+    && git checkout ${DEEPEP_COMMIT} \
+    && TORCH_CUDA_ARCH_LIST="9.0a+PTX;10.0" pip install .
+
+RUN mkdir -p /tmp/coredump
diff --git a/3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore b/3.test_cases/expert-parallelism/pplx-garden-benchmark/.gitignore
@@ -0,0 +1,5 @@
+*.sqsh
+*.out
+*.err
+pplx-garden
+uccl