Skip to content

uvm changes for nvidia platforms #968

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 19 additions & 13 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,17 +135,17 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
add_compile_options(
-Wall
-Wextra
-Wextra
-fcheck=all,no-array-temps
-fbacktrace
-fimplicit-none
-fsignaling-nans
-finit-real=snan
-finit-integer=-99999999
-Wintrinsic-shadow
-Wunderflow
-Wrealloc-lhs
-Wsurprising
-Wintrinsic-shadow
-Wunderflow
-Wrealloc-lhs
-Wsurprising
)
endif()

Expand All @@ -163,7 +163,6 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
"SHELL:-h acc_model=auto_async_none"
"SHELL: -h acc_model=no_fast_addr"
"SHELL: -h list=adm"
"SHELL: -munsafe-fp-atomics" # Not unsafe for operations we do
)

add_link_options("SHELL:-hkeepfiles")
Expand All @@ -173,6 +172,7 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
"SHELL:-h acc_model=auto_async_none"
"SHELL: -h acc_model=no_fast_addr"
"SHELL: -K trap=fp" "SHELL: -G2"

)
add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
endif()
Expand Down Expand Up @@ -486,23 +486,23 @@ function(MFC_SETUP_TARGET)
endforeach()

target_compile_options(${a_target}
PRIVATE -gpu=keep,ptxinfo,lineinfo
PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
)

# GH-200 Unified Memory Support
if (MFC_Unified)
target_compile_options(${ARGS_TARGET}
PRIVATE -gpu=unified
PRIVATE -gpu=mem:unified -cuda
)
# "This option must appear in both the compile and link lines" -- NVHPC Docs
target_link_options(${ARGS_TARGET}
PRIVATE -gpu=unified
PRIVATE -gpu=mem:unified -cuda
)
endif()

if (CMAKE_BUILD_TYPE STREQUAL "Debug")
target_compile_options(${a_target}
PRIVATE -gpu=autocompare,debug
PRIVATE -gpu=autocompare,debug -cuda
)
endif()
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
Expand All @@ -513,9 +513,15 @@ function(MFC_SETUP_TARGET)
target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
endif()

if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
find_package(CUDAToolkit REQUIRED)
target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR
CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")

if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
else() # CUDA >= 12.9
target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx )
endif()
target_link_options(${a_target} PRIVATE "-cudalib=nvtx")
endif()
endforeach()

Expand Down
48 changes: 48 additions & 0 deletions misc/nvidia_uvm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
## The Main Idea behind the implemented Out-of-Core Strategy for Grace-Hopper

To run MFC out-of-core on Grace-Hopper using Unified Memory we implement a zero-copy strategy.

We start by setting preferred location CPU for all buffers by hooking into the allocate macro and setting `NVIDIA_ALLOC_MODE=2`.
This way we disable access counter based migrations and keep everything on the CPU memory, freeing up as much GPU memory as possible.

Then, for the "important" buffers that are frequently accessed from the GPU, we reset preferred location to GPU in order to place them (and directly populate them) in GPU memory.
This is done by the `PREFER_GPU` macro that has been manually placed in the code right after the allocations of the "important" buffers.
To activate these hints we export `NVIDIA_MANUAL_GPU_HINTS=1`.

To allow fine grained control and be able to simulate larger sizes, we also use the following environment variables:
- With `NVIDIA_IGR_TEMPS_ON_GPU` we control how many temporaries from the IGR module are to be placed in GPU memory.
- With `NVIDIA_VARS_ON_GPU` we control how many of the `q_cons_ts(1)%vf(j)%sf` arrays we place in GPU memory.

It is important to note that we have rearranged the timestep updates in the 3rd order TVD Runge Kutta scheme in a way that allows us to pass only `q_cons_ts(1)` to the `compute_rhs` routines.
This way, in order to keep the computation of `compute_rhs` (mostly) on GPU data, we only need to store `q_cons_ts(1)` (fully or even partially) in GPU memory.
Thus, we choose to keep `q_cons_ts(2)` in CPU memory for the full lifetime of the simulation, freeing up space in GPU memory that allows for bumping up the size of the simulation, without sacrificing performance.
In the timestep updates between the `compute_rhs` calls, we access both `q_cons_ts(1)` and `q_cons_ts(2)` directly from the physical location where they reside (zero-copy), simultaneously pulling data from GPU memory and CPU memory (through C2C), making good use of Grace-Hopper.

Note: This rearrangement most likely "breaks" the timestepper for different physics cases, but we can easily fix it in a later step.

## Example Workflow for Out-of-Core Strategy based on Unified Memory

```shell
# Allocate a node
salloc -A g183 --partition normal -t 02:00:00 -N 1 -n 4 --cpus-per-task=71

# Start uenv
uenv start --view=modules icon/25.2:v1

# cd to root directory of MFC
cd MFC-Wilfong

# Load modules
. ./mfc.sh load -c san -m g

# Build
export MFC_CUDA_CC=90
./mfc.sh build --gpu -j 71 --single --unified --verbose

# Run pre_process and simulation binaries with case optimization (in an interactive job)
./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis

# Run pre_process and simulation binaries with case optimization (in an batch job)
./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis -e batch -p normal -a g183 -w 00:15:00
```
The environment variables `NVIDIA_ALLOC_MODE`, `NVIDIA_MANUAL_GPU_HINTS`, `NVIDIA_VARS_ON_GPU`, and `NVIDIA_IGR_TEMPS_ON_GPU`, can be set appropriately in `toolchain/templates/santis.mako`, to configure a run with ALL buffers either in GPU or in CPU memory, or a run with SOME buffers in GPU memory and the rest in CPU memory.
21 changes: 21 additions & 0 deletions misc/nvidia_uvm/bind.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

# -------------------------------- #
# Binding for a single Santis node #
# -------------------------------- #

# Local rank
export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"

# Bind to GPU
export CUDA_VISIBLE_DEVICES="$local_rank"

# Binding to NIC
export MPICH_OFI_NIC_POLICY=USER
export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"

# Bind to cores ( second core per socket )
physcores=(0 72 144 216)

#echo rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
25 changes: 25 additions & 0 deletions misc/nvidia_uvm/nsys.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

#set -x
set -euo pipefail

rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"

[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
[[ -z "${NSYS+x}" ]] && NSYS=0

if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
echo "Doing nsys"
exec nsys profile \
--cpuctxsw=none -b none -s none \
--event-sample=system-wide \
--cpu-socket-events=61,71,265,273 \
--cpu-socket-metrics=103,104 \
--event-sampling-interval=10 \
--trace=nvtx,openacc \
--force-overwrite=true \
-e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
-o "$NSYS_FILE" "$@"
else
exec "$@"
fi
27 changes: 27 additions & 0 deletions misc/nvidia_uvm/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env bash

# TODO: Modify accordingly
PATH_TO_BINARY=${SCRATCH}/projects/cfd/mfc/MFC-Wilfong/build/install/cdcd4e8762/bin/

# NVHPC and CUDA env vars
export NV_ACC_USE_MALLOC=1 # use malloc instead of cudaMallocManaged ( compiled using -gpu=mem:unified )
export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints
export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH

# Cray MPICH
export MPICH_GPU_SUPPORT_ENABLED=1 # MPICH with GPU support
export FI_CXI_RX_MATCH_MODE=software
export FI_MR_CACHE_MONITOR=disabled

# CUSTOM env vars to MFC
export NVIDIA_ALLOC_MODE=2 # default alloc to prefloc CPU
export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some
export NVIDIA_IGR_TEMPS_ON_GPU=1 # jac on GPU and jac_rhs on CPU ( NOTE: good default, tune based on size )
export NVIDIA_VARS_ON_GPU=7 # q_cons_ts(1)%vf%sf for j=1-7 on GPU ( NOTE: good default, tune based on size )

# NSYS
export NSYS=1 # enable nsys profiling
export NSYS_FILE=report_uvm_single_N-499_nGPUs-4_params-${NVIDIA_VARS_ON_GPU}-${NVIDIA_IGR_TEMPS_ON_GPU}.qdrep

# Run using --cpu-bind=none because we use our own binding script
srun --ntasks 4 --cpu-bind=none ./bind.sh ./nsys.sh ${PATH_TO_BINARY}/simulation
102 changes: 102 additions & 0 deletions src/common/include/macros.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,113 @@
#endif
#:enddef

#:def PREFER_GPU(*args)
#ifdef MFC_SIMULATION
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
block
use cudafor
intrinsic :: minval, maxval, sum
integer :: istat
integer :: prefer_gpu_mode
character(len=10) :: prefer_gpu_mode_str

! environment variable
call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
if (trim(prefer_gpu_mode_str) == "0") then ! OFF
prefer_gpu_mode = 0
elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
prefer_gpu_mode = 1
else ! default
prefer_gpu_mode = 0
endif

if (prefer_gpu_mode .eq. 1) then
#:for arg in args
!print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
! unset
istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId )
if (istat /= cudaSuccess) then
write(*,"('Error code: ',I0, ': ')") istat
write(*,*) cudaGetErrorString(istat)
endif
! set
istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
if (istat /= cudaSuccess) then
write(*,"('Error code: ',I0, ': ')") istat
write(*,*) cudaGetErrorString(istat)
endif
#:endfor
end if
end block
#endif
#endif
#:enddef


#:def PARSE(s)
${s if s.rfind(')') == -1 else next((s[:i] for i in range(s.rfind(')'), -1, -1) if s[i] == '(' and s.count('(', i, s.rfind(')')+1) == s.count(')', i, s.rfind(')')+1)), s)}$
#:enddef

#:def ALLOCATE(*args)
@:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
#:set allocated_variables = ', '.join(args)
allocate (${allocated_variables}$)
$:GPU_ENTER_DATA(create=('[' + allocated_variables + ']'))


#ifdef MFC_SIMULATION
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
block
use cudafor
intrinsic :: minval, maxval, sum
integer :: istat, stream_id
integer :: alloc_mode
character(len=10) :: alloc_mode_str

! environment variable
call get_environment_variable("NVIDIA_ALLOC_MODE", alloc_mode_str)
if (trim(alloc_mode_str) == "0") then ! no CPU first touch, no preferred location CPU
alloc_mode = 0
elseif (trim(alloc_mode_str) == "1") then ! CPU first touch, no preferred location CPU
alloc_mode = 1
elseif (trim(alloc_mode_str) == "2") then ! no CPU first touch, preferred location CPU
alloc_mode = 2
elseif (trim(alloc_mode_str) == "3") then ! CPU first touch, preferred location CPU
alloc_mode = 3
else ! default
alloc_mode = 0
endif

stream_id = 0

! prefetch to CPU
if ((alloc_mode .eq. 1) .or. (alloc_mode .eq. 3)) then
#:for arg in args
istat = cudaMemPrefetchAsync( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaCpuDeviceId, stream_id )
!print*, "! @{PARSE(${arg}$)}@ with shape", SHAPE(@{PARSE(${arg}$)}@), "=> prefetch to CPU"
if (istat /= cudaSuccess) then
write(*,"('Error code: ',I0, ': ')") istat
write(*,*) cudaGetErrorString(istat)
endif
#:endfor
endif

! memadvise preferred location
if ((alloc_mode .eq. 2) .or. (alloc_mode .eq. 3)) then
#:for arg in args
istat = cudaMemAdvise( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId )
!print*, "! @{PARSE(${arg}$)}@ with shape", SHAPE(@{PARSE(${arg}$)}@), "=> preferred location CPU"
if (istat /= cudaSuccess) then
write(*,"('Error code: ',I0, ': ')") istat
write(*,*) cudaGetErrorString(istat)
endif
#:endfor
endif

end block
#endif
#endif

#:enddef ALLOCATE

#:def DEALLOCATE(*args)
Expand Down
9 changes: 9 additions & 0 deletions src/simulation/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -1294,16 +1294,25 @@ contains
@:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
@:ALLOCATE(x_cc(-buff_size:m + buff_size))
@:ALLOCATE(dx(-buff_size:m + buff_size))
@:PREFER_GPU(x_cb)
@:PREFER_GPU(x_cc)
@:PREFER_GPU(dx)

if (n == 0) return;
@:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
@:ALLOCATE(y_cc(-buff_size:n + buff_size))
@:ALLOCATE(dy(-buff_size:n + buff_size))
@:PREFER_GPU(y_cb)
@:PREFER_GPU(y_cc)
@:PREFER_GPU(dy)

if (p == 0) return;
@:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
@:ALLOCATE(z_cc(-buff_size:p + buff_size))
@:ALLOCATE(dz(-buff_size:p + buff_size))
@:PREFER_GPU(z_cb)
@:PREFER_GPU(z_cc)
@:PREFER_GPU(dz)

end subroutine s_initialize_global_parameters_module

Expand Down
Loading
Loading