Skip to content

Strategy for running MFC out-of-core on NVIDIA Grace-Hopper using Unified Memory #972

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Aug 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2358d29
Add scripts for santis/alps, example case, and captures for UVM comms…
ntselepidis Aug 1, 2025
37d393b
Add PREFER_GPU and rearrange update for out-of-core computation
ntselepidis Aug 1, 2025
693c7f4
Allow keeping q_cons_ts(2) on CPU using pinned allocations
ntselepidis Aug 1, 2025
7054b7b
Modify PREFER_GPU macro
ntselepidis Aug 1, 2025
ee1277d
Allow control in placement of IGR temps
ntselepidis Aug 1, 2025
4065c02
Do some clean up
ntselepidis Aug 2, 2025
cfb792c
ENV Vars to case file options and code structure changes
Aug 3, 2025
cacc6b0
Fix some comments
ntselepidis Aug 3, 2025
884a4d9
Merge remote-tracking branch 'upstream/master' into nvidia
wilfonba Aug 3, 2025
b3fdbff
test merge and add nv_uvm_out_of_core back
wilfonba Aug 3, 2025
51d7e90
Fix some allocs and deallocs in timesteppers
ntselepidis Aug 5, 2025
c553b78
Fix nv_uvm_out_of_core inconsistency and add to case file
ntselepidis Aug 5, 2025
f3b3851
Fix bug in 2nd order TVD RK introduced by merge
ntselepidis Aug 5, 2025
71b5976
Fix some comments
ntselepidis Aug 5, 2025
a4d6b38
Add note on binding script requirement for PREFER_GPU macro
ntselepidis Aug 6, 2025
acb2405
Flip nv_uvm_pref_gpu default to false
ntselepidis Aug 7, 2025
8fef22d
Be explicit with unified memory compilation to stay robust in changes…
ntselepidis Aug 7, 2025
5e369c3
Add some changes to future proof the unified memory build
ntselepidis Aug 11, 2025
52c5608
Comment out calls to cudaGetErrorString
ntselepidis Aug 11, 2025
4ec8617
prepare for merge
wilfonba Aug 11, 2025
bd0adee
Merge remote-tracking branch 'upstream/master' into nvidia
wilfonba Aug 11, 2025
37b1768
update capture
wilfonba Aug 11, 2025
e02e9f6
add fastmath flag and bug fix
wilfonba Aug 11, 2025
a6ff639
Fix typo in CMakeLists
ntselepidis Aug 12, 2025
457ae60
Replace host_pool with host in m_igr
ntselepidis Aug 12, 2025
a6116f2
Set cpus-per-task to 72 and update binding script
ntselepidis Aug 12, 2025
fb50e90
Add some more updates to the helper scripts
ntselepidis Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 25 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ option(MFC_MPI "Build with MPI" ON
option(MFC_OpenACC "Build with OpenACC" OFF)
option(MFC_GCov "Build with GCov" OFF)
option(MFC_Unified "Build with unified CPU & GPU memory (GH-200 only)" OFF)
option(MFC_Fastmath "Build with -gpu=fastmath on NV GPUs" OFF)
option(MFC_PRE_PROCESS "Build pre_process" OFF)
option(MFC_SIMULATION "Build simulation" OFF)
option(MFC_POST_PROCESS "Build post_process" OFF)
Expand Down Expand Up @@ -487,6 +488,9 @@ function(MFC_SETUP_TARGET)
"-foffload=amdgcn-amdhsa='-march=gfx90a'"
"-foffload-options=-lgfortran\ -lm"
"-fno-exceptions")
if (MFC_Fastmath)
message(WARNING "--fastmath has no effect with the GNU compiler")
endif()
elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
foreach (cc ${MFC_CUDA_CC})
target_compile_options(${a_target}
Expand All @@ -498,14 +502,20 @@ function(MFC_SETUP_TARGET)
PRIVATE -gpu=keep,ptxinfo,lineinfo
)

if (MFC_Fastmath)
target_compile_options(${a_target}
PRIVATE -gpu=fastmath
)
endif()

# GH-200 Unified Memory Support
if (MFC_Unified)
target_compile_options(${ARGS_TARGET}
PRIVATE -gpu=unified
PRIVATE -gpu=mem:unified:managedalloc -cuda
)
# "This option must appear in both the compile and link lines" -- NVHPC Docs
target_link_options(${ARGS_TARGET}
PRIVATE -gpu=unified
PRIVATE -gpu=mem:unified:managedalloc -cuda
)
endif()

Expand All @@ -521,16 +531,28 @@ function(MFC_SETUP_TARGET)
PRIVATE -DFRONTIER_UNIFIED)
endif()

if (MFC_Fastmath)
message(WARNING "--fastmath has no effect with the CCE")
endif()

find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
endif()
elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
if (MFC_Fastmath)
message(WARNING "--fastmath has no effect with the CCE")
endif()
endif()

if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
find_package(CUDAToolkit REQUIRED)
target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
else() # CUDA >= 12.9
target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx)
target_link_options(${a_target} PRIVATE "-cudalib=nvtx3")
endif()
endif()
endforeach()

Expand Down
105 changes: 105 additions & 0 deletions examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3
import math
import json

N = 799
Nx = N
Ny = 2 * (N + 1) - 1
Nz = 2 * (N + 1) - 1

Re = 1600
L = 1
P0 = 101325
rho0 = 1
C0 = math.sqrt(1.4 * P0)
V0 = 0.1 * C0
mu = V0 * L / Re

cfl = 0.5
dx = 2 * math.pi * L / (Ny + 1)

dt = cfl * dx / (C0)

tC = L / V0
tEnd = 20 * tC

Nt = int(tEnd / dt)
Nt = 10


# Configuring case dictionary
print(
json.dumps(
{
"rdma_mpi": "T",
# Logistics
"run_time_info": "F",
# Computational Domain Parameters
"x_domain%beg": -math.pi * L,
"x_domain%end": math.pi * L,
"y_domain%beg": -math.pi * L,
"y_domain%end": math.pi * L,
"z_domain%beg": -math.pi * L,
"z_domain%end": math.pi * L,
"m": Nx,
"n": Ny,
"p": Nz,
"cyl_coord": "F",
"dt": dt,
"t_step_start": 0,
"t_step_stop": 10, # Nt,
"t_step_save": 10, # int(Nt / 100),
# Simulation Algorithm Parameters
"num_patches": 1,
"model_eqns": 2,
"num_fluids": 1,
"time_stepper": 3,
"bc_x%beg": -1,
"bc_x%end": -1,
"bc_y%beg": -1,
"bc_y%end": -1,
"bc_z%beg": -1,
"bc_z%end": -1,
"igr": "T",
"igr_order": 5,
"igr_iter_solver": 1,
"num_igr_iters": 3,
"num_igr_warm_start_iters": 3,
"alf_factor": 10,
"viscous": "T",
# Formatted Database Files Structure Parameters
"format": 1,
"precision": 2,
"prim_vars_wrt": "T",
"omega_wrt(1)": "T",
"omega_wrt(2)": "T",
"omega_wrt(3)": "T",
"qm_wrt": "T",
"fd_order": 4,
"parallel_io": "T",
# Patch 1: Background (AIR - 2)
"patch_icpp(1)%geometry": 9,
"patch_icpp(1)%x_centroid": 0,
"patch_icpp(1)%y_centroid": 0,
"patch_icpp(1)%z_centroid": 0,
"patch_icpp(1)%length_x": 2 * math.pi * L,
"patch_icpp(1)%length_y": 2 * math.pi * L,
"patch_icpp(1)%length_z": 2 * math.pi * L,
"patch_icpp(1)%vel(1)": 0.0,
"patch_icpp(1)%vel(2)": 0.0,
"patch_icpp(1)%vel(3)": 0,
"patch_icpp(1)%pres": 0.0,
"patch_icpp(1)%hcid": 380,
"patch_icpp(1)%alpha_rho(1)": 1,
"patch_icpp(1)%alpha(1)": 1,
# Fluids Physical Parameters
"fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
"fluid_pp(1)%pi_inf": 0,
"fluid_pp(1)%Re(1)": 1 / mu,
# NVIDIA UVM Options
"nv_uvm_out_of_core": "T",
"nv_uvm_igr_temps_on_gpu": 3,
"nv_uvm_pref_gpu": "T",
}
)
)
24 changes: 24 additions & 0 deletions misc/nvidia_uvm/bind.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash

# -------------------------------- #
# Binding for a single Santis node #
# -------------------------------- #

# Local rank
export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"

# Bind to GPU
export CUDA_VISIBLE_DEVICES="$local_rank"

# Binding to NIC
export MPICH_OFI_NIC_POLICY=USER
export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"

# Bind to cores ( all cores per socket )
physcores=(0-71 72-143 144-215 216-287)

#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY

#set -x
numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
#set +x
24 changes: 24 additions & 0 deletions misc/nvidia_uvm/nsys.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

#set -x
set -euo pipefail

rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"

[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
[[ -z "${NSYS+x}" ]] && NSYS=0

if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
exec nsys profile \
--cpuctxsw=none -b none -s none \
--event-sample=system-wide \
--cpu-socket-events=61,71,265,273 \
--cpu-socket-metrics=103,104 \
--event-sampling-interval=10 \
--trace=nvtx,cuda,openacc \
--force-overwrite=true \
-e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
-o "$NSYS_FILE" "$@"
else
exec "$@"
fi
49 changes: 49 additions & 0 deletions src/common/include/macros.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,55 @@
#endif
#:enddef

! Caution:
! This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI rank.
! That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0.
! For an example see misc/nvidia_uvm/bind.sh.
#:def PREFER_GPU(*args)
#ifdef MFC_SIMULATION
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
block
! Beginning in the 25.3 release, the structure of the cudafor module has been changed slightly.
! The module now includes, or “uses” 3 submodules: cuda_runtime_api, gpu_reductions, and sort.
! The cudafor functionality has not changed. But for new users, or users who have needed to
! work-around name conflicts in the module, it may be better to use cuda_runtime_api to expose
! interfaces to the CUDA runtime calls described in Chapter 4 of this guide.
! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules
#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3)
use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
#else
use cuda_runtime_api
#endif
integer :: istat

if (nv_uvm_pref_gpu) then
#:for arg in args
!print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
! set preferred location GPU
istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0)
if (istat /= cudaSuccess) then
write (*, "('Error code: ',I0, ': ')") istat
!write(*,*) cudaGetErrorString(istat)
end if
! set accessed by CPU
istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)
if (istat /= cudaSuccess) then
write (*, "('Error code: ',I0, ': ')") istat
!write(*,*) cudaGetErrorString(istat)
end if
! prefetch to GPU - physically populate memory pages
istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0)
if (istat /= cudaSuccess) then
write (*, "('Error code: ',I0, ': ')") istat
!write(*,*) cudaGetErrorString(istat)
end if
#:endfor
end if
end block
#endif
#endif
#:enddef

#:def ALLOCATE(*args)
@:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
#:set allocated_variables = ', '.join(args)
Expand Down
8 changes: 8 additions & 0 deletions src/common/m_mpi_common.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ module m_mpi_common
!! average primitive variables, for a single computational domain boundary
!! at the time, from the relevant neighboring processor.

#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
$:GPU_DECLARE(create='[buff_send, buff_recv]')
#endif

integer :: halo_size
$:GPU_DECLARE(create='[halo_size]')
Expand Down Expand Up @@ -78,7 +80,13 @@ contains

$:GPU_UPDATE(device='[halo_size, v_size]')

#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
@:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
#else
allocate (buff_send(0:halo_size), buff_recv(0:halo_size))
$:GPU_ENTER_DATA(create='[capture:buff_send]')
$:GPU_ENTER_DATA(create='[capture:buff_recv]')
#endif
#endif

end subroutine s_initialize_mpi_common_module
Expand Down
10 changes: 10 additions & 0 deletions src/simulation/m_checker.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ contains

if (igr) then
call s_check_inputs_igr
call s_check_inputs_nvidia_uvm
else
if (recon_type == WENO_TYPE) then
call s_check_inputs_weno
Expand Down Expand Up @@ -411,4 +412,13 @@ contains
@:PROHIBIT(powell .and. fd_order == dflt_int, "fd_order must be set if Powell's method is enabled")
end subroutine s_check_inputs_mhd

impure subroutine s_check_inputs_nvidia_uvm
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
@:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, &
"nv_uvm_igr_temps_on_gpu must be in the range [0, 3]")
@:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, &
"nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2")
#endif
end subroutine s_check_inputs_nvidia_uvm

end module m_checker
24 changes: 24 additions & 0 deletions src/simulation/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,16 @@ module m_global_parameters
logical :: viscous !< Viscous effects
#:endif

!> @name Variables for our of core IGR computation on NVIDIA
!> @{
logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE)
integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
! 1 => jac on GPU, jac_rhs and jac_old on CPU
! 2 => jac and jac_rhs on GPU, jac_old on CPU
! 3 => jac, jac_rhs, and jac_old on GPU (default)
logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE)
!> @}

real(wp) :: weno_eps !< Binding for the WENO nonlinear weights
real(wp) :: teno_CT !< Smoothness threshold for TENO
logical :: mp_weno !< Monotonicity preserving (MP) WENO
Expand Down Expand Up @@ -573,6 +583,11 @@ contains
t_stop = dflt_real
t_save = dflt_real

! NVIDIA UVM options
nv_uvm_out_of_core = .false.
nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
nv_uvm_pref_gpu = .false.

! Simulation algorithm parameters
model_eqns = dflt_int
mpp_lim = .false.
Expand Down Expand Up @@ -1321,16 +1336,25 @@ contains
@:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
@:ALLOCATE(x_cc(-buff_size:m + buff_size))
@:ALLOCATE(dx(-buff_size:m + buff_size))
@:PREFER_GPU(x_cb)
@:PREFER_GPU(x_cc)
@:PREFER_GPU(dx)

if (n == 0) return;
@:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
@:ALLOCATE(y_cc(-buff_size:n + buff_size))
@:ALLOCATE(dy(-buff_size:n + buff_size))
@:PREFER_GPU(y_cb)
@:PREFER_GPU(y_cc)
@:PREFER_GPU(dy)

if (p == 0) return;
@:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
@:ALLOCATE(z_cc(-buff_size:p + buff_size))
@:ALLOCATE(dz(-buff_size:p + buff_size))
@:PREFER_GPU(z_cb)
@:PREFER_GPU(z_cc)
@:PREFER_GPU(dz)

end subroutine s_initialize_global_parameters_module

Expand Down
Loading
Loading