Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2358d29
Add scripts for santis/alps, example case, and captures for UVM comms…
ntselepidis Aug 1, 2025
37d393b
Add PREFER_GPU and rearrange update for out-of-core computation
ntselepidis Aug 1, 2025
693c7f4
Allow keeping q_cons_ts(2) on CPU using pinned allocations
ntselepidis Aug 1, 2025
7054b7b
Modify PREFER_GPU macro
ntselepidis Aug 1, 2025
ee1277d
Allow control in placement of IGR temps
ntselepidis Aug 1, 2025
4065c02
Do some clean up
ntselepidis Aug 2, 2025
cfb792c
ENV Vars to case file options and code structure changes
Aug 3, 2025
cacc6b0
Fix some comments
ntselepidis Aug 3, 2025
884a4d9
Merge remote-tracking branch 'upstream/master' into nvidia
wilfonba Aug 3, 2025
b3fdbff
test merge and add nv_uvm_out_of_core back
wilfonba Aug 3, 2025
51d7e90
Fix some allocs and deallocs in timesteppers
ntselepidis Aug 5, 2025
c553b78
Fix nv_uvm_out_of_core inconsistency and add to case file
ntselepidis Aug 5, 2025
f3b3851
Fix bug in 2nd order TVD RK introduced by merge
ntselepidis Aug 5, 2025
71b5976
Fix some comments
ntselepidis Aug 5, 2025
a4d6b38
Add note on binding script requirement for PREFER_GPU macro
ntselepidis Aug 6, 2025
acb2405
Flip nv_uvm_pref_gpu default to false
ntselepidis Aug 7, 2025
8fef22d
Be explicit with unified memory compilation to stay robust in changes…
ntselepidis Aug 7, 2025
5e369c3
Add some changes to future proof the unified memory build
ntselepidis Aug 11, 2025
52c5608
Comment out calls to cudaGetErrorString
ntselepidis Aug 11, 2025
4ec8617
prepare for merge
wilfonba Aug 11, 2025
bd0adee
Merge remote-tracking branch 'upstream/master' into nvidia
wilfonba Aug 11, 2025
37b1768
update capture
wilfonba Aug 11, 2025
e02e9f6
add fastmath flag and bug fix
wilfonba Aug 11, 2025
a6ff639
Fix typo in CMakeLists
ntselepidis Aug 12, 2025
457ae60
Replace host_pool with host in m_igr
ntselepidis Aug 12, 2025
a6116f2
Set cpus-per-task to 72 and update binding script
ntselepidis Aug 12, 2025
fb50e90
Add some more updates to the helper scripts
ntselepidis Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -486,17 +486,17 @@ function(MFC_SETUP_TARGET)
endforeach()

target_compile_options(${a_target}
PRIVATE -gpu=keep,ptxinfo,lineinfo
PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
)

# GH-200 Unified Memory Support
if (MFC_Unified)
target_compile_options(${ARGS_TARGET}
PRIVATE -gpu=unified
PRIVATE -gpu=mem:unified -cuda
)
# "This option must appear in both the compile and link lines" -- NVHPC Docs
target_link_options(${ARGS_TARGET}
PRIVATE -gpu=unified
PRIVATE -gpu=mem:unified -cuda
)
endif()

Expand Down
101 changes: 101 additions & 0 deletions examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python3
import math
import json

N = 799
Nx = N
Ny = 2*(N+1)-1
Nz = 2*(N+1)-1

Re = 1600
L = 1
P0 = 101325
rho0 = 1
C0 = math.sqrt(1.4 * P0)
V0 = 0.1 * C0
mu = V0 * L / Re

cfl = 0.5
dx = 2 * math.pi * L / (Ny + 1)

dt = cfl * dx / (C0)

tC = L / V0
tEnd = 20 * tC

Nt = int(tEnd / dt)
Nt = 10


# Configuring case dictionary
print(
json.dumps(
{
"rdma_mpi": "T",
# Logistics
"run_time_info": "F",
# Computational Domain Parameters
"x_domain%beg": -math.pi * L,
"x_domain%end": math.pi * L,
"y_domain%beg": -math.pi * L,
"y_domain%end": math.pi * L,
"z_domain%beg": -math.pi * L,
"z_domain%end": math.pi * L,
"m": Nx,
"n": Ny,
"p": Nz,
"cyl_coord": "F",
"dt": dt,
"t_step_start": 0,
"t_step_stop": Nt,
"t_step_save": int(Nt / 100),
# Simulation Algorithm Parameters
"num_patches": 1,
"model_eqns": 2,
"num_fluids": 1,
"time_stepper": 3,
"bc_x%beg": -1,
"bc_x%end": -1,
"bc_y%beg": -1,
"bc_y%end": -1,
"bc_z%beg": -1,
"bc_z%end": -1,
"igr": "T",
"igr_order": 5,
"igr_iter_solver": 1,
"num_igr_iters": 3,
"num_igr_warm_start_iters": 3,
"alf_factor": 10,
"viscous": "T",
# Formatted Database Files Structure Parameters
"format": 1,
"precision": 2,
"prim_vars_wrt": "T",
"omega_wrt(1)": "T",
"omega_wrt(2)": "T",
"omega_wrt(3)": "T",
"qm_wrt": "T",
"fd_order": 4,
"parallel_io": "T",
# Patch 1: Background (AIR - 2)
"patch_icpp(1)%geometry": 9,
"patch_icpp(1)%x_centroid": 0,
"patch_icpp(1)%y_centroid": 0,
"patch_icpp(1)%z_centroid": 0,
"patch_icpp(1)%length_x": 2 * math.pi * L,
"patch_icpp(1)%length_y": 2 * math.pi * L,
"patch_icpp(1)%length_z": 2 * math.pi * L,
"patch_icpp(1)%vel(1)": 0.0,
"patch_icpp(1)%vel(2)": 0.0,
"patch_icpp(1)%vel(3)": 0,
"patch_icpp(1)%pres": 0.0,
"patch_icpp(1)%hcid": 380,
"patch_icpp(1)%alpha_rho(1)": 1,
"patch_icpp(1)%alpha(1)": 1,
# Fluids Physical Parameters
"fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
"fluid_pp(1)%pi_inf": 0,
"fluid_pp(1)%Re(1)": 1 / mu,
}
)
)
24 changes: 24 additions & 0 deletions misc/nvidia_uvm/bind.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash

# -------------------------------- #
# Binding for a single Santis node #
# -------------------------------- #

# Local rank
export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"

# Bind to GPU
export CUDA_VISIBLE_DEVICES="$local_rank"

# Binding to NIC
export MPICH_OFI_NIC_POLICY=USER
export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"

# Bind to cores ( first core per socket )
physcores=(0 72 144 216)

#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY

#set -x
numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
#set +x
24 changes: 24 additions & 0 deletions misc/nvidia_uvm/nsys.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

#set -x
set -euo pipefail

rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"

[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
[[ -z "${NSYS+x}" ]] && NSYS=0

if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
exec nsys profile \
--cpuctxsw=none -b none -s none \
--event-sample=system-wide \
--cpu-socket-events=61,71,265,273 \
--cpu-socket-metrics=103,104 \
--event-sampling-interval=10 \
--trace=nvtx,openacc \
--force-overwrite=true \
-e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
-o "$NSYS_FILE" "$@"
else
exec "$@"
fi
47 changes: 47 additions & 0 deletions src/common/include/macros.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,53 @@
#endif
#:enddef

#:def PREFER_GPU(*args)
#ifdef MFC_SIMULATION
#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
block
use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
integer :: istat
integer :: prefer_gpu_mode
character(len=10) :: prefer_gpu_mode_str

! environment variable
call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
if (trim(prefer_gpu_mode_str) == "0") then ! OFF
prefer_gpu_mode = 0
elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
prefer_gpu_mode = 1
else ! default
prefer_gpu_mode = 0
endif

if (prefer_gpu_mode .eq. 1) then
#:for arg in args
!print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
! set preferred location GPU
istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
if (istat /= cudaSuccess) then
write(*,"('Error code: ',I0, ': ')") istat
write(*,*) cudaGetErrorString(istat)
endif
! set accessed by CPU
istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId )
if (istat /= cudaSuccess) then
write(*,"('Error code: ',I0, ': ')") istat
write(*,*) cudaGetErrorString(istat)
endif
! prefetch to GPU - physically populate memory pages
istat = cudaMemPrefetchAsync( c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0 )
if (istat /= cudaSuccess) then
write(*,"('Error code: ',I0, ': ')") istat
write(*,*) cudaGetErrorString(istat)
endif
#:endfor
end if
end block
#endif
#endif
#:enddef

#:def ALLOCATE(*args)
@:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
#:set allocated_variables = ', '.join(args)
Expand Down
8 changes: 8 additions & 0 deletions src/common/m_mpi_common.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ module m_mpi_common
!! average primitive variables, for a single computational domain boundary
!! at the time, from the relevant neighboring processor.

#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
$:GPU_DECLARE(create='[buff_send, buff_recv]')
#endif

integer :: halo_size
$:GPU_DECLARE(create='[halo_size]')
Expand Down Expand Up @@ -78,7 +80,13 @@ contains

$:GPU_UPDATE(device='[halo_size, v_size]')

#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
@:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
#else
ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
!$acc enter data create(capture:buff_send)
!$acc enter data create(capture:buff_recv)
#endif
#endif

end subroutine s_initialize_mpi_common_module
Expand Down
9 changes: 9 additions & 0 deletions src/simulation/m_global_parameters.fpp
Original file line number Diff line number Diff line change
Expand Up @@ -1308,16 +1308,25 @@ contains
@:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
@:ALLOCATE(x_cc(-buff_size:m + buff_size))
@:ALLOCATE(dx(-buff_size:m + buff_size))
@:PREFER_GPU(x_cb)
@:PREFER_GPU(x_cc)
@:PREFER_GPU(dx)

if (n == 0) return;
@:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
@:ALLOCATE(y_cc(-buff_size:n + buff_size))
@:ALLOCATE(dy(-buff_size:n + buff_size))
@:PREFER_GPU(y_cb)
@:PREFER_GPU(y_cc)
@:PREFER_GPU(dy)

if (p == 0) return;
@:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
@:ALLOCATE(z_cc(-buff_size:p + buff_size))
@:ALLOCATE(dz(-buff_size:p + buff_size))
@:PREFER_GPU(z_cb)
@:PREFER_GPU(z_cc)
@:PREFER_GPU(dz)

end subroutine s_initialize_global_parameters_module

Expand Down
Loading
Loading