diff --git a/CMakeLists.txt b/CMakeLists.txt index 8269c1cb4..6523f7877 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,17 +135,17 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") if (CMAKE_BUILD_TYPE STREQUAL "Debug") add_compile_options( -Wall - -Wextra + -Wextra -fcheck=all,no-array-temps -fbacktrace -fimplicit-none -fsignaling-nans -finit-real=snan -finit-integer=-99999999 - -Wintrinsic-shadow - -Wunderflow - -Wrealloc-lhs - -Wsurprising + -Wintrinsic-shadow + -Wunderflow + -Wrealloc-lhs + -Wsurprising ) endif() @@ -163,7 +163,6 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") "SHELL:-h acc_model=auto_async_none" "SHELL: -h acc_model=no_fast_addr" "SHELL: -h list=adm" - "SHELL: -munsafe-fp-atomics" # Not unsafe for operations we do ) add_link_options("SHELL:-hkeepfiles") @@ -173,6 +172,7 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") "SHELL:-h acc_model=auto_async_none" "SHELL: -h acc_model=no_fast_addr" "SHELL: -K trap=fp" "SHELL: -G2" + ) add_link_options("SHELL: -K trap=fp" "SHELL: -G2") endif() @@ -486,23 +486,23 @@ function(MFC_SETUP_TARGET) endforeach() target_compile_options(${a_target} - PRIVATE -gpu=keep,ptxinfo,lineinfo + PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath ) # GH-200 Unified Memory Support if (MFC_Unified) target_compile_options(${ARGS_TARGET} - PRIVATE -gpu=unified + PRIVATE -gpu=mem:unified -cuda ) # "This option must appear in both the compile and link lines" -- NVHPC Docs target_link_options(${ARGS_TARGET} - PRIVATE -gpu=unified + PRIVATE -gpu=mem:unified -cuda ) endif() if (CMAKE_BUILD_TYPE STREQUAL "Debug") target_compile_options(${a_target} - PRIVATE -gpu=autocompare,debug + PRIVATE -gpu=autocompare,debug -cuda ) endif() elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") @@ -513,9 +513,15 @@ function(MFC_SETUP_TARGET) target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc") endif() - if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") - find_package(CUDAToolkit REQUIRED) - target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt) + if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR + CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") + + if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8 + target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt) + else() # CUDA >= 12.9 + target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx ) + endif() + target_link_options(${a_target} PRIVATE "-cudalib=nvtx") endif() endforeach() diff --git a/misc/nvidia_uvm/README.md b/misc/nvidia_uvm/README.md new file mode 100644 index 000000000..2de72c489 --- /dev/null +++ b/misc/nvidia_uvm/README.md @@ -0,0 +1,48 @@ +## The Main Idea behind the implemented Out-of-Core Strategy for Grace-Hopper + +To run MFC out-of-core on Grace-Hopper using Unified Memory we implement a zero-copy strategy. + +We start by setting preferred location CPU for all buffers by hooking into the allocate macro and setting `NVIDIA_ALLOC_MODE=2`. +This way we disable access counter based migrations and keep everything on the CPU memory, freeing up as much GPU memory as possible. + +Then, for the "important" buffers that are frequently accessed from the GPU, we reset preferred location to GPU in order to place them (and directly populate them) in GPU memory. +This is done by the `PREFER_GPU` macro that has been manually placed in the code right after the allocations of the "important" buffers. +To activate these hints we export `NVIDIA_MANUAL_GPU_HINTS=1`. + +To allow fine grained control and be able to simulate larger sizes, we also use the following environment variables: +- With `NVIDIA_IGR_TEMPS_ON_GPU` we control how many temporaries from the IGR module are to be placed in GPU memory. +- With `NVIDIA_VARS_ON_GPU` we control how many of the `q_cons_ts(1)%vf(j)%sf` arrays we place in GPU memory. + +It is important to note that we have rearranged the timestep updates in the 3rd order TVD Runge Kutta scheme in a way that allows us to pass only `q_cons_ts(1)` to the `compute_rhs` routines. +This way, in order to keep the computation of `compute_rhs` (mostly) on GPU data, we only need to store `q_cons_ts(1)` (fully or even partially) in GPU memory. +Thus, we choose to keep `q_cons_ts(2)` in CPU memory for the full lifetime of the simulation, freeing up space in GPU memory that allows for bumping up the size of the simulation, without sacrificing performance. +In the timestep updates between the `compute_rhs` calls, we access both `q_cons_ts(1)` and `q_cons_ts(2)` directly from the physical location where they reside (zero-copy), simultaneously pulling data from GPU memory and CPU memory (through C2C), making good use of Grace-Hopper. + +Note: This rearrangement most likely "breaks" the timestepper for different physics cases, but we can easily fix it in a later step. + +## Example Workflow for Out-of-Core Strategy based on Unified Memory + +```shell +# Allocate a node +salloc -A g183 --partition normal -t 02:00:00 -N 1 -n 4 --cpus-per-task=71 + +# Start uenv +uenv start --view=modules icon/25.2:v1 + +# cd to root directory of MFC +cd MFC-Wilfong + +# Load modules +. ./mfc.sh load -c san -m g + +# Build +export MFC_CUDA_CC=90 +./mfc.sh build --gpu -j 71 --single --unified --verbose + +# Run pre_process and simulation binaries with case optimization (in an interactive job) +./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis + +# Run pre_process and simulation binaries with case optimization (in an batch job) +./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis -e batch -p normal -a g183 -w 00:15:00 +``` +The environment variables `NVIDIA_ALLOC_MODE`, `NVIDIA_MANUAL_GPU_HINTS`, `NVIDIA_VARS_ON_GPU`, and `NVIDIA_IGR_TEMPS_ON_GPU`, can be set appropriately in `toolchain/templates/santis.mako`, to configure a run with ALL buffers either in GPU or in CPU memory, or a run with SOME buffers in GPU memory and the rest in CPU memory. diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh new file mode 100755 index 000000000..239d769e8 --- /dev/null +++ b/misc/nvidia_uvm/bind.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# -------------------------------- # +# Binding for a single Santis node # +# -------------------------------- # + +# Local rank +export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}" + +# Bind to GPU +export CUDA_VISIBLE_DEVICES="$local_rank" + +# Binding to NIC +export MPICH_OFI_NIC_POLICY=USER +export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3" + +# Bind to cores ( second core per socket ) +physcores=(0 72 144 216) + +#echo rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY +numactl -l --all --physcpubind=${physcores[$local_rank]} "$@" diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh new file mode 100755 index 000000000..19b3d4b44 --- /dev/null +++ b/misc/nvidia_uvm/nsys.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +#set -x +set -euo pipefail + +rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}" + +[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep +[[ -z "${NSYS+x}" ]] && NSYS=0 + +if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then + echo "Doing nsys" + exec nsys profile \ + --cpuctxsw=none -b none -s none \ + --event-sample=system-wide \ + --cpu-socket-events=61,71,265,273 \ + --cpu-socket-metrics=103,104 \ + --event-sampling-interval=10 \ + --trace=nvtx,openacc \ + --force-overwrite=true \ + -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \ + -o "$NSYS_FILE" "$@" +else + exec "$@" +fi diff --git a/misc/nvidia_uvm/run.sh b/misc/nvidia_uvm/run.sh new file mode 100644 index 000000000..c065ebb81 --- /dev/null +++ b/misc/nvidia_uvm/run.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# TODO: Modify accordingly +PATH_TO_BINARY=${SCRATCH}/projects/cfd/mfc/MFC-Wilfong/build/install/cdcd4e8762/bin/ + +# NVHPC and CUDA env vars +export NV_ACC_USE_MALLOC=1 # use malloc instead of cudaMallocManaged ( compiled using -gpu=mem:unified ) +export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints +export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH + +# Cray MPICH +export MPICH_GPU_SUPPORT_ENABLED=1 # MPICH with GPU support +export FI_CXI_RX_MATCH_MODE=software +export FI_MR_CACHE_MONITOR=disabled + +# CUSTOM env vars to MFC +export NVIDIA_ALLOC_MODE=2 # default alloc to prefloc CPU +export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some +export NVIDIA_IGR_TEMPS_ON_GPU=1 # jac on GPU and jac_rhs on CPU ( NOTE: good default, tune based on size ) +export NVIDIA_VARS_ON_GPU=7 # q_cons_ts(1)%vf%sf for j=1-7 on GPU ( NOTE: good default, tune based on size ) + +# NSYS +export NSYS=1 # enable nsys profiling +export NSYS_FILE=report_uvm_single_N-499_nGPUs-4_params-${NVIDIA_VARS_ON_GPU}-${NVIDIA_IGR_TEMPS_ON_GPU}.qdrep + +# Run using --cpu-bind=none because we use our own binding script +srun --ntasks 4 --cpu-bind=none ./bind.sh ./nsys.sh ${PATH_TO_BINARY}/simulation diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index c1652388c..c89d8e9f7 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -12,11 +12,113 @@ #endif #:enddef +#:def PREFER_GPU(*args) +#ifdef MFC_SIMULATION +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + block + use cudafor + intrinsic :: minval, maxval, sum + integer :: istat + integer :: prefer_gpu_mode + character(len=10) :: prefer_gpu_mode_str + + ! environment variable + call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str) + if (trim(prefer_gpu_mode_str) == "0") then ! OFF + prefer_gpu_mode = 0 + elseif (trim(prefer_gpu_mode_str) == "1") then ! ON + prefer_gpu_mode = 1 + else ! default + prefer_gpu_mode = 0 + endif + + if (prefer_gpu_mode .eq. 1) then + #:for arg in args + !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) + ! unset + istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId ) + if (istat /= cudaSuccess) then + write(*,"('Error code: ',I0, ': ')") istat + write(*,*) cudaGetErrorString(istat) + endif + ! set + istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 ) + if (istat /= cudaSuccess) then + write(*,"('Error code: ',I0, ': ')") istat + write(*,*) cudaGetErrorString(istat) + endif + #:endfor + end if + end block +#endif +#endif +#:enddef + + +#:def PARSE(s) +${s if s.rfind(')') == -1 else next((s[:i] for i in range(s.rfind(')'), -1, -1) if s[i] == '(' and s.count('(', i, s.rfind(')')+1) == s.count(')', i, s.rfind(')')+1)), s)}$ +#:enddef + #:def ALLOCATE(*args) @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'}) #:set allocated_variables = ', '.join(args) allocate (${allocated_variables}$) $:GPU_ENTER_DATA(create=('[' + allocated_variables + ']')) + + +#ifdef MFC_SIMULATION +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + block + use cudafor + intrinsic :: minval, maxval, sum + integer :: istat, stream_id + integer :: alloc_mode + character(len=10) :: alloc_mode_str + + ! environment variable + call get_environment_variable("NVIDIA_ALLOC_MODE", alloc_mode_str) + if (trim(alloc_mode_str) == "0") then ! no CPU first touch, no preferred location CPU + alloc_mode = 0 + elseif (trim(alloc_mode_str) == "1") then ! CPU first touch, no preferred location CPU + alloc_mode = 1 + elseif (trim(alloc_mode_str) == "2") then ! no CPU first touch, preferred location CPU + alloc_mode = 2 + elseif (trim(alloc_mode_str) == "3") then ! CPU first touch, preferred location CPU + alloc_mode = 3 + else ! default + alloc_mode = 0 + endif + + stream_id = 0 + + ! prefetch to CPU + if ((alloc_mode .eq. 1) .or. (alloc_mode .eq. 3)) then + #:for arg in args + istat = cudaMemPrefetchAsync( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaCpuDeviceId, stream_id ) + !print*, "! @{PARSE(${arg}$)}@ with shape", SHAPE(@{PARSE(${arg}$)}@), "=> prefetch to CPU" + if (istat /= cudaSuccess) then + write(*,"('Error code: ',I0, ': ')") istat + write(*,*) cudaGetErrorString(istat) + endif + #:endfor + endif + + ! memadvise preferred location + if ((alloc_mode .eq. 2) .or. (alloc_mode .eq. 3)) then + #:for arg in args + istat = cudaMemAdvise( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId ) + !print*, "! @{PARSE(${arg}$)}@ with shape", SHAPE(@{PARSE(${arg}$)}@), "=> preferred location CPU" + if (istat /= cudaSuccess) then + write(*,"('Error code: ',I0, ': ')") istat + write(*,*) cudaGetErrorString(istat) + endif + #:endfor + endif + + end block +#endif +#endif + #:enddef ALLOCATE #:def DEALLOCATE(*args) diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index fa6185c20..047056a51 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -1294,16 +1294,25 @@ contains @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size)) @:ALLOCATE(x_cc(-buff_size:m + buff_size)) @:ALLOCATE(dx(-buff_size:m + buff_size)) + @:PREFER_GPU(x_cb) + @:PREFER_GPU(x_cc) + @:PREFER_GPU(dx) if (n == 0) return; @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size)) @:ALLOCATE(y_cc(-buff_size:n + buff_size)) @:ALLOCATE(dy(-buff_size:n + buff_size)) + @:PREFER_GPU(y_cb) + @:PREFER_GPU(y_cc) + @:PREFER_GPU(dy) if (p == 0) return; @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size)) @:ALLOCATE(z_cc(-buff_size:p + buff_size)) @:ALLOCATE(dz(-buff_size:p + buff_size)) + @:PREFER_GPU(z_cb) + @:PREFER_GPU(z_cc) + @:PREFER_GPU(dz) end subroutine s_initialize_global_parameters_module diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index db80bb834..1219626ef 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -24,8 +24,12 @@ module m_igr s_igr_flux_add, & s_finalize_igr_module +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + real(wp), pointer, contiguous, dimension(:, :, :) :: jac,jac_rhs,jac_old +#else real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]') +#endif real(wp), allocatable, dimension(:, :) :: Res $:GPU_DECLARE(create='[Res]') @@ -79,10 +83,36 @@ module m_igr integer :: i, j, k, l, q, r +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + real(wp), allocatable, dimension(:, :, :, :), pinned, target :: m_igr_pool_host + real(wp), allocatable, dimension(:, :, :), pinned, target :: m_igr_pool_host2 +#endif + contains subroutine s_initialize_igr_module() + integer :: igr_temps_on_gpu = 3 + integer :: igr_temps_on_cpu = 0 + integer :: pool_idx = 1 + character(len=10) :: igr_temps_on_gpu_str + +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + call get_environment_variable("NVIDIA_IGR_TEMPS_ON_GPU", igr_temps_on_gpu_str) + + if (trim(igr_temps_on_gpu_str) == "0") then + igr_temps_on_gpu = 0 ! jac, jac_rhs and jac_old on CPU + elseif (trim(igr_temps_on_gpu_str) == "1") then + igr_temps_on_gpu = 1 ! jac on GPU, jac_rhs on CPU, jac_old on CPU + elseif (trim(igr_temps_on_gpu_str) == "2") then + igr_temps_on_gpu = 2 ! jac and jac_rhs on GPU, jac_old on CPU + elseif (trim(igr_temps_on_gpu_str) == "3") then + igr_temps_on_gpu = 3 ! jac, jac_rhs and jac_old on GPU + else ! default on GPU + igr_temps_on_gpu = 3 + endif +#endif + if (viscous) then @:ALLOCATE(Res(1:2, 1:maxval(Re_size))) do i = 1, 2 @@ -91,8 +121,73 @@ contains end do end do $:GPU_UPDATE(device='[Res, Re_idx, Re_size]') + @:PREFER_GPU(Res) + @:PREFER_GPU(Re_idx) end if +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + igr_temps_on_cpu = 3 - igr_temps_on_gpu + + if ( igr_temps_on_cpu >= 1 ) then + !allocate(m_igr_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & + ! idwbuff(2)%beg:idwbuff(2)%end, & + ! idwbuff(3)%beg:idwbuff(3)%end, & + ! 1:igr_temps_on_cpu)) + + !There was a dimensionality change for jac_rhs, using a different pineed pool + if ( igr_temps_on_cpu == 1 ) then + allocate(m_igr_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end, & + 1:igr_temps_on_cpu)) + + elseif (igr_temps_on_cpu >=2 ) then + allocate(m_igr_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end, & + 1:igr_temps_on_cpu-1)) + allocate(m_igr_pool_host2(-1:m,-1:n,-1:p)) + endif + + pool_idx = 1 + if ( igr_temps_on_cpu >= 1 ) then + !print*, 'jac_old on CPU' + jac_old(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => m_igr_pool_host(:,:,:,pool_idx) + pool_idx = pool_idx + 1 + end if + if ( igr_temps_on_cpu >= 2 ) then + jac_rhs(-1:m,-1:n,-1:p) => m_igr_pool_host2(:,:,:) + end if + if ( igr_temps_on_cpu >= 3 ) then + !print*, 'jac on CPU' + jac(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => m_igr_pool_host(:,:,:,pool_idx) + pool_idx = pool_idx + 1 + end if + end if + if ( igr_temps_on_gpu >= 1 ) then + !print*, 'jac on GPU' + @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(jac) + endif + if ( igr_temps_on_gpu >= 2 ) then + !print*, 'jac_rhs on GPU' + @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) + @:PREFER_GPU(jac_rhs) + endif + if ( igr_temps_on_gpu >= 3 ) then + !print*, 'jac_old on GPU' + @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(jac_old) + endif +#else @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) @@ -103,6 +198,7 @@ contains idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) end if +#endif $:GPU_PARALLEL_LOOP(collapse=3) do l = idwbuff(3)%beg, idwbuff(3)%end diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index d040650bf..e092cc0f5 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -77,6 +77,10 @@ module m_time_steppers $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]') +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host +#endif + contains !> The computation of parameters, the allocation of memory, @@ -86,6 +90,33 @@ contains integer :: i, j !< Generic loop iterators + integer :: vars_on_gpu = 0 + character(len=10) :: vars_on_gpu_str + +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + call get_environment_variable("NVIDIA_VARS_ON_GPU", vars_on_gpu_str) + + if (trim(vars_on_gpu_str) == "0") then + vars_on_gpu = 0 + elseif (trim(vars_on_gpu_str) == "1") then + vars_on_gpu = 1 + elseif (trim(vars_on_gpu_str) == "2") then + vars_on_gpu = 2 + elseif (trim(vars_on_gpu_str) == "3") then + vars_on_gpu = 3 + elseif (trim(vars_on_gpu_str) == "4") then + vars_on_gpu = 4 + elseif (trim(vars_on_gpu_str) == "5") then + vars_on_gpu = 5 + elseif (trim(vars_on_gpu_str) == "6") then + vars_on_gpu = 6 + elseif (trim(vars_on_gpu_str) == "7") then + vars_on_gpu = 7 + else ! default + vars_on_gpu = 0 + endif +#endif + ! Setting number of time-stages for selected time-stepping scheme if (time_stepper == 1) then num_ts = 1 @@ -95,11 +126,35 @@ contains ! Allocating the cell-average conservative variables @:ALLOCATE(q_cons_ts(1:num_ts)) + @:PREFER_GPU(q_cons_ts) do i = 1, num_ts @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size)) + @:PREFER_GPU(q_cons_ts(i)%vf) end do + !! +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end, & + 1:sys_size)) + do i = 1, num_ts + do j = 1, sys_size + if ( i == 1 ) then + @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf) + else + q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j) + end if + end do + @:ACC_SETUP_VFs(q_cons_ts(i)) + end do +#else do i = 1, num_ts do j = 1, sys_size @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & @@ -108,6 +163,7 @@ contains end do @:ACC_SETUP_VFs(q_cons_ts(i)) end do +#endif ! Allocating the cell-average primitive ts variables if (probe_wrt) then @@ -682,6 +738,7 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) +#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -694,6 +751,22 @@ contains end do end do end do +#else + !$acc parallel loop collapse(3) gang vector default(present) + do l = 0, p + do k = 0, n + do j = 0, m + do i = 1, sys_size + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l) + end do + end do + end do + end do +#endif !Evolve pb and mv for non-polytropic qbmm if (qbmm .and. (.not. polytropic)) then @@ -750,10 +823,15 @@ contains ! Stage 2 of 3 +#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED) call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2) +#else + call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg,2) +#endif if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2) +#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -767,6 +845,21 @@ contains end do end do end do +#else + !$acc parallel loop collapse(3) gang vector default(present) + do l = 0, p + do k = 0, n + do j = 0, m + do i = 1, sys_size + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & + + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l))/4._wp + end do + end do + end do + end do +#endif if (qbmm .and. (.not. polytropic)) then $:GPU_PARALLEL_LOOP(collapse=5) @@ -823,10 +916,15 @@ contains end if ! Stage 3 of 3 +#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED) call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3) +#else + call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg,3) +#endif if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3) +#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -840,6 +938,21 @@ contains end do end do end do +#else + !$acc parallel loop collapse(3) gang vector default(present) + do l = 0, p + do k = 0, n + do j = 0, m + do i = 1, sys_size + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (q_cons_ts(2)%vf(i)%sf(j, k, l) & + + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & + + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp + end do + end do + end do + end do +#endif if (qbmm .and. (.not. polytropic)) then $:GPU_PARALLEL_LOOP(collapse=5) @@ -1143,15 +1256,29 @@ contains ! Deallocating the cell-average conservative variables do i = 1, num_ts +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) + do j = 1, sys_size + if ( i == 1 ) then + @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf) + else + nullify(q_cons_ts(i)%vf(j)%sf) + end if + end do +#else do j = 1, sys_size @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf) end do +#endif @:DEALLOCATE(q_cons_ts(i)%vf) end do @:DEALLOCATE(q_cons_ts) +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + deallocate(q_cons_ts_pool_host) +#endif + ! Deallocating the cell-average primitive ts variables if (probe_wrt) then do i = 0, 3 diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 2de738986..750c9b294 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -64,6 +64,9 @@ def get_install_dirpath(self, case: Case ) -> str: # The install directory is located /build/install/ return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)]) + def get_home_dirpath(self, case: Case) -> str: + return os.sep.join([os.getcwd()]) + def get_install_binpath(self, case: Case ) -> str: # /install//bin/ return os.sep.join([self.get_install_dirpath(case), "bin", self.name]) diff --git a/toolchain/templates/default.mako b/toolchain/templates/default.mako index b1cdaf81e..df833ed4e 100644 --- a/toolchain/templates/default.mako +++ b/toolchain/templates/default.mako @@ -57,7 +57,7 @@ if engine == 'batch': "${target.get_install_binpath(case)}") elif [ "$binary" == "mpirun" ]; then (set -x; ${profiler} \ - $binary -np ${nodes*tasks_per_node} \ + $binary --allow-run-as-root -np ${nodes*tasks_per_node} \ "${target.get_install_binpath(case)}") elif [ "$binary" == "mpiexec" ]; then (set -x; ${profiler} \ diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako new file mode 100644 index 000000000..818383d35 --- /dev/null +++ b/toolchain/templates/santis.mako @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +<%namespace name="helpers" file="helpers.mako"/> + +% if engine == 'batch': +#SBATCH --uenv=icon/25.2:v1 +#SBATCH --nodes=${nodes} +#SBATCH --reservation=g183 +#SBATCH --ntasks-per-node=${tasks_per_node} +#SBATCH --job-name="${name}" +#SBATCH --output="${name}.out" +#SBATCH --error="${name}.err" +#SBATCH --time=${walltime} +% if account: +#SBATCH --account=${account} +% endif +% if partition: +#SBATCH --partition=${partition} +% endif +% if quality_of_service: +#SBATCH --qos=${quality_of_service} +% endif +% if email: +#SBATCH --mail-user=${email} +#SBATCH --mail-type="BEGIN, END, FAIL" +% endif +% endif + +# NVHPC and CUDA env vars +export NV_ACC_USE_MALLOC=0 # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified ) +export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints +#export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH + +# Cray MPICH +export MPICH_GPU_SUPPORT_ENABLED=1 +export FI_CXI_RX_MATCH_MODE=software +export FI_MR_CACHE_MONITOR=disabled +export MPICH_NO_BUFFER_ALIAS_CHECK=1 + +# CUSTOM env vars to MFC +export NVIDIA_ALLOC_MODE=0 # do nothing +export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some +export NVIDIA_IGR_TEMPS_ON_GPU=3 # jac, jac_rhs, and jac_old on GPU +export NVIDIA_VARS_ON_GPU=7 # q_cons_ts(1)%vf%sf for j=1-7 on GPU + +# NSYS +export NSYS=0 # enable nsys profiling +export NSYS_FILE=myreport.qdrep + +${helpers.template_prologue()} + +ok ":) Loading modules:\n" +cd "${MFC_ROOT_DIR}" +% if engine == 'batch': +. ./mfc.sh load -c san -m ${'g' if gpu else 'c'} +% endif +cd - > /dev/null +echo + +% for target in targets: + ${helpers.run_prologue(target)} + + % if not mpi: + (set -x; ${profiler} "${target.get_install_binpath(case)}") + % else: + (set -x; srun --unbuffered \ + --ntasks=${nodes*tasks_per_node} \ + --cpus-per-task 1 \ + --cpu-bind=none \ + % if gpu: + --gpus-per-task 1 \ + % endif + --wait 200 --bcast=/tmp/${target.name} \ + "${target.get_home_dirpath(case)}/misc/nvidia_uvm/bind.sh" \ + #% if target.name == 'simulation': + #"${target.get_home_dirpath(case)}/misc/nvidia_uvm/nsys.sh" \ + #% endif + "${target.get_install_binpath(case)}") + % endif + + ${helpers.run_epilogue(target)} + + echo +% endfor + +${helpers.template_epilogue()}