diff --git a/CMakeLists.txt b/CMakeLists.txt index b7f087c25b..0b349eb394 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(MFC_MPI "Build with MPI" ON option(MFC_OpenACC "Build with OpenACC" OFF) option(MFC_GCov "Build with GCov" OFF) option(MFC_Unified "Build with unified CPU & GPU memory (GH-200 only)" OFF) +option(MFC_Fastmath "Build with -gpu=fastmath on NV GPUs" OFF) option(MFC_PRE_PROCESS "Build pre_process" OFF) option(MFC_SIMULATION "Build simulation" OFF) option(MFC_POST_PROCESS "Build post_process" OFF) @@ -487,6 +488,9 @@ function(MFC_SETUP_TARGET) "-foffload=amdgcn-amdhsa='-march=gfx90a'" "-foffload-options=-lgfortran\ -lm" "-fno-exceptions") + if (MFC_Fastmath) + message(WARNING "--fastmath has no effect with the GNU compiler") + endif() elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") foreach (cc ${MFC_CUDA_CC}) target_compile_options(${a_target} @@ -498,14 +502,20 @@ function(MFC_SETUP_TARGET) PRIVATE -gpu=keep,ptxinfo,lineinfo ) + if (MFC_Fastmath) + target_compile_options(${a_target} + PRIVATE -gpu=fastmath + ) + endif() + # GH-200 Unified Memory Support if (MFC_Unified) target_compile_options(${ARGS_TARGET} - PRIVATE -gpu=unified + PRIVATE -gpu=mem:unified:managedalloc -cuda ) # "This option must appear in both the compile and link lines" -- NVHPC Docs target_link_options(${ARGS_TARGET} - PRIVATE -gpu=unified + PRIVATE -gpu=mem:unified:managedalloc -cuda ) endif() @@ -521,16 +531,28 @@ function(MFC_SETUP_TARGET) PRIVATE -DFRONTIER_UNIFIED) endif() + if (MFC_Fastmath) + message(WARNING "--fastmath has no effect with the CCE") + endif() + find_package(hipfort COMPONENTS hip CONFIG REQUIRED) target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn) endif() elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc") + if (MFC_Fastmath) + message(WARNING "--fastmath has no effect with the CCE") + endif() endif() if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") find_package(CUDAToolkit REQUIRED) - target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt) + if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8 + target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt) + else() # CUDA >= 12.9 + target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx) + target_link_options(${a_target} PRIVATE "-cudalib=nvtx3") + endif() endif() endforeach() diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py new file mode 100644 index 0000000000..e2b22e8017 --- /dev/null +++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +import math +import json + +N = 799 +Nx = N +Ny = 2 * (N + 1) - 1 +Nz = 2 * (N + 1) - 1 + +Re = 1600 +L = 1 +P0 = 101325 +rho0 = 1 +C0 = math.sqrt(1.4 * P0) +V0 = 0.1 * C0 +mu = V0 * L / Re + +cfl = 0.5 +dx = 2 * math.pi * L / (Ny + 1) + +dt = cfl * dx / (C0) + +tC = L / V0 +tEnd = 20 * tC + +Nt = int(tEnd / dt) +Nt = 10 + + +# Configuring case dictionary +print( + json.dumps( + { + "rdma_mpi": "T", + # Logistics + "run_time_info": "F", + # Computational Domain Parameters + "x_domain%beg": -math.pi * L, + "x_domain%end": math.pi * L, + "y_domain%beg": -math.pi * L, + "y_domain%end": math.pi * L, + "z_domain%beg": -math.pi * L, + "z_domain%end": math.pi * L, + "m": Nx, + "n": Ny, + "p": Nz, + "cyl_coord": "F", + "dt": dt, + "t_step_start": 0, + "t_step_stop": 10, # Nt, + "t_step_save": 10, # int(Nt / 100), + # Simulation Algorithm Parameters + "num_patches": 1, + "model_eqns": 2, + "num_fluids": 1, + "time_stepper": 3, + "bc_x%beg": -1, + "bc_x%end": -1, + "bc_y%beg": -1, + "bc_y%end": -1, + "bc_z%beg": -1, + "bc_z%end": -1, + "igr": "T", + "igr_order": 5, + "igr_iter_solver": 1, + "num_igr_iters": 3, + "num_igr_warm_start_iters": 3, + "alf_factor": 10, + "viscous": "T", + # Formatted Database Files Structure Parameters + "format": 1, + "precision": 2, + "prim_vars_wrt": "T", + "omega_wrt(1)": "T", + "omega_wrt(2)": "T", + "omega_wrt(3)": "T", + "qm_wrt": "T", + "fd_order": 4, + "parallel_io": "T", + # Patch 1: Background (AIR - 2) + "patch_icpp(1)%geometry": 9, + "patch_icpp(1)%x_centroid": 0, + "patch_icpp(1)%y_centroid": 0, + "patch_icpp(1)%z_centroid": 0, + "patch_icpp(1)%length_x": 2 * math.pi * L, + "patch_icpp(1)%length_y": 2 * math.pi * L, + "patch_icpp(1)%length_z": 2 * math.pi * L, + "patch_icpp(1)%vel(1)": 0.0, + "patch_icpp(1)%vel(2)": 0.0, + "patch_icpp(1)%vel(3)": 0, + "patch_icpp(1)%pres": 0.0, + "patch_icpp(1)%hcid": 380, + "patch_icpp(1)%alpha_rho(1)": 1, + "patch_icpp(1)%alpha(1)": 1, + # Fluids Physical Parameters + "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1), + "fluid_pp(1)%pi_inf": 0, + "fluid_pp(1)%Re(1)": 1 / mu, + # NVIDIA UVM Options + "nv_uvm_out_of_core": "T", + "nv_uvm_igr_temps_on_gpu": 3, + "nv_uvm_pref_gpu": "T", + } + ) +) diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh new file mode 100755 index 0000000000..37f5a1a3cd --- /dev/null +++ b/misc/nvidia_uvm/bind.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# -------------------------------- # +# Binding for a single Santis node # +# -------------------------------- # + +# Local rank +export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}" + +# Bind to GPU +export CUDA_VISIBLE_DEVICES="$local_rank" + +# Binding to NIC +export MPICH_OFI_NIC_POLICY=USER +export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3" + +# Bind to cores ( all cores per socket ) +physcores=(0-71 72-143 144-215 216-287) + +#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY + +#set -x +numactl -l --all --physcpubind=${physcores[$local_rank]} "$@" +#set +x diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh new file mode 100755 index 0000000000..205bee8fd4 --- /dev/null +++ b/misc/nvidia_uvm/nsys.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +#set -x +set -euo pipefail + +rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}" + +[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep +[[ -z "${NSYS+x}" ]] && NSYS=0 + +if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then + exec nsys profile \ + --cpuctxsw=none -b none -s none \ + --event-sample=system-wide \ + --cpu-socket-events=61,71,265,273 \ + --cpu-socket-metrics=103,104 \ + --event-sampling-interval=10 \ + --trace=nvtx,cuda,openacc \ + --force-overwrite=true \ + -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \ + -o "$NSYS_FILE" "$@" +else + exec "$@" +fi diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index c1652388c3..69241c99ef 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -12,6 +12,55 @@ #endif #:enddef +! Caution: +! This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI rank. +! That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0. +! For an example see misc/nvidia_uvm/bind.sh. +#:def PREFER_GPU(*args) +#ifdef MFC_SIMULATION +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + block +! Beginning in the 25.3 release, the structure of the cudafor module has been changed slightly. +! The module now includes, or “uses” 3 submodules: cuda_runtime_api, gpu_reductions, and sort. +! The cudafor functionality has not changed. But for new users, or users who have needed to +! work-around name conflicts in the module, it may be better to use cuda_runtime_api to expose +! interfaces to the CUDA runtime calls described in Chapter 4 of this guide. +! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules +#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3) + use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval +#else + use cuda_runtime_api +#endif + integer :: istat + + if (nv_uvm_pref_gpu) then + #:for arg in args + !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) + ! set preferred location GPU + istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0) + if (istat /= cudaSuccess) then + write (*, "('Error code: ',I0, ': ')") istat + !write(*,*) cudaGetErrorString(istat) + end if + ! set accessed by CPU + istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId) + if (istat /= cudaSuccess) then + write (*, "('Error code: ',I0, ': ')") istat + !write(*,*) cudaGetErrorString(istat) + end if + ! prefetch to GPU - physically populate memory pages + istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0) + if (istat /= cudaSuccess) then + write (*, "('Error code: ',I0, ': ')") istat + !write(*,*) cudaGetErrorString(istat) + end if + #:endfor + end if + end block +#endif +#endif +#:enddef + #:def ALLOCATE(*args) @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'}) #:set allocated_variables = ', '.join(args) diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp index fdfcab8d25..4332681f11 100644 --- a/src/common/m_mpi_common.fpp +++ b/src/common/m_mpi_common.fpp @@ -38,7 +38,9 @@ module m_mpi_common !! average primitive variables, for a single computational domain boundary !! at the time, from the relevant neighboring processor. +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM $:GPU_DECLARE(create='[buff_send, buff_recv]') +#endif integer :: halo_size $:GPU_DECLARE(create='[halo_size]') @@ -78,7 +80,13 @@ contains $:GPU_UPDATE(device='[halo_size, v_size]') +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size)) +#else + allocate (buff_send(0:halo_size), buff_recv(0:halo_size)) + $:GPU_ENTER_DATA(create='[capture:buff_send]') + $:GPU_ENTER_DATA(create='[capture:buff_recv]') +#endif #endif end subroutine s_initialize_mpi_common_module diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp index f0196af0e2..8917b0be46 100644 --- a/src/simulation/m_checker.fpp +++ b/src/simulation/m_checker.fpp @@ -30,6 +30,7 @@ contains if (igr) then call s_check_inputs_igr + call s_check_inputs_nvidia_uvm else if (recon_type == WENO_TYPE) then call s_check_inputs_weno @@ -411,4 +412,13 @@ contains @:PROHIBIT(powell .and. fd_order == dflt_int, "fd_order must be set if Powell's method is enabled") end subroutine s_check_inputs_mhd + impure subroutine s_check_inputs_nvidia_uvm +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + @:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, & + "nv_uvm_igr_temps_on_gpu must be in the range [0, 3]") + @:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, & + "nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2") +#endif + end subroutine s_check_inputs_nvidia_uvm + end module m_checker diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 2a904a1e97..34c122f1e7 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -157,6 +157,16 @@ module m_global_parameters logical :: viscous !< Viscous effects #:endif + !> @name Variables for our of core IGR computation on NVIDIA + !> @{ + logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE) + integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU + ! 1 => jac on GPU, jac_rhs and jac_old on CPU + ! 2 => jac and jac_rhs on GPU, jac_old on CPU + ! 3 => jac, jac_rhs, and jac_old on GPU (default) + logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE) + !> @} + real(wp) :: weno_eps !< Binding for the WENO nonlinear weights real(wp) :: teno_CT !< Smoothness threshold for TENO logical :: mp_weno !< Monotonicity preserving (MP) WENO @@ -573,6 +583,11 @@ contains t_stop = dflt_real t_save = dflt_real + ! NVIDIA UVM options + nv_uvm_out_of_core = .false. + nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default) + nv_uvm_pref_gpu = .false. + ! Simulation algorithm parameters model_eqns = dflt_int mpp_lim = .false. @@ -1321,16 +1336,25 @@ contains @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size)) @:ALLOCATE(x_cc(-buff_size:m + buff_size)) @:ALLOCATE(dx(-buff_size:m + buff_size)) + @:PREFER_GPU(x_cb) + @:PREFER_GPU(x_cc) + @:PREFER_GPU(dx) if (n == 0) return; @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size)) @:ALLOCATE(y_cc(-buff_size:n + buff_size)) @:ALLOCATE(dy(-buff_size:n + buff_size)) + @:PREFER_GPU(y_cb) + @:PREFER_GPU(y_cc) + @:PREFER_GPU(dy) if (p == 0) return; @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size)) @:ALLOCATE(z_cc(-buff_size:p + buff_size)) @:ALLOCATE(dz(-buff_size:p + buff_size)) + @:PREFER_GPU(z_cb) + @:PREFER_GPU(z_cc) + @:PREFER_GPU(dz) end subroutine s_initialize_global_parameters_module diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index db80bb8346..0fbc76346f 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -24,8 +24,16 @@ module m_igr s_igr_flux_add, & s_finalize_igr_module +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + integer, dimension(3) :: nv_uvm_temp_on_gpu + real(wp), pointer, contiguous, dimension(:, :, :) :: jac, jac_rhs, jac_old + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host +#else real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]') +#endif real(wp), allocatable, dimension(:, :) :: Res $:GPU_DECLARE(create='[Res]') @@ -73,7 +81,6 @@ module m_igr 5._wp/6._wp, & ! Index 0 2._wp/6._wp & ! Index 1 ] - #:endif #:endif @@ -91,8 +98,11 @@ contains end do end do $:GPU_UPDATE(device='[Res, Re_idx, Re_size]') + @:PREFER_GPU(Res) + @:PREFER_GPU(Re_idx) end if +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) @@ -103,6 +113,51 @@ contains idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) end if +#else + ! create map + nv_uvm_temp_on_gpu(1:3) = 0 + nv_uvm_temp_on_gpu(1:nv_uvm_igr_temps_on_gpu) = 1 + + if (nv_uvm_temp_on_gpu(1) == 1) then + @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(jac) + else + allocate (jac_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + + jac(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => jac_host(:, :, :) + end if + + if (nv_uvm_temp_on_gpu(2) == 1) then + @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) + @:PREFER_GPU(jac_rhs) + else + allocate (jac_rhs_host(-1:m, -1:n, -1:p)) + jac_rhs(-1:m, -1:n, -1:p) => jac_rhs_host(:, :, :) + end if + + if (igr_iter_solver == 1) then ! Jacobi iteration + if (nv_uvm_temp_on_gpu(3) == 1) then + @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(jac_old) + else + allocate (jac_old_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + + jac_old(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host(:, :, :) + end if + end if +#endif $:GPU_PARALLEL_LOOP(collapse=3) do l = idwbuff(3)%beg, idwbuff(3)%end @@ -2612,11 +2667,36 @@ contains @:DEALLOCATE(Res) end if +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM @:DEALLOCATE(jac, jac_rhs) if (igr_iter_solver == 1) then ! Jacobi iteration @:DEALLOCATE(jac_old) end if +#else + if (nv_uvm_temp_on_gpu(1) == 1) then + @:DEALLOCATE(jac) + else + nullify (jac) + deallocate (jac_host) + end if + + if (nv_uvm_temp_on_gpu(2) == 1) then + @:DEALLOCATE(jac_rhs) + else + nullify (jac_rhs) + deallocate (jac_rhs_host) + end if + + if (igr_iter_solver == 1) then ! Jacobi iteration + if (nv_uvm_temp_on_gpu(3) == 1) then + @:DEALLOCATE(jac_old) + else + nullify (jac_old) + deallocate (jac_old_host) + end if + end if +#endif #:if not MFC_CASE_OPTIMIZATION @:DEALLOCATE(coeff_L, coeff_R) diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index 06ea632b7b..755f762166 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -237,6 +237,11 @@ contains #:endfor end do + ! NVIDIA UVM variables + call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) + call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + #endif end subroutine s_mpi_bcast_user_inputs diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index 95489bedf6..313ef48f2b 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -187,9 +187,9 @@ contains surface_tension, bubbles_lagrange, lag_params, & hyperelasticity, R0ref, num_bc_patches, Bx0, powell, & cont_damage, tau_star, cont_damage_s, alpha_bar, & - alf_factor, num_igr_iters, down_sample, & - num_igr_warm_start_iters, & - int_comp, ic_eps, ic_beta + alf_factor, num_igr_iters, num_igr_warm_start_iters, & + int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, & + nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample ! Checking that an input file has been provided by the user. If it ! has, then the input file is read in, otherwise, simulation exits. diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index 4aaf0878fc..e7d4ba6017 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -77,7 +77,9 @@ module m_time_steppers $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]') -#if defined(FRONTIER_UNIFIED) +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) + real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host +#elif defined(FRONTIER_UNIFIED) real(wp), pointer, contiguous, dimension(:, :, :, :) :: q_cons_ts_pool_host, q_cons_ts_pool_device integer(kind=8) :: pool_dims(4), pool_starts(4) #endif @@ -105,12 +107,47 @@ contains ! Allocating the cell-average conservative variables @:ALLOCATE(q_cons_ts(1:num_ts)) + @:PREFER_GPU(q_cons_ts) do i = 1, num_ts @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size)) + @:PREFER_GPU(q_cons_ts(i)%vf) end do -#ifdef FRONTIER_UNIFIED +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) + if (num_ts == 2 .and. nv_uvm_out_of_core) then + ! host allocation for q_cons_ts(2)%vf(j)%sf for all j + allocate (q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end, & + 1:sys_size)) + end if + + do j = 1, sys_size + ! q_cons_ts(1) lives on the device + @:ALLOCATE(q_cons_ts(1)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(q_cons_ts(1)%vf(j)%sf) + if (num_ts == 2) then + if (nv_uvm_out_of_core) then + ! q_cons_ts(2) lives on the host + q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:, :, :, j) + else + @:ALLOCATE(q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(q_cons_ts(2)%vf(j)%sf) + end if + end if + end do + + do i = 1, num_ts + @:ACC_SETUP_VFs(q_cons_ts(i)) + end do +#elif defined(FRONTIER_UNIFIED) ! Allocate to memory regions using hip calls ! that we will attach pointers to do i = 1, 3 @@ -357,11 +394,13 @@ contains ! Allocating the cell-average RHS variables @:ALLOCATE(rhs_vf(1:sys_size)) + @:PREFER_GPU(rhs_vf) if (igr) then do i = 1, sys_size @:ALLOCATE(rhs_vf(i)%sf(-1:m+1,-1:n+1,-1:p+1)) @:ACC_SETUP_SFs(rhs_vf(i)) + @:PREFER_GPU(rhs_vf(i)%sf) end do else do i = 1, sys_size @@ -536,8 +575,8 @@ contains real(wp), intent(inout) :: time_avg integer :: i, j, k, l, q!< Generic loop iterator - integer :: dest real(wp) :: start, finish + integer :: dest ! Stage 1 of 2 @@ -567,7 +606,7 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) -#ifdef FRONTIER_UNIFIED +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -659,7 +698,7 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2) -#ifdef FRONTIER_UNIFIED +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -668,7 +707,7 @@ contains q_cons_ts(1)%vf(i)%sf(j, k, l) = & (q_cons_ts(2)%vf(i)%sf(j, k, l) & + q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l))/4._wp + + dt*rhs_vf(i)%sf(j, k, l))/2._wp end do end do end do @@ -771,9 +810,8 @@ contains real(wp), intent(INOUT) :: time_avg integer :: i, j, k, l, q !< Generic loop iterator - integer :: dest - real(wp) :: start, finish + integer :: dest ! Stage 1 of 3 @@ -804,7 +842,7 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) -#ifdef FRONTIER_UNIFIED +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -896,7 +934,7 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2) -#if defined(FRONTIER_UNIFIED) +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -989,7 +1027,7 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3) -#ifdef FRONTIER_UNIFIED +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -1094,6 +1132,7 @@ contains end if end if + end subroutine s_3rd_order_tvd_rk !> Strang splitting scheme with 3rd order TVD RK time-stepping algorithm for @@ -1338,30 +1377,45 @@ contains use hipfort_hipmalloc use hipfort_check #endif - integer :: i, j !< Generic loop iterators ! Deallocating the cell-average conservative variables +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) + do j = 1, sys_size + @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf) + if (num_ts == 2) then + if (nv_uvm_out_of_core) then + nullify (q_cons_ts(2)%vf(j)%sf) + else + @:DEALLOCATE(q_cons_ts(2)%vf(j)%sf) + end if + end if + end do + if (num_ts == 2 .and. nv_uvm_out_of_core) then + deallocate (q_cons_ts_pool_host) + end if +#elif defined(FRONTIER_UNIFIED) do i = 1, num_ts -#ifdef FRONTIER_UNIFIED do j = 1, sys_size nullify (q_cons_ts(i)%vf(j)%sf) end do + end do + + call hipCheck(hipHostFree(q_cons_ts_pool_host)) + call hipCheck(hipFree(q_cons_ts_pool_device)) #else + do i = 1, num_ts do j = 1, sys_size @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf) end do + end do #endif + do i = 1, num_ts @:DEALLOCATE(q_cons_ts(i)%vf) end do @:DEALLOCATE(q_cons_ts) -#ifdef FRONTIER_UNIFIED - call hipCheck(hipHostFree(q_cons_ts_pool_host)) - call hipCheck(hipFree(q_cons_ts_pool_device)) -#endif - ! Deallocating the cell-average primitive ts variables if (probe_wrt) then do i = 0, 3 diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp index 56beaea979..a9846124ba 100644 --- a/src/simulation/m_weno.fpp +++ b/src/simulation/m_weno.fpp @@ -98,7 +98,9 @@ module m_weno !> @name Indical bounds in the s1-, s2- and s3-directions !> @{ type(int_bounds_info) :: is1_weno, is2_weno, is3_weno +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM $:GPU_DECLARE(create='[is1_weno,is2_weno,is3_weno]') +#endif ! !> @} diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 7c7648ae18..846763b233 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -64,6 +64,9 @@ def get_install_dirpath(self, case: Case ) -> str: # The install directory is located /build/install/ return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)]) + def get_home_dirpath(self) -> str: + return os.sep.join([os.getcwd()]) + def get_install_binpath(self, case: Case ) -> str: # /install//bin/ return os.sep.join([self.get_install_dirpath(case), "bin", self.name]) @@ -144,6 +147,7 @@ def configure(self, case: Case): flags.append(f"-DMFC_OpenACC={'ON' if ARG('gpu') else 'OFF'}") flags.append(f"-DMFC_GCov={ 'ON' if ARG('gcov') else 'OFF'}") flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}") + flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}") command = ["cmake"] + flags + ["-S", cmake_dirpath, "-B", build_dirpath] diff --git a/toolchain/mfc/lock.py b/toolchain/mfc/lock.py index aa91cc9675..eb20bd73fa 100644 --- a/toolchain/mfc/lock.py +++ b/toolchain/mfc/lock.py @@ -5,7 +5,7 @@ from .printer import cons -MFC_LOCK_CURRENT_VERSION: int = 5 +MFC_LOCK_CURRENT_VERSION: int = 6 @dataclasses.dataclass diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py index e509d7c6b4..8378d3044d 100644 --- a/toolchain/mfc/run/case_dicts.py +++ b/toolchain/mfc/run/case_dicts.py @@ -313,6 +313,9 @@ def analytic(self): 'int_comp': ParamType.LOG, 'ic_eps': ParamType.REAL, 'ic_beta': ParamType.REAL, + 'nv_uvm_out_of_core': ParamType.LOG, + 'nv_uvm_igr_temps_on_gpu': ParamType.INT, + 'nv_uvm_pref_gpu': ParamType.LOG, }) for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector', diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py index fa7d438e77..ba545c5680 100644 --- a/toolchain/mfc/state.py +++ b/toolchain/mfc/state.py @@ -3,12 +3,13 @@ @dataclasses.dataclass class MFCConfig: - mpi: bool = True - gpu: bool = False - debug: bool = False - gcov: bool = False - unified: bool = False - single: bool = False + mpi: bool = True + gpu: bool = False + debug: bool = False + gcov: bool = False + unified: bool = False + single: bool = False + fastmath : bool = False @staticmethod def from_dict(d: dict): diff --git a/toolchain/mfc/test/cases.py b/toolchain/mfc/test/cases.py index 387540fb5b..7329c6e14b 100644 --- a/toolchain/mfc/test/cases.py +++ b/toolchain/mfc/test/cases.py @@ -693,17 +693,17 @@ def alter_mixlayer_perturb(dimInfo): 'patch_icpp(1)%vel(1)': 1.0, 'patch_icpp(1)%vel(2)': 0.0, 'patch_icpp(1)%vel(3)': 0.0, 'patch_icpp(1)%pres': 17.8571428571, 'patch_icpp(1)%alpha_rho(1)': 1.0, 'patch_icpp(1)%alpha(1)': 1.0, 'patch_icpp(1)%r0': -1e6, 'patch_icpp(1)%v0': -1e6, - 'patch_icpp(2)%geometry': -100, + 'patch_icpp(2)%geometry': -100, 'patch_icpp(2)%x_centroid': -1e6, 'patch_icpp(2)%length_x': -1e6, - 'patch_icpp(2)%y_centroid': -1e6, 'patch_icpp(2)%length_y': -1e6, - 'patch_icpp(2)%z_centroid': -1e6, 'patch_icpp(2)%length_z': -1e6, - 'patch_icpp(2)%vel(1)': -1e6, 'patch_icpp(2)%vel(2)': -1e6, 'patch_icpp(2)%vel(3)': -1e6, + 'patch_icpp(2)%y_centroid': -1e6, 'patch_icpp(2)%length_y': -1e6, + 'patch_icpp(2)%z_centroid': -1e6, 'patch_icpp(2)%length_z': -1e6, + 'patch_icpp(2)%vel(1)': -1e6, 'patch_icpp(2)%vel(2)': -1e6, 'patch_icpp(2)%vel(3)': -1e6, 'patch_icpp(2)%r0': -1e6, 'patch_icpp(2)%v0': -1e6, - 'patch_icpp(3)%geometry': -100, + 'patch_icpp(3)%geometry': -100, 'patch_icpp(3)%x_centroid': -1e6, 'patch_icpp(3)%length_x': -1e6, - 'patch_icpp(3)%y_centroid': -1e6, 'patch_icpp(3)%length_y': -1e6, - 'patch_icpp(3)%z_centroid': -1e6, 'patch_icpp(3)%length_z': -1e6, - 'patch_icpp(3)%vel(1)': -1e6, 'patch_icpp(3)%vel(2)': -1e6, 'patch_icpp(3)%vel(3)': -1e6, + 'patch_icpp(3)%y_centroid': -1e6, 'patch_icpp(3)%length_y': -1e6, + 'patch_icpp(3)%z_centroid': -1e6, 'patch_icpp(3)%length_z': -1e6, + 'patch_icpp(3)%vel(1)': -1e6, 'patch_icpp(3)%vel(2)': -1e6, 'patch_icpp(3)%vel(3)': -1e6, 'patch_icpp(3)%r0': -1e6, 'patch_icpp(3)%v0': -1e6 })) @@ -993,11 +993,12 @@ def foreach_example(): "2D_lagrange_bubblescreen", "3D_lagrange_bubblescreen", "2D_triple_point", "1D_shuosher_analytical", - "1D_titarevtorro_analytical", + "1D_titarevtorro_analytical", "2D_acoustic_pulse_analytical", "2D_isentropicvortex_analytical", "2D_zero_circ_vortex_analytical", "3D_TaylorGreenVortex_analytical", + "3D_IGR_TaylorGreenVortex_nvidia", "2D_backward_facing_step", "2D_forward_facing_step"] if path in casesToSkip: diff --git a/toolchain/modules b/toolchain/modules index a124a275f1..27783e9407 100644 --- a/toolchain/modules +++ b/toolchain/modules @@ -85,3 +85,6 @@ n-cpu penguin/openmpi/4.1.5/gcc-8.5.0 n-gpu penguin/openmpi/4.1.5/nvhpc-22.3 nvidia/nvhpc/22.3 cuda/cuda-11.6 n-gpu CC=nvc CXX=nvc++ FC=nvfortran +san CSCS Santis +san-all cmake python +san-gpu nvhpc cuda cray-mpich diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako new file mode 100644 index 0000000000..cb4b330625 --- /dev/null +++ b/toolchain/templates/santis.mako @@ -0,0 +1,93 @@ +#!/usr/bin/env bash + +<%namespace name="helpers" file="helpers.mako"/> + +% if engine == 'batch': +#SBATCH --uenv=icon/25.2:v1@santis +#SBATCH --nodes=${nodes} +#SBATCH --ntasks-per-node=${tasks_per_node} +#SBATCH --cpus-per-task=72 +#SBATCH --job-name="${name}" +#SBATCH --output="${name}.out" +#SBATCH --error="${name}.err" +#SBATCH --time=${walltime} +% if account: +#SBATCH --account=${account} +% endif +% if partition: +#SBATCH --partition=${partition} +% endif +% if quality_of_service: +#SBATCH --qos=${quality_of_service} +% endif +% if email: +#SBATCH --mail-user=${email} +#SBATCH --mail-type="BEGIN, END, FAIL" +% endif +% endif + +# We compiled the code using -gpu=unified:managedalloc, hence we use cudaMallocManaged for the dynamic allocations. +# Using NV_ACC_USE_MALLOC we could change to malloc at runtime. We choose to not do that here and stick with cudaMallocManaged and 2MB page sizes. +# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#memory-model +# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#command-line-options-selecting-compiler-memory-modes +export NV_ACC_USE_MALLOC=0 + +# For NVIDIA CUDA devices, controls the use of automatic memory hints at data constructs in the managed and unified memory modes. +# Below is a breakdown of the permitted values (case insensitive): +# - DEFAULT: Use the default settings. On NVIDIA Grace Hopper systems, the default is currently ENABLE_ALL; on all other systems, the default is DISABLE. +# - DISABLE: Memory hints are disabled for all data constructs. +# - ENABLE_EXPLICIT: Memory hints are enabled for explicit data constructs only. +# - ENABLE_ALL: Memory hints are enabled for explicit and implicit data constructs. +# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#environment-variables-controlling-device-memory-management +# Here we disable the implicit compiler hints. +# Using NVCOMPILER_ACC_NO_MEMHINTS is the legacy way and is still supported, but users should prefer NVCOMPILER_ACC_MEMHINTS when using newer nvhpc compilers. +export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints - legacy way +export NVCOMPILER_ACC_MEMHINTS=DISABLE # disable implicit compiler hints - new way + +# Cray MPICH +export MPICH_GPU_SUPPORT_ENABLED=1 +export FI_CXI_RX_MATCH_MODE=software +export FI_MR_CACHE_MONITOR=disabled +export MPICH_NO_BUFFER_ALIAS_CHECK=1 + +# NSYS +export NSYS=0 # enable nsys profiling +export NSYS_FILE=myreport.qdrep + +${helpers.template_prologue()} + +ok ":) Loading modules:\n" +cd "${MFC_ROOT_DIR}" +% if engine == 'batch': +. ./mfc.sh load -c san -m ${'g' if gpu else 'c'} +% endif +cd - > /dev/null +echo + +% for target in targets: + ${helpers.run_prologue(target)} + + % if not mpi: + (set -x; ${profiler} "${target.get_install_binpath(case)}") + % else: + (set -x; srun --unbuffered \ + --ntasks=${nodes*tasks_per_node} \ + --cpus-per-task 72 \ + --cpu-bind=none \ + % if gpu: + --gpus-per-task 1 \ + % endif + --wait 200 --bcast=/tmp/${target.name} \ + "${target.get_home_dirpath()}/misc/nvidia_uvm/bind.sh" \ + % if target.name == 'simulation': + "${target.get_home_dirpath()}/misc/nvidia_uvm/nsys.sh" \ + % endif + "${target.get_install_binpath(case)}") + % endif + + ${helpers.run_epilogue(target)} + + echo +% endfor + +${helpers.template_epilogue()}