From 2358d298f81336a0a26ea068b698305e8157d2fa Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Fri, 1 Aug 2025 18:04:49 +0200 Subject: [PATCH 01/25] Add scripts for santis/alps, example case, and captures for UVM comms via RDMA --- CMakeLists.txt | 6 +- .../3D_IGR_TaylorGreenVortex_nvidia/case.py | 101 ++++++++++++++++++ misc/nvidia_uvm/bind.sh | 24 +++++ misc/nvidia_uvm/nsys.sh | 24 +++++ src/common/m_mpi_common.fpp | 8 ++ toolchain/mfc/build.py | 3 + toolchain/modules | 3 + toolchain/templates/santis.mako | 86 +++++++++++++++ 8 files changed, 252 insertions(+), 3 deletions(-) create mode 100644 examples/3D_IGR_TaylorGreenVortex_nvidia/case.py create mode 100755 misc/nvidia_uvm/bind.sh create mode 100755 misc/nvidia_uvm/nsys.sh create mode 100644 toolchain/templates/santis.mako diff --git a/CMakeLists.txt b/CMakeLists.txt index 8269c1cb48..c0acb3dbe3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -486,17 +486,17 @@ function(MFC_SETUP_TARGET) endforeach() target_compile_options(${a_target} - PRIVATE -gpu=keep,ptxinfo,lineinfo + PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath ) # GH-200 Unified Memory Support if (MFC_Unified) target_compile_options(${ARGS_TARGET} - PRIVATE -gpu=unified + PRIVATE -gpu=mem:unified -cuda ) # "This option must appear in both the compile and link lines" -- NVHPC Docs target_link_options(${ARGS_TARGET} - PRIVATE -gpu=unified + PRIVATE -gpu=mem:unified -cuda ) endif() diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py new file mode 100644 index 0000000000..74faa7aa22 --- /dev/null +++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +import math +import json + +N = 799 +Nx = N +Ny = 2*(N+1)-1 +Nz = 2*(N+1)-1 + +Re = 1600 +L = 1 +P0 = 101325 +rho0 = 1 +C0 = math.sqrt(1.4 * P0) +V0 = 0.1 * C0 +mu = V0 * L / Re + +cfl = 0.5 +dx = 2 * math.pi * L / (Ny + 1) + +dt = cfl * dx / (C0) + +tC = L / V0 +tEnd = 20 * tC + +Nt = int(tEnd / dt) +Nt = 10 + + +# Configuring case dictionary +print( + json.dumps( + { + "rdma_mpi": "T", + # Logistics + "run_time_info": "F", + # Computational Domain Parameters + "x_domain%beg": -math.pi * L, + "x_domain%end": math.pi * L, + "y_domain%beg": -math.pi * L, + "y_domain%end": math.pi * L, + "z_domain%beg": -math.pi * L, + "z_domain%end": math.pi * L, + "m": Nx, + "n": Ny, + "p": Nz, + "cyl_coord": "F", + "dt": dt, + "t_step_start": 0, + "t_step_stop": Nt, + "t_step_save": int(Nt / 100), + # Simulation Algorithm Parameters + "num_patches": 1, + "model_eqns": 2, + "num_fluids": 1, + "time_stepper": 3, + "bc_x%beg": -1, + "bc_x%end": -1, + "bc_y%beg": -1, + "bc_y%end": -1, + "bc_z%beg": -1, + "bc_z%end": -1, + "igr": "T", + "igr_order": 5, + "igr_iter_solver": 1, + "num_igr_iters": 3, + "num_igr_warm_start_iters": 3, + "alf_factor": 10, + "viscous": "T", + # Formatted Database Files Structure Parameters + "format": 1, + "precision": 2, + "prim_vars_wrt": "T", + "omega_wrt(1)": "T", + "omega_wrt(2)": "T", + "omega_wrt(3)": "T", + "qm_wrt": "T", + "fd_order": 4, + "parallel_io": "T", + # Patch 1: Background (AIR - 2) + "patch_icpp(1)%geometry": 9, + "patch_icpp(1)%x_centroid": 0, + "patch_icpp(1)%y_centroid": 0, + "patch_icpp(1)%z_centroid": 0, + "patch_icpp(1)%length_x": 2 * math.pi * L, + "patch_icpp(1)%length_y": 2 * math.pi * L, + "patch_icpp(1)%length_z": 2 * math.pi * L, + "patch_icpp(1)%vel(1)": 0.0, + "patch_icpp(1)%vel(2)": 0.0, + "patch_icpp(1)%vel(3)": 0, + "patch_icpp(1)%pres": 0.0, + "patch_icpp(1)%hcid": 380, + "patch_icpp(1)%alpha_rho(1)": 1, + "patch_icpp(1)%alpha(1)": 1, + # Fluids Physical Parameters + "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1), + "fluid_pp(1)%pi_inf": 0, + "fluid_pp(1)%Re(1)": 1 / mu, + } + ) +) diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh new file mode 100755 index 0000000000..0b7bf91e96 --- /dev/null +++ b/misc/nvidia_uvm/bind.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# -------------------------------- # +# Binding for a single Santis node # +# -------------------------------- # + +# Local rank +export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}" + +# Bind to GPU +export CUDA_VISIBLE_DEVICES="$local_rank" + +# Binding to NIC +export MPICH_OFI_NIC_POLICY=USER +export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3" + +# Bind to cores ( first core per socket ) +physcores=(0 72 144 216) + +#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY + +#set -x +numactl -l --all --physcpubind=${physcores[$local_rank]} "$@" +#set +x diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh new file mode 100755 index 0000000000..172bcb2f69 --- /dev/null +++ b/misc/nvidia_uvm/nsys.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +#set -x +set -euo pipefail + +rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}" + +[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep +[[ -z "${NSYS+x}" ]] && NSYS=0 + +if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then + exec nsys profile \ + --cpuctxsw=none -b none -s none \ + --event-sample=system-wide \ + --cpu-socket-events=61,71,265,273 \ + --cpu-socket-metrics=103,104 \ + --event-sampling-interval=10 \ + --trace=nvtx,openacc \ + --force-overwrite=true \ + -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \ + -o "$NSYS_FILE" "$@" +else + exec "$@" +fi diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp index 100c055d8d..2bdd241344 100644 --- a/src/common/m_mpi_common.fpp +++ b/src/common/m_mpi_common.fpp @@ -38,7 +38,9 @@ module m_mpi_common !! average primitive variables, for a single computational domain boundary !! at the time, from the relevant neighboring processor. +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM $:GPU_DECLARE(create='[buff_send, buff_recv]') +#endif integer :: halo_size $:GPU_DECLARE(create='[halo_size]') @@ -78,7 +80,13 @@ contains $:GPU_UPDATE(device='[halo_size, v_size]') +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size)) +#else + ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size)) + !$acc enter data create(capture:buff_send) + !$acc enter data create(capture:buff_recv) +#endif #endif end subroutine s_initialize_mpi_common_module diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 2de738986d..750c9b294c 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -64,6 +64,9 @@ def get_install_dirpath(self, case: Case ) -> str: # The install directory is located /build/install/ return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)]) + def get_home_dirpath(self, case: Case) -> str: + return os.sep.join([os.getcwd()]) + def get_install_binpath(self, case: Case ) -> str: # /install//bin/ return os.sep.join([self.get_install_dirpath(case), "bin", self.name]) diff --git a/toolchain/modules b/toolchain/modules index 1e7ebe97f3..19e4e4d8df 100644 --- a/toolchain/modules +++ b/toolchain/modules @@ -85,3 +85,6 @@ n-cpu penguin/openmpi/4.1.5/gcc-8.5.0 n-gpu penguin/openmpi/4.1.5/nvhpc-22.3 nvidia/nvhpc/22.3 cuda/cuda-11.6 n-gpu CC=nvc CXX=nvc++ FC=nvfortran +san CSCS Santis +san-all cmake python +san-gpu nvhpc cuda cray-mpich diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako new file mode 100644 index 0000000000..926c682039 --- /dev/null +++ b/toolchain/templates/santis.mako @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +<%namespace name="helpers" file="helpers.mako"/> + +% if engine == 'batch': +#SBATCH --uenv=icon/25.2:v1 +#SBATCH --nodes=${nodes} +#SBATCH --reservation=g183 +#SBATCH --ntasks-per-node=${tasks_per_node} +#SBATCH --job-name="${name}" +#SBATCH --output="${name}.out" +#SBATCH --error="${name}.err" +#SBATCH --time=${walltime} +% if account: +#SBATCH --account=${account} +% endif +% if partition: +#SBATCH --partition=${partition} +% endif +% if quality_of_service: +#SBATCH --qos=${quality_of_service} +% endif +% if email: +#SBATCH --mail-user=${email} +#SBATCH --mail-type="BEGIN, END, FAIL" +% endif +% endif + +# NVHPC and CUDA env vars +export NV_ACC_USE_MALLOC=0 # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified ) +export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints +#export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH + +# Cray MPICH +export MPICH_GPU_SUPPORT_ENABLED=1 +export FI_CXI_RX_MATCH_MODE=software +export FI_MR_CACHE_MONITOR=disabled +export MPICH_NO_BUFFER_ALIAS_CHECK=1 + +# CUSTOM env vars to MFC +export NVIDIA_ALLOC_MODE=0 # do nothing +export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some +export NVIDIA_IGR_TEMPS_ON_GPU=3 # jac, jac_rhs, and jac_old on GPU +export NVIDIA_VARS_ON_GPU=7 # q_cons_ts(1)%vf%sf for j=1-7 on GPU + +# NSYS +export NSYS=1 # enable nsys profiling +export NSYS_FILE=myreport.qdrep + +${helpers.template_prologue()} + +ok ":) Loading modules:\n" +cd "${MFC_ROOT_DIR}" +% if engine == 'batch': +. ./mfc.sh load -c san -m ${'g' if gpu else 'c'} +% endif +cd - > /dev/null +echo + +% for target in targets: + ${helpers.run_prologue(target)} + + % if not mpi: + (set -x; ${profiler} "${target.get_install_binpath(case)}") + % else: + (set -x; srun --unbuffered \ + --ntasks=${nodes*tasks_per_node} \ + --cpus-per-task 1 \ + --cpu-bind=none \ + % if gpu: + --gpus-per-task 1 \ + % endif + --wait 200 --bcast=/tmp/${target.name} \ + "${target.get_home_dirpath(case)}/misc/nvidia_uvm/bind.sh" \ + % if target.name == 'simulation': + "${target.get_home_dirpath(case)}/misc/nvidia_uvm/nsys.sh" \ + % endif + "${target.get_install_binpath(case)}") + % endif + + ${helpers.run_epilogue(target)} + + echo +% endfor + +${helpers.template_epilogue()} From 37d393b02f5a5d01a27a1ffeca10d96231600d22 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Fri, 1 Aug 2025 20:34:32 +0200 Subject: [PATCH 02/25] Add PREFER_GPU and rearrange update for out-of-core computation --- src/common/include/macros.fpp | 41 ++++++++++++++++ src/simulation/m_global_parameters.fpp | 9 ++++ src/simulation/m_igr.fpp | 6 +++ src/simulation/m_time_steppers.fpp | 65 +++++++++++++++++++++++++- 4 files changed, 119 insertions(+), 2 deletions(-) diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index c1652388c3..b0d87d31b0 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -12,6 +12,47 @@ #endif #:enddef +#:def PREFER_GPU(*args) +#ifdef MFC_SIMULATION +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + block + use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval + integer :: istat + integer :: prefer_gpu_mode + character(len=10) :: prefer_gpu_mode_str + + ! environment variable + call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str) + if (trim(prefer_gpu_mode_str) == "0") then ! OFF + prefer_gpu_mode = 0 + elseif (trim(prefer_gpu_mode_str) == "1") then ! ON + prefer_gpu_mode = 1 + else ! default + prefer_gpu_mode = 0 + endif + + if (prefer_gpu_mode .eq. 1) then + #:for arg in args + !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) + ! unset + istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId ) + if (istat /= cudaSuccess) then + write(*,"('Error code: ',I0, ': ')") istat + write(*,*) cudaGetErrorString(istat) + endif + ! set + istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 ) + if (istat /= cudaSuccess) then + write(*,"('Error code: ',I0, ': ')") istat + write(*,*) cudaGetErrorString(istat) + endif + #:endfor + end if + end block +#endif +#endif +#:enddef + #:def ALLOCATE(*args) @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'}) #:set allocated_variables = ', '.join(args) diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 5be11129a2..2c2d0af646 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -1308,16 +1308,25 @@ contains @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size)) @:ALLOCATE(x_cc(-buff_size:m + buff_size)) @:ALLOCATE(dx(-buff_size:m + buff_size)) + @:PREFER_GPU(x_cb) + @:PREFER_GPU(x_cc) + @:PREFER_GPU(dx) if (n == 0) return; @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size)) @:ALLOCATE(y_cc(-buff_size:n + buff_size)) @:ALLOCATE(dy(-buff_size:n + buff_size)) + @:PREFER_GPU(y_cb) + @:PREFER_GPU(y_cc) + @:PREFER_GPU(dy) if (p == 0) return; @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size)) @:ALLOCATE(z_cc(-buff_size:p + buff_size)) @:ALLOCATE(dz(-buff_size:p + buff_size)) + @:PREFER_GPU(z_cb) + @:PREFER_GPU(z_cc) + @:PREFER_GPU(dz) end subroutine s_initialize_global_parameters_module diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index db80bb8346..76069928f2 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -91,17 +91,23 @@ contains end do end do $:GPU_UPDATE(device='[Res, Re_idx, Re_size]') + @:PREFER_GPU(Res) + @:PREFER_GPU(Re_idx) end if @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(jac) + @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) + @:PREFER_GPU(jac_rhs) if (igr_iter_solver == 1) then ! Jacobi iteration @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(jac_old) end if $:GPU_PARALLEL_LOOP(collapse=3) diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index d040650bfa..c87bcad464 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -95,9 +95,11 @@ contains ! Allocating the cell-average conservative variables @:ALLOCATE(q_cons_ts(1:num_ts)) + @:PREFER_GPU(q_cons_ts) do i = 1, num_ts @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size)) + @:PREFER_GPU(q_cons_ts(i)%vf) end do do i = 1, num_ts @@ -105,6 +107,7 @@ contains @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf) end do @:ACC_SETUP_VFs(q_cons_ts(i)) end do @@ -304,11 +307,13 @@ contains ! Allocating the cell-average RHS variables @:ALLOCATE(rhs_vf(1:sys_size)) + @:PREFER_GPU(rhs_vf) if (igr) then do i = 1, sys_size @:ALLOCATE(rhs_vf(i)%sf(-1:m+1,-1:n+1,-1:p+1)) @:ACC_SETUP_SFs(rhs_vf(i)) + @:PREFER_GPU(rhs_vf(i)%sf) end do else do i = 1, sys_size @@ -650,6 +655,7 @@ contains real(wp), intent(INOUT) :: time_avg integer :: i, j, k, l, q !< Generic loop iterator + integer :: dest real(wp) :: start, finish @@ -682,6 +688,7 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) +#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -694,6 +701,24 @@ contains end do end do end do + dest = 2 ! result in q_cons_ts(2)%vf +#else + $:GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l) + end do + end do + end do + end do + dest = 1 ! result in q_cons_ts(1)%vf +#endif !Evolve pb and mv for non-polytropic qbmm if (qbmm .and. (.not. polytropic)) then @@ -750,10 +775,11 @@ contains ! Stage 2 of 3 - call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2) + call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2) if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2) +#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -767,6 +793,23 @@ contains end do end do end do + dest = 2 ! result in q_cons_ts(2)%vf +#else + $:GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & + + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l))/4._wp + end do + end do + end do + end do + dest = 1 ! result in q_cons_ts(1)%vf +#endif if (qbmm .and. (.not. polytropic)) then $:GPU_PARALLEL_LOOP(collapse=5) @@ -823,10 +866,11 @@ contains end if ! Stage 3 of 3 - call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3) + call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3) if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3) +#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -840,6 +884,23 @@ contains end do end do end do + dest = 1 ! result in q_cons_ts(1)%vf +#else + $:GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (q_cons_ts(2)%vf(i)%sf(j, k, l) & + + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & + + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp + end do + end do + end do + end do + dest = 1 ! result in q_cons_ts(1)%vf +#endif if (qbmm .and. (.not. polytropic)) then $:GPU_PARALLEL_LOOP(collapse=5) From 693c7f46e562d5039fe30d23ccfcca3753721ff6 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Fri, 1 Aug 2025 20:35:21 +0200 Subject: [PATCH 03/25] Allow keeping q_cons_ts(2) on CPU using pinned allocations --- src/simulation/m_time_steppers.fpp | 68 +++++++++++++++++++++++++++--- toolchain/templates/santis.mako | 3 +- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index c87bcad464..0d9ddfd885 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -75,8 +75,14 @@ module m_time_steppers integer, private :: num_ts !< !! Number of time stages in the time-stepping scheme + integer, private :: out_of_core + $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]') +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host +#endif + contains !> The computation of parameters, the allocation of memory, @@ -86,6 +92,21 @@ contains integer :: i, j !< Generic loop iterators + character(len=10) :: out_of_core_str + out_of_core = 0 + +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + call get_environment_variable("MFC_OUT_OF_CORE", out_of_core_str) + + if (trim(out_of_core_str) == "0") then + out_of_core = 0 + elseif (trim(out_of_core_str) == "1") then + out_of_core = 1 + else ! default + out_of_core = 0 + endif +#endif + ! Setting number of time-stages for selected time-stepping scheme if (time_stepper == 1) then num_ts = 1 @@ -102,12 +123,33 @@ contains @:PREFER_GPU(q_cons_ts(i)%vf) end do +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + if ( out_of_core == 1 ) then + allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end, & + 1:sys_size)) + end if +#endif + do i = 1, num_ts do j = 1, sys_size - @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end)) - @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf) +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + if ( i <= (num_ts - out_of_core) ) then + !print*, "q_cons_ts", i, j, "on GPU" +#endif + @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf) +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + else + !print*, "q_cons_ts", i, j, "on CPU" + q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j) + end if +#endif end do @:ACC_SETUP_VFs(q_cons_ts(i)) end do @@ -1205,7 +1247,17 @@ contains ! Deallocating the cell-average conservative variables do i = 1, num_ts do j = 1, sys_size - @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf) +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + if ( i <= (num_ts - out_of_core) ) then + !print*, "q_cons_ts", i, j, "dealloc" +#endif + @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf) +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + else + !print*, "q_cons_ts", i, j, "nullify" + nullify(q_cons_ts(i)%vf(j)%sf) + end if +#endif end do @:DEALLOCATE(q_cons_ts(i)%vf) @@ -1213,6 +1265,12 @@ contains @:DEALLOCATE(q_cons_ts) +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + if ( out_of_core == 1 ) then + deallocate(q_cons_ts_pool_host) + end if +#endif + ! Deallocating the cell-average primitive ts variables if (probe_wrt) then do i = 0, 3 diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako index 926c682039..27b4d6b425 100644 --- a/toolchain/templates/santis.mako +++ b/toolchain/templates/santis.mako @@ -38,10 +38,9 @@ export FI_MR_CACHE_MONITOR=disabled export MPICH_NO_BUFFER_ALIAS_CHECK=1 # CUSTOM env vars to MFC -export NVIDIA_ALLOC_MODE=0 # do nothing +export MFC_OUT_OF_CORE=1 # out of core export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some export NVIDIA_IGR_TEMPS_ON_GPU=3 # jac, jac_rhs, and jac_old on GPU -export NVIDIA_VARS_ON_GPU=7 # q_cons_ts(1)%vf%sf for j=1-7 on GPU # NSYS export NSYS=1 # enable nsys profiling From 7054b7b666226cbe91d5a078353ae0389eefad43 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Fri, 1 Aug 2025 21:06:39 +0200 Subject: [PATCH 04/25] Modify PREFER_GPU macro --- src/common/include/macros.fpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index b0d87d31b0..0ffad2e06b 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -34,14 +34,20 @@ if (prefer_gpu_mode .eq. 1) then #:for arg in args !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) - ! unset - istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId ) + ! set preferred location GPU + istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 ) if (istat /= cudaSuccess) then write(*,"('Error code: ',I0, ': ')") istat write(*,*) cudaGetErrorString(istat) endif - ! set - istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 ) + ! set accessed by CPU + istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId ) + if (istat /= cudaSuccess) then + write(*,"('Error code: ',I0, ': ')") istat + write(*,*) cudaGetErrorString(istat) + endif + ! prefetch to GPU - physically populate memory pages + istat = cudaMemPrefetchAsync( c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0 ) if (istat /= cudaSuccess) then write(*,"('Error code: ',I0, ': ')") istat write(*,*) cudaGetErrorString(istat) From ee1277d8dae3a02e9e66377f9ce2b28c977bf70f Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Sat, 2 Aug 2025 00:53:53 +0200 Subject: [PATCH 05/25] Allow control in placement of IGR temps --- src/simulation/m_igr.fpp | 124 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 76069928f2..94fb6cd857 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -24,8 +24,16 @@ module m_igr s_igr_flux_add, & s_finalize_igr_module +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + integer, dimension(3) :: temp_on_gpu + real(wp), pointer, contiguous, dimension(:, :, :) :: jac,jac_rhs,jac_old + real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host1 + real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host2 + real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host3 +#else real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]') +#endif real(wp), allocatable, dimension(:, :) :: Res $:GPU_DECLARE(create='[Res]') @@ -82,6 +90,47 @@ module m_igr contains subroutine s_initialize_igr_module() +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + integer :: igr_temps_total + integer :: igr_temps_on_gpu + integer :: igr_temps_on_cpu + character(len=10) :: igr_temps_on_gpu_str + + ! initialize + if (igr_iter_solver == 1) then ! Jacobi iteration + igr_temps_total = 3 + else + igr_temps_total = 2 + end if + igr_temps_on_gpu = igr_temps_total + igr_temps_on_cpu = 0 + + call get_environment_variable("NVIDIA_IGR_TEMPS_ON_GPU", igr_temps_on_gpu_str) + + if (trim(igr_temps_on_gpu_str) == "0") then + igr_temps_on_gpu = 0 ! jac, jac_rhs and jac_old on CPU + else if (trim(igr_temps_on_gpu_str) == "1") then + igr_temps_on_gpu = 1 ! jac on GPU, jac_rhs on CPU, jac_old on CPU + else if (trim(igr_temps_on_gpu_str) == "2") then + igr_temps_on_gpu = 2 ! jac and jac_rhs on GPU, jac_old on CPU + else if (trim(igr_temps_on_gpu_str) == "3") then + igr_temps_on_gpu = 3 ! jac, jac_rhs and jac_old on GPU + else ! default on GPU + igr_temps_on_gpu = 3 + end if + + ! trim if needed + if ( igr_temps_on_gpu > igr_temps_total ) then + igr_temps_on_gpu = igr_temps_total + end if + igr_temps_on_cpu = igr_temps_total - igr_temps_on_gpu + + ! create map + temp_on_gpu(1:3) = -1 + temp_on_gpu(1:igr_temps_total) = 0 + temp_on_gpu(1:igr_temps_on_gpu) = 1 + print*, temp_on_gpu(1:3) +#endif if (viscous) then @:ALLOCATE(Res(1:2, 1:maxval(Re_size))) @@ -95,6 +144,7 @@ contains @:PREFER_GPU(Re_idx) end if +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) @@ -109,6 +159,55 @@ contains idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac_old) end if +#else + + if ( temp_on_gpu(1) == 1 ) then + @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(jac) + else + print*, 'jac on CPU' + + allocate(pool_host1(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + + jac(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => pool_host1(:,:,:) + end if + + if ( temp_on_gpu(2) == 1 ) then + @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) + @:PREFER_GPU(jac_rhs) + else + print*, 'jac_rhs on CPU' + + allocate(pool_host2(-1:m,-1:n,-1:p)) + + jac_rhs(-1:m,-1:n,-1:p) => pool_host2(:,:,:) + end if + + if (igr_iter_solver == 1) then ! Jacobi iteration + if ( temp_on_gpu(3) == 1 ) then + @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(jac_old) + else + print*, 'jac_old on CPU' + + allocate(pool_host3(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + + jac_old(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => pool_host3(:,:,:) + end if + end if +#endif $:GPU_PARALLEL_LOOP(collapse=3) do l = idwbuff(3)%beg, idwbuff(3)%end @@ -2618,11 +2717,36 @@ contains @:DEALLOCATE(Res) end if +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM @:DEALLOCATE(jac, jac_rhs) if (igr_iter_solver == 1) then ! Jacobi iteration @:DEALLOCATE(jac_old) end if +#else + if (temp_on_gpu(1) == 1) then + @:DEALLOCATE(jac) + else + nullify(jac) + deallocate(pool_host1) + end if + + if (temp_on_gpu(2) == 1) then + @:DEALLOCATE(jac_rhs) + else + nullify(jac_rhs) + deallocate(pool_host2) + end if + + if (igr_iter_solver == 1) then ! Jacobi iteration + if (temp_on_gpu(3) == 1) then + @:DEALLOCATE(jac_old) + else + nullify(jac_old) + deallocate(pool_host3) + end if + end if +#endif #:if not MFC_CASE_OPTIMIZATION @:DEALLOCATE(coeff_L, coeff_R) From 4065c024734978f95498f1be2504781c86da434b Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Sat, 2 Aug 2025 09:26:23 +0200 Subject: [PATCH 06/25] Do some clean up --- src/simulation/m_igr.fpp | 37 +++++------------------------- src/simulation/m_time_steppers.fpp | 5 ++-- 2 files changed, 8 insertions(+), 34 deletions(-) diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 94fb6cd857..9be4b514c3 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -91,20 +91,9 @@ contains subroutine s_initialize_igr_module() #ifdef __NVCOMPILER_GPU_UNIFIED_MEM - integer :: igr_temps_total - integer :: igr_temps_on_gpu - integer :: igr_temps_on_cpu + integer :: igr_temps_on_gpu = 3 character(len=10) :: igr_temps_on_gpu_str - ! initialize - if (igr_iter_solver == 1) then ! Jacobi iteration - igr_temps_total = 3 - else - igr_temps_total = 2 - end if - igr_temps_on_gpu = igr_temps_total - igr_temps_on_cpu = 0 - call get_environment_variable("NVIDIA_IGR_TEMPS_ON_GPU", igr_temps_on_gpu_str) if (trim(igr_temps_on_gpu_str) == "0") then @@ -119,17 +108,10 @@ contains igr_temps_on_gpu = 3 end if - ! trim if needed - if ( igr_temps_on_gpu > igr_temps_total ) then - igr_temps_on_gpu = igr_temps_total - end if - igr_temps_on_cpu = igr_temps_total - igr_temps_on_gpu - ! create map - temp_on_gpu(1:3) = -1 - temp_on_gpu(1:igr_temps_total) = 0 + temp_on_gpu(1:3) = 0 temp_on_gpu(1:igr_temps_on_gpu) = 1 - print*, temp_on_gpu(1:3) + !print*, temp_on_gpu(1:3) #endif if (viscous) then @@ -148,16 +130,12 @@ contains @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) - @:PREFER_GPU(jac) - @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) - @:PREFER_GPU(jac_rhs) if (igr_iter_solver == 1) then ! Jacobi iteration @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) - @:PREFER_GPU(jac_old) end if #else @@ -167,8 +145,7 @@ contains idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac) else - print*, 'jac on CPU' - + !print*, 'jac on CPU' allocate(pool_host1(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) @@ -182,8 +159,7 @@ contains @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) @:PREFER_GPU(jac_rhs) else - print*, 'jac_rhs on CPU' - + !print*, 'jac_rhs on CPU' allocate(pool_host2(-1:m,-1:n,-1:p)) jac_rhs(-1:m,-1:n,-1:p) => pool_host2(:,:,:) @@ -196,8 +172,7 @@ contains idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac_old) else - print*, 'jac_old on CPU' - + !print*, 'jac_old on CPU' allocate(pool_host3(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index 0d9ddfd885..8f86202bbe 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -75,12 +75,11 @@ module m_time_steppers integer, private :: num_ts !< !! Number of time stages in the time-stepping scheme - integer, private :: out_of_core - $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]') #ifdef __NVCOMPILER_GPU_UNIFIED_MEM real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host + integer, private :: out_of_core #endif contains @@ -92,10 +91,10 @@ contains integer :: i, j !< Generic loop iterators +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM character(len=10) :: out_of_core_str out_of_core = 0 -#ifdef __NVCOMPILER_GPU_UNIFIED_MEM call get_environment_variable("MFC_OUT_OF_CORE", out_of_core_str) if (trim(out_of_core_str) == "0") then From cfb792c991710dd45f64d415215b34ba23b01cec Mon Sep 17 00:00:00 2001 From: Benjamin Wilfong Date: Sun, 3 Aug 2025 06:10:07 +0200 Subject: [PATCH 07/25] ENV Vars to case file options and code structure changes --- CMakeLists.txt | 2 +- .../3D_IGR_TaylorGreenVortex_nvidia/case.py | 7 +- src/common/include/macros.fpp | 16 +- src/simulation/m_checker.fpp | 10 + src/simulation/m_global_parameters.fpp | 13 + src/simulation/m_igr.fpp | 73 ++--- src/simulation/m_mpi_proxy.fpp | 4 + src/simulation/m_start_up.fpp | 7 +- src/simulation/m_time_steppers.fpp | 251 ++++++++++-------- toolchain/mfc/run/case_dicts.py | 2 + toolchain/mfc/test/cases.py | 21 +- toolchain/templates/santis.mako | 7 +- 12 files changed, 215 insertions(+), 198 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c0acb3dbe3..8901ec0b16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,7 +234,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") message(STATUS "LTO/IPO is not supported in NVHPC Version < 23.11. Use a newer version of NVHPC for best performance.") else() message(STATUS "Performing IPO using -Mextract followed by -Minline") - set(NVHPC_USE_TWO_PASS_IPO TRUE) + set(NVHPC_USE_TWO_PASS_IPO FALSE) endif() else() CHECK_IPO_SUPPORTED(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR) diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py index 74faa7aa22..17ad1ceb43 100644 --- a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py +++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py @@ -47,8 +47,8 @@ "cyl_coord": "F", "dt": dt, "t_step_start": 0, - "t_step_stop": Nt, - "t_step_save": int(Nt / 100), + "t_step_stop": 10, #Nt, + "t_step_save": 10, #int(Nt / 100), # Simulation Algorithm Parameters "num_patches": 1, "model_eqns": 2, @@ -96,6 +96,9 @@ "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1), "fluid_pp(1)%pi_inf": 0, "fluid_pp(1)%Re(1)": 1 / mu, + # NVIDIA UVM Options + "nv_uvm_igr_temps_on_gpu": 3, + "nv_uvm_pref_gpu": "T", } ) ) diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index 0ffad2e06b..7177efa32d 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -18,20 +18,8 @@ block use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval integer :: istat - integer :: prefer_gpu_mode - character(len=10) :: prefer_gpu_mode_str - - ! environment variable - call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str) - if (trim(prefer_gpu_mode_str) == "0") then ! OFF - prefer_gpu_mode = 0 - elseif (trim(prefer_gpu_mode_str) == "1") then ! ON - prefer_gpu_mode = 1 - else ! default - prefer_gpu_mode = 0 - endif - - if (prefer_gpu_mode .eq. 1) then + + if (nv_uvm_pref_gpu) then #:for arg in args !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) ! set preferred location GPU diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp index f0196af0e2..8917b0be46 100644 --- a/src/simulation/m_checker.fpp +++ b/src/simulation/m_checker.fpp @@ -30,6 +30,7 @@ contains if (igr) then call s_check_inputs_igr + call s_check_inputs_nvidia_uvm else if (recon_type == WENO_TYPE) then call s_check_inputs_weno @@ -411,4 +412,13 @@ contains @:PROHIBIT(powell .and. fd_order == dflt_int, "fd_order must be set if Powell's method is enabled") end subroutine s_check_inputs_mhd + impure subroutine s_check_inputs_nvidia_uvm +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + @:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, & + "nv_uvm_igr_temps_on_gpu must be in the range [0, 3]") + @:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, & + "nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2") +#endif + end subroutine s_check_inputs_nvidia_uvm + end module m_checker diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 2c2d0af646..401fa5412d 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -156,6 +156,15 @@ module m_global_parameters logical :: viscous !< Viscous effects #:endif + !> @name Variables for our of core IGR computation on NVIDIA + !> @{ + integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU + ! 1 => jac on GPU, jac_rhs and jac_old on CPU + ! 2 => jac and jac_rhs on GPU, jac_old on CPU + ! 4 => jac, jac_rhs, and jac_old on GPU (default) + logical :: nv_uvm_pref_gpu ! Enable pinned gpu memory (default TRUE) + !> @} + real(wp) :: weno_eps !< Binding for the WENO nonlinear weights real(wp) :: teno_CT !< Smoothness threshold for TENO logical :: mp_weno !< Monotonicity preserving (MP) WENO @@ -570,6 +579,10 @@ contains t_stop = dflt_real t_save = dflt_real + ! NVIDIA UVM options + nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default) + nv_uvm_pref_gpu = .true. + ! Simulation algorithm parameters model_eqns = dflt_int mpp_lim = .false. diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 9be4b514c3..0d1edad478 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -25,11 +25,11 @@ module m_igr s_finalize_igr_module #ifdef __NVCOMPILER_GPU_UNIFIED_MEM - integer, dimension(3) :: temp_on_gpu + integer, dimension(3) :: nv_uvm_temp_on_gpu real(wp), pointer, contiguous, dimension(:, :, :) :: jac,jac_rhs,jac_old - real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host1 - real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host2 - real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host3 + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host_pool + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host_pool + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host_pool #else real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]') @@ -81,7 +81,6 @@ module m_igr 5._wp/6._wp, & ! Index 0 2._wp/6._wp & ! Index 1 ] - #:endif #:endif @@ -90,29 +89,6 @@ module m_igr contains subroutine s_initialize_igr_module() -#ifdef __NVCOMPILER_GPU_UNIFIED_MEM - integer :: igr_temps_on_gpu = 3 - character(len=10) :: igr_temps_on_gpu_str - - call get_environment_variable("NVIDIA_IGR_TEMPS_ON_GPU", igr_temps_on_gpu_str) - - if (trim(igr_temps_on_gpu_str) == "0") then - igr_temps_on_gpu = 0 ! jac, jac_rhs and jac_old on CPU - else if (trim(igr_temps_on_gpu_str) == "1") then - igr_temps_on_gpu = 1 ! jac on GPU, jac_rhs on CPU, jac_old on CPU - else if (trim(igr_temps_on_gpu_str) == "2") then - igr_temps_on_gpu = 2 ! jac and jac_rhs on GPU, jac_old on CPU - else if (trim(igr_temps_on_gpu_str) == "3") then - igr_temps_on_gpu = 3 ! jac, jac_rhs and jac_old on GPU - else ! default on GPU - igr_temps_on_gpu = 3 - end if - - ! create map - temp_on_gpu(1:3) = 0 - temp_on_gpu(1:igr_temps_on_gpu) = 1 - !print*, temp_on_gpu(1:3) -#endif if (viscous) then @:ALLOCATE(Res(1:2, 1:maxval(Re_size))) @@ -138,48 +114,47 @@ contains idwbuff(3)%beg:idwbuff(3)%end)) end if #else + ! create map + nv_uvm_temp_on_gpu(1:3) = 0 + nv_uvm_temp_on_gpu(1:nv_uvm_igr_temps_on_gpu) = 1 - if ( temp_on_gpu(1) == 1 ) then + if (nv_uvm_temp_on_gpu(1) == 1) then @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac) else - !print*, 'jac on CPU' - allocate(pool_host1(idwbuff(1)%beg:idwbuff(1)%end, & + allocate(jac_host_pool(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end) => pool_host1(:,:,:) + idwbuff(3)%beg:idwbuff(3)%end) => jac_host_pool(:,:,:) end if - if ( temp_on_gpu(2) == 1 ) then + if (nv_uvm_temp_on_gpu(2) == 1) then @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) @:PREFER_GPU(jac_rhs) else - !print*, 'jac_rhs on CPU' - allocate(pool_host2(-1:m,-1:n,-1:p)) - - jac_rhs(-1:m,-1:n,-1:p) => pool_host2(:,:,:) + allocate(jac_rhs_host_pool(-1:m,-1:n,-1:p)) + jac_rhs(-1:m,-1:n,-1:p) => jac_rhs_host_pool(:,:,:) end if if (igr_iter_solver == 1) then ! Jacobi iteration - if ( temp_on_gpu(3) == 1 ) then + if (nv_uvm_temp_on_gpu(3) == 1) then @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac_old) else - !print*, 'jac_old on CPU' - allocate(pool_host3(idwbuff(1)%beg:idwbuff(1)%end, & + allocate(jac_old_host_pool(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end)) jac_old(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end) => pool_host3(:,:,:) + idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host_pool(:,:,:) end if end if #endif @@ -203,7 +178,7 @@ contains #:if not MFC_CASE_OPTIMIZATION if (igr_order == 3) then - vidxb = -1; vidxe = 2; + vidxb = -1; vidxe = 2; $:GPU_UPDATE(device='[vidxb, vidxe]') @:ALLOCATE(coeff_L(0:2)) @@ -219,7 +194,7 @@ contains $:GPU_UPDATE(device='[coeff_R]') elseif (igr_order == 5) then - vidxb = -2; vidxe = 3; + vidxb = -2; vidxe = 3; $:GPU_UPDATE(device='[vidxb, vidxe]') @:ALLOCATE(coeff_L(-1:3)) @@ -2699,26 +2674,26 @@ contains @:DEALLOCATE(jac_old) end if #else - if (temp_on_gpu(1) == 1) then + if (nv_uvm_temp_on_gpu(1) == 1) then @:DEALLOCATE(jac) else nullify(jac) - deallocate(pool_host1) + deallocate(jac_host_pool) end if - if (temp_on_gpu(2) == 1) then + if (nv_uvm_temp_on_gpu(2) == 1) then @:DEALLOCATE(jac_rhs) else nullify(jac_rhs) - deallocate(pool_host2) + deallocate(jac_rhs_host_pool) end if if (igr_iter_solver == 1) then ! Jacobi iteration - if (temp_on_gpu(3) == 1) then + if (nv_uvm_temp_on_gpu(3) == 1) then @:DEALLOCATE(jac_old) else nullify(jac_old) - deallocate(pool_host3) + deallocate(jac_old_host_pool) end if end if #endif diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index f5cc89b4c5..3564c1e2e3 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -237,6 +237,10 @@ contains #:endfor end do + ! NVIDIA UVM variables + call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) + call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + #endif end subroutine s_mpi_bcast_user_inputs diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index e004252060..2bda3c8413 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -185,9 +185,10 @@ contains surface_tension, bubbles_lagrange, lag_params, & hyperelasticity, R0ref, num_bc_patches, Bx0, powell, & cont_damage, tau_star, cont_damage_s, alpha_bar, & - alf_factor, num_igr_iters, & - num_igr_warm_start_iters, & - int_comp, ic_eps, ic_beta + alf_factor, num_igr_iters, num_igr_warm_start_iters, & + int_comp, ic_eps, ic_beta, nv_uvm_igr_temps_on_gpu, & + nv_uvm_pref_gpu + ! Checking that an input file has been provided by the user. If it ! has, then the input file is read in, otherwise, simulation exits. inquire (FILE=trim(file_path), EXIST=file_exist) diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index 8f86202bbe..540f08f547 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -79,7 +79,6 @@ module m_time_steppers #ifdef __NVCOMPILER_GPU_UNIFIED_MEM real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host - integer, private :: out_of_core #endif contains @@ -91,21 +90,6 @@ contains integer :: i, j !< Generic loop iterators -#ifdef __NVCOMPILER_GPU_UNIFIED_MEM - character(len=10) :: out_of_core_str - out_of_core = 0 - - call get_environment_variable("MFC_OUT_OF_CORE", out_of_core_str) - - if (trim(out_of_core_str) == "0") then - out_of_core = 0 - elseif (trim(out_of_core_str) == "1") then - out_of_core = 1 - else ! default - out_of_core = 0 - endif -#endif - ! Setting number of time-stages for selected time-stepping scheme if (time_stepper == 1) then num_ts = 1 @@ -123,35 +107,38 @@ contains end do #ifdef __NVCOMPILER_GPU_UNIFIED_MEM - if ( out_of_core == 1 ) then - allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end, & - 1:sys_size)) - end if -#endif + allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end, & + 1:sys_size)) + + do j = 1, sys_size + ! q_cons_ts(1) lives on the device + @:ALLOCATE(q_cons_ts(1)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) + @:PREFER_GPU(q_cons_ts(1)%vf(j)%sf) + if (num_ts == 2) then + ! q_cons_ts(2) lives on the host + q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j) + end if + end do + do i = 1, num_ts + @:ACC_SETUP_VFs(q_cons_ts(i)) + end do +#else do i = 1, num_ts do j = 1, sys_size -#ifdef __NVCOMPILER_GPU_UNIFIED_MEM - if ( i <= (num_ts - out_of_core) ) then - !print*, "q_cons_ts", i, j, "on GPU" -#endif - @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end)) - @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf) -#ifdef __NVCOMPILER_GPU_UNIFIED_MEM - else - !print*, "q_cons_ts", i, j, "on CPU" - q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j) - end if -#endif + @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) end do @:ACC_SETUP_VFs(q_cons_ts(i)) end do +#endif ! Allocating the cell-average primitive ts variables if (probe_wrt) then @@ -513,6 +500,7 @@ contains integer :: i, j, k, l, q!< Generic loop iterator real(wp) :: start, finish + integer :: dest ! Stage 1 of 2 @@ -542,12 +530,15 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p do k = 0, n do j = 0, m q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) + q_cons_ts(1)%vf(i)%sf(j, k, l) = & q_cons_ts(1)%vf(i)%sf(j, k, l) & + dt*rhs_vf(i)%sf(j, k, l) end do @@ -555,6 +546,24 @@ contains end do end do + dest = 1 ! Result in q_cons_ts(1)%vf +#else + $:GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l) + end do + end do + end do + end do + + dest = 2 ! Result in q_cons_ts(2)%vf +#endif + !Evolve pb and mv for non-polytropic qbmm if (qbmm .and. (.not. polytropic)) then $:GPU_PARALLEL_LOOP(collapse=5) @@ -590,30 +599,46 @@ contains end do end if - if (bodyForces) call s_apply_bodyforces(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, dt) + if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt) - if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(2)%vf) + if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf) if (model_eqns == 3 .and. (.not. relax)) then - call s_pressure_relaxation_procedure(q_cons_ts(2)%vf) + call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf) end if - if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf) + if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf) if (ib) then if (qbmm .and. .not. polytropic) then - call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf) else - call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf) end if end if ! Stage 2 of 2 - call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2) + call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2) if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2) +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) + $:GPU_PARALLEL_LOOP(collapse=4) + do i = 1, sys_size + do l = 0, p + do k = 0, n + do j = 0, m + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (q_cons_ts(2)%vf(i)%sf(j, k, l) & + + q_cons_ts(1)%vf(i)%sf(j, k, l) & + + dt*rhs_vf(i)%sf(j, k, l))/4._wp + end do + end do + end do + end do + dest = 1 ! Result in q_cons_ts(1)%vf +#else $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p @@ -628,6 +653,9 @@ contains end do end do + dest = 1 ! Result in q_cons_ts(1)%vf +#endif + if (qbmm .and. (.not. polytropic)) then $:GPU_PARALLEL_LOOP(collapse=5) do i = 1, nb @@ -664,21 +692,21 @@ contains end do end if - if (bodyForces) call s_apply_bodyforces(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp) + if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp) - if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(1)%vf) + if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf) if (model_eqns == 3 .and. (.not. relax)) then - call s_pressure_relaxation_procedure(q_cons_ts(1)%vf) + call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf) end if - if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf) + if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf) if (ib) then if (qbmm .and. .not. polytropic) then - call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf) else - call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf) end if end if @@ -729,20 +757,23 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1) -#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p do k = 0, n do j = 0, m q_cons_ts(2)%vf(i)%sf(j, k, l) = & + q_cons_ts(1)%vf(i)%sf(j, k, l) + q_cons_ts(1)%vf(i)%sf(j, k, l) = & q_cons_ts(1)%vf(i)%sf(j, k, l) & + dt*rhs_vf(i)%sf(j, k, l) end do end do end do end do - dest = 2 ! result in q_cons_ts(2)%vf + + dest = 1 ! result in q_cons_ts(1)%vf #else $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size @@ -750,15 +781,14 @@ contains do k = 0, n do j = 0, m q_cons_ts(2)%vf(i)%sf(j, k, l) = & - q_cons_ts(1)%vf(i)%sf(j, k, l) - q_cons_ts(1)%vf(i)%sf(j, k, l) = & q_cons_ts(1)%vf(i)%sf(j, k, l) & + dt*rhs_vf(i)%sf(j, k, l) end do end do end do end do - dest = 1 ! result in q_cons_ts(1)%vf + + dest = 2 ! result in q_cons_ts(2)%vf #endif !Evolve pb and mv for non-polytropic qbmm @@ -796,21 +826,21 @@ contains end do end if - if (bodyForces) call s_apply_bodyforces(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, dt) + if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt) - if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(2)%vf) + if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf) if (model_eqns == 3 .and. (.not. relax)) then - call s_pressure_relaxation_procedure(q_cons_ts(2)%vf) + call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf) end if - if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf) + if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf) if (ib) then if (qbmm .and. .not. polytropic) then - call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf) else - call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf) end if end if @@ -820,36 +850,38 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2) -#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) - $:GPU_PARALLEL_LOOP(collapse=4) +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) + $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p do k = 0, n do j = 0, m - q_cons_ts(2)%vf(i)%sf(j, k, l) = & - (3._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & - + q_cons_ts(2)%vf(i)%sf(j, k, l) & + q_cons_ts(1)%vf(i)%sf(j, k, l) = & + (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & + + q_cons_ts(1)%vf(i)%sf(j, k, l) & + dt*rhs_vf(i)%sf(j, k, l))/4._wp end do end do end do end do - dest = 2 ! result in q_cons_ts(2)%vf + + dest = 1 ! Result in q_cons_ts(1)%vf #else $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p do k = 0, n do j = 0, m - q_cons_ts(1)%vf(i)%sf(j, k, l) = & - (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & - + q_cons_ts(1)%vf(i)%sf(j, k, l) & + q_cons_ts(2)%vf(i)%sf(j, k, l) = & + (3._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & + + q_cons_ts(2)%vf(i)%sf(j, k, l) & + dt*rhs_vf(i)%sf(j, k, l))/4._wp end do end do end do end do - dest = 1 ! result in q_cons_ts(1)%vf + + dest = 2 ! Result in q_cons_ts(2)%vf #endif if (qbmm .and. (.not. polytropic)) then @@ -888,21 +920,21 @@ contains end do end if - if (bodyForces) call s_apply_bodyforces(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, dt/4._wp) + if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt/4._wp) - if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(2)%vf) + if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf) if (model_eqns == 3 .and. (.not. relax)) then - call s_pressure_relaxation_procedure(q_cons_ts(2)%vf) + call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf) end if - if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf) + if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf) if (ib) then if (qbmm .and. .not. polytropic) then - call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf) else - call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf) end if end if @@ -911,21 +943,22 @@ contains if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3) -#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) +#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size do l = 0, p do k = 0, n do j = 0, m q_cons_ts(1)%vf(i)%sf(j, k, l) = & - (q_cons_ts(1)%vf(i)%sf(j, k, l) & - + 2._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & + (q_cons_ts(2)%vf(i)%sf(j, k, l) & + + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp end do end do end do end do - dest = 1 ! result in q_cons_ts(1)%vf + + dest = 1 ! Result in q_cons_ts(1)%vf #else $:GPU_PARALLEL_LOOP(collapse=4) do i = 1, sys_size @@ -933,14 +966,15 @@ contains do k = 0, n do j = 0, m q_cons_ts(1)%vf(i)%sf(j, k, l) = & - (q_cons_ts(2)%vf(i)%sf(j, k, l) & - + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) & + (q_cons_ts(1)%vf(i)%sf(j, k, l) & + + 2._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) & + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp end do end do end do end do - dest = 1 ! result in q_cons_ts(1)%vf + + dest = 1 ! Result in q_cons_ts(2)%vf #endif if (qbmm .and. (.not. polytropic)) then @@ -979,25 +1013,25 @@ contains end do end if - if (bodyForces) call s_apply_bodyforces(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp) + if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp) - if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(1)%vf) + if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf) if (model_eqns == 3 .and. (.not. relax)) then - call s_pressure_relaxation_procedure(q_cons_ts(1)%vf) + call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf) end if call nvtxStartRange("RHS-ELASTIC") - if (hyperelasticity) call s_hyperelastic_rmt_stress_update(q_cons_ts(1)%vf, q_prim_vf) + if (hyperelasticity) call s_hyperelastic_rmt_stress_update(q_cons_ts(dest)%vf, q_prim_vf) call nvtxEndRange - if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf) + if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf) if (ib) then if (qbmm .and. .not. polytropic) then - call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf) else - call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf) + call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf) end if end if @@ -1007,6 +1041,7 @@ contains time = time + (finish - start) end if + end subroutine s_3rd_order_tvd_rk !> Strang splitting scheme with 3rd order TVD RK time-stepping algorithm for @@ -1244,30 +1279,20 @@ contains integer :: i, j !< Generic loop iterators ! Deallocating the cell-average conservative variables +#ifdef __NVCOMPILER_GPU_UNIFIED_MEM + do j = 1, sys_size + @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf) + if (num_ts == 2) then + nullify(q_cons_ts(2)%vf(j)%sf) + end if + end do + deallocate(q_cons_ts_pool_host) +#else do i = 1, num_ts do j = 1, sys_size -#ifdef __NVCOMPILER_GPU_UNIFIED_MEM - if ( i <= (num_ts - out_of_core) ) then - !print*, "q_cons_ts", i, j, "dealloc" -#endif - @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf) -#ifdef __NVCOMPILER_GPU_UNIFIED_MEM - else - !print*, "q_cons_ts", i, j, "nullify" - nullify(q_cons_ts(i)%vf(j)%sf) - end if -#endif + @:ALLOCATE(q_cons_ts(i)%vf(j)%sf) end do - - @:DEALLOCATE(q_cons_ts(i)%vf) end do - - @:DEALLOCATE(q_cons_ts) - -#ifdef __NVCOMPILER_GPU_UNIFIED_MEM - if ( out_of_core == 1 ) then - deallocate(q_cons_ts_pool_host) - end if #endif ! Deallocating the cell-average primitive ts variables diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py index 6bcf0964cc..bd9dcec11a 100644 --- a/toolchain/mfc/run/case_dicts.py +++ b/toolchain/mfc/run/case_dicts.py @@ -312,6 +312,8 @@ def analytic(self): 'int_comp': ParamType.LOG, 'ic_eps': ParamType.REAL, 'ic_beta': ParamType.REAL, + 'nv_uvm_igr_temps_on_gpu': ParamType.INT, + 'nv_uvm_pref_gpu': ParamType.LOG, }) for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector', diff --git a/toolchain/mfc/test/cases.py b/toolchain/mfc/test/cases.py index fac3dc4eba..95927bb04b 100644 --- a/toolchain/mfc/test/cases.py +++ b/toolchain/mfc/test/cases.py @@ -693,17 +693,17 @@ def alter_mixlayer_perturb(dimInfo): 'patch_icpp(1)%vel(1)': 1.0, 'patch_icpp(1)%vel(2)': 0.0, 'patch_icpp(1)%vel(3)': 0.0, 'patch_icpp(1)%pres': 17.8571428571, 'patch_icpp(1)%alpha_rho(1)': 1.0, 'patch_icpp(1)%alpha(1)': 1.0, 'patch_icpp(1)%r0': -1e6, 'patch_icpp(1)%v0': -1e6, - 'patch_icpp(2)%geometry': -100, + 'patch_icpp(2)%geometry': -100, 'patch_icpp(2)%x_centroid': -1e6, 'patch_icpp(2)%length_x': -1e6, - 'patch_icpp(2)%y_centroid': -1e6, 'patch_icpp(2)%length_y': -1e6, - 'patch_icpp(2)%z_centroid': -1e6, 'patch_icpp(2)%length_z': -1e6, - 'patch_icpp(2)%vel(1)': -1e6, 'patch_icpp(2)%vel(2)': -1e6, 'patch_icpp(2)%vel(3)': -1e6, + 'patch_icpp(2)%y_centroid': -1e6, 'patch_icpp(2)%length_y': -1e6, + 'patch_icpp(2)%z_centroid': -1e6, 'patch_icpp(2)%length_z': -1e6, + 'patch_icpp(2)%vel(1)': -1e6, 'patch_icpp(2)%vel(2)': -1e6, 'patch_icpp(2)%vel(3)': -1e6, 'patch_icpp(2)%r0': -1e6, 'patch_icpp(2)%v0': -1e6, - 'patch_icpp(3)%geometry': -100, + 'patch_icpp(3)%geometry': -100, 'patch_icpp(3)%x_centroid': -1e6, 'patch_icpp(3)%length_x': -1e6, - 'patch_icpp(3)%y_centroid': -1e6, 'patch_icpp(3)%length_y': -1e6, - 'patch_icpp(3)%z_centroid': -1e6, 'patch_icpp(3)%length_z': -1e6, - 'patch_icpp(3)%vel(1)': -1e6, 'patch_icpp(3)%vel(2)': -1e6, 'patch_icpp(3)%vel(3)': -1e6, + 'patch_icpp(3)%y_centroid': -1e6, 'patch_icpp(3)%length_y': -1e6, + 'patch_icpp(3)%z_centroid': -1e6, 'patch_icpp(3)%length_z': -1e6, + 'patch_icpp(3)%vel(1)': -1e6, 'patch_icpp(3)%vel(2)': -1e6, 'patch_icpp(3)%vel(3)': -1e6, 'patch_icpp(3)%r0': -1e6, 'patch_icpp(3)%v0': -1e6 })) @@ -993,11 +993,12 @@ def foreach_example(): "2D_lagrange_bubblescreen", "3D_lagrange_bubblescreen", "2D_triple_point", "1D_shuosher_analytical", - "1D_titarevtorro_analytical", + "1D_titarevtorro_analytical", "2D_acoustic_pulse_analytical", "2D_isentropicvortex_analytical", "2D_zero_circ_vortex_analytical", - "3D_TaylorGreenVortex_analytical"] + "3D_TaylorGreenVortex_analytical", + "3D_IGR_TaylorGreenVortex_nvidia"] if path in casesToSkip: continue name = f"{path.split('_')[0]} -> Example -> {'_'.join(path.split('_')[1:])}" diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako index 27b4d6b425..e798b677df 100644 --- a/toolchain/templates/santis.mako +++ b/toolchain/templates/santis.mako @@ -37,13 +37,8 @@ export FI_CXI_RX_MATCH_MODE=software export FI_MR_CACHE_MONITOR=disabled export MPICH_NO_BUFFER_ALIAS_CHECK=1 -# CUSTOM env vars to MFC -export MFC_OUT_OF_CORE=1 # out of core -export NVIDIA_MANUAL_GPU_HINTS=1 # prefloc GPU on some -export NVIDIA_IGR_TEMPS_ON_GPU=3 # jac, jac_rhs, and jac_old on GPU - # NSYS -export NSYS=1 # enable nsys profiling +export NSYS=0 # enable nsys profiling export NSYS_FILE=myreport.qdrep ${helpers.template_prologue()} From cacc6b041560667d319ce0f8938f694469566747 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Sun, 3 Aug 2025 10:53:35 +0200 Subject: [PATCH 08/25] Fix some comments --- src/simulation/m_global_parameters.fpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 401fa5412d..a7539327f5 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -161,8 +161,8 @@ module m_global_parameters integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU ! 1 => jac on GPU, jac_rhs and jac_old on CPU ! 2 => jac and jac_rhs on GPU, jac_old on CPU - ! 4 => jac, jac_rhs, and jac_old on GPU (default) - logical :: nv_uvm_pref_gpu ! Enable pinned gpu memory (default TRUE) + ! 3 => jac, jac_rhs, and jac_old on GPU (default) + logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default TRUE) !> @} real(wp) :: weno_eps !< Binding for the WENO nonlinear weights From b3fdbff5f8df0ad7426db16195d10f9e66626533 Mon Sep 17 00:00:00 2001 From: Ben Wilfong <48168887+wilfonba@users.noreply.github.com> Date: Sun, 3 Aug 2025 14:15:05 -0400 Subject: [PATCH 09/25] test merge and add nv_uvm_out_of_core back --- src/simulation/m_global_parameters.fpp | 2 ++ src/simulation/m_mpi_proxy.fpp | 1 + src/simulation/m_time_steppers.fpp | 10 ++++++---- toolchain/mfc/run/case_dicts.py | 1 + 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 52ad4aec3e..24f23ed4a5 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -163,6 +163,7 @@ module m_global_parameters ! 1 => jac on GPU, jac_rhs and jac_old on CPU ! 2 => jac and jac_rhs on GPU, jac_old on CPU ! 3 => jac, jac_rhs, and jac_old on GPU (default) + logical :: nv_uvm_out_of_core ! Enable out-or-core storage of q_cons_ts(2) in timestepping logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default TRUE) !> @} @@ -584,6 +585,7 @@ contains ! NVIDIA UVM options nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default) + nv_uvm_out_of_core = .false. nv_uvm_pref_gpu = .true. ! Simulation algorithm parameters diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index d97fdb64c8..f2293b0ffd 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -239,6 +239,7 @@ contains ! NVIDIA UVM variables call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) + call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) #endif diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index 9d562c6689..48fddcd8cd 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -574,7 +574,6 @@ contains real(wp), intent(inout) :: time_avg integer :: i, j, k, l, q!< Generic loop iterator - integer :: dest real(wp) :: start, finish integer :: dest @@ -810,9 +809,8 @@ contains real(wp), intent(INOUT) :: time_avg integer :: i, j, k, l, q !< Generic loop iterator - integer :: dest - real(wp) :: start, finish + integer :: dest ! Stage 1 of 3 @@ -1385,7 +1383,11 @@ contains do j = 1, sys_size @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf) if (num_ts == 2) then - nullify(q_cons_ts(2)%vf(j)%sf) + if (nv_uvm_out_of_core) then + nullify(q_cons_ts(2)%vf(j)%sf) + else + @:DEALLOCATE(q_cons_ts(2)%vf(j)%sf) + end if end if end do deallocate(q_cons_ts_pool_host) diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py index 704450a48b..54d7d00b99 100644 --- a/toolchain/mfc/run/case_dicts.py +++ b/toolchain/mfc/run/case_dicts.py @@ -314,6 +314,7 @@ def analytic(self): 'ic_eps': ParamType.REAL, 'ic_beta': ParamType.REAL, 'nv_uvm_igr_temps_on_gpu': ParamType.INT, + 'nv_uvm_igr_out_of_core': ParamType.LOG, 'nv_uvm_pref_gpu': ParamType.LOG, }) From 51d7e90db68084df48cd4154e8c9295e1da5a506 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Tue, 5 Aug 2025 20:09:34 +0200 Subject: [PATCH 10/25] Fix some allocs and deallocs in timesteppers --- src/simulation/m_time_steppers.fpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index 48fddcd8cd..b2c6e72bcd 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -115,7 +115,8 @@ contains end do #if defined(__NVCOMPILER_GPU_UNIFIED_MEM) - if (nv_uvm_out_of_core) then + if (num_ts == 2 .and. nv_uvm_out_of_core) then + ! host allocation for q_cons_ts(2)%vf(j)%sf for all j allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & idwbuff(3)%beg:idwbuff(3)%end, & @@ -1390,7 +1391,9 @@ contains end if end if end do - deallocate(q_cons_ts_pool_host) + if (num_ts == 2 .and. nv_uvm_out_of_core) then + deallocate(q_cons_ts_pool_host) + end if #elif defined(FRONTIER_UNIFIED) do i = 1, num_ts do j = 1, sys_size @@ -1407,6 +1410,9 @@ contains end do end do #endif + do i = 1, num_ts + @:DEALLOCATE(q_cons_ts(i)%vf) + end do @:DEALLOCATE(q_cons_ts) From c553b78cfcb114d32f3a9065ba6f07cbec2c25bd Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Tue, 5 Aug 2025 20:18:29 +0200 Subject: [PATCH 11/25] Fix nv_uvm_out_of_core inconsistency and add to case file --- examples/3D_IGR_TaylorGreenVortex_nvidia/case.py | 1 + src/simulation/m_global_parameters.fpp | 4 ++-- src/simulation/m_mpi_proxy.fpp | 2 +- src/simulation/m_start_up.fpp | 4 ++-- toolchain/mfc/run/case_dicts.py | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py index 17ad1ceb43..ccc7413d03 100644 --- a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py +++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py @@ -97,6 +97,7 @@ "fluid_pp(1)%pi_inf": 0, "fluid_pp(1)%Re(1)": 1 / mu, # NVIDIA UVM Options + "nv_uvm_out_of_core": "T", "nv_uvm_igr_temps_on_gpu": 3, "nv_uvm_pref_gpu": "T", } diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 24f23ed4a5..3ab256c548 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -159,11 +159,11 @@ module m_global_parameters !> @name Variables for our of core IGR computation on NVIDIA !> @{ + logical :: nv_uvm_out_of_core ! Enable out-or-core storage of q_cons_ts(2) in timestepping integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU ! 1 => jac on GPU, jac_rhs and jac_old on CPU ! 2 => jac and jac_rhs on GPU, jac_old on CPU ! 3 => jac, jac_rhs, and jac_old on GPU (default) - logical :: nv_uvm_out_of_core ! Enable out-or-core storage of q_cons_ts(2) in timestepping logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default TRUE) !> @} @@ -584,8 +584,8 @@ contains t_save = dflt_real ! NVIDIA UVM options - nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default) nv_uvm_out_of_core = .false. + nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default) nv_uvm_pref_gpu = .true. ! Simulation algorithm parameters diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp index f2293b0ffd..755f762166 100644 --- a/src/simulation/m_mpi_proxy.fpp +++ b/src/simulation/m_mpi_proxy.fpp @@ -238,8 +238,8 @@ contains end do ! NVIDIA UVM variables - call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) + call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr) #endif diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp index 610920b8e2..313ef48f2b 100644 --- a/src/simulation/m_start_up.fpp +++ b/src/simulation/m_start_up.fpp @@ -188,8 +188,8 @@ contains hyperelasticity, R0ref, num_bc_patches, Bx0, powell, & cont_damage, tau_star, cont_damage_s, alpha_bar, & alf_factor, num_igr_iters, num_igr_warm_start_iters, & - int_comp, ic_eps, ic_beta, nv_uvm_igr_temps_on_gpu, & - nv_uvm_pref_gpu, down_sample + int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, & + nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample ! Checking that an input file has been provided by the user. If it ! has, then the input file is read in, otherwise, simulation exits. diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py index 54d7d00b99..8378d3044d 100644 --- a/toolchain/mfc/run/case_dicts.py +++ b/toolchain/mfc/run/case_dicts.py @@ -313,8 +313,8 @@ def analytic(self): 'int_comp': ParamType.LOG, 'ic_eps': ParamType.REAL, 'ic_beta': ParamType.REAL, + 'nv_uvm_out_of_core': ParamType.LOG, 'nv_uvm_igr_temps_on_gpu': ParamType.INT, - 'nv_uvm_igr_out_of_core': ParamType.LOG, 'nv_uvm_pref_gpu': ParamType.LOG, }) From f3b3851006bf95b22a6fb24a1c3ae5d6910e96ed Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Tue, 5 Aug 2025 22:50:19 +0200 Subject: [PATCH 12/25] Fix bug in 2nd order TVD RK introduced by merge --- src/simulation/m_time_steppers.fpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index b2c6e72bcd..cd5b087997 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -707,7 +707,7 @@ contains q_cons_ts(1)%vf(i)%sf(j, k, l) = & (q_cons_ts(2)%vf(i)%sf(j, k, l) & + q_cons_ts(1)%vf(i)%sf(j, k, l) & - + dt*rhs_vf(i)%sf(j, k, l))/4._wp + + dt*rhs_vf(i)%sf(j, k, l))/2._wp end do end do end do From 71b59766dab3b5bd172bc0d2f4f8d2e121533916 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Tue, 5 Aug 2025 23:35:07 +0200 Subject: [PATCH 13/25] Fix some comments --- src/simulation/m_global_parameters.fpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 3ab256c548..546bd95f4d 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -159,7 +159,7 @@ module m_global_parameters !> @name Variables for our of core IGR computation on NVIDIA !> @{ - logical :: nv_uvm_out_of_core ! Enable out-or-core storage of q_cons_ts(2) in timestepping + logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE) integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU ! 1 => jac on GPU, jac_rhs and jac_old on CPU ! 2 => jac and jac_rhs on GPU, jac_old on CPU From a4d6b38b61e6e1ae6ac98c2f76385d1fa53a5981 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Wed, 6 Aug 2025 08:46:31 +0200 Subject: [PATCH 14/25] Add note on binding script requirement for PREFER_GPU macro --- src/common/include/macros.fpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index 7177efa32d..f4e4f280b1 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -12,6 +12,10 @@ #endif #:enddef +! Caution: +! This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI rank. +! That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0. +! For an example see misc/nvidia_uvm/bind.sh. #:def PREFER_GPU(*args) #ifdef MFC_SIMULATION #ifdef __NVCOMPILER_GPU_UNIFIED_MEM From acb24057b021bbcfaba33e320a4dbb1d15e5165b Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Thu, 7 Aug 2025 08:49:53 +0200 Subject: [PATCH 15/25] Flip nv_uvm_pref_gpu default to false --- src/simulation/m_global_parameters.fpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 546bd95f4d..47c7a3a276 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -164,7 +164,7 @@ module m_global_parameters ! 1 => jac on GPU, jac_rhs and jac_old on CPU ! 2 => jac and jac_rhs on GPU, jac_old on CPU ! 3 => jac, jac_rhs, and jac_old on GPU (default) - logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default TRUE) + logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE) !> @} real(wp) :: weno_eps !< Binding for the WENO nonlinear weights @@ -586,7 +586,7 @@ contains ! NVIDIA UVM options nv_uvm_out_of_core = .false. nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default) - nv_uvm_pref_gpu = .true. + nv_uvm_pref_gpu = .false. ! Simulation algorithm parameters model_eqns = dflt_int From 8fef22d567dbbd7698f7272bfb6c90fb4776fa19 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Thu, 7 Aug 2025 18:26:18 +0200 Subject: [PATCH 16/25] Be explicit with unified memory compilation to stay robust in changes of defaults --- CMakeLists.txt | 4 ++-- toolchain/templates/santis.mako | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e3d2498a4..0d48fab5c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -498,11 +498,11 @@ function(MFC_SETUP_TARGET) # GH-200 Unified Memory Support if (MFC_Unified) target_compile_options(${ARGS_TARGET} - PRIVATE -gpu=mem:unified -cuda + PRIVATE -gpu=mem:unified:managedalloc -cuda ) # "This option must appear in both the compile and link lines" -- NVHPC Docs target_link_options(${ARGS_TARGET} - PRIVATE -gpu=mem:unified -cuda + PRIVATE -gpu=mem:unified:managedalloc -cuda ) endif() diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako index e798b677df..30ebdecf2b 100644 --- a/toolchain/templates/santis.mako +++ b/toolchain/templates/santis.mako @@ -27,7 +27,7 @@ % endif # NVHPC and CUDA env vars -export NV_ACC_USE_MALLOC=0 # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified ) +export NV_ACC_USE_MALLOC=0 # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified:managedalloc ) export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints #export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH From 5e369c364003186910bf49cd9f4a373a9e6d8d79 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Mon, 11 Aug 2025 00:41:02 -0700 Subject: [PATCH 17/25] Add some changes to future proof the unified memory build --- CMakeLists.txt | 7 ++++++- src/common/include/macros.fpp | 10 ++++++++++ src/simulation/m_weno.fpp | 2 ++ toolchain/templates/santis.mako | 21 +++++++++++++++++---- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d48fab5c9..97b31ec7f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -527,7 +527,12 @@ function(MFC_SETUP_TARGET) if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") find_package(CUDAToolkit REQUIRED) - target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt) + if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8 + target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt) + else() # CUDA >= 12.9 + target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx) + target_link_options(${a_target} PRIVATE "-cudalib=nvtx3") + endif() endif() endforeach() diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index f4e4f280b1..58b8b07190 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -20,7 +20,17 @@ #ifdef MFC_SIMULATION #ifdef __NVCOMPILER_GPU_UNIFIED_MEM block +! Beginning in the 25.3 release, the structure of the cudafor module has been changed slightly. +! The module now includes, or “uses” 3 submodules: cuda_runtime_api, gpu_reductions, and sort. +! The cudafor functionality has not changed. But for new users, or users who have needed to +! work-around name conflicts in the module, it may be better to use cuda_runtime_api to expose +! interfaces to the CUDA runtime calls described in Chapter 4 of this guide. +! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules +#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3) use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval +#else + use cuda_runtime_api +#endif integer :: istat if (nv_uvm_pref_gpu) then diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp index 56beaea979..a9846124ba 100644 --- a/src/simulation/m_weno.fpp +++ b/src/simulation/m_weno.fpp @@ -98,7 +98,9 @@ module m_weno !> @name Indical bounds in the s1-, s2- and s3-directions !> @{ type(int_bounds_info) :: is1_weno, is2_weno, is3_weno +#ifndef __NVCOMPILER_GPU_UNIFIED_MEM $:GPU_DECLARE(create='[is1_weno,is2_weno,is3_weno]') +#endif ! !> @} diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako index 30ebdecf2b..b0e47dc154 100644 --- a/toolchain/templates/santis.mako +++ b/toolchain/templates/santis.mako @@ -26,10 +26,23 @@ % endif % endif -# NVHPC and CUDA env vars -export NV_ACC_USE_MALLOC=0 # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified:managedalloc ) -export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints -#export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH +# We compiled the code using -gpu=unified:managedalloc, hence we use cudaMallocManaged for the dynamic allocations. +# Using NV_ACC_USE_MALLOC we could change to malloc at runtime. We choose to not do that here and stick with cudaMallocManaged and 2MB page sizes. +# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#memory-model +# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#command-line-options-selecting-compiler-memory-modes +export NV_ACC_USE_MALLOC=0 + +# For NVIDIA CUDA devices, controls the use of automatic memory hints at data constructs in the managed and unified memory modes. +# Below is a breakdown of the permitted values (case insensitive): +# - DEFAULT: Use the default settings. On NVIDIA Grace Hopper systems, the default is currently ENABLE_ALL; on all other systems, the default is DISABLE. +# - DISABLE: Memory hints are disabled for all data constructs. +# - ENABLE_EXPLICIT: Memory hints are enabled for explicit data constructs only. +# - ENABLE_ALL: Memory hints are enabled for explicit and implicit data constructs. +# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#environment-variables-controlling-device-memory-management +# Here we disable the implicit compiler hints. +# Using NVCOMPILER_ACC_NO_MEMHINTS is the legacy way and is still supported, but users should prefer NVCOMPILER_ACC_MEMHINTS when using newer nvhpc compilers. +export NVCOMPILER_ACC_NO_MEMHINTS=1 # disable implicit compiler hints - legacy way +export NVCOMPILER_ACC_MEMHINTS=DISABLE # disable implicit compiler hints - new way # Cray MPICH export MPICH_GPU_SUPPORT_ENABLED=1 From 52c56087ef43dd2f4032d785daebb5e74b568863 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Mon, 11 Aug 2025 11:13:00 +0200 Subject: [PATCH 18/25] Comment out calls to cudaGetErrorString --- src/common/include/macros.fpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index 58b8b07190..1378f9e864 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -40,19 +40,19 @@ istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 ) if (istat /= cudaSuccess) then write(*,"('Error code: ',I0, ': ')") istat - write(*,*) cudaGetErrorString(istat) + !write(*,*) cudaGetErrorString(istat) endif ! set accessed by CPU istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId ) if (istat /= cudaSuccess) then write(*,"('Error code: ',I0, ': ')") istat - write(*,*) cudaGetErrorString(istat) + !write(*,*) cudaGetErrorString(istat) endif ! prefetch to GPU - physically populate memory pages istat = cudaMemPrefetchAsync( c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0 ) if (istat /= cudaSuccess) then write(*,"('Error code: ',I0, ': ')") istat - write(*,*) cudaGetErrorString(istat) + !write(*,*) cudaGetErrorString(istat) endif #:endfor end if From 4ec8617e5dc104c4133e303542a4c21052cd5255 Mon Sep 17 00:00:00 2001 From: Ben Wilfong <48168887+wilfonba@users.noreply.github.com> Date: Mon, 11 Aug 2025 12:18:00 -0400 Subject: [PATCH 19/25] prepare for merge --- CMakeLists.txt | 4 +- .../3D_IGR_TaylorGreenVortex_nvidia/case.py | 8 +-- src/common/include/macros.fpp | 54 +++++++++---------- src/common/m_mpi_common.fpp | 5 +- src/simulation/m_global_parameters.fpp | 6 +-- src/simulation/m_igr.fpp | 40 +++++++------- src/simulation/m_time_steppers.fpp | 16 +++--- toolchain/mfc/build.py | 2 +- 8 files changed, 67 insertions(+), 68 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97b31ec7f3..a581a2b769 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -234,7 +234,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") message(STATUS "LTO/IPO is not supported in NVHPC Version < 23.11. Use a newer version of NVHPC for best performance.") else() message(STATUS "Performing IPO using -Mextract followed by -Minline") - set(NVHPC_USE_TWO_PASS_IPO FALSE) + set(NVHPC_USE_TWO_PASS_IPO TRUE) endif() else() CHECK_IPO_SUPPORTED(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR) @@ -492,7 +492,7 @@ function(MFC_SETUP_TARGET) endforeach() target_compile_options(${a_target} - PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath + PRIVATE -gpu=keep,ptxinfo,lineinfo ) # GH-200 Unified Memory Support diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py index ccc7413d03..e2b22e8017 100644 --- a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py +++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py @@ -4,8 +4,8 @@ N = 799 Nx = N -Ny = 2*(N+1)-1 -Nz = 2*(N+1)-1 +Ny = 2 * (N + 1) - 1 +Nz = 2 * (N + 1) - 1 Re = 1600 L = 1 @@ -47,8 +47,8 @@ "cyl_coord": "F", "dt": dt, "t_step_start": 0, - "t_step_stop": 10, #Nt, - "t_step_save": 10, #int(Nt / 100), + "t_step_stop": 10, # Nt, + "t_step_save": 10, # int(Nt / 100), # Simulation Algorithm Parameters "num_patches": 1, "model_eqns": 2, diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp index 1378f9e864..69241c99ef 100644 --- a/src/common/include/macros.fpp +++ b/src/common/include/macros.fpp @@ -27,35 +27,35 @@ ! interfaces to the CUDA runtime calls described in Chapter 4 of this guide. ! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules #if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3) - use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval + use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval #else - use cuda_runtime_api + use cuda_runtime_api #endif - integer :: istat - - if (nv_uvm_pref_gpu) then - #:for arg in args - !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) - ! set preferred location GPU - istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 ) - if (istat /= cudaSuccess) then - write(*,"('Error code: ',I0, ': ')") istat - !write(*,*) cudaGetErrorString(istat) - endif - ! set accessed by CPU - istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId ) - if (istat /= cudaSuccess) then - write(*,"('Error code: ',I0, ': ')") istat - !write(*,*) cudaGetErrorString(istat) - endif - ! prefetch to GPU - physically populate memory pages - istat = cudaMemPrefetchAsync( c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0 ) - if (istat /= cudaSuccess) then - write(*,"('Error code: ',I0, ': ')") istat - !write(*,*) cudaGetErrorString(istat) - endif - #:endfor - end if + integer :: istat + + if (nv_uvm_pref_gpu) then + #:for arg in args + !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$) + ! set preferred location GPU + istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0) + if (istat /= cudaSuccess) then + write (*, "('Error code: ',I0, ': ')") istat + !write(*,*) cudaGetErrorString(istat) + end if + ! set accessed by CPU + istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId) + if (istat /= cudaSuccess) then + write (*, "('Error code: ',I0, ': ')") istat + !write(*,*) cudaGetErrorString(istat) + end if + ! prefetch to GPU - physically populate memory pages + istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0) + if (istat /= cudaSuccess) then + write (*, "('Error code: ',I0, ': ')") istat + !write(*,*) cudaGetErrorString(istat) + end if + #:endfor + end if end block #endif #endif diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp index 8633c84ceb..dbe61317b8 100644 --- a/src/common/m_mpi_common.fpp +++ b/src/common/m_mpi_common.fpp @@ -83,9 +83,8 @@ contains #ifndef __NVCOMPILER_GPU_UNIFIED_MEM @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size)) #else - ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size)) - !$acc enter data create(capture:buff_send) - !$acc enter data create(capture:buff_recv) + allocate (buff_send(0:halo_size), buff_recv(0:halo_size)) + $:GPU_ENTER_DATA(create='[capture:buff_send, capture:buff_recv]') #endif #endif diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp index 47c7a3a276..34c122f1e7 100644 --- a/src/simulation/m_global_parameters.fpp +++ b/src/simulation/m_global_parameters.fpp @@ -161,9 +161,9 @@ module m_global_parameters !> @{ logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE) integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU - ! 1 => jac on GPU, jac_rhs and jac_old on CPU - ! 2 => jac and jac_rhs on GPU, jac_old on CPU - ! 3 => jac, jac_rhs, and jac_old on GPU (default) + ! 1 => jac on GPU, jac_rhs and jac_old on CPU + ! 2 => jac and jac_rhs on GPU, jac_old on CPU + ! 3 => jac, jac_rhs, and jac_old on GPU (default) logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE) !> @} diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 0d1edad478..01fd90e7a8 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -26,7 +26,7 @@ module m_igr #ifdef __NVCOMPILER_GPU_UNIFIED_MEM integer, dimension(3) :: nv_uvm_temp_on_gpu - real(wp), pointer, contiguous, dimension(:, :, :) :: jac,jac_rhs,jac_old + real(wp), pointer, contiguous, dimension(:, :, :) :: jac, jac_rhs, jac_old real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host_pool real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host_pool real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host_pool @@ -124,21 +124,21 @@ contains idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac) else - allocate(jac_host_pool(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end)) + allocate (jac_host_pool(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end) => jac_host_pool(:,:,:) + idwbuff(3)%beg:idwbuff(3)%end) => jac_host_pool(:, :, :) end if if (nv_uvm_temp_on_gpu(2) == 1) then @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) @:PREFER_GPU(jac_rhs) else - allocate(jac_rhs_host_pool(-1:m,-1:n,-1:p)) - jac_rhs(-1:m,-1:n,-1:p) => jac_rhs_host_pool(:,:,:) + allocate (jac_rhs_host_pool(-1:m, -1:n, -1:p)) + jac_rhs(-1:m, -1:n, -1:p) => jac_rhs_host_pool(:, :, :) end if if (igr_iter_solver == 1) then ! Jacobi iteration @@ -148,13 +148,13 @@ contains idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac_old) else - allocate(jac_old_host_pool(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end)) + allocate (jac_old_host_pool(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) jac_old(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host_pool(:,:,:) + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host_pool(:, :, :) end if end if #endif @@ -178,7 +178,7 @@ contains #:if not MFC_CASE_OPTIMIZATION if (igr_order == 3) then - vidxb = -1; vidxe = 2; + vidxb = -1; vidxe = 2; $:GPU_UPDATE(device='[vidxb, vidxe]') @:ALLOCATE(coeff_L(0:2)) @@ -194,7 +194,7 @@ contains $:GPU_UPDATE(device='[coeff_R]') elseif (igr_order == 5) then - vidxb = -2; vidxe = 3; + vidxb = -2; vidxe = 3; $:GPU_UPDATE(device='[vidxb, vidxe]') @:ALLOCATE(coeff_L(-1:3)) @@ -2677,23 +2677,23 @@ contains if (nv_uvm_temp_on_gpu(1) == 1) then @:DEALLOCATE(jac) else - nullify(jac) - deallocate(jac_host_pool) + nullify (jac) + deallocate (jac_host_pool) end if if (nv_uvm_temp_on_gpu(2) == 1) then @:DEALLOCATE(jac_rhs) else - nullify(jac_rhs) - deallocate(jac_rhs_host_pool) + nullify (jac_rhs) + deallocate (jac_rhs_host_pool) end if if (igr_iter_solver == 1) then ! Jacobi iteration if (nv_uvm_temp_on_gpu(3) == 1) then @:DEALLOCATE(jac_old) else - nullify(jac_old) - deallocate(jac_old_host_pool) + nullify (jac_old) + deallocate (jac_old_host_pool) end if end if #endif diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp index cd5b087997..e7d4ba6017 100644 --- a/src/simulation/m_time_steppers.fpp +++ b/src/simulation/m_time_steppers.fpp @@ -117,10 +117,10 @@ contains #if defined(__NVCOMPILER_GPU_UNIFIED_MEM) if (num_ts == 2 .and. nv_uvm_out_of_core) then ! host allocation for q_cons_ts(2)%vf(j)%sf for all j - allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end, & - 1:sys_size)) + allocate (q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end, & + 1:sys_size)) end if do j = 1, sys_size @@ -133,8 +133,8 @@ contains if (nv_uvm_out_of_core) then ! q_cons_ts(2) lives on the host q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j) + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:, :, :, j) else @:ALLOCATE(q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & @@ -1385,14 +1385,14 @@ contains @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf) if (num_ts == 2) then if (nv_uvm_out_of_core) then - nullify(q_cons_ts(2)%vf(j)%sf) + nullify (q_cons_ts(2)%vf(j)%sf) else @:DEALLOCATE(q_cons_ts(2)%vf(j)%sf) end if end if end do if (num_ts == 2 .and. nv_uvm_out_of_core) then - deallocate(q_cons_ts_pool_host) + deallocate (q_cons_ts_pool_host) end if #elif defined(FRONTIER_UNIFIED) do i = 1, num_ts diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 8edaeec990..70b2f6950b 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -64,7 +64,7 @@ def get_install_dirpath(self, case: Case ) -> str: # The install directory is located /build/install/ return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)]) - def get_home_dirpath(self, case: Case) -> str: + def get_home_dirpath(self) -> str: return os.sep.join([os.getcwd()]) def get_install_binpath(self, case: Case ) -> str: From 37b17682377e37bd9df65b339efa0c6a70c3b049 Mon Sep 17 00:00:00 2001 From: Ben Wilfong <48168887+wilfonba@users.noreply.github.com> Date: Mon, 11 Aug 2025 12:24:09 -0400 Subject: [PATCH 20/25] update capture --- src/common/m_mpi_common.fpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp index dbe61317b8..4332681f11 100644 --- a/src/common/m_mpi_common.fpp +++ b/src/common/m_mpi_common.fpp @@ -84,7 +84,8 @@ contains @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size)) #else allocate (buff_send(0:halo_size), buff_recv(0:halo_size)) - $:GPU_ENTER_DATA(create='[capture:buff_send, capture:buff_recv]') + $:GPU_ENTER_DATA(create='[capture:buff_send]') + $:GPU_ENTER_DATA(create='[capture:buff_recv]') #endif #endif From e02e9f654acc3c7401603218be68ff12312f3c59 Mon Sep 17 00:00:00 2001 From: Ben Wilfong Date: Mon, 11 Aug 2025 20:17:50 +0200 Subject: [PATCH 21/25] add fastmath flag and bug fix --- CMakeLists.txt | 17 +++++++++++++++++ toolchain/mfc/build.py | 1 + toolchain/mfc/lock.py | 2 +- toolchain/mfc/state.py | 13 +++++++------ toolchain/templates/santis.mako | 7 +++---- 5 files changed, 29 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5452590712..3fce77f001 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,7 @@ option(MFC_MPI "Build with MPI" ON option(MFC_OpenACC "Build with OpenACC" OFF) option(MFC_GCov "Build with GCov" OFF) option(MFC_Unified "Build with unified CPU & GPU memory (GH-200 only)" OFF) +option(MFC_Fastmath "Build with -fastmath on NV GPUs" OFF) option(MFC_PRE_PROCESS "Build pre_process" OFF) option(MFC_SIMULATION "Build simulation" OFF) option(MFC_POST_PROCESS "Build post_process" OFF) @@ -487,6 +488,9 @@ function(MFC_SETUP_TARGET) "-foffload=amdgcn-amdhsa='-march=gfx90a'" "-foffload-options=-lgfortran\ -lm" "-fno-exceptions") + if (MFC_Fastmath) + message(WARNING "--fastmath has no effect with the GNU compiler") + endif() elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") foreach (cc ${MFC_CUDA_CC}) target_compile_options(${a_target} @@ -498,6 +502,12 @@ function(MFC_SETUP_TARGET) PRIVATE -gpu=keep,ptxinfo,lineinfo ) + if (MFC_Fastmath) + target_compile_options(${a_target} + PRIVATE -gpu=fastmath + ) + endif() + # GH-200 Unified Memory Support if (MFC_Unified) target_compile_options(${ARGS_TARGET} @@ -521,11 +531,18 @@ function(MFC_SETUP_TARGET) PRIVATE -DFRONTIER_UNIFIED) endif() + if (MFC_Fastmath) + message(WARNING "--fastmath has no effect with the CCE") + endif() + find_package(hipfort COMPONENTS hip CONFIG REQUIRED) target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn) endif() elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc") + if (MFC_Fastmath) + message(WARNING "--fastmath has no effect with the CCE") + endif() endif() if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI") diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 70b2f6950b..846763b233 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -147,6 +147,7 @@ def configure(self, case: Case): flags.append(f"-DMFC_OpenACC={'ON' if ARG('gpu') else 'OFF'}") flags.append(f"-DMFC_GCov={ 'ON' if ARG('gcov') else 'OFF'}") flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}") + flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}") command = ["cmake"] + flags + ["-S", cmake_dirpath, "-B", build_dirpath] diff --git a/toolchain/mfc/lock.py b/toolchain/mfc/lock.py index aa91cc9675..eb20bd73fa 100644 --- a/toolchain/mfc/lock.py +++ b/toolchain/mfc/lock.py @@ -5,7 +5,7 @@ from .printer import cons -MFC_LOCK_CURRENT_VERSION: int = 5 +MFC_LOCK_CURRENT_VERSION: int = 6 @dataclasses.dataclass diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py index fa7d438e77..ba545c5680 100644 --- a/toolchain/mfc/state.py +++ b/toolchain/mfc/state.py @@ -3,12 +3,13 @@ @dataclasses.dataclass class MFCConfig: - mpi: bool = True - gpu: bool = False - debug: bool = False - gcov: bool = False - unified: bool = False - single: bool = False + mpi: bool = True + gpu: bool = False + debug: bool = False + gcov: bool = False + unified: bool = False + single: bool = False + fastmath : bool = False @staticmethod def from_dict(d: dict): diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako index b0e47dc154..1671a8f254 100644 --- a/toolchain/templates/santis.mako +++ b/toolchain/templates/santis.mako @@ -3,9 +3,8 @@ <%namespace name="helpers" file="helpers.mako"/> % if engine == 'batch': -#SBATCH --uenv=icon/25.2:v1 +#SBATCH --uenv=icon/25.2:v1@santis #SBATCH --nodes=${nodes} -#SBATCH --reservation=g183 #SBATCH --ntasks-per-node=${tasks_per_node} #SBATCH --job-name="${name}" #SBATCH --output="${name}.out" @@ -78,9 +77,9 @@ echo --gpus-per-task 1 \ % endif --wait 200 --bcast=/tmp/${target.name} \ - "${target.get_home_dirpath(case)}/misc/nvidia_uvm/bind.sh" \ + "${target.get_home_dirpath()}/misc/nvidia_uvm/bind.sh" \ % if target.name == 'simulation': - "${target.get_home_dirpath(case)}/misc/nvidia_uvm/nsys.sh" \ + "${target.get_home_dirpath()}/misc/nvidia_uvm/nsys.sh" \ % endif "${target.get_install_binpath(case)}") % endif From a6ff639d793542f87f0f99a8ca25710e58dc5100 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Tue, 12 Aug 2025 07:52:45 +0200 Subject: [PATCH 22/25] Fix typo in CMakeLists --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3fce77f001..0b349eb394 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,7 +20,7 @@ option(MFC_MPI "Build with MPI" ON option(MFC_OpenACC "Build with OpenACC" OFF) option(MFC_GCov "Build with GCov" OFF) option(MFC_Unified "Build with unified CPU & GPU memory (GH-200 only)" OFF) -option(MFC_Fastmath "Build with -fastmath on NV GPUs" OFF) +option(MFC_Fastmath "Build with -gpu=fastmath on NV GPUs" OFF) option(MFC_PRE_PROCESS "Build pre_process" OFF) option(MFC_SIMULATION "Build simulation" OFF) option(MFC_POST_PROCESS "Build post_process" OFF) From 457ae607ef730495c38f6408a2772c29fc64533b Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Tue, 12 Aug 2025 14:01:22 +0200 Subject: [PATCH 23/25] Replace host_pool with host in m_igr --- src/simulation/m_igr.fpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp index 01fd90e7a8..0fbc76346f 100644 --- a/src/simulation/m_igr.fpp +++ b/src/simulation/m_igr.fpp @@ -27,9 +27,9 @@ module m_igr #ifdef __NVCOMPILER_GPU_UNIFIED_MEM integer, dimension(3) :: nv_uvm_temp_on_gpu real(wp), pointer, contiguous, dimension(:, :, :) :: jac, jac_rhs, jac_old - real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host_pool - real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host_pool - real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host_pool + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host + real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host #else real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]') @@ -124,21 +124,21 @@ contains idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac) else - allocate (jac_host_pool(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end)) + allocate (jac_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) jac(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end) => jac_host_pool(:, :, :) + idwbuff(3)%beg:idwbuff(3)%end) => jac_host(:, :, :) end if if (nv_uvm_temp_on_gpu(2) == 1) then @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p)) @:PREFER_GPU(jac_rhs) else - allocate (jac_rhs_host_pool(-1:m, -1:n, -1:p)) - jac_rhs(-1:m, -1:n, -1:p) => jac_rhs_host_pool(:, :, :) + allocate (jac_rhs_host(-1:m, -1:n, -1:p)) + jac_rhs(-1:m, -1:n, -1:p) => jac_rhs_host(:, :, :) end if if (igr_iter_solver == 1) then ! Jacobi iteration @@ -148,13 +148,13 @@ contains idwbuff(3)%beg:idwbuff(3)%end)) @:PREFER_GPU(jac_old) else - allocate (jac_old_host_pool(idwbuff(1)%beg:idwbuff(1)%end, & - idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end)) + allocate (jac_old_host(idwbuff(1)%beg:idwbuff(1)%end, & + idwbuff(2)%beg:idwbuff(2)%end, & + idwbuff(3)%beg:idwbuff(3)%end)) jac_old(idwbuff(1)%beg:idwbuff(1)%end, & idwbuff(2)%beg:idwbuff(2)%end, & - idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host_pool(:, :, :) + idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host(:, :, :) end if end if #endif @@ -2678,14 +2678,14 @@ contains @:DEALLOCATE(jac) else nullify (jac) - deallocate (jac_host_pool) + deallocate (jac_host) end if if (nv_uvm_temp_on_gpu(2) == 1) then @:DEALLOCATE(jac_rhs) else nullify (jac_rhs) - deallocate (jac_rhs_host_pool) + deallocate (jac_rhs_host) end if if (igr_iter_solver == 1) then ! Jacobi iteration @@ -2693,7 +2693,7 @@ contains @:DEALLOCATE(jac_old) else nullify (jac_old) - deallocate (jac_old_host_pool) + deallocate (jac_old_host) end if end if #endif From a6116f24def35029ed6abe270777934add8f2719 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Tue, 12 Aug 2025 16:01:08 +0200 Subject: [PATCH 24/25] Set cpus-per-task to 72 and update binding script --- misc/nvidia_uvm/bind.sh | 2 +- toolchain/templates/santis.mako | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh index 0b7bf91e96..b5b4bbb945 100755 --- a/misc/nvidia_uvm/bind.sh +++ b/misc/nvidia_uvm/bind.sh @@ -15,7 +15,7 @@ export MPICH_OFI_NIC_POLICY=USER export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3" # Bind to cores ( first core per socket ) -physcores=(0 72 144 216) +physcores=(0-71 72-143 144-215 216-287) #echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako index 1671a8f254..23abce3508 100644 --- a/toolchain/templates/santis.mako +++ b/toolchain/templates/santis.mako @@ -71,7 +71,7 @@ echo % else: (set -x; srun --unbuffered \ --ntasks=${nodes*tasks_per_node} \ - --cpus-per-task 1 \ + --cpus-per-task 72 \ --cpu-bind=none \ % if gpu: --gpus-per-task 1 \ From fb50e908a0cd98d0cc5dfb1a24ceefba3d1749e9 Mon Sep 17 00:00:00 2001 From: Nikolaos Tselepidis Date: Tue, 12 Aug 2025 16:11:33 +0200 Subject: [PATCH 25/25] Add some more updates to the helper scripts --- misc/nvidia_uvm/bind.sh | 2 +- misc/nvidia_uvm/nsys.sh | 2 +- toolchain/templates/santis.mako | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh index b5b4bbb945..37f5a1a3cd 100755 --- a/misc/nvidia_uvm/bind.sh +++ b/misc/nvidia_uvm/bind.sh @@ -14,7 +14,7 @@ export CUDA_VISIBLE_DEVICES="$local_rank" export MPICH_OFI_NIC_POLICY=USER export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3" -# Bind to cores ( first core per socket ) +# Bind to cores ( all cores per socket ) physcores=(0-71 72-143 144-215 216-287) #echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh index 172bcb2f69..205bee8fd4 100755 --- a/misc/nvidia_uvm/nsys.sh +++ b/misc/nvidia_uvm/nsys.sh @@ -15,7 +15,7 @@ if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then --cpu-socket-events=61,71,265,273 \ --cpu-socket-metrics=103,104 \ --event-sampling-interval=10 \ - --trace=nvtx,openacc \ + --trace=nvtx,cuda,openacc \ --force-overwrite=true \ -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \ -o "$NSYS_FILE" "$@" diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako index 23abce3508..cb4b330625 100644 --- a/toolchain/templates/santis.mako +++ b/toolchain/templates/santis.mako @@ -6,6 +6,7 @@ #SBATCH --uenv=icon/25.2:v1@santis #SBATCH --nodes=${nodes} #SBATCH --ntasks-per-node=${tasks_per_node} +#SBATCH --cpus-per-task=72 #SBATCH --job-name="${name}" #SBATCH --output="${name}.out" #SBATCH --error="${name}.err"