MFlowCode · sbryngelson · Aug 14, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025
@@ -20,6 +20,7 @@ option(MFC_MPI           "Build with MPI"                                     ON
 option(MFC_OpenACC       "Build with OpenACC"                                OFF)
 option(MFC_GCov          "Build with GCov"                                   OFF)
 option(MFC_Unified       "Build with unified CPU & GPU memory (GH-200 only)" OFF)
+option(MFC_Fastmath      "Build with -gpu=fastmath on NV GPUs"               OFF)
 option(MFC_PRE_PROCESS   "Build pre_process"                                 OFF)
 option(MFC_SIMULATION    "Build simulation"                                  OFF)
 option(MFC_POST_PROCESS  "Build post_process"                                OFF)
@@ -487,6 +488,9 @@ function(MFC_SETUP_TARGET)
                     "-foffload=amdgcn-amdhsa='-march=gfx90a'"
                     "-foffload-options=-lgfortran\ -lm"
                     "-fno-exceptions")
+                if (MFC_Fastmath)
+                    message(WARNING "--fastmath has no effect with the GNU compiler")
+                endif()
             elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
                 foreach (cc ${MFC_CUDA_CC})
                     target_compile_options(${a_target}
@@ -498,14 +502,20 @@ function(MFC_SETUP_TARGET)
                     PRIVATE -gpu=keep,ptxinfo,lineinfo
                 )
 
+                if (MFC_Fastmath)
+                    target_compile_options(${a_target}
+                        PRIVATE -gpu=fastmath
+                    )
+                endif()
+
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified:managedalloc -cuda
                     )
                     # "This option must appear in both the compile and link lines" -- NVHPC Docs
                     target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified:managedalloc -cuda
                     )
                 endif()
 
@@ -521,16 +531,28 @@ function(MFC_SETUP_TARGET)
                         PRIVATE -DFRONTIER_UNIFIED)
                 endif()
 
+                if (MFC_Fastmath)
+                    message(WARNING "--fastmath has no effect with the CCE")
+                endif()
+
                 find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
                 target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
             endif()
         elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
+            if (MFC_Fastmath)
+                message(WARNING "--fastmath has no effect with the CCE")
+            endif()
         endif()
 
         if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
             find_package(CUDAToolkit REQUIRED)
-            target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+            if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
+                target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+            else() # CUDA >= 12.9
+                target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx)
+                target_link_options(${a_target} PRIVATE "-cudalib=nvtx3")
+            endif()
         endif()
     endforeach()
 

@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+import math
+import json
+
+N = 799
+Nx = N
+Ny = 2 * (N + 1) - 1
+Nz = 2 * (N + 1) - 1
+
+Re = 1600
+L = 1
+P0 = 101325
+rho0 = 1
+C0 = math.sqrt(1.4 * P0)
+V0 = 0.1 * C0
+mu = V0 * L / Re
+
+cfl = 0.5
+dx = 2 * math.pi * L / (Ny + 1)
+
+dt = cfl * dx / (C0)
+
+tC = L / V0
+tEnd = 20 * tC
+
+Nt = int(tEnd / dt)
+Nt = 10
+
+
+# Configuring case dictionary
+print(
+    json.dumps(
+        {
+            "rdma_mpi": "T",
+            # Logistics
+            "run_time_info": "F",
+            # Computational Domain Parameters
+            "x_domain%beg": -math.pi * L,
+            "x_domain%end": math.pi * L,
+            "y_domain%beg": -math.pi * L,
+            "y_domain%end": math.pi * L,
+            "z_domain%beg": -math.pi * L,
+            "z_domain%end": math.pi * L,
+            "m": Nx,
+            "n": Ny,
+            "p": Nz,
+            "cyl_coord": "F",
+            "dt": dt,
+            "t_step_start": 0,
+            "t_step_stop": 10,  # Nt,
+            "t_step_save": 10,  # int(Nt / 100),
+            # Simulation Algorithm Parameters
+            "num_patches": 1,
+            "model_eqns": 2,
+            "num_fluids": 1,
+            "time_stepper": 3,
+            "bc_x%beg": -1,
+            "bc_x%end": -1,
+            "bc_y%beg": -1,
+            "bc_y%end": -1,
+            "bc_z%beg": -1,
+            "bc_z%end": -1,
+            "igr": "T",
+            "igr_order": 5,
+            "igr_iter_solver": 1,
+            "num_igr_iters": 3,
+            "num_igr_warm_start_iters": 3,
+            "alf_factor": 10,
+            "viscous": "T",
+            # Formatted Database Files Structure Parameters
+            "format": 1,
+            "precision": 2,
+            "prim_vars_wrt": "T",
+            "omega_wrt(1)": "T",
+            "omega_wrt(2)": "T",
+            "omega_wrt(3)": "T",
+            "qm_wrt": "T",
+            "fd_order": 4,
+            "parallel_io": "T",
+            # Patch 1: Background (AIR - 2)
+            "patch_icpp(1)%geometry": 9,
+            "patch_icpp(1)%x_centroid": 0,
+            "patch_icpp(1)%y_centroid": 0,
+            "patch_icpp(1)%z_centroid": 0,
+            "patch_icpp(1)%length_x": 2 * math.pi * L,
+            "patch_icpp(1)%length_y": 2 * math.pi * L,
+            "patch_icpp(1)%length_z": 2 * math.pi * L,
+            "patch_icpp(1)%vel(1)": 0.0,
+            "patch_icpp(1)%vel(2)": 0.0,
+            "patch_icpp(1)%vel(3)": 0,
+            "patch_icpp(1)%pres": 0.0,
+            "patch_icpp(1)%hcid": 380,
+            "patch_icpp(1)%alpha_rho(1)": 1,
+            "patch_icpp(1)%alpha(1)": 1,
+            # Fluids Physical Parameters
+            "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
+            "fluid_pp(1)%pi_inf": 0,
+            "fluid_pp(1)%Re(1)": 1 / mu,
+            # NVIDIA UVM Options
+            "nv_uvm_out_of_core": "T",
+            "nv_uvm_igr_temps_on_gpu": 3,
+            "nv_uvm_pref_gpu": "T",
+        }
+    )
+)
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# -------------------------------- #
+# Binding for a single Santis node #
+# -------------------------------- #
+
+# Local rank
+export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
+
+# Bind to GPU
+export CUDA_VISIBLE_DEVICES="$local_rank"
+
+# Binding to NIC
+export MPICH_OFI_NIC_POLICY=USER
+export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
+
+# Bind to cores ( all cores per socket )
+physcores=(0-71 72-143 144-215 216-287)
+
+#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
+
+#set -x
+numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
+#set +x
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#set -x
+set -euo pipefail
+
+rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
+
+[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
+[[ -z "${NSYS+x}" ]] && NSYS=0
+
+if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
+  exec nsys profile \
+       --cpuctxsw=none -b none -s none \
+      --event-sample=system-wide \
+      --cpu-socket-events=61,71,265,273 \
+      --cpu-socket-metrics=103,104 \
+      --event-sampling-interval=10 \
+      --trace=nvtx,cuda,openacc \
+      --force-overwrite=true \
+      -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
+      -o "$NSYS_FILE" "$@"
+else
+  exec "$@"
+fi
@@ -12,6 +12,55 @@
 #endif
 #:enddef
 
+! Caution:
+! This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI rank.
+! That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0.
+! For an example see misc/nvidia_uvm/bind.sh.
+#:def PREFER_GPU(*args)
+#ifdef MFC_SIMULATION
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    block
+! Beginning in the 25.3 release, the structure of the cudafor module has been changed slightly.
+! The module now includes, or “uses” 3 submodules: cuda_runtime_api, gpu_reductions, and sort.
+! The cudafor functionality has not changed. But for new users, or users who have needed to
+! work-around name conflicts in the module, it may be better to use cuda_runtime_api to expose
+! interfaces to the CUDA runtime calls described in Chapter 4 of this guide.
+! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules
+#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3)
+        use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
+#else
+        use cuda_runtime_api
+#endif
+        integer :: istat
+
+        if (nv_uvm_pref_gpu) then
+            #:for arg in args
+                !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
+                ! set preferred location GPU
+                istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+                ! set accessed by CPU
+                istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+                ! prefetch to GPU - physically populate memory pages
+                istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+            #:endfor
+        end if
+    end block
+#endif
+#endif
+#:enddef
+
 #:def ALLOCATE(*args)
     @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
     #:set allocated_variables = ', '.join(args)

@@ -38,7 +38,9 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[buff_send, buff_recv]')
+#endif
 
     integer :: halo_size
     $:GPU_DECLARE(create='[halo_size]')
@@ -78,7 +80,13 @@ contains
 
         $:GPU_UPDATE(device='[halo_size, v_size]')
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
+#else
+        allocate (buff_send(0:halo_size), buff_recv(0:halo_size))
+        $:GPU_ENTER_DATA(create='[capture:buff_send]')
+        $:GPU_ENTER_DATA(create='[capture:buff_recv]')
+#endif
 #endif
 
     end subroutine s_initialize_mpi_common_module

@@ -30,6 +30,7 @@ contains
 
         if (igr) then
             call s_check_inputs_igr
+            call s_check_inputs_nvidia_uvm
         else
             if (recon_type == WENO_TYPE) then
                 call s_check_inputs_weno
@@ -411,4 +412,13 @@ contains
         @:PROHIBIT(powell .and. fd_order == dflt_int, "fd_order must be set if Powell's method is enabled")
     end subroutine s_check_inputs_mhd
 
+    impure subroutine s_check_inputs_nvidia_uvm
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        @:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, &
+            "nv_uvm_igr_temps_on_gpu must be in the range [0, 3]")
+        @:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, &
+            "nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2")
+#endif
+    end subroutine s_check_inputs_nvidia_uvm
+
 end module m_checker
@@ -157,6 +157,16 @@ module m_global_parameters
         logical :: viscous        !< Viscous effects
     #:endif
 
+    !> @name Variables for our of core IGR computation on NVIDIA
+    !> @{
+    logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE)
+    integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
+    ! 1 => jac on GPU, jac_rhs and jac_old on CPU
+    ! 2 => jac and jac_rhs on GPU, jac_old on CPU
+    ! 3 => jac, jac_rhs, and jac_old on GPU (default)
+    logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE)
+    !> @}
+
     real(wp) :: weno_eps       !< Binding for the WENO nonlinear weights
     real(wp) :: teno_CT        !< Smoothness threshold for TENO
     logical :: mp_weno        !< Monotonicity preserving (MP) WENO
@@ -573,6 +583,11 @@ contains
         t_stop = dflt_real
         t_save = dflt_real
 
+        ! NVIDIA UVM options
+        nv_uvm_out_of_core = .false.
+        nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
+        nv_uvm_pref_gpu = .false.
+
         ! Simulation algorithm parameters
         model_eqns = dflt_int
         mpp_lim = .false.
@@ -1321,16 +1336,25 @@ contains
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
         @:ALLOCATE(x_cc(-buff_size:m + buff_size))
         @:ALLOCATE(dx(-buff_size:m + buff_size))
+        @:PREFER_GPU(x_cb)
+        @:PREFER_GPU(x_cc)
+        @:PREFER_GPU(dx)
 
         if (n == 0) return; 
         @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
         @:ALLOCATE(y_cc(-buff_size:n + buff_size))
         @:ALLOCATE(dy(-buff_size:n + buff_size))
+        @:PREFER_GPU(y_cb)
+        @:PREFER_GPU(y_cc)
+        @:PREFER_GPU(dy)
 
         if (p == 0) return; 
         @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
         @:ALLOCATE(z_cc(-buff_size:p + buff_size))
         @:ALLOCATE(dz(-buff_size:p + buff_size))
+        @:PREFER_GPU(z_cb)
+        @:PREFER_GPU(z_cc)
+        @:PREFER_GPU(dz)
 
     end subroutine s_initialize_global_parameters_module