MFlowCode · bdorschn · Jul 25, 2025
@@ -135,17 +135,17 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         add_compile_options(
             -Wall
-            -Wextra
+            -Wextra	
             -fcheck=all,no-array-temps
             -fbacktrace
             -fimplicit-none
             -fsignaling-nans
             -finit-real=snan
             -finit-integer=-99999999
-            -Wintrinsic-shadow
-            -Wunderflow
-            -Wrealloc-lhs
-            -Wsurprising
+            -Wintrinsic-shadow	
+            -Wunderflow	
+            -Wrealloc-lhs	
+            -Wsurprising	
 	    )
     endif()
 
@@ -163,7 +163,6 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
         "SHELL:-h acc_model=auto_async_none"
         "SHELL: -h acc_model=no_fast_addr"
         "SHELL: -h list=adm"
-        "SHELL: -munsafe-fp-atomics" # Not unsafe for operations we do
     )
 
     add_link_options("SHELL:-hkeepfiles")
@@ -173,6 +172,7 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
                 "SHELL:-h acc_model=auto_async_none"
                 "SHELL: -h acc_model=no_fast_addr"
                 "SHELL: -K trap=fp" "SHELL: -G2"
+
         )
         add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
     endif()
@@ -486,23 +486,23 @@ function(MFC_SETUP_TARGET)
                 endforeach()
 
                 target_compile_options(${a_target}
-                    PRIVATE -gpu=keep,ptxinfo,lineinfo
+                    PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
                 )
 
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                     # "This option must appear in both the compile and link lines" -- NVHPC Docs
                     target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                 endif()
 
                 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
                     target_compile_options(${a_target}
-                        PRIVATE -gpu=autocompare,debug
+                        PRIVATE -gpu=autocompare,debug -cuda
                     )
                 endif()
             elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
@@ -513,9 +513,15 @@ function(MFC_SETUP_TARGET)
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
         endif()
 
-        if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
-            find_package(CUDAToolkit REQUIRED)
-            target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+        if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR
+            CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+
+            if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
+                target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+            else() # CUDA >= 12.9
+                target_link_libraries(${a_target}  PRIVATE nvhpcwrapnvtx )
+            endif()
+            target_link_options(${a_target} PRIVATE "-cudalib=nvtx")
         endif()
     endforeach()
 

@@ -0,0 +1,48 @@
+## The Main Idea behind the implemented Out-of-Core Strategy for Grace-Hopper
+
+To run MFC out-of-core on Grace-Hopper using Unified Memory we implement a zero-copy strategy.
+
+We start by setting preferred location CPU for all buffers by hooking into the allocate macro and setting `NVIDIA_ALLOC_MODE=2`.
+This way we disable access counter based migrations and keep everything on the CPU memory, freeing up as much GPU memory as possible.
+
+Then, for the "important" buffers that are frequently accessed from the GPU, we reset preferred location to GPU in order to place them (and directly populate them) in GPU memory.
+This is done by the `PREFER_GPU` macro that has been manually placed in the code right after the allocations of the "important" buffers.
+To activate these hints we export `NVIDIA_MANUAL_GPU_HINTS=1`.
+
+To allow fine grained control and be able to simulate larger sizes, we also use the following environment variables:
+- With `NVIDIA_IGR_TEMPS_ON_GPU` we control how many temporaries from the IGR module are to be placed in GPU memory.
+- With `NVIDIA_VARS_ON_GPU` we control how many of the `q_cons_ts(1)%vf(j)%sf` arrays we place in GPU memory.
+
+It is important to note that we have rearranged the timestep updates in the 3rd order TVD Runge Kutta scheme in a way that allows us to pass only `q_cons_ts(1)` to the `compute_rhs` routines.
+This way, in order to keep the computation of `compute_rhs` (mostly) on GPU data, we only need to store `q_cons_ts(1)` (fully or even partially) in GPU memory.
+Thus, we choose to keep `q_cons_ts(2)` in CPU memory for the full lifetime of the simulation, freeing up space in GPU memory that allows for bumping up the size of the simulation, without sacrificing performance.
+In the timestep updates between the `compute_rhs` calls, we access both `q_cons_ts(1)` and `q_cons_ts(2)` directly from the physical location where they reside (zero-copy), simultaneously pulling data from GPU memory and CPU memory (through C2C), making good use of Grace-Hopper.
+
+Note: This rearrangement most likely "breaks" the timestepper for different physics cases, but we can easily fix it in a later step.
+
+## Example Workflow for Out-of-Core Strategy based on Unified Memory
+
+```shell
+# Allocate a node
+salloc -A g183 --partition normal -t 02:00:00 -N 1 -n 4 --cpus-per-task=71
+
+# Start uenv
+uenv start --view=modules icon/25.2:v1
+
+# cd to root directory of MFC
+cd MFC-Wilfong
+
+# Load modules
+. ./mfc.sh load -c san -m g
+
+# Build
+export MFC_CUDA_CC=90
+./mfc.sh build --gpu -j 71 --single --unified --verbose
+
+# Run pre_process and simulation binaries with case optimization (in an interactive job)
+./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis
+
+# Run pre_process and simulation binaries with case optimization (in an batch job)
+./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis -e batch -p normal -a g183 -w 00:15:00
+```
+The environment variables `NVIDIA_ALLOC_MODE`, `NVIDIA_MANUAL_GPU_HINTS`, `NVIDIA_VARS_ON_GPU`, and `NVIDIA_IGR_TEMPS_ON_GPU`, can be set appropriately in `toolchain/templates/santis.mako`, to configure a run with ALL buffers either in GPU or in CPU memory, or a run with SOME buffers in GPU memory and the rest in CPU memory.
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+# -------------------------------- #
+# Binding for a single Santis node #
+# -------------------------------- #
+
+# Local rank
+export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
+
+# Bind to GPU
+export CUDA_VISIBLE_DEVICES="$local_rank"
+
+# Binding to NIC
+export MPICH_OFI_NIC_POLICY=USER
+export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
+
+# Bind to cores ( second core per socket )
+physcores=(0 72 144 216)
+
+#echo rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
+numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#set -x
+set -euo pipefail
+
+rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
+
+[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
+[[ -z "${NSYS+x}" ]] && NSYS=0
+
+if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
+    echo "Doing nsys"
+  exec nsys profile \
+       --cpuctxsw=none -b none -s none \
+      --event-sample=system-wide \
+      --cpu-socket-events=61,71,265,273 \
+      --cpu-socket-metrics=103,104 \
+      --event-sampling-interval=10 \
+      --trace=nvtx,openacc \
+      --force-overwrite=true \
+      -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
+      -o "$NSYS_FILE" "$@"
+else
+  exec "$@"
+fi
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# TODO: Modify accordingly
+PATH_TO_BINARY=${SCRATCH}/projects/cfd/mfc/MFC-Wilfong/build/install/cdcd4e8762/bin/
+
+# NVHPC and CUDA env vars
+export NV_ACC_USE_MALLOC=1                    # use malloc instead of cudaMallocManaged ( compiled using -gpu=mem:unified )
+export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints
+export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
+
+# Cray MPICH
+export MPICH_GPU_SUPPORT_ENABLED=1            # MPICH with GPU support
+export FI_CXI_RX_MATCH_MODE=software
+export FI_MR_CACHE_MONITOR=disabled
+
+# CUSTOM env vars to MFC
+export NVIDIA_ALLOC_MODE=2                    # default alloc to prefloc CPU
+export NVIDIA_MANUAL_GPU_HINTS=1              # prefloc GPU on some
+export NVIDIA_IGR_TEMPS_ON_GPU=1              # jac on GPU and jac_rhs on CPU       ( NOTE: good default, tune based on size )
+export NVIDIA_VARS_ON_GPU=7                   # q_cons_ts(1)%vf%sf for j=1-7 on GPU ( NOTE: good default, tune based on size )
+
+# NSYS
+export NSYS=1                                 # enable nsys profiling
+export NSYS_FILE=report_uvm_single_N-499_nGPUs-4_params-${NVIDIA_VARS_ON_GPU}-${NVIDIA_IGR_TEMPS_ON_GPU}.qdrep
+
+# Run using --cpu-bind=none because we use our own binding script
+srun --ntasks 4 --cpu-bind=none ./bind.sh ./nsys.sh ${PATH_TO_BINARY}/simulation
@@ -12,11 +12,113 @@
 #endif
 #:enddef
 
+#:def PREFER_GPU(*args)
+#ifdef MFC_SIMULATION
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    block
+    use cudafor
+    intrinsic :: minval, maxval, sum
+    integer :: istat
+    integer :: prefer_gpu_mode
+    character(len=10) :: prefer_gpu_mode_str
+
+    ! environment variable
+    call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
+    if (trim(prefer_gpu_mode_str) == "0") then ! OFF
+        prefer_gpu_mode = 0
+    elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
+        prefer_gpu_mode = 1
+    else ! default
+        prefer_gpu_mode = 0
+    endif
+
+    if (prefer_gpu_mode .eq. 1) then
+    #:for arg in args
+        !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
+        ! unset
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+        ! set
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+    #:endfor
+    end if
+    end block
+#endif
+#endif
+#:enddef
+
+
+#:def PARSE(s)
+${s if s.rfind(')') == -1 else next((s[:i] for i in range(s.rfind(')'), -1, -1) if s[i] == '(' and s.count('(', i, s.rfind(')')+1) == s.count(')', i, s.rfind(')')+1)), s)}$
+#:enddef
+
 #:def ALLOCATE(*args)
     @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
     #:set allocated_variables = ', '.join(args)
     allocate (${allocated_variables}$)
     $:GPU_ENTER_DATA(create=('[' + allocated_variables + ']'))
+
+
+#ifdef MFC_SIMULATION
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    block
+    use cudafor
+    intrinsic :: minval, maxval, sum
+    integer :: istat, stream_id
+    integer :: alloc_mode
+    character(len=10) :: alloc_mode_str
+
+    ! environment variable
+    call get_environment_variable("NVIDIA_ALLOC_MODE", alloc_mode_str)
+    if (trim(alloc_mode_str) == "0") then ! no CPU first touch, no preferred location CPU
+        alloc_mode = 0
+    elseif (trim(alloc_mode_str) == "1") then ! CPU first touch, no preferred location CPU
+        alloc_mode = 1
+    elseif (trim(alloc_mode_str) == "2") then ! no CPU first touch, preferred location CPU
+        alloc_mode = 2
+    elseif (trim(alloc_mode_str) == "3") then ! CPU first touch, preferred location CPU
+        alloc_mode = 3
+    else ! default
+        alloc_mode = 0
+    endif
+
+    stream_id = 0
+
+    ! prefetch to CPU
+    if ((alloc_mode .eq. 1) .or. (alloc_mode .eq. 3)) then
+    #:for arg in args
+        istat = cudaMemPrefetchAsync( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaCpuDeviceId, stream_id )
+        !print*, "! @{PARSE(${arg}$)}@ with shape",  SHAPE(@{PARSE(${arg}$)}@), "=> prefetch to CPU"
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+    #:endfor
+    endif
+
+    ! memadvise preferred location
+    if ((alloc_mode .eq. 2) .or. (alloc_mode .eq. 3)) then
+    #:for arg in args
+        istat = cudaMemAdvise( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId )
+        !print*, "! @{PARSE(${arg}$)}@ with shape",  SHAPE(@{PARSE(${arg}$)}@), "=> preferred location CPU"
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+    #:endfor
+    endif
+
+    end block
+#endif
+#endif
+
 #:enddef ALLOCATE
 
 #:def DEALLOCATE(*args)

@@ -1294,16 +1294,25 @@ contains
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
         @:ALLOCATE(x_cc(-buff_size:m + buff_size))
         @:ALLOCATE(dx(-buff_size:m + buff_size))
+        @:PREFER_GPU(x_cb)
+        @:PREFER_GPU(x_cc)
+        @:PREFER_GPU(dx)
 
         if (n == 0) return; 
         @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
         @:ALLOCATE(y_cc(-buff_size:n + buff_size))
         @:ALLOCATE(dy(-buff_size:n + buff_size))
+        @:PREFER_GPU(y_cb)
+        @:PREFER_GPU(y_cc)
+        @:PREFER_GPU(dy)
 
         if (p == 0) return; 
         @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
         @:ALLOCATE(z_cc(-buff_size:p + buff_size))
         @:ALLOCATE(dz(-buff_size:p + buff_size))
+        @:PREFER_GPU(z_cb)
+        @:PREFER_GPU(z_cc)
+        @:PREFER_GPU(dz)
 
     end subroutine s_initialize_global_parameters_module