From 2358d298f81336a0a26ea068b698305e8157d2fa Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Fri, 1 Aug 2025 18:04:49 +0200
Subject: [PATCH 01/25] Add scripts for santis/alps, example case, and captures
 for UVM comms via RDMA

---
 CMakeLists.txt                                |   6 +-
 .../3D_IGR_TaylorGreenVortex_nvidia/case.py   | 101 ++++++++++++++++++
 misc/nvidia_uvm/bind.sh                       |  24 +++++
 misc/nvidia_uvm/nsys.sh                       |  24 +++++
 src/common/m_mpi_common.fpp                   |   8 ++
 toolchain/mfc/build.py                        |   3 +
 toolchain/modules                             |   3 +
 toolchain/templates/santis.mako               |  86 +++++++++++++++
 8 files changed, 252 insertions(+), 3 deletions(-)
 create mode 100644 examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
 create mode 100755 misc/nvidia_uvm/bind.sh
 create mode 100755 misc/nvidia_uvm/nsys.sh
 create mode 100644 toolchain/templates/santis.mako

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8269c1cb48..c0acb3dbe3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -486,17 +486,17 @@ function(MFC_SETUP_TARGET)
                 endforeach()
 
                 target_compile_options(${a_target}
-                    PRIVATE -gpu=keep,ptxinfo,lineinfo
+                    PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
                 )
 
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                     # "This option must appear in both the compile and link lines" -- NVHPC Docs
                     target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                 endif()
 
diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
new file mode 100644
index 0000000000..74faa7aa22
--- /dev/null
+++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+import math
+import json
+
+N = 799
+Nx = N
+Ny = 2*(N+1)-1
+Nz = 2*(N+1)-1
+
+Re = 1600
+L = 1
+P0 = 101325
+rho0 = 1
+C0 = math.sqrt(1.4 * P0)
+V0 = 0.1 * C0
+mu = V0 * L / Re
+
+cfl = 0.5
+dx = 2 * math.pi * L / (Ny + 1)
+
+dt = cfl * dx / (C0)
+
+tC = L / V0
+tEnd = 20 * tC
+
+Nt = int(tEnd / dt)
+Nt = 10
+
+
+# Configuring case dictionary
+print(
+    json.dumps(
+        {
+            "rdma_mpi": "T",
+            # Logistics
+            "run_time_info": "F",
+            # Computational Domain Parameters
+            "x_domain%beg": -math.pi * L,
+            "x_domain%end": math.pi * L,
+            "y_domain%beg": -math.pi * L,
+            "y_domain%end": math.pi * L,
+            "z_domain%beg": -math.pi * L,
+            "z_domain%end": math.pi * L,
+            "m": Nx,
+            "n": Ny,
+            "p": Nz,
+            "cyl_coord": "F",
+            "dt": dt,
+            "t_step_start": 0,
+            "t_step_stop": Nt,
+            "t_step_save": int(Nt / 100),
+            # Simulation Algorithm Parameters
+            "num_patches": 1,
+            "model_eqns": 2,
+            "num_fluids": 1,
+            "time_stepper": 3,
+            "bc_x%beg": -1,
+            "bc_x%end": -1,
+            "bc_y%beg": -1,
+            "bc_y%end": -1,
+            "bc_z%beg": -1,
+            "bc_z%end": -1,
+            "igr": "T",
+            "igr_order": 5,
+            "igr_iter_solver": 1,
+            "num_igr_iters": 3,
+            "num_igr_warm_start_iters": 3,
+            "alf_factor": 10,
+            "viscous": "T",
+            # Formatted Database Files Structure Parameters
+            "format": 1,
+            "precision": 2,
+            "prim_vars_wrt": "T",
+            "omega_wrt(1)": "T",
+            "omega_wrt(2)": "T",
+            "omega_wrt(3)": "T",
+            "qm_wrt": "T",
+            "fd_order": 4,
+            "parallel_io": "T",
+            # Patch 1: Background (AIR - 2)
+            "patch_icpp(1)%geometry": 9,
+            "patch_icpp(1)%x_centroid": 0,
+            "patch_icpp(1)%y_centroid": 0,
+            "patch_icpp(1)%z_centroid": 0,
+            "patch_icpp(1)%length_x": 2 * math.pi * L,
+            "patch_icpp(1)%length_y": 2 * math.pi * L,
+            "patch_icpp(1)%length_z": 2 * math.pi * L,
+            "patch_icpp(1)%vel(1)": 0.0,
+            "patch_icpp(1)%vel(2)": 0.0,
+            "patch_icpp(1)%vel(3)": 0,
+            "patch_icpp(1)%pres": 0.0,
+            "patch_icpp(1)%hcid": 380,
+            "patch_icpp(1)%alpha_rho(1)": 1,
+            "patch_icpp(1)%alpha(1)": 1,
+            # Fluids Physical Parameters
+            "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
+            "fluid_pp(1)%pi_inf": 0,
+            "fluid_pp(1)%Re(1)": 1 / mu,
+        }
+    )
+)
diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh
new file mode 100755
index 0000000000..0b7bf91e96
--- /dev/null
+++ b/misc/nvidia_uvm/bind.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# -------------------------------- #
+# Binding for a single Santis node #
+# -------------------------------- #
+
+# Local rank
+export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
+
+# Bind to GPU
+export CUDA_VISIBLE_DEVICES="$local_rank"
+
+# Binding to NIC
+export MPICH_OFI_NIC_POLICY=USER
+export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
+
+# Bind to cores ( first core per socket )
+physcores=(0 72 144 216)
+
+#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
+
+#set -x
+numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
+#set +x
diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh
new file mode 100755
index 0000000000..172bcb2f69
--- /dev/null
+++ b/misc/nvidia_uvm/nsys.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#set -x
+set -euo pipefail
+
+rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
+
+[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
+[[ -z "${NSYS+x}" ]] && NSYS=0
+
+if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
+  exec nsys profile \
+       --cpuctxsw=none -b none -s none \
+      --event-sample=system-wide \
+      --cpu-socket-events=61,71,265,273 \
+      --cpu-socket-metrics=103,104 \
+      --event-sampling-interval=10 \
+      --trace=nvtx,openacc \
+      --force-overwrite=true \
+      -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
+      -o "$NSYS_FILE" "$@"
+else
+  exec "$@"
+fi
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 100c055d8d..2bdd241344 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -38,7 +38,9 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[buff_send, buff_recv]')
+#endif
 
     integer :: halo_size
     $:GPU_DECLARE(create='[halo_size]')
@@ -78,7 +80,13 @@ contains
 
         $:GPU_UPDATE(device='[halo_size, v_size]')
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
+#else
+        ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
+        !$acc enter data create(capture:buff_send)
+        !$acc enter data create(capture:buff_recv)
+#endif
 #endif
 
     end subroutine s_initialize_mpi_common_module
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 2de738986d..750c9b294c 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -64,6 +64,9 @@ def get_install_dirpath(self, case: Case ) -> str:
         # The install directory is located <root>/build/install/<slug>
         return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)])
 
+    def get_home_dirpath(self, case: Case) -> str:
+        return os.sep.join([os.getcwd()])
+
     def get_install_binpath(self, case: Case ) -> str:
         # <root>/install/<slug>/bin/<target>
         return os.sep.join([self.get_install_dirpath(case), "bin", self.name])
diff --git a/toolchain/modules b/toolchain/modules
index 1e7ebe97f3..19e4e4d8df 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -85,3 +85,6 @@ n-cpu penguin/openmpi/4.1.5/gcc-8.5.0
 n-gpu penguin/openmpi/4.1.5/nvhpc-22.3 nvidia/nvhpc/22.3 cuda/cuda-11.6
 n-gpu CC=nvc CXX=nvc++ FC=nvfortran
 
+san   CSCS Santis
+san-all cmake python
+san-gpu nvhpc cuda cray-mpich
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
new file mode 100644
index 0000000000..926c682039
--- /dev/null
+++ b/toolchain/templates/santis.mako
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+<%namespace name="helpers" file="helpers.mako"/>
+
+% if engine == 'batch':
+#SBATCH --uenv=icon/25.2:v1
+#SBATCH --nodes=${nodes}
+#SBATCH --reservation=g183
+#SBATCH --ntasks-per-node=${tasks_per_node}
+#SBATCH --job-name="${name}"
+#SBATCH --output="${name}.out"
+#SBATCH --error="${name}.err"
+#SBATCH --time=${walltime}
+% if account:
+#SBATCH --account=${account}
+% endif
+% if partition:
+#SBATCH --partition=${partition}
+% endif
+% if quality_of_service:
+#SBATCH --qos=${quality_of_service}
+% endif
+% if email:
+#SBATCH --mail-user=${email}
+#SBATCH --mail-type="BEGIN, END, FAIL"
+% endif
+% endif
+
+# NVHPC and CUDA env vars
+export NV_ACC_USE_MALLOC=0                    # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified )
+export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints
+#export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
+
+# Cray MPICH
+export MPICH_GPU_SUPPORT_ENABLED=1
+export FI_CXI_RX_MATCH_MODE=software
+export FI_MR_CACHE_MONITOR=disabled
+export MPICH_NO_BUFFER_ALIAS_CHECK=1
+
+# CUSTOM env vars to MFC
+export NVIDIA_ALLOC_MODE=0                    # do nothing
+export NVIDIA_MANUAL_GPU_HINTS=1              # prefloc GPU on some
+export NVIDIA_IGR_TEMPS_ON_GPU=3              # jac, jac_rhs, and jac_old on GPU
+export NVIDIA_VARS_ON_GPU=7                   # q_cons_ts(1)%vf%sf for j=1-7 on GPU
+
+# NSYS
+export NSYS=1                                 # enable nsys profiling
+export NSYS_FILE=myreport.qdrep
+
+${helpers.template_prologue()}
+
+ok ":) Loading modules:\n"
+cd "${MFC_ROOT_DIR}"
+% if engine == 'batch':
+. ./mfc.sh load -c san -m ${'g' if gpu else 'c'}
+% endif
+cd - > /dev/null
+echo
+
+% for target in targets:
+    ${helpers.run_prologue(target)}
+
+    % if not mpi:
+        (set -x; ${profiler} "${target.get_install_binpath(case)}")
+    % else:
+        (set -x; srun --unbuffered \
+                --ntasks=${nodes*tasks_per_node}                     \
+                --cpus-per-task 1                                    \
+                --cpu-bind=none                                      \
+            % if gpu:
+                --gpus-per-task 1                                    \
+            % endif
+                --wait 200 --bcast=/tmp/${target.name}               \
+                "${target.get_home_dirpath(case)}/misc/nvidia_uvm/bind.sh" \
+            % if target.name == 'simulation':
+                "${target.get_home_dirpath(case)}/misc/nvidia_uvm/nsys.sh" \
+            % endif
+                "${target.get_install_binpath(case)}")
+    % endif
+
+    ${helpers.run_epilogue(target)}
+
+    echo
+% endfor
+
+${helpers.template_epilogue()}

From 37d393b02f5a5d01a27a1ffeca10d96231600d22 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Fri, 1 Aug 2025 20:34:32 +0200
Subject: [PATCH 02/25] Add PREFER_GPU and rearrange update for out-of-core
 computation

---
 src/common/include/macros.fpp          | 41 ++++++++++++++++
 src/simulation/m_global_parameters.fpp |  9 ++++
 src/simulation/m_igr.fpp               |  6 +++
 src/simulation/m_time_steppers.fpp     | 65 +++++++++++++++++++++++++-
 4 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index c1652388c3..b0d87d31b0 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -12,6 +12,47 @@
 #endif
 #:enddef
 
+#:def PREFER_GPU(*args)
+#ifdef MFC_SIMULATION
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    block
+    use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
+    integer :: istat
+    integer :: prefer_gpu_mode
+    character(len=10) :: prefer_gpu_mode_str
+
+    ! environment variable
+    call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
+    if (trim(prefer_gpu_mode_str) == "0") then ! OFF
+        prefer_gpu_mode = 0
+    elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
+        prefer_gpu_mode = 1
+    else ! default
+        prefer_gpu_mode = 0
+    endif
+
+    if (prefer_gpu_mode .eq. 1) then
+    #:for arg in args
+        !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
+        ! unset
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+        ! set
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+    #:endfor
+    end if
+    end block
+#endif
+#endif
+#:enddef
+
 #:def ALLOCATE(*args)
     @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
     #:set allocated_variables = ', '.join(args)
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 5be11129a2..2c2d0af646 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -1308,16 +1308,25 @@ contains
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
         @:ALLOCATE(x_cc(-buff_size:m + buff_size))
         @:ALLOCATE(dx(-buff_size:m + buff_size))
+        @:PREFER_GPU(x_cb)
+        @:PREFER_GPU(x_cc)
+        @:PREFER_GPU(dx)
 
         if (n == 0) return; 
         @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
         @:ALLOCATE(y_cc(-buff_size:n + buff_size))
         @:ALLOCATE(dy(-buff_size:n + buff_size))
+        @:PREFER_GPU(y_cb)
+        @:PREFER_GPU(y_cc)
+        @:PREFER_GPU(dy)
 
         if (p == 0) return; 
         @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
         @:ALLOCATE(z_cc(-buff_size:p + buff_size))
         @:ALLOCATE(dz(-buff_size:p + buff_size))
+        @:PREFER_GPU(z_cb)
+        @:PREFER_GPU(z_cc)
+        @:PREFER_GPU(dz)
 
     end subroutine s_initialize_global_parameters_module
 
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index db80bb8346..76069928f2 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -91,17 +91,23 @@ contains
                 end do
             end do
             $:GPU_UPDATE(device='[Res, Re_idx, Re_size]')
+            @:PREFER_GPU(Res)
+            @:PREFER_GPU(Re_idx)
         end if
 
         @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
             idwbuff(2)%beg:idwbuff(2)%end, &
             idwbuff(3)%beg:idwbuff(3)%end))
+        @:PREFER_GPU(jac)
+
         @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
+        @:PREFER_GPU(jac_rhs)
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
             @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
+            @:PREFER_GPU(jac_old)
         end if
 
         $:GPU_PARALLEL_LOOP(collapse=3)
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index d040650bfa..c87bcad464 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -95,9 +95,11 @@ contains
 
         ! Allocating the cell-average conservative variables
         @:ALLOCATE(q_cons_ts(1:num_ts))
+        @:PREFER_GPU(q_cons_ts)
 
         do i = 1, num_ts
             @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size))
+            @:PREFER_GPU(q_cons_ts(i)%vf)
         end do
 
         do i = 1, num_ts
@@ -105,6 +107,7 @@ contains
                 @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
                     idwbuff(2)%beg:idwbuff(2)%end, &
                     idwbuff(3)%beg:idwbuff(3)%end))
+                @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf)
             end do
             @:ACC_SETUP_VFs(q_cons_ts(i))
         end do
@@ -304,11 +307,13 @@ contains
 
         ! Allocating the cell-average RHS variables
         @:ALLOCATE(rhs_vf(1:sys_size))
+        @:PREFER_GPU(rhs_vf)
 
         if (igr) then
             do i = 1, sys_size
                 @:ALLOCATE(rhs_vf(i)%sf(-1:m+1,-1:n+1,-1:p+1))
                 @:ACC_SETUP_SFs(rhs_vf(i))
+                @:PREFER_GPU(rhs_vf(i)%sf)
             end do
         else
             do i = 1, sys_size
@@ -650,6 +655,7 @@ contains
         real(wp), intent(INOUT) :: time_avg
 
         integer :: i, j, k, l, q !< Generic loop iterator
+        integer :: dest
 
         real(wp) :: start, finish
 
@@ -682,6 +688,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
+#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -694,6 +701,24 @@ contains
                 end do
             end do
         end do
+        dest = 2 ! result in q_cons_ts(2)%vf
+#else
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do i = 1, sys_size
+            do l = 0, p
+                do k = 0, n
+                    do j = 0, m
+                        q_cons_ts(2)%vf(i)%sf(j, k, l) = &
+                            q_cons_ts(1)%vf(i)%sf(j, k, l)
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                            q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                            + dt*rhs_vf(i)%sf(j, k, l)
+                    end do
+                end do
+            end do
+        end do
+        dest = 1 ! result in q_cons_ts(1)%vf
+#endif
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
@@ -750,10 +775,11 @@ contains
 
         ! Stage 2 of 3
 
-        call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
+        call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
+#if  !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -767,6 +793,23 @@ contains
                 end do
             end do
         end do
+        dest = 2 ! result in q_cons_ts(2)%vf
+#else
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do i = 1, sys_size
+            do l = 0, p
+                do k = 0, n
+                    do j = 0, m
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                            (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                             + q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                             + dt*rhs_vf(i)%sf(j, k, l))/4._wp
+                    end do
+                end do
+            end do
+        end do
+        dest = 1 ! result in q_cons_ts(1)%vf
+#endif
 
         if (qbmm .and. (.not. polytropic)) then
             $:GPU_PARALLEL_LOOP(collapse=5)
@@ -823,10 +866,11 @@ contains
         end if
 
         ! Stage 3 of 3
-        call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3)
+        call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3)
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
 
+#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -840,6 +884,23 @@ contains
                 end do
             end do
         end do
+        dest = 1 ! result in q_cons_ts(1)%vf
+#else
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do i = 1, sys_size
+            do l = 0, p
+                do k = 0, n
+                    do j = 0, m
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                            (q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                             + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                             + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp
+                    end do
+                end do
+            end do
+        end do
+        dest = 1 ! result in q_cons_ts(1)%vf
+#endif
 
         if (qbmm .and. (.not. polytropic)) then
             $:GPU_PARALLEL_LOOP(collapse=5)

From 693c7f46e562d5039fe30d23ccfcca3753721ff6 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Fri, 1 Aug 2025 20:35:21 +0200
Subject: [PATCH 03/25] Allow keeping q_cons_ts(2) on CPU using pinned
 allocations

---
 src/simulation/m_time_steppers.fpp | 68 +++++++++++++++++++++++++++---
 toolchain/templates/santis.mako    |  3 +-
 2 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index c87bcad464..0d9ddfd885 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -75,8 +75,14 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
+    integer, private :: out_of_core
+
     $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]')
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host
+#endif
+
 contains
 
     !> The computation of parameters, the allocation of memory,
@@ -86,6 +92,21 @@ contains
 
         integer :: i, j !< Generic loop iterators
 
+        character(len=10) :: out_of_core_str
+        out_of_core = 0
+
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        call get_environment_variable("MFC_OUT_OF_CORE", out_of_core_str)
+
+        if (trim(out_of_core_str) == "0") then
+            out_of_core = 0
+        elseif (trim(out_of_core_str) == "1") then
+            out_of_core = 1
+        else ! default
+            out_of_core = 0
+        endif
+#endif
+
         ! Setting number of time-stages for selected time-stepping scheme
         if (time_stepper == 1) then
             num_ts = 1
@@ -102,12 +123,33 @@ contains
             @:PREFER_GPU(q_cons_ts(i)%vf)
         end do
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        if ( out_of_core == 1 ) then
+           allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                        idwbuff(2)%beg:idwbuff(2)%end, &
+                                        idwbuff(3)%beg:idwbuff(3)%end, &
+                                        1:sys_size))
+        end if
+#endif
+
         do i = 1, num_ts
             do j = 1, sys_size
-                @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
-                @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf)
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+                if ( i <= (num_ts - out_of_core) ) then
+                    !print*, "q_cons_ts", i, j, "on GPU"
+#endif
+                    @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf)
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+                else
+                    !print*, "q_cons_ts", i, j, "on CPU"
+                    q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j)
+                end if
+#endif
             end do
             @:ACC_SETUP_VFs(q_cons_ts(i))
         end do
@@ -1205,7 +1247,17 @@ contains
         ! Deallocating the cell-average conservative variables
         do i = 1, num_ts
             do j = 1, sys_size
-                @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+                if ( i <= (num_ts - out_of_core) ) then
+                    !print*, "q_cons_ts", i, j, "dealloc"
+#endif
+                    @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+                else
+                    !print*, "q_cons_ts", i, j, "nullify"
+                    nullify(q_cons_ts(i)%vf(j)%sf)
+                end if
+#endif
             end do
 
             @:DEALLOCATE(q_cons_ts(i)%vf)
@@ -1213,6 +1265,12 @@ contains
 
         @:DEALLOCATE(q_cons_ts)
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        if ( out_of_core == 1 ) then
+            deallocate(q_cons_ts_pool_host)
+        end if
+#endif
+
         ! Deallocating the cell-average primitive ts variables
         if (probe_wrt) then
             do i = 0, 3
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
index 926c682039..27b4d6b425 100644
--- a/toolchain/templates/santis.mako
+++ b/toolchain/templates/santis.mako
@@ -38,10 +38,9 @@ export FI_MR_CACHE_MONITOR=disabled
 export MPICH_NO_BUFFER_ALIAS_CHECK=1
 
 # CUSTOM env vars to MFC
-export NVIDIA_ALLOC_MODE=0                    # do nothing
+export MFC_OUT_OF_CORE=1                      # out of core
 export NVIDIA_MANUAL_GPU_HINTS=1              # prefloc GPU on some
 export NVIDIA_IGR_TEMPS_ON_GPU=3              # jac, jac_rhs, and jac_old on GPU
-export NVIDIA_VARS_ON_GPU=7                   # q_cons_ts(1)%vf%sf for j=1-7 on GPU
 
 # NSYS
 export NSYS=1                                 # enable nsys profiling

From 7054b7b666226cbe91d5a078353ae0389eefad43 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Fri, 1 Aug 2025 21:06:39 +0200
Subject: [PATCH 04/25] Modify PREFER_GPU macro

---
 src/common/include/macros.fpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index b0d87d31b0..0ffad2e06b 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -34,14 +34,20 @@
     if (prefer_gpu_mode .eq. 1) then
     #:for arg in args
         !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
-        ! unset
-        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId )
+        ! set preferred location GPU
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
         if (istat /= cudaSuccess) then
             write(*,"('Error code: ',I0, ': ')") istat
             write(*,*) cudaGetErrorString(istat)
         endif
-        ! set
-        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
+        ! set accessed by CPU
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+        ! prefetch to GPU - physically populate memory pages
+        istat = cudaMemPrefetchAsync( c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0 )
         if (istat /= cudaSuccess) then
             write(*,"('Error code: ',I0, ': ')") istat
             write(*,*) cudaGetErrorString(istat)

From ee1277d8dae3a02e9e66377f9ce2b28c977bf70f Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Sat, 2 Aug 2025 00:53:53 +0200
Subject: [PATCH 05/25] Allow control in placement of IGR temps

---
 src/simulation/m_igr.fpp | 124 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)

diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 76069928f2..94fb6cd857 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -24,8 +24,16 @@ module m_igr
  s_igr_flux_add, &
  s_finalize_igr_module
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    integer, dimension(3) :: temp_on_gpu
+    real(wp), pointer, contiguous, dimension(:, :, :) :: jac,jac_rhs,jac_old
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host1
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host2
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host3
+#else
     real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old
     $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]')
+#endif
 
     real(wp), allocatable, dimension(:, :) :: Res
     $:GPU_DECLARE(create='[Res]')
@@ -82,6 +90,47 @@ module m_igr
 contains
 
     subroutine s_initialize_igr_module()
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        integer :: igr_temps_total
+        integer :: igr_temps_on_gpu
+        integer :: igr_temps_on_cpu
+        character(len=10) :: igr_temps_on_gpu_str
+
+        ! initialize
+        if (igr_iter_solver == 1) then ! Jacobi iteration
+            igr_temps_total = 3
+        else
+            igr_temps_total = 2
+        end if
+        igr_temps_on_gpu = igr_temps_total
+        igr_temps_on_cpu = 0
+
+        call get_environment_variable("NVIDIA_IGR_TEMPS_ON_GPU", igr_temps_on_gpu_str)
+
+        if (trim(igr_temps_on_gpu_str) == "0") then
+            igr_temps_on_gpu = 0 ! jac, jac_rhs and jac_old on CPU
+        else if (trim(igr_temps_on_gpu_str) == "1") then
+            igr_temps_on_gpu = 1 ! jac on GPU, jac_rhs on CPU, jac_old on CPU
+        else if (trim(igr_temps_on_gpu_str) == "2") then
+            igr_temps_on_gpu = 2 ! jac and jac_rhs on GPU, jac_old on CPU
+        else if (trim(igr_temps_on_gpu_str) == "3") then
+            igr_temps_on_gpu = 3 ! jac, jac_rhs and jac_old on GPU
+        else ! default on GPU
+            igr_temps_on_gpu = 3
+        end if
+
+        ! trim if needed
+        if ( igr_temps_on_gpu > igr_temps_total ) then
+            igr_temps_on_gpu = igr_temps_total
+        end if
+        igr_temps_on_cpu = igr_temps_total - igr_temps_on_gpu
+
+        ! create map
+        temp_on_gpu(1:3) = -1
+        temp_on_gpu(1:igr_temps_total) = 0
+        temp_on_gpu(1:igr_temps_on_gpu) = 1
+        print*, temp_on_gpu(1:3)
+#endif
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -95,6 +144,7 @@ contains
             @:PREFER_GPU(Re_idx)
         end if
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
             idwbuff(2)%beg:idwbuff(2)%end, &
             idwbuff(3)%beg:idwbuff(3)%end))
@@ -109,6 +159,55 @@ contains
                 idwbuff(3)%beg:idwbuff(3)%end))
             @:PREFER_GPU(jac_old)
         end if
+#else
+
+        if ( temp_on_gpu(1) == 1 ) then
+            @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:PREFER_GPU(jac)
+        else
+            print*, 'jac on CPU'
+
+            allocate(pool_host1(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+
+            jac(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end) => pool_host1(:,:,:)
+        end if
+
+        if ( temp_on_gpu(2) == 1 ) then
+            @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
+            @:PREFER_GPU(jac_rhs)
+        else
+            print*, 'jac_rhs on CPU'
+
+            allocate(pool_host2(-1:m,-1:n,-1:p))
+
+            jac_rhs(-1:m,-1:n,-1:p) => pool_host2(:,:,:)
+        end if
+
+        if (igr_iter_solver == 1) then ! Jacobi iteration
+            if ( temp_on_gpu(3) == 1 ) then
+                @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:PREFER_GPU(jac_old)
+            else
+                print*, 'jac_old on CPU'
+
+                allocate(pool_host3(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+
+                jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end) => pool_host3(:,:,:)
+            end if
+        end if
+#endif
 
         $:GPU_PARALLEL_LOOP(collapse=3)
         do l = idwbuff(3)%beg, idwbuff(3)%end
@@ -2618,11 +2717,36 @@ contains
             @:DEALLOCATE(Res)
         end if
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:DEALLOCATE(jac, jac_rhs)
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
             @:DEALLOCATE(jac_old)
         end if
+#else
+        if (temp_on_gpu(1) == 1) then
+            @:DEALLOCATE(jac)
+        else
+            nullify(jac)
+            deallocate(pool_host1)
+        end if
+
+        if (temp_on_gpu(2) == 1) then
+            @:DEALLOCATE(jac_rhs)
+        else
+            nullify(jac_rhs)
+            deallocate(pool_host2)
+        end if
+
+        if (igr_iter_solver == 1) then ! Jacobi iteration
+            if (temp_on_gpu(3) == 1) then
+                @:DEALLOCATE(jac_old)
+            else
+                nullify(jac_old)
+                deallocate(pool_host3)
+            end if
+        end if
+#endif
 
         #:if not MFC_CASE_OPTIMIZATION
             @:DEALLOCATE(coeff_L, coeff_R)

From 4065c024734978f95498f1be2504781c86da434b Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Sat, 2 Aug 2025 09:26:23 +0200
Subject: [PATCH 06/25] Do some clean up

---
 src/simulation/m_igr.fpp           | 37 +++++-------------------------
 src/simulation/m_time_steppers.fpp |  5 ++--
 2 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 94fb6cd857..9be4b514c3 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -91,20 +91,9 @@ contains
 
     subroutine s_initialize_igr_module()
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-        integer :: igr_temps_total
-        integer :: igr_temps_on_gpu
-        integer :: igr_temps_on_cpu
+        integer :: igr_temps_on_gpu = 3
         character(len=10) :: igr_temps_on_gpu_str
 
-        ! initialize
-        if (igr_iter_solver == 1) then ! Jacobi iteration
-            igr_temps_total = 3
-        else
-            igr_temps_total = 2
-        end if
-        igr_temps_on_gpu = igr_temps_total
-        igr_temps_on_cpu = 0
-
         call get_environment_variable("NVIDIA_IGR_TEMPS_ON_GPU", igr_temps_on_gpu_str)
 
         if (trim(igr_temps_on_gpu_str) == "0") then
@@ -119,17 +108,10 @@ contains
             igr_temps_on_gpu = 3
         end if
 
-        ! trim if needed
-        if ( igr_temps_on_gpu > igr_temps_total ) then
-            igr_temps_on_gpu = igr_temps_total
-        end if
-        igr_temps_on_cpu = igr_temps_total - igr_temps_on_gpu
-
         ! create map
-        temp_on_gpu(1:3) = -1
-        temp_on_gpu(1:igr_temps_total) = 0
+        temp_on_gpu(1:3) = 0
         temp_on_gpu(1:igr_temps_on_gpu) = 1
-        print*, temp_on_gpu(1:3)
+        !print*, temp_on_gpu(1:3)
 #endif
 
         if (viscous) then
@@ -148,16 +130,12 @@ contains
         @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
             idwbuff(2)%beg:idwbuff(2)%end, &
             idwbuff(3)%beg:idwbuff(3)%end))
-        @:PREFER_GPU(jac)
-
         @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
-        @:PREFER_GPU(jac_rhs)
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
             @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
-            @:PREFER_GPU(jac_old)
         end if
 #else
 
@@ -167,8 +145,7 @@ contains
                 idwbuff(3)%beg:idwbuff(3)%end))
             @:PREFER_GPU(jac)
         else
-            print*, 'jac on CPU'
-
+            !print*, 'jac on CPU'
             allocate(pool_host1(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
@@ -182,8 +159,7 @@ contains
             @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
             @:PREFER_GPU(jac_rhs)
         else
-            print*, 'jac_rhs on CPU'
-
+            !print*, 'jac_rhs on CPU'
             allocate(pool_host2(-1:m,-1:n,-1:p))
 
             jac_rhs(-1:m,-1:n,-1:p) => pool_host2(:,:,:)
@@ -196,8 +172,7 @@ contains
                     idwbuff(3)%beg:idwbuff(3)%end))
                 @:PREFER_GPU(jac_old)
             else
-                print*, 'jac_old on CPU'
-
+                !print*, 'jac_old on CPU'
                 allocate(pool_host3(idwbuff(1)%beg:idwbuff(1)%end, &
                     idwbuff(2)%beg:idwbuff(2)%end, &
                     idwbuff(3)%beg:idwbuff(3)%end))
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 0d9ddfd885..8f86202bbe 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -75,12 +75,11 @@ module m_time_steppers
     integer, private :: num_ts !<
     !! Number of time stages in the time-stepping scheme
 
-    integer, private :: out_of_core
-
     $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]')
 
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM
     real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host
+    integer, private :: out_of_core
 #endif
 
 contains
@@ -92,10 +91,10 @@ contains
 
         integer :: i, j !< Generic loop iterators
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
         character(len=10) :: out_of_core_str
         out_of_core = 0
 
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
         call get_environment_variable("MFC_OUT_OF_CORE", out_of_core_str)
 
         if (trim(out_of_core_str) == "0") then

From cfb792c991710dd45f64d415215b34ba23b01cec Mon Sep 17 00:00:00 2001
From: Benjamin Wilfong <bwilfong@daint-ln002.cscs.ch>
Date: Sun, 3 Aug 2025 06:10:07 +0200
Subject: [PATCH 07/25] ENV Vars to case file options and code structure
 changes

---
 CMakeLists.txt                                |   2 +-
 .../3D_IGR_TaylorGreenVortex_nvidia/case.py   |   7 +-
 src/common/include/macros.fpp                 |  16 +-
 src/simulation/m_checker.fpp                  |  10 +
 src/simulation/m_global_parameters.fpp        |  13 +
 src/simulation/m_igr.fpp                      |  73 ++---
 src/simulation/m_mpi_proxy.fpp                |   4 +
 src/simulation/m_start_up.fpp                 |   7 +-
 src/simulation/m_time_steppers.fpp            | 251 ++++++++++--------
 toolchain/mfc/run/case_dicts.py               |   2 +
 toolchain/mfc/test/cases.py                   |  21 +-
 toolchain/templates/santis.mako               |   7 +-
 12 files changed, 215 insertions(+), 198 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0acb3dbe3..8901ec0b16 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,7 +234,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
             message(STATUS "LTO/IPO is not supported in NVHPC Version < 23.11. Use a newer version of NVHPC for best performance.")
         else()
             message(STATUS "Performing IPO using -Mextract followed by -Minline")
-            set(NVHPC_USE_TWO_PASS_IPO TRUE)
+            set(NVHPC_USE_TWO_PASS_IPO FALSE)
         endif()
     else()
         CHECK_IPO_SUPPORTED(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR)
diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
index 74faa7aa22..17ad1ceb43 100644
--- a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
+++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
@@ -47,8 +47,8 @@
             "cyl_coord": "F",
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": Nt,
-            "t_step_save": int(Nt / 100),
+            "t_step_stop": 10, #Nt,
+            "t_step_save": 10, #int(Nt / 100),
             # Simulation Algorithm Parameters
             "num_patches": 1,
             "model_eqns": 2,
@@ -96,6 +96,9 @@
             "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
             "fluid_pp(1)%pi_inf": 0,
             "fluid_pp(1)%Re(1)": 1 / mu,
+            # NVIDIA UVM Options
+            "nv_uvm_igr_temps_on_gpu": 3,
+            "nv_uvm_pref_gpu": "T",
         }
     )
 )
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 0ffad2e06b..7177efa32d 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -18,20 +18,8 @@
     block
     use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
     integer :: istat
-    integer :: prefer_gpu_mode
-    character(len=10) :: prefer_gpu_mode_str
-
-    ! environment variable
-    call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
-    if (trim(prefer_gpu_mode_str) == "0") then ! OFF
-        prefer_gpu_mode = 0
-    elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
-        prefer_gpu_mode = 1
-    else ! default
-        prefer_gpu_mode = 0
-    endif
-
-    if (prefer_gpu_mode .eq. 1) then
+
+    if (nv_uvm_pref_gpu) then
     #:for arg in args
         !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
         ! set preferred location GPU
diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp
index f0196af0e2..8917b0be46 100644
--- a/src/simulation/m_checker.fpp
+++ b/src/simulation/m_checker.fpp
@@ -30,6 +30,7 @@ contains
 
         if (igr) then
             call s_check_inputs_igr
+            call s_check_inputs_nvidia_uvm
         else
             if (recon_type == WENO_TYPE) then
                 call s_check_inputs_weno
@@ -411,4 +412,13 @@ contains
         @:PROHIBIT(powell .and. fd_order == dflt_int, "fd_order must be set if Powell's method is enabled")
     end subroutine s_check_inputs_mhd
 
+    impure subroutine s_check_inputs_nvidia_uvm
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        @:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, &
+            "nv_uvm_igr_temps_on_gpu must be in the range [0, 3]")
+        @:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, &
+            "nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2")
+#endif
+    end subroutine s_check_inputs_nvidia_uvm
+
 end module m_checker
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 2c2d0af646..401fa5412d 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -156,6 +156,15 @@ module m_global_parameters
         logical :: viscous        !< Viscous effects
     #:endif
 
+    !> @name Variables for our of core IGR computation on NVIDIA
+    !> @{
+    integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
+                                       ! 1 => jac on GPU, jac_rhs and jac_old on CPU
+                                       ! 2 => jac and jac_rhs on GPU, jac_old on CPU
+                                       ! 4 => jac, jac_rhs, and jac_old on GPU (default)
+    logical :: nv_uvm_pref_gpu ! Enable pinned gpu memory (default TRUE)
+    !> @}
+
     real(wp) :: weno_eps       !< Binding for the WENO nonlinear weights
     real(wp) :: teno_CT        !< Smoothness threshold for TENO
     logical :: mp_weno        !< Monotonicity preserving (MP) WENO
@@ -570,6 +579,10 @@ contains
         t_stop = dflt_real
         t_save = dflt_real
 
+        ! NVIDIA UVM options
+        nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
+        nv_uvm_pref_gpu = .true.
+
         ! Simulation algorithm parameters
         model_eqns = dflt_int
         mpp_lim = .false.
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 9be4b514c3..0d1edad478 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -25,11 +25,11 @@ module m_igr
  s_finalize_igr_module
 
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-    integer, dimension(3) :: temp_on_gpu
+    integer, dimension(3) :: nv_uvm_temp_on_gpu
     real(wp), pointer, contiguous, dimension(:, :, :) :: jac,jac_rhs,jac_old
-    real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host1
-    real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host2
-    real(wp), allocatable, dimension(:, :, :), pinned, target :: pool_host3
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host_pool
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host_pool
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host_pool
 #else
     real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old
     $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]')
@@ -81,7 +81,6 @@ module m_igr
                                    5._wp/6._wp, & ! Index 0
                                    2._wp/6._wp & ! Index 1
                                    ]
-
         #:endif
     #:endif
 
@@ -90,29 +89,6 @@ module m_igr
 contains
 
     subroutine s_initialize_igr_module()
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-        integer :: igr_temps_on_gpu = 3
-        character(len=10) :: igr_temps_on_gpu_str
-
-        call get_environment_variable("NVIDIA_IGR_TEMPS_ON_GPU", igr_temps_on_gpu_str)
-
-        if (trim(igr_temps_on_gpu_str) == "0") then
-            igr_temps_on_gpu = 0 ! jac, jac_rhs and jac_old on CPU
-        else if (trim(igr_temps_on_gpu_str) == "1") then
-            igr_temps_on_gpu = 1 ! jac on GPU, jac_rhs on CPU, jac_old on CPU
-        else if (trim(igr_temps_on_gpu_str) == "2") then
-            igr_temps_on_gpu = 2 ! jac and jac_rhs on GPU, jac_old on CPU
-        else if (trim(igr_temps_on_gpu_str) == "3") then
-            igr_temps_on_gpu = 3 ! jac, jac_rhs and jac_old on GPU
-        else ! default on GPU
-            igr_temps_on_gpu = 3
-        end if
-
-        ! create map
-        temp_on_gpu(1:3) = 0
-        temp_on_gpu(1:igr_temps_on_gpu) = 1
-        !print*, temp_on_gpu(1:3)
-#endif
 
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
@@ -138,48 +114,47 @@ contains
                 idwbuff(3)%beg:idwbuff(3)%end))
         end if
 #else
+        ! create map
+        nv_uvm_temp_on_gpu(1:3) = 0
+        nv_uvm_temp_on_gpu(1:nv_uvm_igr_temps_on_gpu) = 1
 
-        if ( temp_on_gpu(1) == 1 ) then
+        if (nv_uvm_temp_on_gpu(1) == 1) then
             @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
             @:PREFER_GPU(jac)
         else
-            !print*, 'jac on CPU'
-            allocate(pool_host1(idwbuff(1)%beg:idwbuff(1)%end, &
+            allocate(jac_host_pool(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
 
             jac(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end) => pool_host1(:,:,:)
+                idwbuff(3)%beg:idwbuff(3)%end) => jac_host_pool(:,:,:)
         end if
 
-        if ( temp_on_gpu(2) == 1 ) then
+        if (nv_uvm_temp_on_gpu(2) == 1) then
             @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
             @:PREFER_GPU(jac_rhs)
         else
-            !print*, 'jac_rhs on CPU'
-            allocate(pool_host2(-1:m,-1:n,-1:p))
-
-            jac_rhs(-1:m,-1:n,-1:p) => pool_host2(:,:,:)
+            allocate(jac_rhs_host_pool(-1:m,-1:n,-1:p))
+            jac_rhs(-1:m,-1:n,-1:p) => jac_rhs_host_pool(:,:,:)
         end if
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
-            if ( temp_on_gpu(3) == 1 ) then
+            if (nv_uvm_temp_on_gpu(3) == 1) then
                 @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
                     idwbuff(2)%beg:idwbuff(2)%end, &
                     idwbuff(3)%beg:idwbuff(3)%end))
                 @:PREFER_GPU(jac_old)
             else
-                !print*, 'jac_old on CPU'
-                allocate(pool_host3(idwbuff(1)%beg:idwbuff(1)%end, &
+                allocate(jac_old_host_pool(idwbuff(1)%beg:idwbuff(1)%end, &
                     idwbuff(2)%beg:idwbuff(2)%end, &
                     idwbuff(3)%beg:idwbuff(3)%end))
 
                 jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
                     idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end) => pool_host3(:,:,:)
+                    idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host_pool(:,:,:)
             end if
         end if
 #endif
@@ -203,7 +178,7 @@ contains
 
         #:if not MFC_CASE_OPTIMIZATION
             if (igr_order == 3) then
-                vidxb = -1; vidxe = 2; 
+                vidxb = -1; vidxe = 2;
                 $:GPU_UPDATE(device='[vidxb, vidxe]')
 
                 @:ALLOCATE(coeff_L(0:2))
@@ -219,7 +194,7 @@ contains
                 $:GPU_UPDATE(device='[coeff_R]')
 
             elseif (igr_order == 5) then
-                vidxb = -2; vidxe = 3; 
+                vidxb = -2; vidxe = 3;
                 $:GPU_UPDATE(device='[vidxb, vidxe]')
 
                 @:ALLOCATE(coeff_L(-1:3))
@@ -2699,26 +2674,26 @@ contains
             @:DEALLOCATE(jac_old)
         end if
 #else
-        if (temp_on_gpu(1) == 1) then
+        if (nv_uvm_temp_on_gpu(1) == 1) then
             @:DEALLOCATE(jac)
         else
             nullify(jac)
-            deallocate(pool_host1)
+            deallocate(jac_host_pool)
         end if
 
-        if (temp_on_gpu(2) == 1) then
+        if (nv_uvm_temp_on_gpu(2) == 1) then
             @:DEALLOCATE(jac_rhs)
         else
             nullify(jac_rhs)
-            deallocate(pool_host2)
+            deallocate(jac_rhs_host_pool)
         end if
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
-            if (temp_on_gpu(3) == 1) then
+            if (nv_uvm_temp_on_gpu(3) == 1) then
                 @:DEALLOCATE(jac_old)
             else
                 nullify(jac_old)
-                deallocate(pool_host3)
+                deallocate(jac_old_host_pool)
             end if
         end if
 #endif
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index f5cc89b4c5..3564c1e2e3 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -237,6 +237,10 @@ contains
             #:endfor
         end do
 
+        ! NVIDIA UVM variables
+        call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+
 #endif
 
     end subroutine s_mpi_bcast_user_inputs
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index e004252060..2bda3c8413 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -185,9 +185,10 @@ contains
             surface_tension, bubbles_lagrange, lag_params, &
             hyperelasticity, R0ref, num_bc_patches, Bx0, powell, &
             cont_damage, tau_star, cont_damage_s, alpha_bar, &
-            alf_factor, num_igr_iters, &
-            num_igr_warm_start_iters, &
-            int_comp, ic_eps, ic_beta 
+            alf_factor, num_igr_iters, num_igr_warm_start_iters, &
+            int_comp, ic_eps, ic_beta, nv_uvm_igr_temps_on_gpu, &
+            nv_uvm_pref_gpu
+
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
         inquire (FILE=trim(file_path), EXIST=file_exist)
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 8f86202bbe..540f08f547 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -79,7 +79,6 @@ module m_time_steppers
 
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM
     real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host
-    integer, private :: out_of_core
 #endif
 
 contains
@@ -91,21 +90,6 @@ contains
 
         integer :: i, j !< Generic loop iterators
 
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-        character(len=10) :: out_of_core_str
-        out_of_core = 0
-
-        call get_environment_variable("MFC_OUT_OF_CORE", out_of_core_str)
-
-        if (trim(out_of_core_str) == "0") then
-            out_of_core = 0
-        elseif (trim(out_of_core_str) == "1") then
-            out_of_core = 1
-        else ! default
-            out_of_core = 0
-        endif
-#endif
-
         ! Setting number of time-stages for selected time-stepping scheme
         if (time_stepper == 1) then
             num_ts = 1
@@ -123,35 +107,38 @@ contains
         end do
 
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-        if ( out_of_core == 1 ) then
-           allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
-                                        idwbuff(2)%beg:idwbuff(2)%end, &
-                                        idwbuff(3)%beg:idwbuff(3)%end, &
-                                        1:sys_size))
-        end if
-#endif
+       allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                    idwbuff(2)%beg:idwbuff(2)%end, &
+                                    idwbuff(3)%beg:idwbuff(3)%end, &
+                                    1:sys_size))
+
+        do j = 1, sys_size
+            ! q_cons_ts(1) lives on the device
+            @:ALLOCATE(q_cons_ts(1)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:PREFER_GPU(q_cons_ts(1)%vf(j)%sf)
+            if (num_ts == 2) then
+                ! q_cons_ts(2) lives on the host
+                q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j)
+            end if
+        end do
 
+        do i = 1, num_ts
+            @:ACC_SETUP_VFs(q_cons_ts(i))
+        end do
+#else
         do i = 1, num_ts
             do j = 1, sys_size
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-                if ( i <= (num_ts - out_of_core) ) then
-                    !print*, "q_cons_ts", i, j, "on GPU"
-#endif
-                    @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end))
-                    @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf)
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-                else
-                    !print*, "q_cons_ts", i, j, "on CPU"
-                    q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j)
-                end if
-#endif
+                @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
             end do
             @:ACC_SETUP_VFs(q_cons_ts(i))
         end do
+#endif
 
         ! Allocating the cell-average primitive ts variables
         if (probe_wrt) then
@@ -513,6 +500,7 @@ contains
 
         integer :: i, j, k, l, q!< Generic loop iterator
         real(wp) :: start, finish
+        integer :: dest
 
         ! Stage 1 of 2
 
@@ -542,12 +530,15 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
                         q_cons_ts(2)%vf(i)%sf(j, k, l) = &
+                            q_cons_ts(1)%vf(i)%sf(j, k, l)
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
                             q_cons_ts(1)%vf(i)%sf(j, k, l) &
                             + dt*rhs_vf(i)%sf(j, k, l)
                     end do
@@ -555,6 +546,24 @@ contains
             end do
         end do
 
+        dest = 1 ! Result in q_cons_ts(1)%vf
+#else
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do i = 1, sys_size
+            do l = 0, p
+                do k = 0, n
+                    do j = 0, m
+                        q_cons_ts(2)%vf(i)%sf(j, k, l) = &
+                            q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                            + dt*rhs_vf(i)%sf(j, k, l)
+                    end do
+                end do
+            end do
+        end do
+
+        dest = 2 ! Result in q_cons_ts(2)%vf
+#endif
+
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
             $:GPU_PARALLEL_LOOP(collapse=5)
@@ -590,30 +599,46 @@ contains
             end do
         end if
 
-        if (bodyForces) call s_apply_bodyforces(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, dt)
+        if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt)
 
-        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(2)%vf)
+        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf)
 
         if (model_eqns == 3 .and. (.not. relax)) then
-            call s_pressure_relaxation_procedure(q_cons_ts(2)%vf)
+            call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf)
         end if
 
-        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf)
+        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf)
 
         if (ib) then
             if (qbmm .and. .not. polytropic) then
-                call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf)
             else
-                call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf)
             end if
         end if
 
         ! Stage 2 of 2
 
-        call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
+        call s_compute_rhs(q_cons_ts(dest)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+        $:GPU_PARALLEL_LOOP(collapse=4)
+        do i = 1, sys_size
+            do l = 0, p
+                do k = 0, n
+                    do j = 0, m
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                            (q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                             + q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                             + dt*rhs_vf(i)%sf(j, k, l))/4._wp
+                    end do
+                end do
+            end do
+        end do
 
+        dest = 1 ! Result in q_cons_ts(1)%vf
+#else
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -628,6 +653,9 @@ contains
             end do
         end do
 
+        dest = 1 ! Result in q_cons_ts(1)%vf
+#endif
+
         if (qbmm .and. (.not. polytropic)) then
             $:GPU_PARALLEL_LOOP(collapse=5)
             do i = 1, nb
@@ -664,21 +692,21 @@ contains
             end do
         end if
 
-        if (bodyForces) call s_apply_bodyforces(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp)
+        if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp)
 
-        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(1)%vf)
+        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf)
 
         if (model_eqns == 3 .and. (.not. relax)) then
-            call s_pressure_relaxation_procedure(q_cons_ts(1)%vf)
+            call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf)
         end if
 
-        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf)
+        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf)
 
         if (ib) then
             if (qbmm .and. .not. polytropic) then
-                call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
             else
-                call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf)
             end if
         end if
 
@@ -729,20 +757,23 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
                         q_cons_ts(2)%vf(i)%sf(j, k, l) = &
+                            q_cons_ts(1)%vf(i)%sf(j, k, l)
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
                             q_cons_ts(1)%vf(i)%sf(j, k, l) &
                             + dt*rhs_vf(i)%sf(j, k, l)
                     end do
                 end do
             end do
         end do
-        dest = 2 ! result in q_cons_ts(2)%vf
+
+        dest = 1 ! result in q_cons_ts(1)%vf
 #else
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
@@ -750,15 +781,14 @@ contains
                 do k = 0, n
                     do j = 0, m
                         q_cons_ts(2)%vf(i)%sf(j, k, l) = &
-                            q_cons_ts(1)%vf(i)%sf(j, k, l)
-                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
                             q_cons_ts(1)%vf(i)%sf(j, k, l) &
                             + dt*rhs_vf(i)%sf(j, k, l)
                     end do
                 end do
             end do
         end do
-        dest = 1 ! result in q_cons_ts(1)%vf
+
+        dest = 2 ! result in q_cons_ts(2)%vf
 #endif
 
         !Evolve pb and mv for non-polytropic qbmm
@@ -796,21 +826,21 @@ contains
             end do
         end if
 
-        if (bodyForces) call s_apply_bodyforces(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, dt)
+        if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt)
 
-        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(2)%vf)
+        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf)
 
         if (model_eqns == 3 .and. (.not. relax)) then
-            call s_pressure_relaxation_procedure(q_cons_ts(2)%vf)
+            call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf)
         end if
 
-        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf)
+        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf)
 
         if (ib) then
             if (qbmm .and. .not. polytropic) then
-                call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf)
             else
-                call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf)
             end if
         end if
 
@@ -820,36 +850,38 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-#if  !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
-        $:GPU_PARALLEL_LOOP(collapse=4)
+#if  defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        q_cons_ts(2)%vf(i)%sf(j, k, l) = &
-                            (3._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) &
-                             + q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                            (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                             + q_cons_ts(1)%vf(i)%sf(j, k, l) &
                              + dt*rhs_vf(i)%sf(j, k, l))/4._wp
                     end do
                 end do
             end do
         end do
-        dest = 2 ! result in q_cons_ts(2)%vf
+
+        dest = 1 ! Result in q_cons_ts(1)%vf
 #else
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
-                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
-                            (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) &
-                             + q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                        q_cons_ts(2)%vf(i)%sf(j, k, l) = &
+                            (3._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                             + q_cons_ts(2)%vf(i)%sf(j, k, l) &
                              + dt*rhs_vf(i)%sf(j, k, l))/4._wp
                     end do
                 end do
             end do
         end do
-        dest = 1 ! result in q_cons_ts(1)%vf
+
+        dest = 2 ! Result in q_cons_ts(2)%vf
 #endif
 
         if (qbmm .and. (.not. polytropic)) then
@@ -888,21 +920,21 @@ contains
             end do
         end if
 
-        if (bodyForces) call s_apply_bodyforces(q_cons_ts(2)%vf, q_prim_vf, rhs_vf, dt/4._wp)
+        if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, dt/4._wp)
 
-        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(2)%vf)
+        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf)
 
         if (model_eqns == 3 .and. (.not. relax)) then
-            call s_pressure_relaxation_procedure(q_cons_ts(2)%vf)
+            call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf)
         end if
 
-        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(2)%vf)
+        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf)
 
         if (ib) then
             if (qbmm .and. .not. polytropic) then
-                call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(2)%sf, mv_ts(2)%sf)
             else
-                call s_ibm_correct_state(q_cons_ts(2)%vf, q_prim_vf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf)
             end if
         end if
 
@@ -911,21 +943,22 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
 
-#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
                 do k = 0, n
                     do j = 0, m
                         q_cons_ts(1)%vf(i)%sf(j, k, l) = &
-                            (q_cons_ts(1)%vf(i)%sf(j, k, l) &
-                             + 2._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                            (q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                             + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) &
                              + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp
                     end do
                 end do
             end do
         end do
-        dest = 1 ! result in q_cons_ts(1)%vf
+
+        dest = 1 ! Result in q_cons_ts(1)%vf
 #else
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
@@ -933,14 +966,15 @@ contains
                 do k = 0, n
                     do j = 0, m
                         q_cons_ts(1)%vf(i)%sf(j, k, l) = &
-                            (q_cons_ts(2)%vf(i)%sf(j, k, l) &
-                             + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                            (q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                             + 2._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) &
                              + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp
                     end do
                 end do
             end do
         end do
-        dest = 1 ! result in q_cons_ts(1)%vf
+
+        dest = 1 ! Result in q_cons_ts(2)%vf
 #endif
 
         if (qbmm .and. (.not. polytropic)) then
@@ -979,25 +1013,25 @@ contains
             end do
         end if
 
-        if (bodyForces) call s_apply_bodyforces(q_cons_ts(1)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp)
+        if (bodyForces) call s_apply_bodyforces(q_cons_ts(dest)%vf, q_prim_vf, rhs_vf, 2._wp*dt/3._wp)
 
-        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(1)%vf)
+        if (grid_geometry == 3) call s_apply_fourier_filter(q_cons_ts(dest)%vf)
 
         if (model_eqns == 3 .and. (.not. relax)) then
-            call s_pressure_relaxation_procedure(q_cons_ts(1)%vf)
+            call s_pressure_relaxation_procedure(q_cons_ts(dest)%vf)
         end if
 
         call nvtxStartRange("RHS-ELASTIC")
-        if (hyperelasticity) call s_hyperelastic_rmt_stress_update(q_cons_ts(1)%vf, q_prim_vf)
+        if (hyperelasticity) call s_hyperelastic_rmt_stress_update(q_cons_ts(dest)%vf, q_prim_vf)
         call nvtxEndRange
 
-        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(1)%vf)
+        if (adv_n) call s_comp_alpha_from_n(q_cons_ts(dest)%vf)
 
         if (ib) then
             if (qbmm .and. .not. polytropic) then
-                call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf, pb_ts(1)%sf, mv_ts(1)%sf)
             else
-                call s_ibm_correct_state(q_cons_ts(1)%vf, q_prim_vf)
+                call s_ibm_correct_state(q_cons_ts(dest)%vf, q_prim_vf)
             end if
         end if
 
@@ -1007,6 +1041,7 @@ contains
 
             time = time + (finish - start)
         end if
+
     end subroutine s_3rd_order_tvd_rk
 
     !> Strang splitting scheme with 3rd order TVD RK time-stepping algorithm for
@@ -1244,30 +1279,20 @@ contains
         integer :: i, j !< Generic loop iterators
 
         ! Deallocating the cell-average conservative variables
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        do j = 1, sys_size
+            @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf)
+            if (num_ts == 2) then
+                nullify(q_cons_ts(2)%vf(j)%sf)
+            end if
+        end do
+        deallocate(q_cons_ts_pool_host)
+#else
         do i = 1, num_ts
             do j = 1, sys_size
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-                if ( i <= (num_ts - out_of_core) ) then
-                    !print*, "q_cons_ts", i, j, "dealloc"
-#endif
-                    @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-                else
-                    !print*, "q_cons_ts", i, j, "nullify"
-                    nullify(q_cons_ts(i)%vf(j)%sf)
-                end if
-#endif
+                @:ALLOCATE(q_cons_ts(i)%vf(j)%sf)
             end do
-
-            @:DEALLOCATE(q_cons_ts(i)%vf)
         end do
-
-        @:DEALLOCATE(q_cons_ts)
-
-#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
-        if ( out_of_core == 1 ) then
-            deallocate(q_cons_ts_pool_host)
-        end if
 #endif
 
         ! Deallocating the cell-average primitive ts variables
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index 6bcf0964cc..bd9dcec11a 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -312,6 +312,8 @@ def analytic(self):
     'int_comp': ParamType.LOG,
     'ic_eps': ParamType.REAL,
     'ic_beta': ParamType.REAL,
+    'nv_uvm_igr_temps_on_gpu': ParamType.INT,
+    'nv_uvm_pref_gpu': ParamType.LOG,
 })
 
 for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',
diff --git a/toolchain/mfc/test/cases.py b/toolchain/mfc/test/cases.py
index fac3dc4eba..95927bb04b 100644
--- a/toolchain/mfc/test/cases.py
+++ b/toolchain/mfc/test/cases.py
@@ -693,17 +693,17 @@ def alter_mixlayer_perturb(dimInfo):
                 'patch_icpp(1)%vel(1)': 1.0, 'patch_icpp(1)%vel(2)': 0.0, 'patch_icpp(1)%vel(3)': 0.0,
                 'patch_icpp(1)%pres': 17.8571428571, 'patch_icpp(1)%alpha_rho(1)': 1.0, 'patch_icpp(1)%alpha(1)': 1.0,
                 'patch_icpp(1)%r0': -1e6, 'patch_icpp(1)%v0': -1e6,
-                'patch_icpp(2)%geometry': -100, 
+                'patch_icpp(2)%geometry': -100,
                 'patch_icpp(2)%x_centroid': -1e6, 'patch_icpp(2)%length_x': -1e6,
-                'patch_icpp(2)%y_centroid': -1e6, 'patch_icpp(2)%length_y': -1e6, 
-                'patch_icpp(2)%z_centroid': -1e6, 'patch_icpp(2)%length_z': -1e6, 
-                'patch_icpp(2)%vel(1)': -1e6, 'patch_icpp(2)%vel(2)': -1e6, 'patch_icpp(2)%vel(3)': -1e6, 
+                'patch_icpp(2)%y_centroid': -1e6, 'patch_icpp(2)%length_y': -1e6,
+                'patch_icpp(2)%z_centroid': -1e6, 'patch_icpp(2)%length_z': -1e6,
+                'patch_icpp(2)%vel(1)': -1e6, 'patch_icpp(2)%vel(2)': -1e6, 'patch_icpp(2)%vel(3)': -1e6,
                 'patch_icpp(2)%r0': -1e6, 'patch_icpp(2)%v0': -1e6,
-                'patch_icpp(3)%geometry': -100, 
+                'patch_icpp(3)%geometry': -100,
                 'patch_icpp(3)%x_centroid': -1e6, 'patch_icpp(3)%length_x': -1e6,
-                'patch_icpp(3)%y_centroid': -1e6, 'patch_icpp(3)%length_y': -1e6, 
-                'patch_icpp(3)%z_centroid': -1e6, 'patch_icpp(3)%length_z': -1e6, 
-                'patch_icpp(3)%vel(1)': -1e6, 'patch_icpp(3)%vel(2)': -1e6, 'patch_icpp(3)%vel(3)': -1e6, 
+                'patch_icpp(3)%y_centroid': -1e6, 'patch_icpp(3)%length_y': -1e6,
+                'patch_icpp(3)%z_centroid': -1e6, 'patch_icpp(3)%length_z': -1e6,
+                'patch_icpp(3)%vel(1)': -1e6, 'patch_icpp(3)%vel(2)': -1e6, 'patch_icpp(3)%vel(3)': -1e6,
                 'patch_icpp(3)%r0': -1e6, 'patch_icpp(3)%v0': -1e6
             }))
 
@@ -993,11 +993,12 @@ def foreach_example():
                            "2D_lagrange_bubblescreen",
                            "3D_lagrange_bubblescreen", "2D_triple_point",
                            "1D_shuosher_analytical",
-                           "1D_titarevtorro_analytical", 
+                           "1D_titarevtorro_analytical",
                            "2D_acoustic_pulse_analytical",
                            "2D_isentropicvortex_analytical",
                            "2D_zero_circ_vortex_analytical",
-                           "3D_TaylorGreenVortex_analytical"]
+                           "3D_TaylorGreenVortex_analytical",
+                           "3D_IGR_TaylorGreenVortex_nvidia"]
             if path in casesToSkip:
                 continue
             name = f"{path.split('_')[0]} -> Example -> {'_'.join(path.split('_')[1:])}"
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
index 27b4d6b425..e798b677df 100644
--- a/toolchain/templates/santis.mako
+++ b/toolchain/templates/santis.mako
@@ -37,13 +37,8 @@ export FI_CXI_RX_MATCH_MODE=software
 export FI_MR_CACHE_MONITOR=disabled
 export MPICH_NO_BUFFER_ALIAS_CHECK=1
 
-# CUSTOM env vars to MFC
-export MFC_OUT_OF_CORE=1                      # out of core
-export NVIDIA_MANUAL_GPU_HINTS=1              # prefloc GPU on some
-export NVIDIA_IGR_TEMPS_ON_GPU=3              # jac, jac_rhs, and jac_old on GPU
-
 # NSYS
-export NSYS=1                                 # enable nsys profiling
+export NSYS=0                                 # enable nsys profiling
 export NSYS_FILE=myreport.qdrep
 
 ${helpers.template_prologue()}

From cacc6b041560667d319ce0f8938f694469566747 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Sun, 3 Aug 2025 10:53:35 +0200
Subject: [PATCH 08/25] Fix some comments

---
 src/simulation/m_global_parameters.fpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 401fa5412d..a7539327f5 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -161,8 +161,8 @@ module m_global_parameters
     integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
                                        ! 1 => jac on GPU, jac_rhs and jac_old on CPU
                                        ! 2 => jac and jac_rhs on GPU, jac_old on CPU
-                                       ! 4 => jac, jac_rhs, and jac_old on GPU (default)
-    logical :: nv_uvm_pref_gpu ! Enable pinned gpu memory (default TRUE)
+                                       ! 3 => jac, jac_rhs, and jac_old on GPU (default)
+    logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default TRUE)
     !> @}
 
     real(wp) :: weno_eps       !< Binding for the WENO nonlinear weights

From b3fdbff5f8df0ad7426db16195d10f9e66626533 Mon Sep 17 00:00:00 2001
From: Ben Wilfong <48168887+wilfonba@users.noreply.github.com>
Date: Sun, 3 Aug 2025 14:15:05 -0400
Subject: [PATCH 09/25] test merge and add nv_uvm_out_of_core back

---
 src/simulation/m_global_parameters.fpp |  2 ++
 src/simulation/m_mpi_proxy.fpp         |  1 +
 src/simulation/m_time_steppers.fpp     | 10 ++++++----
 toolchain/mfc/run/case_dicts.py        |  1 +
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 52ad4aec3e..24f23ed4a5 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -163,6 +163,7 @@ module m_global_parameters
                                        ! 1 => jac on GPU, jac_rhs and jac_old on CPU
                                        ! 2 => jac and jac_rhs on GPU, jac_old on CPU
                                        ! 3 => jac, jac_rhs, and jac_old on GPU (default)
+    logical :: nv_uvm_out_of_core ! Enable out-or-core storage of q_cons_ts(2) in timestepping
     logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default TRUE)
     !> @}
 
@@ -584,6 +585,7 @@ contains
 
         ! NVIDIA UVM options
         nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
+        nv_uvm_out_of_core = .false.
         nv_uvm_pref_gpu = .true.
 
         ! Simulation algorithm parameters
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index d97fdb64c8..f2293b0ffd 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -239,6 +239,7 @@ contains
 
         ! NVIDIA UVM variables
         call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
         call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
 
 #endif
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 9d562c6689..48fddcd8cd 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -574,7 +574,6 @@ contains
         real(wp), intent(inout) :: time_avg
 
         integer :: i, j, k, l, q!< Generic loop iterator
-        integer :: dest
         real(wp) :: start, finish
         integer :: dest
 
@@ -810,9 +809,8 @@ contains
         real(wp), intent(INOUT) :: time_avg
 
         integer :: i, j, k, l, q !< Generic loop iterator
-        integer :: dest
-
         real(wp) :: start, finish
+        integer :: dest
 
         ! Stage 1 of 3
 
@@ -1385,7 +1383,11 @@ contains
         do j = 1, sys_size
             @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf)
             if (num_ts == 2) then
-                nullify(q_cons_ts(2)%vf(j)%sf)
+                if (nv_uvm_out_of_core) then
+                    nullify(q_cons_ts(2)%vf(j)%sf)
+                else
+                    @:DEALLOCATE(q_cons_ts(2)%vf(j)%sf)
+                end if
             end if
         end do
         deallocate(q_cons_ts_pool_host)
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index 704450a48b..54d7d00b99 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -314,6 +314,7 @@ def analytic(self):
     'ic_eps': ParamType.REAL,
     'ic_beta': ParamType.REAL,
     'nv_uvm_igr_temps_on_gpu': ParamType.INT,
+    'nv_uvm_igr_out_of_core': ParamType.LOG,
     'nv_uvm_pref_gpu': ParamType.LOG,
 })
 

From 51d7e90db68084df48cd4154e8c9295e1da5a506 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Tue, 5 Aug 2025 20:09:34 +0200
Subject: [PATCH 10/25] Fix some allocs and deallocs in timesteppers

---
 src/simulation/m_time_steppers.fpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 48fddcd8cd..b2c6e72bcd 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -115,7 +115,8 @@ contains
         end do
 
 #if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
-        if (nv_uvm_out_of_core) then
+        if (num_ts == 2 .and. nv_uvm_out_of_core) then
+            ! host allocation for q_cons_ts(2)%vf(j)%sf for all j
             allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
                                         idwbuff(2)%beg:idwbuff(2)%end, &
                                         idwbuff(3)%beg:idwbuff(3)%end, &
@@ -1390,7 +1391,9 @@ contains
                 end if
             end if
         end do
-        deallocate(q_cons_ts_pool_host)
+        if (num_ts == 2 .and. nv_uvm_out_of_core) then
+            deallocate(q_cons_ts_pool_host)
+        end if
 #elif defined(FRONTIER_UNIFIED)
         do i = 1, num_ts
             do j = 1, sys_size
@@ -1407,6 +1410,9 @@ contains
             end do
         end do
 #endif
+        do i = 1, num_ts
+            @:DEALLOCATE(q_cons_ts(i)%vf)
+        end do
 
         @:DEALLOCATE(q_cons_ts)
 

From c553b78cfcb114d32f3a9065ba6f07cbec2c25bd Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Tue, 5 Aug 2025 20:18:29 +0200
Subject: [PATCH 11/25] Fix nv_uvm_out_of_core inconsistency and add to case
 file

---
 examples/3D_IGR_TaylorGreenVortex_nvidia/case.py | 1 +
 src/simulation/m_global_parameters.fpp           | 4 ++--
 src/simulation/m_mpi_proxy.fpp                   | 2 +-
 src/simulation/m_start_up.fpp                    | 4 ++--
 toolchain/mfc/run/case_dicts.py                  | 2 +-
 5 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
index 17ad1ceb43..ccc7413d03 100644
--- a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
+++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
@@ -97,6 +97,7 @@
             "fluid_pp(1)%pi_inf": 0,
             "fluid_pp(1)%Re(1)": 1 / mu,
             # NVIDIA UVM Options
+            "nv_uvm_out_of_core": "T",
             "nv_uvm_igr_temps_on_gpu": 3,
             "nv_uvm_pref_gpu": "T",
         }
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 24f23ed4a5..3ab256c548 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -159,11 +159,11 @@ module m_global_parameters
 
     !> @name Variables for our of core IGR computation on NVIDIA
     !> @{
+    logical :: nv_uvm_out_of_core ! Enable out-or-core storage of q_cons_ts(2) in timestepping
     integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
                                        ! 1 => jac on GPU, jac_rhs and jac_old on CPU
                                        ! 2 => jac and jac_rhs on GPU, jac_old on CPU
                                        ! 3 => jac, jac_rhs, and jac_old on GPU (default)
-    logical :: nv_uvm_out_of_core ! Enable out-or-core storage of q_cons_ts(2) in timestepping
     logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default TRUE)
     !> @}
 
@@ -584,8 +584,8 @@ contains
         t_save = dflt_real
 
         ! NVIDIA UVM options
-        nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
         nv_uvm_out_of_core = .false.
+        nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
         nv_uvm_pref_gpu = .true.
 
         ! Simulation algorithm parameters
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index f2293b0ffd..755f762166 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -238,8 +238,8 @@ contains
         end do
 
         ! NVIDIA UVM variables
-        call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
         call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
 
 #endif
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 610920b8e2..313ef48f2b 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -188,8 +188,8 @@ contains
             hyperelasticity, R0ref, num_bc_patches, Bx0, powell, &
             cont_damage, tau_star, cont_damage_s, alpha_bar, &
             alf_factor, num_igr_iters, num_igr_warm_start_iters, &
-            int_comp, ic_eps, ic_beta, nv_uvm_igr_temps_on_gpu, &
-            nv_uvm_pref_gpu, down_sample
+            int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, &
+            nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index 54d7d00b99..8378d3044d 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -313,8 +313,8 @@ def analytic(self):
     'int_comp': ParamType.LOG,
     'ic_eps': ParamType.REAL,
     'ic_beta': ParamType.REAL,
+    'nv_uvm_out_of_core': ParamType.LOG,
     'nv_uvm_igr_temps_on_gpu': ParamType.INT,
-    'nv_uvm_igr_out_of_core': ParamType.LOG,
     'nv_uvm_pref_gpu': ParamType.LOG,
 })
 

From f3b3851006bf95b22a6fb24a1c3ae5d6910e96ed Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Tue, 5 Aug 2025 22:50:19 +0200
Subject: [PATCH 12/25] Fix bug in 2nd order TVD RK introduced by merge

---
 src/simulation/m_time_steppers.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index b2c6e72bcd..cd5b087997 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -707,7 +707,7 @@ contains
                         q_cons_ts(1)%vf(i)%sf(j, k, l) = &
                             (q_cons_ts(2)%vf(i)%sf(j, k, l) &
                              + q_cons_ts(1)%vf(i)%sf(j, k, l) &
-                             + dt*rhs_vf(i)%sf(j, k, l))/4._wp
+                             + dt*rhs_vf(i)%sf(j, k, l))/2._wp
                     end do
                 end do
             end do

From 71b59766dab3b5bd172bc0d2f4f8d2e121533916 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Tue, 5 Aug 2025 23:35:07 +0200
Subject: [PATCH 13/25] Fix some comments

---
 src/simulation/m_global_parameters.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 3ab256c548..546bd95f4d 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -159,7 +159,7 @@ module m_global_parameters
 
     !> @name Variables for our of core IGR computation on NVIDIA
     !> @{
-    logical :: nv_uvm_out_of_core ! Enable out-or-core storage of q_cons_ts(2) in timestepping
+    logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE)
     integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
                                        ! 1 => jac on GPU, jac_rhs and jac_old on CPU
                                        ! 2 => jac and jac_rhs on GPU, jac_old on CPU

From a4d6b38b61e6e1ae6ac98c2f76385d1fa53a5981 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Wed, 6 Aug 2025 08:46:31 +0200
Subject: [PATCH 14/25] Add note on binding script requirement for PREFER_GPU
 macro

---
 src/common/include/macros.fpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 7177efa32d..f4e4f280b1 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -12,6 +12,10 @@
 #endif
 #:enddef
 
+! Caution:
+! This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI rank.
+! That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0.
+! For an example see misc/nvidia_uvm/bind.sh.
 #:def PREFER_GPU(*args)
 #ifdef MFC_SIMULATION
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM

From acb24057b021bbcfaba33e320a4dbb1d15e5165b Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Thu, 7 Aug 2025 08:49:53 +0200
Subject: [PATCH 15/25] Flip nv_uvm_pref_gpu default to false

---
 src/simulation/m_global_parameters.fpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 546bd95f4d..47c7a3a276 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -164,7 +164,7 @@ module m_global_parameters
                                        ! 1 => jac on GPU, jac_rhs and jac_old on CPU
                                        ! 2 => jac and jac_rhs on GPU, jac_old on CPU
                                        ! 3 => jac, jac_rhs, and jac_old on GPU (default)
-    logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default TRUE)
+    logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE)
     !> @}
 
     real(wp) :: weno_eps       !< Binding for the WENO nonlinear weights
@@ -586,7 +586,7 @@ contains
         ! NVIDIA UVM options
         nv_uvm_out_of_core = .false.
         nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
-        nv_uvm_pref_gpu = .true.
+        nv_uvm_pref_gpu = .false.
 
         ! Simulation algorithm parameters
         model_eqns = dflt_int

From 8fef22d567dbbd7698f7272bfb6c90fb4776fa19 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Thu, 7 Aug 2025 18:26:18 +0200
Subject: [PATCH 16/25] Be explicit with unified memory compilation to stay
 robust in changes of defaults

---
 CMakeLists.txt                  | 4 ++--
 toolchain/templates/santis.mako | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e3d2498a4..0d48fab5c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -498,11 +498,11 @@ function(MFC_SETUP_TARGET)
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=mem:unified -cuda
+                        PRIVATE -gpu=mem:unified:managedalloc -cuda
                     )
                     # "This option must appear in both the compile and link lines" -- NVHPC Docs
                     target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=mem:unified -cuda
+                        PRIVATE -gpu=mem:unified:managedalloc -cuda
                     )
                 endif()
 
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
index e798b677df..30ebdecf2b 100644
--- a/toolchain/templates/santis.mako
+++ b/toolchain/templates/santis.mako
@@ -27,7 +27,7 @@
 % endif
 
 # NVHPC and CUDA env vars
-export NV_ACC_USE_MALLOC=0                    # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified )
+export NV_ACC_USE_MALLOC=0                    # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified:managedalloc )
 export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints
 #export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
 

From 5e369c364003186910bf49cd9f4a373a9e6d8d79 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Mon, 11 Aug 2025 00:41:02 -0700
Subject: [PATCH 17/25] Add some changes to future proof the unified memory
 build

---
 CMakeLists.txt                  |  7 ++++++-
 src/common/include/macros.fpp   | 10 ++++++++++
 src/simulation/m_weno.fpp       |  2 ++
 toolchain/templates/santis.mako | 21 +++++++++++++++++----
 4 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d48fab5c9..97b31ec7f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -527,7 +527,12 @@ function(MFC_SETUP_TARGET)
 
         if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
             find_package(CUDAToolkit REQUIRED)
-            target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+            if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
+                target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+            else() # CUDA >= 12.9
+                target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx)
+                target_link_options(${a_target} PRIVATE "-cudalib=nvtx3")
+            endif()
         endif()
     endforeach()
 
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index f4e4f280b1..58b8b07190 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -20,7 +20,17 @@
 #ifdef MFC_SIMULATION
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM
     block
+! Beginning in the 25.3 release, the structure of the cudafor module has been changed slightly.
+! The module now includes, or “uses” 3 submodules: cuda_runtime_api, gpu_reductions, and sort.
+! The cudafor functionality has not changed. But for new users, or users who have needed to
+! work-around name conflicts in the module, it may be better to use cuda_runtime_api to expose
+! interfaces to the CUDA runtime calls described in Chapter 4 of this guide.
+! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules
+#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3)
     use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
+#else
+    use cuda_runtime_api
+#endif
     integer :: istat
 
     if (nv_uvm_pref_gpu) then
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 56beaea979..a9846124ba 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -98,7 +98,9 @@ module m_weno
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
     type(int_bounds_info) :: is1_weno, is2_weno, is3_weno
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[is1_weno,is2_weno,is3_weno]')
+#endif
     !
     !> @}
 
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
index 30ebdecf2b..b0e47dc154 100644
--- a/toolchain/templates/santis.mako
+++ b/toolchain/templates/santis.mako
@@ -26,10 +26,23 @@
 % endif
 % endif
 
-# NVHPC and CUDA env vars
-export NV_ACC_USE_MALLOC=0                    # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified:managedalloc )
-export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints
-#export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
+# We compiled the code using -gpu=unified:managedalloc, hence we use cudaMallocManaged for the dynamic allocations.
+# Using NV_ACC_USE_MALLOC we could change to malloc at runtime. We choose to not do that here and stick with cudaMallocManaged and 2MB page sizes.
+# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#memory-model
+# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#command-line-options-selecting-compiler-memory-modes
+export NV_ACC_USE_MALLOC=0
+
+# For NVIDIA CUDA devices, controls the use of automatic memory hints at data constructs in the managed and unified memory modes.
+# Below is a breakdown of the permitted values (case insensitive):
+# - DEFAULT: Use the default settings. On NVIDIA Grace Hopper systems, the default is currently ENABLE_ALL; on all other systems, the default is DISABLE.
+# - DISABLE: Memory hints are disabled for all data constructs.
+# - ENABLE_EXPLICIT: Memory hints are enabled for explicit data constructs only.
+# - ENABLE_ALL: Memory hints are enabled for explicit and implicit data constructs.
+# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#environment-variables-controlling-device-memory-management
+# Here we disable the implicit compiler hints.
+# Using NVCOMPILER_ACC_NO_MEMHINTS is the legacy way and is still supported, but users should prefer NVCOMPILER_ACC_MEMHINTS when using newer nvhpc compilers.
+export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints - legacy way
+export NVCOMPILER_ACC_MEMHINTS=DISABLE        # disable implicit compiler hints - new way
 
 # Cray MPICH
 export MPICH_GPU_SUPPORT_ENABLED=1

From 52c56087ef43dd2f4032d785daebb5e74b568863 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Mon, 11 Aug 2025 11:13:00 +0200
Subject: [PATCH 18/25] Comment out calls to cudaGetErrorString

---
 src/common/include/macros.fpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 58b8b07190..1378f9e864 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -40,19 +40,19 @@
         istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
         if (istat /= cudaSuccess) then
             write(*,"('Error code: ',I0, ': ')") istat
-            write(*,*) cudaGetErrorString(istat)
+            !write(*,*) cudaGetErrorString(istat)
         endif
         ! set accessed by CPU
         istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId )
         if (istat /= cudaSuccess) then
             write(*,"('Error code: ',I0, ': ')") istat
-            write(*,*) cudaGetErrorString(istat)
+            !write(*,*) cudaGetErrorString(istat)
         endif
         ! prefetch to GPU - physically populate memory pages
         istat = cudaMemPrefetchAsync( c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0 )
         if (istat /= cudaSuccess) then
             write(*,"('Error code: ',I0, ': ')") istat
-            write(*,*) cudaGetErrorString(istat)
+            !write(*,*) cudaGetErrorString(istat)
         endif
     #:endfor
     end if

From 4ec8617e5dc104c4133e303542a4c21052cd5255 Mon Sep 17 00:00:00 2001
From: Ben Wilfong <48168887+wilfonba@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:18:00 -0400
Subject: [PATCH 19/25] prepare for merge

---
 CMakeLists.txt                                |  4 +-
 .../3D_IGR_TaylorGreenVortex_nvidia/case.py   |  8 +--
 src/common/include/macros.fpp                 | 54 +++++++++----------
 src/common/m_mpi_common.fpp                   |  5 +-
 src/simulation/m_global_parameters.fpp        |  6 +--
 src/simulation/m_igr.fpp                      | 40 +++++++-------
 src/simulation/m_time_steppers.fpp            | 16 +++---
 toolchain/mfc/build.py                        |  2 +-
 8 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97b31ec7f3..a581a2b769 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -234,7 +234,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release")
             message(STATUS "LTO/IPO is not supported in NVHPC Version < 23.11. Use a newer version of NVHPC for best performance.")
         else()
             message(STATUS "Performing IPO using -Mextract followed by -Minline")
-            set(NVHPC_USE_TWO_PASS_IPO FALSE)
+            set(NVHPC_USE_TWO_PASS_IPO TRUE)
         endif()
     else()
         CHECK_IPO_SUPPORTED(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR)
@@ -492,7 +492,7 @@ function(MFC_SETUP_TARGET)
                 endforeach()
 
                 target_compile_options(${a_target}
-                    PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
+                    PRIVATE -gpu=keep,ptxinfo,lineinfo
                 )
 
                 # GH-200 Unified Memory Support
diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
index ccc7413d03..e2b22e8017 100644
--- a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
+++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
@@ -4,8 +4,8 @@
 
 N = 799
 Nx = N
-Ny = 2*(N+1)-1
-Nz = 2*(N+1)-1
+Ny = 2 * (N + 1) - 1
+Nz = 2 * (N + 1) - 1
 
 Re = 1600
 L = 1
@@ -47,8 +47,8 @@
             "cyl_coord": "F",
             "dt": dt,
             "t_step_start": 0,
-            "t_step_stop": 10, #Nt,
-            "t_step_save": 10, #int(Nt / 100),
+            "t_step_stop": 10,  # Nt,
+            "t_step_save": 10,  # int(Nt / 100),
             # Simulation Algorithm Parameters
             "num_patches": 1,
             "model_eqns": 2,
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index 1378f9e864..69241c99ef 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -27,35 +27,35 @@
 ! interfaces to the CUDA runtime calls described in Chapter 4 of this guide.
 ! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules
 #if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3)
-    use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
+        use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
 #else
-    use cuda_runtime_api
+        use cuda_runtime_api
 #endif
-    integer :: istat
-
-    if (nv_uvm_pref_gpu) then
-    #:for arg in args
-        !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
-        ! set preferred location GPU
-        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
-        if (istat /= cudaSuccess) then
-            write(*,"('Error code: ',I0, ': ')") istat
-            !write(*,*) cudaGetErrorString(istat)
-        endif
-        ! set accessed by CPU
-        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId )
-        if (istat /= cudaSuccess) then
-            write(*,"('Error code: ',I0, ': ')") istat
-            !write(*,*) cudaGetErrorString(istat)
-        endif
-        ! prefetch to GPU - physically populate memory pages
-        istat = cudaMemPrefetchAsync( c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0 )
-        if (istat /= cudaSuccess) then
-            write(*,"('Error code: ',I0, ': ')") istat
-            !write(*,*) cudaGetErrorString(istat)
-        endif
-    #:endfor
-    end if
+        integer :: istat
+
+        if (nv_uvm_pref_gpu) then
+            #:for arg in args
+                !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
+                ! set preferred location GPU
+                istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+                ! set accessed by CPU
+                istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+                ! prefetch to GPU - physically populate memory pages
+                istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+            #:endfor
+        end if
     end block
 #endif
 #endif
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index 8633c84ceb..dbe61317b8 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -83,9 +83,8 @@ contains
 #ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
 #else
-        ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
-        !$acc enter data create(capture:buff_send)
-        !$acc enter data create(capture:buff_recv)
+        allocate (buff_send(0:halo_size), buff_recv(0:halo_size))
+        $:GPU_ENTER_DATA(create='[capture:buff_send, capture:buff_recv]')
 #endif
 #endif
 
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 47c7a3a276..34c122f1e7 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -161,9 +161,9 @@ module m_global_parameters
     !> @{
     logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE)
     integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
-                                       ! 1 => jac on GPU, jac_rhs and jac_old on CPU
-                                       ! 2 => jac and jac_rhs on GPU, jac_old on CPU
-                                       ! 3 => jac, jac_rhs, and jac_old on GPU (default)
+    ! 1 => jac on GPU, jac_rhs and jac_old on CPU
+    ! 2 => jac and jac_rhs on GPU, jac_old on CPU
+    ! 3 => jac, jac_rhs, and jac_old on GPU (default)
     logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE)
     !> @}
 
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 0d1edad478..01fd90e7a8 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -26,7 +26,7 @@ module m_igr
 
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM
     integer, dimension(3) :: nv_uvm_temp_on_gpu
-    real(wp), pointer, contiguous, dimension(:, :, :) :: jac,jac_rhs,jac_old
+    real(wp), pointer, contiguous, dimension(:, :, :) :: jac, jac_rhs, jac_old
     real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host_pool
     real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host_pool
     real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host_pool
@@ -124,21 +124,21 @@ contains
                 idwbuff(3)%beg:idwbuff(3)%end))
             @:PREFER_GPU(jac)
         else
-            allocate(jac_host_pool(idwbuff(1)%beg:idwbuff(1)%end, &
-                idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end))
+            allocate (jac_host_pool(idwbuff(1)%beg:idwbuff(1)%end, &
+                                    idwbuff(2)%beg:idwbuff(2)%end, &
+                                    idwbuff(3)%beg:idwbuff(3)%end))
 
             jac(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end) => jac_host_pool(:,:,:)
+                idwbuff(3)%beg:idwbuff(3)%end) => jac_host_pool(:, :, :)
         end if
 
         if (nv_uvm_temp_on_gpu(2) == 1) then
             @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
             @:PREFER_GPU(jac_rhs)
         else
-            allocate(jac_rhs_host_pool(-1:m,-1:n,-1:p))
-            jac_rhs(-1:m,-1:n,-1:p) => jac_rhs_host_pool(:,:,:)
+            allocate (jac_rhs_host_pool(-1:m, -1:n, -1:p))
+            jac_rhs(-1:m, -1:n, -1:p) => jac_rhs_host_pool(:, :, :)
         end if
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
@@ -148,13 +148,13 @@ contains
                     idwbuff(3)%beg:idwbuff(3)%end))
                 @:PREFER_GPU(jac_old)
             else
-                allocate(jac_old_host_pool(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end))
+                allocate (jac_old_host_pool(idwbuff(1)%beg:idwbuff(1)%end, &
+                                            idwbuff(2)%beg:idwbuff(2)%end, &
+                                            idwbuff(3)%beg:idwbuff(3)%end))
 
                 jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
-                    idwbuff(2)%beg:idwbuff(2)%end, &
-                    idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host_pool(:,:,:)
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host_pool(:, :, :)
             end if
         end if
 #endif
@@ -178,7 +178,7 @@ contains
 
         #:if not MFC_CASE_OPTIMIZATION
             if (igr_order == 3) then
-                vidxb = -1; vidxe = 2;
+                vidxb = -1; vidxe = 2; 
                 $:GPU_UPDATE(device='[vidxb, vidxe]')
 
                 @:ALLOCATE(coeff_L(0:2))
@@ -194,7 +194,7 @@ contains
                 $:GPU_UPDATE(device='[coeff_R]')
 
             elseif (igr_order == 5) then
-                vidxb = -2; vidxe = 3;
+                vidxb = -2; vidxe = 3; 
                 $:GPU_UPDATE(device='[vidxb, vidxe]')
 
                 @:ALLOCATE(coeff_L(-1:3))
@@ -2677,23 +2677,23 @@ contains
         if (nv_uvm_temp_on_gpu(1) == 1) then
             @:DEALLOCATE(jac)
         else
-            nullify(jac)
-            deallocate(jac_host_pool)
+            nullify (jac)
+            deallocate (jac_host_pool)
         end if
 
         if (nv_uvm_temp_on_gpu(2) == 1) then
             @:DEALLOCATE(jac_rhs)
         else
-            nullify(jac_rhs)
-            deallocate(jac_rhs_host_pool)
+            nullify (jac_rhs)
+            deallocate (jac_rhs_host_pool)
         end if
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
             if (nv_uvm_temp_on_gpu(3) == 1) then
                 @:DEALLOCATE(jac_old)
             else
-                nullify(jac_old)
-                deallocate(jac_old_host_pool)
+                nullify (jac_old)
+                deallocate (jac_old_host_pool)
             end if
         end if
 #endif
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index cd5b087997..e7d4ba6017 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -117,10 +117,10 @@ contains
 #if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
         if (num_ts == 2 .and. nv_uvm_out_of_core) then
             ! host allocation for q_cons_ts(2)%vf(j)%sf for all j
-            allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
-                                        idwbuff(2)%beg:idwbuff(2)%end, &
-                                        idwbuff(3)%beg:idwbuff(3)%end, &
-                                        1:sys_size))
+            allocate (q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                          idwbuff(2)%beg:idwbuff(2)%end, &
+                                          idwbuff(3)%beg:idwbuff(3)%end, &
+                                          1:sys_size))
         end if
 
         do j = 1, sys_size
@@ -133,8 +133,8 @@ contains
                 if (nv_uvm_out_of_core) then
                     ! q_cons_ts(2) lives on the host
                     q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
-                        idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j)
+                                          idwbuff(2)%beg:idwbuff(2)%end, &
+                                          idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:, :, :, j)
                 else
                     @:ALLOCATE(q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
                         idwbuff(2)%beg:idwbuff(2)%end, &
@@ -1385,14 +1385,14 @@ contains
             @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf)
             if (num_ts == 2) then
                 if (nv_uvm_out_of_core) then
-                    nullify(q_cons_ts(2)%vf(j)%sf)
+                    nullify (q_cons_ts(2)%vf(j)%sf)
                 else
                     @:DEALLOCATE(q_cons_ts(2)%vf(j)%sf)
                 end if
             end if
         end do
         if (num_ts == 2 .and. nv_uvm_out_of_core) then
-            deallocate(q_cons_ts_pool_host)
+            deallocate (q_cons_ts_pool_host)
         end if
 #elif defined(FRONTIER_UNIFIED)
         do i = 1, num_ts
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 8edaeec990..70b2f6950b 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -64,7 +64,7 @@ def get_install_dirpath(self, case: Case ) -> str:
         # The install directory is located <root>/build/install/<slug>
         return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)])
 
-    def get_home_dirpath(self, case: Case) -> str:
+    def get_home_dirpath(self) -> str:
         return os.sep.join([os.getcwd()])
 
     def get_install_binpath(self, case: Case ) -> str:

From 37b17682377e37bd9df65b339efa0c6a70c3b049 Mon Sep 17 00:00:00 2001
From: Ben Wilfong <48168887+wilfonba@users.noreply.github.com>
Date: Mon, 11 Aug 2025 12:24:09 -0400
Subject: [PATCH 20/25] update capture

---
 src/common/m_mpi_common.fpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index dbe61317b8..4332681f11 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -84,7 +84,8 @@ contains
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
 #else
         allocate (buff_send(0:halo_size), buff_recv(0:halo_size))
-        $:GPU_ENTER_DATA(create='[capture:buff_send, capture:buff_recv]')
+        $:GPU_ENTER_DATA(create='[capture:buff_send]')
+        $:GPU_ENTER_DATA(create='[capture:buff_recv]')
 #endif
 #endif
 

From e02e9f654acc3c7401603218be68ff12312f3c59 Mon Sep 17 00:00:00 2001
From: Ben Wilfong <wilfonba@rose-hulman.edu>
Date: Mon, 11 Aug 2025 20:17:50 +0200
Subject: [PATCH 21/25] add fastmath flag and bug fix

---
 CMakeLists.txt                  | 17 +++++++++++++++++
 toolchain/mfc/build.py          |  1 +
 toolchain/mfc/lock.py           |  2 +-
 toolchain/mfc/state.py          | 13 +++++++------
 toolchain/templates/santis.mako |  7 +++----
 5 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5452590712..3fce77f001 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ option(MFC_MPI           "Build with MPI"                                     ON
 option(MFC_OpenACC       "Build with OpenACC"                                OFF)
 option(MFC_GCov          "Build with GCov"                                   OFF)
 option(MFC_Unified       "Build with unified CPU & GPU memory (GH-200 only)" OFF)
+option(MFC_Fastmath      "Build with -fastmath on NV GPUs"                   OFF)
 option(MFC_PRE_PROCESS   "Build pre_process"                                 OFF)
 option(MFC_SIMULATION    "Build simulation"                                  OFF)
 option(MFC_POST_PROCESS  "Build post_process"                                OFF)
@@ -487,6 +488,9 @@ function(MFC_SETUP_TARGET)
                     "-foffload=amdgcn-amdhsa='-march=gfx90a'"
                     "-foffload-options=-lgfortran\ -lm"
                     "-fno-exceptions")
+                if (MFC_Fastmath)
+                    message(WARNING "--fastmath has no effect with the GNU compiler")
+                endif()
             elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
                 foreach (cc ${MFC_CUDA_CC})
                     target_compile_options(${a_target}
@@ -498,6 +502,12 @@ function(MFC_SETUP_TARGET)
                     PRIVATE -gpu=keep,ptxinfo,lineinfo
                 )
 
+                if (MFC_Fastmath)
+                    target_compile_options(${a_target}
+                        PRIVATE -gpu=fastmath
+                    )
+                endif()
+
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
@@ -521,11 +531,18 @@ function(MFC_SETUP_TARGET)
                         PRIVATE -DFRONTIER_UNIFIED)
                 endif()
 
+                if (MFC_Fastmath)
+                    message(WARNING "--fastmath has no effect with the CCE")
+                endif()
+
                 find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
                 target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
             endif()
         elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
+            if (MFC_Fastmath)
+                message(WARNING "--fastmath has no effect with the CCE")
+            endif()
         endif()
 
         if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 70b2f6950b..846763b233 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -147,6 +147,7 @@ def configure(self, case: Case):
             flags.append(f"-DMFC_OpenACC={'ON' if ARG('gpu') else 'OFF'}")
             flags.append(f"-DMFC_GCov={   'ON' if ARG('gcov') else 'OFF'}")
             flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}")
+            flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}")
 
         command = ["cmake"] + flags + ["-S", cmake_dirpath, "-B", build_dirpath]
 
diff --git a/toolchain/mfc/lock.py b/toolchain/mfc/lock.py
index aa91cc9675..eb20bd73fa 100644
--- a/toolchain/mfc/lock.py
+++ b/toolchain/mfc/lock.py
@@ -5,7 +5,7 @@
 from .printer import cons
 
 
-MFC_LOCK_CURRENT_VERSION: int = 5
+MFC_LOCK_CURRENT_VERSION: int = 6
 
 
 @dataclasses.dataclass
diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py
index fa7d438e77..ba545c5680 100644
--- a/toolchain/mfc/state.py
+++ b/toolchain/mfc/state.py
@@ -3,12 +3,13 @@
 
 @dataclasses.dataclass
 class MFCConfig:
-    mpi:     bool = True
-    gpu:     bool = False
-    debug:   bool = False
-    gcov:    bool = False
-    unified: bool = False
-    single: bool = False
+    mpi:       bool = True
+    gpu:       bool = False
+    debug:     bool = False
+    gcov:      bool = False
+    unified:   bool = False
+    single:    bool = False
+    fastmath : bool = False
 
     @staticmethod
     def from_dict(d: dict):
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
index b0e47dc154..1671a8f254 100644
--- a/toolchain/templates/santis.mako
+++ b/toolchain/templates/santis.mako
@@ -3,9 +3,8 @@
 <%namespace name="helpers" file="helpers.mako"/>
 
 % if engine == 'batch':
-#SBATCH --uenv=icon/25.2:v1
+#SBATCH --uenv=icon/25.2:v1@santis
 #SBATCH --nodes=${nodes}
-#SBATCH --reservation=g183
 #SBATCH --ntasks-per-node=${tasks_per_node}
 #SBATCH --job-name="${name}"
 #SBATCH --output="${name}.out"
@@ -78,9 +77,9 @@ echo
                 --gpus-per-task 1                                    \
             % endif
                 --wait 200 --bcast=/tmp/${target.name}               \
-                "${target.get_home_dirpath(case)}/misc/nvidia_uvm/bind.sh" \
+                "${target.get_home_dirpath()}/misc/nvidia_uvm/bind.sh" \
             % if target.name == 'simulation':
-                "${target.get_home_dirpath(case)}/misc/nvidia_uvm/nsys.sh" \
+                "${target.get_home_dirpath()}/misc/nvidia_uvm/nsys.sh" \
             % endif
                 "${target.get_install_binpath(case)}")
     % endif

From a6ff639d793542f87f0f99a8ca25710e58dc5100 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Tue, 12 Aug 2025 07:52:45 +0200
Subject: [PATCH 22/25] Fix typo in CMakeLists

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3fce77f001..0b349eb394 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ option(MFC_MPI           "Build with MPI"                                     ON
 option(MFC_OpenACC       "Build with OpenACC"                                OFF)
 option(MFC_GCov          "Build with GCov"                                   OFF)
 option(MFC_Unified       "Build with unified CPU & GPU memory (GH-200 only)" OFF)
-option(MFC_Fastmath      "Build with -fastmath on NV GPUs"                   OFF)
+option(MFC_Fastmath      "Build with -gpu=fastmath on NV GPUs"               OFF)
 option(MFC_PRE_PROCESS   "Build pre_process"                                 OFF)
 option(MFC_SIMULATION    "Build simulation"                                  OFF)
 option(MFC_POST_PROCESS  "Build post_process"                                OFF)

From 457ae607ef730495c38f6408a2772c29fc64533b Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Tue, 12 Aug 2025 14:01:22 +0200
Subject: [PATCH 23/25] Replace host_pool with host in m_igr

---
 src/simulation/m_igr.fpp | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 01fd90e7a8..0fbc76346f 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -27,9 +27,9 @@ module m_igr
 #ifdef __NVCOMPILER_GPU_UNIFIED_MEM
     integer, dimension(3) :: nv_uvm_temp_on_gpu
     real(wp), pointer, contiguous, dimension(:, :, :) :: jac, jac_rhs, jac_old
-    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host_pool
-    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host_pool
-    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host_pool
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host
 #else
     real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old
     $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]')
@@ -124,21 +124,21 @@ contains
                 idwbuff(3)%beg:idwbuff(3)%end))
             @:PREFER_GPU(jac)
         else
-            allocate (jac_host_pool(idwbuff(1)%beg:idwbuff(1)%end, &
-                                    idwbuff(2)%beg:idwbuff(2)%end, &
-                                    idwbuff(3)%beg:idwbuff(3)%end))
+            allocate (jac_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                               idwbuff(2)%beg:idwbuff(2)%end, &
+                               idwbuff(3)%beg:idwbuff(3)%end))
 
             jac(idwbuff(1)%beg:idwbuff(1)%end, &
                 idwbuff(2)%beg:idwbuff(2)%end, &
-                idwbuff(3)%beg:idwbuff(3)%end) => jac_host_pool(:, :, :)
+                idwbuff(3)%beg:idwbuff(3)%end) => jac_host(:, :, :)
         end if
 
         if (nv_uvm_temp_on_gpu(2) == 1) then
             @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
             @:PREFER_GPU(jac_rhs)
         else
-            allocate (jac_rhs_host_pool(-1:m, -1:n, -1:p))
-            jac_rhs(-1:m, -1:n, -1:p) => jac_rhs_host_pool(:, :, :)
+            allocate (jac_rhs_host(-1:m, -1:n, -1:p))
+            jac_rhs(-1:m, -1:n, -1:p) => jac_rhs_host(:, :, :)
         end if
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
@@ -148,13 +148,13 @@ contains
                     idwbuff(3)%beg:idwbuff(3)%end))
                 @:PREFER_GPU(jac_old)
             else
-                allocate (jac_old_host_pool(idwbuff(1)%beg:idwbuff(1)%end, &
-                                            idwbuff(2)%beg:idwbuff(2)%end, &
-                                            idwbuff(3)%beg:idwbuff(3)%end))
+                allocate (jac_old_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                       idwbuff(2)%beg:idwbuff(2)%end, &
+                                       idwbuff(3)%beg:idwbuff(3)%end))
 
                 jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
                         idwbuff(2)%beg:idwbuff(2)%end, &
-                        idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host_pool(:, :, :)
+                        idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host(:, :, :)
             end if
         end if
 #endif
@@ -2678,14 +2678,14 @@ contains
             @:DEALLOCATE(jac)
         else
             nullify (jac)
-            deallocate (jac_host_pool)
+            deallocate (jac_host)
         end if
 
         if (nv_uvm_temp_on_gpu(2) == 1) then
             @:DEALLOCATE(jac_rhs)
         else
             nullify (jac_rhs)
-            deallocate (jac_rhs_host_pool)
+            deallocate (jac_rhs_host)
         end if
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
@@ -2693,7 +2693,7 @@ contains
                 @:DEALLOCATE(jac_old)
             else
                 nullify (jac_old)
-                deallocate (jac_old_host_pool)
+                deallocate (jac_old_host)
             end if
         end if
 #endif

From a6116f24def35029ed6abe270777934add8f2719 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Tue, 12 Aug 2025 16:01:08 +0200
Subject: [PATCH 24/25] Set cpus-per-task to 72 and update binding script

---
 misc/nvidia_uvm/bind.sh         | 2 +-
 toolchain/templates/santis.mako | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh
index 0b7bf91e96..b5b4bbb945 100755
--- a/misc/nvidia_uvm/bind.sh
+++ b/misc/nvidia_uvm/bind.sh
@@ -15,7 +15,7 @@ export MPICH_OFI_NIC_POLICY=USER
 export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
 
 # Bind to cores ( first core per socket )
-physcores=(0 72 144 216)
+physcores=(0-71 72-143 144-215 216-287)
 
 #echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
 
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
index 1671a8f254..23abce3508 100644
--- a/toolchain/templates/santis.mako
+++ b/toolchain/templates/santis.mako
@@ -71,7 +71,7 @@ echo
     % else:
         (set -x; srun --unbuffered \
                 --ntasks=${nodes*tasks_per_node}                     \
-                --cpus-per-task 1                                    \
+                --cpus-per-task 72                                   \
                 --cpu-bind=none                                      \
             % if gpu:
                 --gpus-per-task 1                                    \

From fb50e908a0cd98d0cc5dfb1a24ceefba3d1749e9 Mon Sep 17 00:00:00 2001
From: Nikolaos Tselepidis <ntselepidis@nvidia.com>
Date: Tue, 12 Aug 2025 16:11:33 +0200
Subject: [PATCH 25/25] Add some more updates to the helper scripts

---
 misc/nvidia_uvm/bind.sh         | 2 +-
 misc/nvidia_uvm/nsys.sh         | 2 +-
 toolchain/templates/santis.mako | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh
index b5b4bbb945..37f5a1a3cd 100755
--- a/misc/nvidia_uvm/bind.sh
+++ b/misc/nvidia_uvm/bind.sh
@@ -14,7 +14,7 @@ export CUDA_VISIBLE_DEVICES="$local_rank"
 export MPICH_OFI_NIC_POLICY=USER
 export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
 
-# Bind to cores ( first core per socket )
+# Bind to cores ( all cores per socket )
 physcores=(0-71 72-143 144-215 216-287)
 
 #echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh
index 172bcb2f69..205bee8fd4 100755
--- a/misc/nvidia_uvm/nsys.sh
+++ b/misc/nvidia_uvm/nsys.sh
@@ -15,7 +15,7 @@ if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
       --cpu-socket-events=61,71,265,273 \
       --cpu-socket-metrics=103,104 \
       --event-sampling-interval=10 \
-      --trace=nvtx,openacc \
+      --trace=nvtx,cuda,openacc \
       --force-overwrite=true \
       -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
       -o "$NSYS_FILE" "$@"
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
index 23abce3508..cb4b330625 100644
--- a/toolchain/templates/santis.mako
+++ b/toolchain/templates/santis.mako
@@ -6,6 +6,7 @@
 #SBATCH --uenv=icon/25.2:v1@santis
 #SBATCH --nodes=${nodes}
 #SBATCH --ntasks-per-node=${tasks_per_node}
+#SBATCH --cpus-per-task=72
 #SBATCH --job-name="${name}"
 #SBATCH --output="${name}.out"
 #SBATCH --error="${name}.err"