diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7f087c25b..0b349eb394 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ option(MFC_MPI           "Build with MPI"                                     ON
 option(MFC_OpenACC       "Build with OpenACC"                                OFF)
 option(MFC_GCov          "Build with GCov"                                   OFF)
 option(MFC_Unified       "Build with unified CPU & GPU memory (GH-200 only)" OFF)
+option(MFC_Fastmath      "Build with -gpu=fastmath on NV GPUs"               OFF)
 option(MFC_PRE_PROCESS   "Build pre_process"                                 OFF)
 option(MFC_SIMULATION    "Build simulation"                                  OFF)
 option(MFC_POST_PROCESS  "Build post_process"                                OFF)
@@ -487,6 +488,9 @@ function(MFC_SETUP_TARGET)
                     "-foffload=amdgcn-amdhsa='-march=gfx90a'"
                     "-foffload-options=-lgfortran\ -lm"
                     "-fno-exceptions")
+                if (MFC_Fastmath)
+                    message(WARNING "--fastmath has no effect with the GNU compiler")
+                endif()
             elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
                 foreach (cc ${MFC_CUDA_CC})
                     target_compile_options(${a_target}
@@ -498,14 +502,20 @@ function(MFC_SETUP_TARGET)
                     PRIVATE -gpu=keep,ptxinfo,lineinfo
                 )
 
+                if (MFC_Fastmath)
+                    target_compile_options(${a_target}
+                        PRIVATE -gpu=fastmath
+                    )
+                endif()
+
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified:managedalloc -cuda
                     )
                     # "This option must appear in both the compile and link lines" -- NVHPC Docs
                     target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified:managedalloc -cuda
                     )
                 endif()
 
@@ -521,16 +531,28 @@ function(MFC_SETUP_TARGET)
                         PRIVATE -DFRONTIER_UNIFIED)
                 endif()
 
+                if (MFC_Fastmath)
+                    message(WARNING "--fastmath has no effect with the CCE")
+                endif()
+
                 find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
                 target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn)
             endif()
         elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
+            if (MFC_Fastmath)
+                message(WARNING "--fastmath has no effect with the CCE")
+            endif()
         endif()
 
         if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
             find_package(CUDAToolkit REQUIRED)
-            target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+            if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
+                target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+            else() # CUDA >= 12.9
+                target_link_libraries(${a_target} PRIVATE nvhpcwrapnvtx)
+                target_link_options(${a_target} PRIVATE "-cudalib=nvtx3")
+            endif()
         endif()
     endforeach()
 
diff --git a/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
new file mode 100644
index 0000000000..e2b22e8017
--- /dev/null
+++ b/examples/3D_IGR_TaylorGreenVortex_nvidia/case.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+import math
+import json
+
+N = 799
+Nx = N
+Ny = 2 * (N + 1) - 1
+Nz = 2 * (N + 1) - 1
+
+Re = 1600
+L = 1
+P0 = 101325
+rho0 = 1
+C0 = math.sqrt(1.4 * P0)
+V0 = 0.1 * C0
+mu = V0 * L / Re
+
+cfl = 0.5
+dx = 2 * math.pi * L / (Ny + 1)
+
+dt = cfl * dx / (C0)
+
+tC = L / V0
+tEnd = 20 * tC
+
+Nt = int(tEnd / dt)
+Nt = 10
+
+
+# Configuring case dictionary
+print(
+    json.dumps(
+        {
+            "rdma_mpi": "T",
+            # Logistics
+            "run_time_info": "F",
+            # Computational Domain Parameters
+            "x_domain%beg": -math.pi * L,
+            "x_domain%end": math.pi * L,
+            "y_domain%beg": -math.pi * L,
+            "y_domain%end": math.pi * L,
+            "z_domain%beg": -math.pi * L,
+            "z_domain%end": math.pi * L,
+            "m": Nx,
+            "n": Ny,
+            "p": Nz,
+            "cyl_coord": "F",
+            "dt": dt,
+            "t_step_start": 0,
+            "t_step_stop": 10,  # Nt,
+            "t_step_save": 10,  # int(Nt / 100),
+            # Simulation Algorithm Parameters
+            "num_patches": 1,
+            "model_eqns": 2,
+            "num_fluids": 1,
+            "time_stepper": 3,
+            "bc_x%beg": -1,
+            "bc_x%end": -1,
+            "bc_y%beg": -1,
+            "bc_y%end": -1,
+            "bc_z%beg": -1,
+            "bc_z%end": -1,
+            "igr": "T",
+            "igr_order": 5,
+            "igr_iter_solver": 1,
+            "num_igr_iters": 3,
+            "num_igr_warm_start_iters": 3,
+            "alf_factor": 10,
+            "viscous": "T",
+            # Formatted Database Files Structure Parameters
+            "format": 1,
+            "precision": 2,
+            "prim_vars_wrt": "T",
+            "omega_wrt(1)": "T",
+            "omega_wrt(2)": "T",
+            "omega_wrt(3)": "T",
+            "qm_wrt": "T",
+            "fd_order": 4,
+            "parallel_io": "T",
+            # Patch 1: Background (AIR - 2)
+            "patch_icpp(1)%geometry": 9,
+            "patch_icpp(1)%x_centroid": 0,
+            "patch_icpp(1)%y_centroid": 0,
+            "patch_icpp(1)%z_centroid": 0,
+            "patch_icpp(1)%length_x": 2 * math.pi * L,
+            "patch_icpp(1)%length_y": 2 * math.pi * L,
+            "patch_icpp(1)%length_z": 2 * math.pi * L,
+            "patch_icpp(1)%vel(1)": 0.0,
+            "patch_icpp(1)%vel(2)": 0.0,
+            "patch_icpp(1)%vel(3)": 0,
+            "patch_icpp(1)%pres": 0.0,
+            "patch_icpp(1)%hcid": 380,
+            "patch_icpp(1)%alpha_rho(1)": 1,
+            "patch_icpp(1)%alpha(1)": 1,
+            # Fluids Physical Parameters
+            "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
+            "fluid_pp(1)%pi_inf": 0,
+            "fluid_pp(1)%Re(1)": 1 / mu,
+            # NVIDIA UVM Options
+            "nv_uvm_out_of_core": "T",
+            "nv_uvm_igr_temps_on_gpu": 3,
+            "nv_uvm_pref_gpu": "T",
+        }
+    )
+)
diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh
new file mode 100755
index 0000000000..37f5a1a3cd
--- /dev/null
+++ b/misc/nvidia_uvm/bind.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# -------------------------------- #
+# Binding for a single Santis node #
+# -------------------------------- #
+
+# Local rank
+export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
+
+# Bind to GPU
+export CUDA_VISIBLE_DEVICES="$local_rank"
+
+# Binding to NIC
+export MPICH_OFI_NIC_POLICY=USER
+export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
+
+# Bind to cores ( all cores per socket )
+physcores=(0-71 72-143 144-215 216-287)
+
+#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
+
+#set -x
+numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
+#set +x
diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh
new file mode 100755
index 0000000000..205bee8fd4
--- /dev/null
+++ b/misc/nvidia_uvm/nsys.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#set -x
+set -euo pipefail
+
+rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
+
+[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
+[[ -z "${NSYS+x}" ]] && NSYS=0
+
+if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
+  exec nsys profile \
+       --cpuctxsw=none -b none -s none \
+      --event-sample=system-wide \
+      --cpu-socket-events=61,71,265,273 \
+      --cpu-socket-metrics=103,104 \
+      --event-sampling-interval=10 \
+      --trace=nvtx,cuda,openacc \
+      --force-overwrite=true \
+      -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
+      -o "$NSYS_FILE" "$@"
+else
+  exec "$@"
+fi
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index c1652388c3..69241c99ef 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -12,6 +12,55 @@
 #endif
 #:enddef
 
+! Caution:
+! This macro requires the use of a binding script to set CUDA_VISIBLE_DEVICES, such that we have one GPU device per MPI rank.
+! That's because for both cudaMemAdvise (preferred location) and cudaMemPrefetchAsync we use location = device_id = 0.
+! For an example see misc/nvidia_uvm/bind.sh.
+#:def PREFER_GPU(*args)
+#ifdef MFC_SIMULATION
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    block
+! Beginning in the 25.3 release, the structure of the cudafor module has been changed slightly.
+! The module now includes, or “uses” 3 submodules: cuda_runtime_api, gpu_reductions, and sort.
+! The cudafor functionality has not changed. But for new users, or users who have needed to
+! work-around name conflicts in the module, it may be better to use cuda_runtime_api to expose
+! interfaces to the CUDA runtime calls described in Chapter 4 of this guide.
+! https://docs.nvidia.com/hpc-sdk/compilers/cuda-fortran-prog-guide/index.html#fortran-host-modules
+#if __NVCOMPILER_MAJOR__ < 25 || (__NVCOMPILER_MAJOR__ == 25 && __NVCOMPILER_MINOR__ < 3)
+        use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
+#else
+        use cuda_runtime_api
+#endif
+        integer :: istat
+
+        if (nv_uvm_pref_gpu) then
+            #:for arg in args
+                !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
+                ! set preferred location GPU
+                istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+                ! set accessed by CPU
+                istat = cudaMemAdvise(c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+                ! prefetch to GPU - physically populate memory pages
+                istat = cudaMemPrefetchAsync(c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0)
+                if (istat /= cudaSuccess) then
+                    write (*, "('Error code: ',I0, ': ')") istat
+                    !write(*,*) cudaGetErrorString(istat)
+                end if
+            #:endfor
+        end if
+    end block
+#endif
+#endif
+#:enddef
+
 #:def ALLOCATE(*args)
     @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
     #:set allocated_variables = ', '.join(args)
diff --git a/src/common/m_mpi_common.fpp b/src/common/m_mpi_common.fpp
index fdfcab8d25..4332681f11 100644
--- a/src/common/m_mpi_common.fpp
+++ b/src/common/m_mpi_common.fpp
@@ -38,7 +38,9 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[buff_send, buff_recv]')
+#endif
 
     integer :: halo_size
     $:GPU_DECLARE(create='[halo_size]')
@@ -78,7 +80,13 @@ contains
 
         $:GPU_UPDATE(device='[halo_size, v_size]')
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
+#else
+        allocate (buff_send(0:halo_size), buff_recv(0:halo_size))
+        $:GPU_ENTER_DATA(create='[capture:buff_send]')
+        $:GPU_ENTER_DATA(create='[capture:buff_recv]')
+#endif
 #endif
 
     end subroutine s_initialize_mpi_common_module
diff --git a/src/simulation/m_checker.fpp b/src/simulation/m_checker.fpp
index f0196af0e2..8917b0be46 100644
--- a/src/simulation/m_checker.fpp
+++ b/src/simulation/m_checker.fpp
@@ -30,6 +30,7 @@ contains
 
         if (igr) then
             call s_check_inputs_igr
+            call s_check_inputs_nvidia_uvm
         else
             if (recon_type == WENO_TYPE) then
                 call s_check_inputs_weno
@@ -411,4 +412,13 @@ contains
         @:PROHIBIT(powell .and. fd_order == dflt_int, "fd_order must be set if Powell's method is enabled")
     end subroutine s_check_inputs_mhd
 
+    impure subroutine s_check_inputs_nvidia_uvm
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        @:PROHIBIT(nv_uvm_igr_temps_on_gpu > 3 .or. nv_uvm_igr_temps_on_gpu < 0, &
+            "nv_uvm_igr_temps_on_gpu must be in the range [0, 3]")
+        @:PROHIBIT(nv_uvm_igr_temps_on_gpu == 3 .and. igr_iter_solver == 2, &
+            "nv_uvm_igr_temps_on_gpu must be in the range [0, 2] for igr_iter_solver == 2")
+#endif
+    end subroutine s_check_inputs_nvidia_uvm
+
 end module m_checker
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 2a904a1e97..34c122f1e7 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -157,6 +157,16 @@ module m_global_parameters
         logical :: viscous        !< Viscous effects
     #:endif
 
+    !> @name Variables for our of core IGR computation on NVIDIA
+    !> @{
+    logical :: nv_uvm_out_of_core ! Enable out-of-core storage of q_cons_ts(2) in timestepping (default FALSE)
+    integer :: nv_uvm_igr_temps_on_gpu ! 0 => jac, jac_rhs, and jac_old on CPU
+    ! 1 => jac on GPU, jac_rhs and jac_old on CPU
+    ! 2 => jac and jac_rhs on GPU, jac_old on CPU
+    ! 3 => jac, jac_rhs, and jac_old on GPU (default)
+    logical :: nv_uvm_pref_gpu ! Enable explicit gpu memory hints (default FALSE)
+    !> @}
+
     real(wp) :: weno_eps       !< Binding for the WENO nonlinear weights
     real(wp) :: teno_CT        !< Smoothness threshold for TENO
     logical :: mp_weno        !< Monotonicity preserving (MP) WENO
@@ -573,6 +583,11 @@ contains
         t_stop = dflt_real
         t_save = dflt_real
 
+        ! NVIDIA UVM options
+        nv_uvm_out_of_core = .false.
+        nv_uvm_igr_temps_on_gpu = 3 ! => jac, jac_rhs, and jac_old on GPU (default)
+        nv_uvm_pref_gpu = .false.
+
         ! Simulation algorithm parameters
         model_eqns = dflt_int
         mpp_lim = .false.
@@ -1321,16 +1336,25 @@ contains
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
         @:ALLOCATE(x_cc(-buff_size:m + buff_size))
         @:ALLOCATE(dx(-buff_size:m + buff_size))
+        @:PREFER_GPU(x_cb)
+        @:PREFER_GPU(x_cc)
+        @:PREFER_GPU(dx)
 
         if (n == 0) return; 
         @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
         @:ALLOCATE(y_cc(-buff_size:n + buff_size))
         @:ALLOCATE(dy(-buff_size:n + buff_size))
+        @:PREFER_GPU(y_cb)
+        @:PREFER_GPU(y_cc)
+        @:PREFER_GPU(dy)
 
         if (p == 0) return; 
         @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
         @:ALLOCATE(z_cc(-buff_size:p + buff_size))
         @:ALLOCATE(dz(-buff_size:p + buff_size))
+        @:PREFER_GPU(z_cb)
+        @:PREFER_GPU(z_cc)
+        @:PREFER_GPU(dz)
 
     end subroutine s_initialize_global_parameters_module
 
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index db80bb8346..0fbc76346f 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -24,8 +24,16 @@ module m_igr
  s_igr_flux_add, &
  s_finalize_igr_module
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    integer, dimension(3) :: nv_uvm_temp_on_gpu
+    real(wp), pointer, contiguous, dimension(:, :, :) :: jac, jac_rhs, jac_old
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_host
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_rhs_host
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: jac_old_host
+#else
     real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old
     $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]')
+#endif
 
     real(wp), allocatable, dimension(:, :) :: Res
     $:GPU_DECLARE(create='[Res]')
@@ -73,7 +81,6 @@ module m_igr
                                    5._wp/6._wp, & ! Index 0
                                    2._wp/6._wp & ! Index 1
                                    ]
-
         #:endif
     #:endif
 
@@ -91,8 +98,11 @@ contains
                 end do
             end do
             $:GPU_UPDATE(device='[Res, Re_idx, Re_size]')
+            @:PREFER_GPU(Res)
+            @:PREFER_GPU(Re_idx)
         end if
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
             idwbuff(2)%beg:idwbuff(2)%end, &
             idwbuff(3)%beg:idwbuff(3)%end))
@@ -103,6 +113,51 @@ contains
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
         end if
+#else
+        ! create map
+        nv_uvm_temp_on_gpu(1:3) = 0
+        nv_uvm_temp_on_gpu(1:nv_uvm_igr_temps_on_gpu) = 1
+
+        if (nv_uvm_temp_on_gpu(1) == 1) then
+            @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:PREFER_GPU(jac)
+        else
+            allocate (jac_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                               idwbuff(2)%beg:idwbuff(2)%end, &
+                               idwbuff(3)%beg:idwbuff(3)%end))
+
+            jac(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end) => jac_host(:, :, :)
+        end if
+
+        if (nv_uvm_temp_on_gpu(2) == 1) then
+            @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
+            @:PREFER_GPU(jac_rhs)
+        else
+            allocate (jac_rhs_host(-1:m, -1:n, -1:p))
+            jac_rhs(-1:m, -1:n, -1:p) => jac_rhs_host(:, :, :)
+        end if
+
+        if (igr_iter_solver == 1) then ! Jacobi iteration
+            if (nv_uvm_temp_on_gpu(3) == 1) then
+                @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
+                    idwbuff(2)%beg:idwbuff(2)%end, &
+                    idwbuff(3)%beg:idwbuff(3)%end))
+                @:PREFER_GPU(jac_old)
+            else
+                allocate (jac_old_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                       idwbuff(2)%beg:idwbuff(2)%end, &
+                                       idwbuff(3)%beg:idwbuff(3)%end))
+
+                jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end) => jac_old_host(:, :, :)
+            end if
+        end if
+#endif
 
         $:GPU_PARALLEL_LOOP(collapse=3)
         do l = idwbuff(3)%beg, idwbuff(3)%end
@@ -2612,11 +2667,36 @@ contains
             @:DEALLOCATE(Res)
         end if
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:DEALLOCATE(jac, jac_rhs)
 
         if (igr_iter_solver == 1) then ! Jacobi iteration
             @:DEALLOCATE(jac_old)
         end if
+#else
+        if (nv_uvm_temp_on_gpu(1) == 1) then
+            @:DEALLOCATE(jac)
+        else
+            nullify (jac)
+            deallocate (jac_host)
+        end if
+
+        if (nv_uvm_temp_on_gpu(2) == 1) then
+            @:DEALLOCATE(jac_rhs)
+        else
+            nullify (jac_rhs)
+            deallocate (jac_rhs_host)
+        end if
+
+        if (igr_iter_solver == 1) then ! Jacobi iteration
+            if (nv_uvm_temp_on_gpu(3) == 1) then
+                @:DEALLOCATE(jac_old)
+            else
+                nullify (jac_old)
+                deallocate (jac_old_host)
+            end if
+        end if
+#endif
 
         #:if not MFC_CASE_OPTIMIZATION
             @:DEALLOCATE(coeff_L, coeff_R)
diff --git a/src/simulation/m_mpi_proxy.fpp b/src/simulation/m_mpi_proxy.fpp
index 06ea632b7b..755f762166 100644
--- a/src/simulation/m_mpi_proxy.fpp
+++ b/src/simulation/m_mpi_proxy.fpp
@@ -237,6 +237,11 @@ contains
             #:endfor
         end do
 
+        ! NVIDIA UVM variables
+        call MPI_BCAST(nv_uvm_out_of_core, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(nv_uvm_igr_temps_on_gpu, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+        call MPI_BCAST(nv_uvm_pref_gpu, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
+
 #endif
 
     end subroutine s_mpi_bcast_user_inputs
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 95489bedf6..313ef48f2b 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -187,9 +187,9 @@ contains
             surface_tension, bubbles_lagrange, lag_params, &
             hyperelasticity, R0ref, num_bc_patches, Bx0, powell, &
             cont_damage, tau_star, cont_damage_s, alpha_bar, &
-            alf_factor, num_igr_iters, down_sample, &
-            num_igr_warm_start_iters, &
-            int_comp, ic_eps, ic_beta
+            alf_factor, num_igr_iters, num_igr_warm_start_iters, &
+            int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, &
+            nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample
 
         ! Checking that an input file has been provided by the user. If it
         ! has, then the input file is read in, otherwise, simulation exits.
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 4aaf0878fc..e7d4ba6017 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -77,7 +77,9 @@ module m_time_steppers
 
     $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]')
 
-#if defined(FRONTIER_UNIFIED)
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+    real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host
+#elif defined(FRONTIER_UNIFIED)
     real(wp), pointer, contiguous, dimension(:, :, :, :) :: q_cons_ts_pool_host, q_cons_ts_pool_device
     integer(kind=8) :: pool_dims(4), pool_starts(4)
 #endif
@@ -105,12 +107,47 @@ contains
 
         ! Allocating the cell-average conservative variables
         @:ALLOCATE(q_cons_ts(1:num_ts))
+        @:PREFER_GPU(q_cons_ts)
 
         do i = 1, num_ts
             @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size))
+            @:PREFER_GPU(q_cons_ts(i)%vf)
         end do
 
-#ifdef FRONTIER_UNIFIED
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+        if (num_ts == 2 .and. nv_uvm_out_of_core) then
+            ! host allocation for q_cons_ts(2)%vf(j)%sf for all j
+            allocate (q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                          idwbuff(2)%beg:idwbuff(2)%end, &
+                                          idwbuff(3)%beg:idwbuff(3)%end, &
+                                          1:sys_size))
+        end if
+
+        do j = 1, sys_size
+            ! q_cons_ts(1) lives on the device
+            @:ALLOCATE(q_cons_ts(1)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                idwbuff(2)%beg:idwbuff(2)%end, &
+                idwbuff(3)%beg:idwbuff(3)%end))
+            @:PREFER_GPU(q_cons_ts(1)%vf(j)%sf)
+            if (num_ts == 2) then
+                if (nv_uvm_out_of_core) then
+                    ! q_cons_ts(2) lives on the host
+                    q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                                          idwbuff(2)%beg:idwbuff(2)%end, &
+                                          idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:, :, :, j)
+                else
+                    @:ALLOCATE(q_cons_ts(2)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                        idwbuff(2)%beg:idwbuff(2)%end, &
+                        idwbuff(3)%beg:idwbuff(3)%end))
+                    @:PREFER_GPU(q_cons_ts(2)%vf(j)%sf)
+                end if
+            end if
+        end do
+
+        do i = 1, num_ts
+            @:ACC_SETUP_VFs(q_cons_ts(i))
+        end do
+#elif defined(FRONTIER_UNIFIED)
         ! Allocate to memory regions using hip calls
         ! that we will attach pointers to
         do i = 1, 3
@@ -357,11 +394,13 @@ contains
 
         ! Allocating the cell-average RHS variables
         @:ALLOCATE(rhs_vf(1:sys_size))
+        @:PREFER_GPU(rhs_vf)
 
         if (igr) then
             do i = 1, sys_size
                 @:ALLOCATE(rhs_vf(i)%sf(-1:m+1,-1:n+1,-1:p+1))
                 @:ACC_SETUP_SFs(rhs_vf(i))
+                @:PREFER_GPU(rhs_vf(i)%sf)
             end do
         else
             do i = 1, sys_size
@@ -536,8 +575,8 @@ contains
         real(wp), intent(inout) :: time_avg
 
         integer :: i, j, k, l, q!< Generic loop iterator
-        integer :: dest
         real(wp) :: start, finish
+        integer :: dest
 
         ! Stage 1 of 2
 
@@ -567,7 +606,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-#ifdef FRONTIER_UNIFIED
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -659,7 +698,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-#ifdef FRONTIER_UNIFIED
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -668,7 +707,7 @@ contains
                         q_cons_ts(1)%vf(i)%sf(j, k, l) = &
                             (q_cons_ts(2)%vf(i)%sf(j, k, l) &
                              + q_cons_ts(1)%vf(i)%sf(j, k, l) &
-                             + dt*rhs_vf(i)%sf(j, k, l))/4._wp
+                             + dt*rhs_vf(i)%sf(j, k, l))/2._wp
                     end do
                 end do
             end do
@@ -771,9 +810,8 @@ contains
         real(wp), intent(INOUT) :: time_avg
 
         integer :: i, j, k, l, q !< Generic loop iterator
-        integer :: dest
-
         real(wp) :: start, finish
+        integer :: dest
 
         ! Stage 1 of 3
 
@@ -804,7 +842,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
-#ifdef FRONTIER_UNIFIED
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -896,7 +934,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
-#if defined(FRONTIER_UNIFIED)
+#if  defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -989,7 +1027,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
 
-#ifdef FRONTIER_UNIFIED
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM) || defined(FRONTIER_UNIFIED)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -1094,6 +1132,7 @@ contains
             end if
 
         end if
+
     end subroutine s_3rd_order_tvd_rk
 
     !> Strang splitting scheme with 3rd order TVD RK time-stepping algorithm for
@@ -1338,30 +1377,45 @@ contains
         use hipfort_hipmalloc
         use hipfort_check
 #endif
-
         integer :: i, j !< Generic loop iterators
 
         ! Deallocating the cell-average conservative variables
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+        do j = 1, sys_size
+            @:DEALLOCATE(q_cons_ts(1)%vf(j)%sf)
+            if (num_ts == 2) then
+                if (nv_uvm_out_of_core) then
+                    nullify (q_cons_ts(2)%vf(j)%sf)
+                else
+                    @:DEALLOCATE(q_cons_ts(2)%vf(j)%sf)
+                end if
+            end if
+        end do
+        if (num_ts == 2 .and. nv_uvm_out_of_core) then
+            deallocate (q_cons_ts_pool_host)
+        end if
+#elif defined(FRONTIER_UNIFIED)
         do i = 1, num_ts
-#ifdef FRONTIER_UNIFIED
             do j = 1, sys_size
                 nullify (q_cons_ts(i)%vf(j)%sf)
             end do
+        end do
+
+        call hipCheck(hipHostFree(q_cons_ts_pool_host))
+        call hipCheck(hipFree(q_cons_ts_pool_device))
 #else
+        do i = 1, num_ts
             do j = 1, sys_size
                 @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
             end do
+        end do
 #endif
+        do i = 1, num_ts
             @:DEALLOCATE(q_cons_ts(i)%vf)
         end do
 
         @:DEALLOCATE(q_cons_ts)
 
-#ifdef FRONTIER_UNIFIED
-        call hipCheck(hipHostFree(q_cons_ts_pool_host))
-        call hipCheck(hipFree(q_cons_ts_pool_device))
-#endif
-
         ! Deallocating the cell-average primitive ts variables
         if (probe_wrt) then
             do i = 0, 3
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 56beaea979..a9846124ba 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -98,7 +98,9 @@ module m_weno
     !> @name Indical bounds in the s1-, s2- and s3-directions
     !> @{
     type(int_bounds_info) :: is1_weno, is2_weno, is3_weno
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[is1_weno,is2_weno,is3_weno]')
+#endif
     !
     !> @}
 
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 7c7648ae18..846763b233 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -64,6 +64,9 @@ def get_install_dirpath(self, case: Case ) -> str:
         # The install directory is located <root>/build/install/<slug>
         return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)])
 
+    def get_home_dirpath(self) -> str:
+        return os.sep.join([os.getcwd()])
+
     def get_install_binpath(self, case: Case ) -> str:
         # <root>/install/<slug>/bin/<target>
         return os.sep.join([self.get_install_dirpath(case), "bin", self.name])
@@ -144,6 +147,7 @@ def configure(self, case: Case):
             flags.append(f"-DMFC_OpenACC={'ON' if ARG('gpu') else 'OFF'}")
             flags.append(f"-DMFC_GCov={   'ON' if ARG('gcov') else 'OFF'}")
             flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}")
+            flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}")
 
         command = ["cmake"] + flags + ["-S", cmake_dirpath, "-B", build_dirpath]
 
diff --git a/toolchain/mfc/lock.py b/toolchain/mfc/lock.py
index aa91cc9675..eb20bd73fa 100644
--- a/toolchain/mfc/lock.py
+++ b/toolchain/mfc/lock.py
@@ -5,7 +5,7 @@
 from .printer import cons
 
 
-MFC_LOCK_CURRENT_VERSION: int = 5
+MFC_LOCK_CURRENT_VERSION: int = 6
 
 
 @dataclasses.dataclass
diff --git a/toolchain/mfc/run/case_dicts.py b/toolchain/mfc/run/case_dicts.py
index e509d7c6b4..8378d3044d 100644
--- a/toolchain/mfc/run/case_dicts.py
+++ b/toolchain/mfc/run/case_dicts.py
@@ -313,6 +313,9 @@ def analytic(self):
     'int_comp': ParamType.LOG,
     'ic_eps': ParamType.REAL,
     'ic_beta': ParamType.REAL,
+    'nv_uvm_out_of_core': ParamType.LOG,
+    'nv_uvm_igr_temps_on_gpu': ParamType.INT,
+    'nv_uvm_pref_gpu': ParamType.LOG,
 })
 
 for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',
diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py
index fa7d438e77..ba545c5680 100644
--- a/toolchain/mfc/state.py
+++ b/toolchain/mfc/state.py
@@ -3,12 +3,13 @@
 
 @dataclasses.dataclass
 class MFCConfig:
-    mpi:     bool = True
-    gpu:     bool = False
-    debug:   bool = False
-    gcov:    bool = False
-    unified: bool = False
-    single: bool = False
+    mpi:       bool = True
+    gpu:       bool = False
+    debug:     bool = False
+    gcov:      bool = False
+    unified:   bool = False
+    single:    bool = False
+    fastmath : bool = False
 
     @staticmethod
     def from_dict(d: dict):
diff --git a/toolchain/mfc/test/cases.py b/toolchain/mfc/test/cases.py
index 387540fb5b..7329c6e14b 100644
--- a/toolchain/mfc/test/cases.py
+++ b/toolchain/mfc/test/cases.py
@@ -693,17 +693,17 @@ def alter_mixlayer_perturb(dimInfo):
                 'patch_icpp(1)%vel(1)': 1.0, 'patch_icpp(1)%vel(2)': 0.0, 'patch_icpp(1)%vel(3)': 0.0,
                 'patch_icpp(1)%pres': 17.8571428571, 'patch_icpp(1)%alpha_rho(1)': 1.0, 'patch_icpp(1)%alpha(1)': 1.0,
                 'patch_icpp(1)%r0': -1e6, 'patch_icpp(1)%v0': -1e6,
-                'patch_icpp(2)%geometry': -100, 
+                'patch_icpp(2)%geometry': -100,
                 'patch_icpp(2)%x_centroid': -1e6, 'patch_icpp(2)%length_x': -1e6,
-                'patch_icpp(2)%y_centroid': -1e6, 'patch_icpp(2)%length_y': -1e6, 
-                'patch_icpp(2)%z_centroid': -1e6, 'patch_icpp(2)%length_z': -1e6, 
-                'patch_icpp(2)%vel(1)': -1e6, 'patch_icpp(2)%vel(2)': -1e6, 'patch_icpp(2)%vel(3)': -1e6, 
+                'patch_icpp(2)%y_centroid': -1e6, 'patch_icpp(2)%length_y': -1e6,
+                'patch_icpp(2)%z_centroid': -1e6, 'patch_icpp(2)%length_z': -1e6,
+                'patch_icpp(2)%vel(1)': -1e6, 'patch_icpp(2)%vel(2)': -1e6, 'patch_icpp(2)%vel(3)': -1e6,
                 'patch_icpp(2)%r0': -1e6, 'patch_icpp(2)%v0': -1e6,
-                'patch_icpp(3)%geometry': -100, 
+                'patch_icpp(3)%geometry': -100,
                 'patch_icpp(3)%x_centroid': -1e6, 'patch_icpp(3)%length_x': -1e6,
-                'patch_icpp(3)%y_centroid': -1e6, 'patch_icpp(3)%length_y': -1e6, 
-                'patch_icpp(3)%z_centroid': -1e6, 'patch_icpp(3)%length_z': -1e6, 
-                'patch_icpp(3)%vel(1)': -1e6, 'patch_icpp(3)%vel(2)': -1e6, 'patch_icpp(3)%vel(3)': -1e6, 
+                'patch_icpp(3)%y_centroid': -1e6, 'patch_icpp(3)%length_y': -1e6,
+                'patch_icpp(3)%z_centroid': -1e6, 'patch_icpp(3)%length_z': -1e6,
+                'patch_icpp(3)%vel(1)': -1e6, 'patch_icpp(3)%vel(2)': -1e6, 'patch_icpp(3)%vel(3)': -1e6,
                 'patch_icpp(3)%r0': -1e6, 'patch_icpp(3)%v0': -1e6
             }))
 
@@ -993,11 +993,12 @@ def foreach_example():
                            "2D_lagrange_bubblescreen",
                            "3D_lagrange_bubblescreen", "2D_triple_point",
                            "1D_shuosher_analytical",
-                           "1D_titarevtorro_analytical", 
+                           "1D_titarevtorro_analytical",
                            "2D_acoustic_pulse_analytical",
                            "2D_isentropicvortex_analytical",
                            "2D_zero_circ_vortex_analytical",
                            "3D_TaylorGreenVortex_analytical",
+                           "3D_IGR_TaylorGreenVortex_nvidia",
                            "2D_backward_facing_step",
                            "2D_forward_facing_step"]
             if path in casesToSkip:
diff --git a/toolchain/modules b/toolchain/modules
index a124a275f1..27783e9407 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -85,3 +85,6 @@ n-cpu penguin/openmpi/4.1.5/gcc-8.5.0
 n-gpu penguin/openmpi/4.1.5/nvhpc-22.3 nvidia/nvhpc/22.3 cuda/cuda-11.6
 n-gpu CC=nvc CXX=nvc++ FC=nvfortran
 
+san   CSCS Santis
+san-all cmake python
+san-gpu nvhpc cuda cray-mpich
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
new file mode 100644
index 0000000000..cb4b330625
--- /dev/null
+++ b/toolchain/templates/santis.mako
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+
+<%namespace name="helpers" file="helpers.mako"/>
+
+% if engine == 'batch':
+#SBATCH --uenv=icon/25.2:v1@santis
+#SBATCH --nodes=${nodes}
+#SBATCH --ntasks-per-node=${tasks_per_node}
+#SBATCH --cpus-per-task=72
+#SBATCH --job-name="${name}"
+#SBATCH --output="${name}.out"
+#SBATCH --error="${name}.err"
+#SBATCH --time=${walltime}
+% if account:
+#SBATCH --account=${account}
+% endif
+% if partition:
+#SBATCH --partition=${partition}
+% endif
+% if quality_of_service:
+#SBATCH --qos=${quality_of_service}
+% endif
+% if email:
+#SBATCH --mail-user=${email}
+#SBATCH --mail-type="BEGIN, END, FAIL"
+% endif
+% endif
+
+# We compiled the code using -gpu=unified:managedalloc, hence we use cudaMallocManaged for the dynamic allocations.
+# Using NV_ACC_USE_MALLOC we could change to malloc at runtime. We choose to not do that here and stick with cudaMallocManaged and 2MB page sizes.
+# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#memory-model
+# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#command-line-options-selecting-compiler-memory-modes
+export NV_ACC_USE_MALLOC=0
+
+# For NVIDIA CUDA devices, controls the use of automatic memory hints at data constructs in the managed and unified memory modes.
+# Below is a breakdown of the permitted values (case insensitive):
+# - DEFAULT: Use the default settings. On NVIDIA Grace Hopper systems, the default is currently ENABLE_ALL; on all other systems, the default is DISABLE.
+# - DISABLE: Memory hints are disabled for all data constructs.
+# - ENABLE_EXPLICIT: Memory hints are enabled for explicit data constructs only.
+# - ENABLE_ALL: Memory hints are enabled for explicit and implicit data constructs.
+# https://docs.nvidia.com/hpc-sdk/compilers/hpc-compilers-user-guide/index.html#environment-variables-controlling-device-memory-management
+# Here we disable the implicit compiler hints.
+# Using NVCOMPILER_ACC_NO_MEMHINTS is the legacy way and is still supported, but users should prefer NVCOMPILER_ACC_MEMHINTS when using newer nvhpc compilers.
+export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints - legacy way
+export NVCOMPILER_ACC_MEMHINTS=DISABLE        # disable implicit compiler hints - new way
+
+# Cray MPICH
+export MPICH_GPU_SUPPORT_ENABLED=1
+export FI_CXI_RX_MATCH_MODE=software
+export FI_MR_CACHE_MONITOR=disabled
+export MPICH_NO_BUFFER_ALIAS_CHECK=1
+
+# NSYS
+export NSYS=0                                 # enable nsys profiling
+export NSYS_FILE=myreport.qdrep
+
+${helpers.template_prologue()}
+
+ok ":) Loading modules:\n"
+cd "${MFC_ROOT_DIR}"
+% if engine == 'batch':
+. ./mfc.sh load -c san -m ${'g' if gpu else 'c'}
+% endif
+cd - > /dev/null
+echo
+
+% for target in targets:
+    ${helpers.run_prologue(target)}
+
+    % if not mpi:
+        (set -x; ${profiler} "${target.get_install_binpath(case)}")
+    % else:
+        (set -x; srun --unbuffered \
+                --ntasks=${nodes*tasks_per_node}                     \
+                --cpus-per-task 72                                   \
+                --cpu-bind=none                                      \
+            % if gpu:
+                --gpus-per-task 1                                    \
+            % endif
+                --wait 200 --bcast=/tmp/${target.name}               \
+                "${target.get_home_dirpath()}/misc/nvidia_uvm/bind.sh" \
+            % if target.name == 'simulation':
+                "${target.get_home_dirpath()}/misc/nvidia_uvm/nsys.sh" \
+            % endif
+                "${target.get_install_binpath(case)}")
+    % endif
+
+    ${helpers.run_epilogue(target)}
+
+    echo
+% endfor
+
+${helpers.template_epilogue()}