diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8269c1cb4..6523f7877 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -135,17 +135,17 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
     if (CMAKE_BUILD_TYPE STREQUAL "Debug")
         add_compile_options(
             -Wall
-            -Wextra
+            -Wextra	
             -fcheck=all,no-array-temps
             -fbacktrace
             -fimplicit-none
             -fsignaling-nans
             -finit-real=snan
             -finit-integer=-99999999
-            -Wintrinsic-shadow
-            -Wunderflow
-            -Wrealloc-lhs
-            -Wsurprising
+            -Wintrinsic-shadow	
+            -Wunderflow	
+            -Wrealloc-lhs	
+            -Wsurprising	
 	    )
     endif()
 
@@ -163,7 +163,6 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
         "SHELL:-h acc_model=auto_async_none"
         "SHELL: -h acc_model=no_fast_addr"
         "SHELL: -h list=adm"
-        "SHELL: -munsafe-fp-atomics" # Not unsafe for operations we do
     )
 
     add_link_options("SHELL:-hkeepfiles")
@@ -173,6 +172,7 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
                 "SHELL:-h acc_model=auto_async_none"
                 "SHELL: -h acc_model=no_fast_addr"
                 "SHELL: -K trap=fp" "SHELL: -G2"
+
         )
         add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
     endif()
@@ -486,23 +486,23 @@ function(MFC_SETUP_TARGET)
                 endforeach()
 
                 target_compile_options(${a_target}
-                    PRIVATE -gpu=keep,ptxinfo,lineinfo
+                    PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
                 )
 
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                     # "This option must appear in both the compile and link lines" -- NVHPC Docs
                     target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                 endif()
 
                 if (CMAKE_BUILD_TYPE STREQUAL "Debug")
                     target_compile_options(${a_target}
-                        PRIVATE -gpu=autocompare,debug
+                        PRIVATE -gpu=autocompare,debug -cuda
                     )
                 endif()
             elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
@@ -513,9 +513,15 @@ function(MFC_SETUP_TARGET)
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
         endif()
 
-        if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
-            find_package(CUDAToolkit REQUIRED)
-            target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+        if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR
+            CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
+
+            if (TARGET CUDA::nvToolsExt) # CUDA <= 12.8
+                target_link_libraries(${a_target} PRIVATE CUDA::nvToolsExt)
+            else() # CUDA >= 12.9
+                target_link_libraries(${a_target}  PRIVATE nvhpcwrapnvtx )
+            endif()
+            target_link_options(${a_target} PRIVATE "-cudalib=nvtx")
         endif()
     endforeach()
 
diff --git a/misc/nvidia_uvm/README.md b/misc/nvidia_uvm/README.md
new file mode 100644
index 000000000..2de72c489
--- /dev/null
+++ b/misc/nvidia_uvm/README.md
@@ -0,0 +1,48 @@
+## The Main Idea behind the implemented Out-of-Core Strategy for Grace-Hopper
+
+To run MFC out-of-core on Grace-Hopper using Unified Memory we implement a zero-copy strategy.
+
+We start by setting preferred location CPU for all buffers by hooking into the allocate macro and setting `NVIDIA_ALLOC_MODE=2`.
+This way we disable access counter based migrations and keep everything on the CPU memory, freeing up as much GPU memory as possible.
+
+Then, for the "important" buffers that are frequently accessed from the GPU, we reset preferred location to GPU in order to place them (and directly populate them) in GPU memory.
+This is done by the `PREFER_GPU` macro that has been manually placed in the code right after the allocations of the "important" buffers.
+To activate these hints we export `NVIDIA_MANUAL_GPU_HINTS=1`.
+
+To allow fine grained control and be able to simulate larger sizes, we also use the following environment variables:
+- With `NVIDIA_IGR_TEMPS_ON_GPU` we control how many temporaries from the IGR module are to be placed in GPU memory.
+- With `NVIDIA_VARS_ON_GPU` we control how many of the `q_cons_ts(1)%vf(j)%sf` arrays we place in GPU memory.
+
+It is important to note that we have rearranged the timestep updates in the 3rd order TVD Runge Kutta scheme in a way that allows us to pass only `q_cons_ts(1)` to the `compute_rhs` routines.
+This way, in order to keep the computation of `compute_rhs` (mostly) on GPU data, we only need to store `q_cons_ts(1)` (fully or even partially) in GPU memory.
+Thus, we choose to keep `q_cons_ts(2)` in CPU memory for the full lifetime of the simulation, freeing up space in GPU memory that allows for bumping up the size of the simulation, without sacrificing performance.
+In the timestep updates between the `compute_rhs` calls, we access both `q_cons_ts(1)` and `q_cons_ts(2)` directly from the physical location where they reside (zero-copy), simultaneously pulling data from GPU memory and CPU memory (through C2C), making good use of Grace-Hopper.
+
+Note: This rearrangement most likely "breaks" the timestepper for different physics cases, but we can easily fix it in a later step.
+
+## Example Workflow for Out-of-Core Strategy based on Unified Memory
+
+```shell
+# Allocate a node
+salloc -A g183 --partition normal -t 02:00:00 -N 1 -n 4 --cpus-per-task=71
+
+# Start uenv
+uenv start --view=modules icon/25.2:v1
+
+# cd to root directory of MFC
+cd MFC-Wilfong
+
+# Load modules
+. ./mfc.sh load -c san -m g
+
+# Build
+export MFC_CUDA_CC=90
+./mfc.sh build --gpu -j 71 --single --unified --verbose
+
+# Run pre_process and simulation binaries with case optimization (in an interactive job)
+./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis
+
+# Run pre_process and simulation binaries with case optimization (in an batch job)
+./mfc.sh run examples/3D_IGR_perf_test/case.py --case-optimization -t pre_process simulation --gpu -N 1 -n 4 -j 71 -c santis -e batch -p normal -a g183 -w 00:15:00
+```
+The environment variables `NVIDIA_ALLOC_MODE`, `NVIDIA_MANUAL_GPU_HINTS`, `NVIDIA_VARS_ON_GPU`, and `NVIDIA_IGR_TEMPS_ON_GPU`, can be set appropriately in `toolchain/templates/santis.mako`, to configure a run with ALL buffers either in GPU or in CPU memory, or a run with SOME buffers in GPU memory and the rest in CPU memory.
diff --git a/misc/nvidia_uvm/bind.sh b/misc/nvidia_uvm/bind.sh
new file mode 100755
index 000000000..239d769e8
--- /dev/null
+++ b/misc/nvidia_uvm/bind.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+# -------------------------------- #
+# Binding for a single Santis node #
+# -------------------------------- #
+
+# Local rank
+export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
+
+# Bind to GPU
+export CUDA_VISIBLE_DEVICES="$local_rank"
+
+# Binding to NIC
+export MPICH_OFI_NIC_POLICY=USER
+export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
+
+# Bind to cores ( second core per socket )
+physcores=(0 72 144 216)
+
+#echo rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
+numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
diff --git a/misc/nvidia_uvm/nsys.sh b/misc/nvidia_uvm/nsys.sh
new file mode 100755
index 000000000..19b3d4b44
--- /dev/null
+++ b/misc/nvidia_uvm/nsys.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+#set -x
+set -euo pipefail
+
+rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
+
+[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
+[[ -z "${NSYS+x}" ]] && NSYS=0
+
+if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
+    echo "Doing nsys"
+  exec nsys profile \
+       --cpuctxsw=none -b none -s none \
+      --event-sample=system-wide \
+      --cpu-socket-events=61,71,265,273 \
+      --cpu-socket-metrics=103,104 \
+      --event-sampling-interval=10 \
+      --trace=nvtx,openacc \
+      --force-overwrite=true \
+      -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
+      -o "$NSYS_FILE" "$@"
+else
+  exec "$@"
+fi
diff --git a/misc/nvidia_uvm/run.sh b/misc/nvidia_uvm/run.sh
new file mode 100644
index 000000000..c065ebb81
--- /dev/null
+++ b/misc/nvidia_uvm/run.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# TODO: Modify accordingly
+PATH_TO_BINARY=${SCRATCH}/projects/cfd/mfc/MFC-Wilfong/build/install/cdcd4e8762/bin/
+
+# NVHPC and CUDA env vars
+export NV_ACC_USE_MALLOC=1                    # use malloc instead of cudaMallocManaged ( compiled using -gpu=mem:unified )
+export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints
+export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
+
+# Cray MPICH
+export MPICH_GPU_SUPPORT_ENABLED=1            # MPICH with GPU support
+export FI_CXI_RX_MATCH_MODE=software
+export FI_MR_CACHE_MONITOR=disabled
+
+# CUSTOM env vars to MFC
+export NVIDIA_ALLOC_MODE=2                    # default alloc to prefloc CPU
+export NVIDIA_MANUAL_GPU_HINTS=1              # prefloc GPU on some
+export NVIDIA_IGR_TEMPS_ON_GPU=1              # jac on GPU and jac_rhs on CPU       ( NOTE: good default, tune based on size )
+export NVIDIA_VARS_ON_GPU=7                   # q_cons_ts(1)%vf%sf for j=1-7 on GPU ( NOTE: good default, tune based on size )
+
+# NSYS
+export NSYS=1                                 # enable nsys profiling
+export NSYS_FILE=report_uvm_single_N-499_nGPUs-4_params-${NVIDIA_VARS_ON_GPU}-${NVIDIA_IGR_TEMPS_ON_GPU}.qdrep
+
+# Run using --cpu-bind=none because we use our own binding script
+srun --ntasks 4 --cpu-bind=none ./bind.sh ./nsys.sh ${PATH_TO_BINARY}/simulation
diff --git a/src/common/include/macros.fpp b/src/common/include/macros.fpp
index c1652388c..c89d8e9f7 100644
--- a/src/common/include/macros.fpp
+++ b/src/common/include/macros.fpp
@@ -12,11 +12,113 @@
 #endif
 #:enddef
 
+#:def PREFER_GPU(*args)
+#ifdef MFC_SIMULATION
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    block
+    use cudafor
+    intrinsic :: minval, maxval, sum
+    integer :: istat
+    integer :: prefer_gpu_mode
+    character(len=10) :: prefer_gpu_mode_str
+
+    ! environment variable
+    call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
+    if (trim(prefer_gpu_mode_str) == "0") then ! OFF
+        prefer_gpu_mode = 0
+    elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
+        prefer_gpu_mode = 1
+    else ! default
+        prefer_gpu_mode = 0
+    endif
+
+    if (prefer_gpu_mode .eq. 1) then
+    #:for arg in args
+        !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
+        ! unset
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseUnSetPreferredLocation, cudaCpuDeviceId )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+        ! set
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+    #:endfor
+    end if
+    end block
+#endif
+#endif
+#:enddef
+
+
+#:def PARSE(s)
+${s if s.rfind(')') == -1 else next((s[:i] for i in range(s.rfind(')'), -1, -1) if s[i] == '(' and s.count('(', i, s.rfind(')')+1) == s.count(')', i, s.rfind(')')+1)), s)}$
+#:enddef
+
 #:def ALLOCATE(*args)
     @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
     #:set allocated_variables = ', '.join(args)
     allocate (${allocated_variables}$)
     $:GPU_ENTER_DATA(create=('[' + allocated_variables + ']'))
+
+
+#ifdef MFC_SIMULATION
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    block
+    use cudafor
+    intrinsic :: minval, maxval, sum
+    integer :: istat, stream_id
+    integer :: alloc_mode
+    character(len=10) :: alloc_mode_str
+
+    ! environment variable
+    call get_environment_variable("NVIDIA_ALLOC_MODE", alloc_mode_str)
+    if (trim(alloc_mode_str) == "0") then ! no CPU first touch, no preferred location CPU
+        alloc_mode = 0
+    elseif (trim(alloc_mode_str) == "1") then ! CPU first touch, no preferred location CPU
+        alloc_mode = 1
+    elseif (trim(alloc_mode_str) == "2") then ! no CPU first touch, preferred location CPU
+        alloc_mode = 2
+    elseif (trim(alloc_mode_str) == "3") then ! CPU first touch, preferred location CPU
+        alloc_mode = 3
+    else ! default
+        alloc_mode = 0
+    endif
+
+    stream_id = 0
+
+    ! prefetch to CPU
+    if ((alloc_mode .eq. 1) .or. (alloc_mode .eq. 3)) then
+    #:for arg in args
+        istat = cudaMemPrefetchAsync( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaCpuDeviceId, stream_id )
+        !print*, "! @{PARSE(${arg}$)}@ with shape",  SHAPE(@{PARSE(${arg}$)}@), "=> prefetch to CPU"
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+    #:endfor
+    endif
+
+    ! memadvise preferred location
+    if ((alloc_mode .eq. 2) .or. (alloc_mode .eq. 3)) then
+    #:for arg in args
+        istat = cudaMemAdvise( c_devloc(@{PARSE(${arg}$)}@), SIZEOF(@{PARSE(${arg}$)}@), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId )
+        !print*, "! @{PARSE(${arg}$)}@ with shape",  SHAPE(@{PARSE(${arg}$)}@), "=> preferred location CPU"
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+    #:endfor
+    endif
+
+    end block
+#endif
+#endif
+
 #:enddef ALLOCATE
 
 #:def DEALLOCATE(*args)
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index fa6185c20..047056a51 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -1294,16 +1294,25 @@ contains
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
         @:ALLOCATE(x_cc(-buff_size:m + buff_size))
         @:ALLOCATE(dx(-buff_size:m + buff_size))
+        @:PREFER_GPU(x_cb)
+        @:PREFER_GPU(x_cc)
+        @:PREFER_GPU(dx)
 
         if (n == 0) return; 
         @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
         @:ALLOCATE(y_cc(-buff_size:n + buff_size))
         @:ALLOCATE(dy(-buff_size:n + buff_size))
+        @:PREFER_GPU(y_cb)
+        @:PREFER_GPU(y_cc)
+        @:PREFER_GPU(dy)
 
         if (p == 0) return; 
         @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
         @:ALLOCATE(z_cc(-buff_size:p + buff_size))
         @:ALLOCATE(dz(-buff_size:p + buff_size))
+        @:PREFER_GPU(z_cb)
+        @:PREFER_GPU(z_cc)
+        @:PREFER_GPU(dz)
 
     end subroutine s_initialize_global_parameters_module
 
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index db80bb834..1219626ef 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -24,8 +24,12 @@ module m_igr
  s_igr_flux_add, &
  s_finalize_igr_module
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    real(wp), pointer, contiguous, dimension(:, :, :) :: jac,jac_rhs,jac_old
+#else
     real(wp), allocatable, dimension(:, :, :) :: jac, jac_rhs, jac_old
     $:GPU_DECLARE(create='[jac, jac_rhs, jac_old]')
+#endif
 
     real(wp), allocatable, dimension(:, :) :: Res
     $:GPU_DECLARE(create='[Res]')
@@ -79,10 +83,36 @@ module m_igr
 
     integer :: i, j, k, l, q, r
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    real(wp), allocatable, dimension(:, :, :, :), pinned, target :: m_igr_pool_host
+    real(wp), allocatable, dimension(:, :, :), pinned, target :: m_igr_pool_host2
+#endif
+
 contains
 
     subroutine s_initialize_igr_module()
 
+        integer :: igr_temps_on_gpu = 3
+        integer :: igr_temps_on_cpu = 0
+        integer :: pool_idx = 1
+        character(len=10) :: igr_temps_on_gpu_str
+
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        call get_environment_variable("NVIDIA_IGR_TEMPS_ON_GPU", igr_temps_on_gpu_str)
+
+        if (trim(igr_temps_on_gpu_str) == "0") then
+            igr_temps_on_gpu = 0 ! jac, jac_rhs and jac_old on CPU
+        elseif (trim(igr_temps_on_gpu_str) == "1") then
+            igr_temps_on_gpu = 1 ! jac on GPU, jac_rhs on CPU, jac_old on CPU
+        elseif (trim(igr_temps_on_gpu_str) == "2") then
+            igr_temps_on_gpu = 2 ! jac and jac_rhs on GPU, jac_old on CPU
+        elseif (trim(igr_temps_on_gpu_str) == "3") then
+            igr_temps_on_gpu = 3 ! jac, jac_rhs and jac_old on GPU
+        else ! default on GPU
+            igr_temps_on_gpu = 3
+        endif
+#endif
+
         if (viscous) then
             @:ALLOCATE(Res(1:2, 1:maxval(Re_size)))
             do i = 1, 2
@@ -91,8 +121,73 @@ contains
                 end do
             end do
             $:GPU_UPDATE(device='[Res, Re_idx, Re_size]')
+            @:PREFER_GPU(Res)
+            @:PREFER_GPU(Re_idx)
         end if
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+           igr_temps_on_cpu = 3 - igr_temps_on_gpu
+
+           if ( igr_temps_on_cpu >= 1 ) then
+               !allocate(m_igr_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+               !                         idwbuff(2)%beg:idwbuff(2)%end, &
+               !                         idwbuff(3)%beg:idwbuff(3)%end, &
+               !                         1:igr_temps_on_cpu))
+
+               !There was a dimensionality change for jac_rhs, using a different pineed pool
+               if ( igr_temps_on_cpu == 1 ) then
+                   allocate(m_igr_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                            idwbuff(2)%beg:idwbuff(2)%end, &
+                                            idwbuff(3)%beg:idwbuff(3)%end, &
+                                            1:igr_temps_on_cpu))
+
+                elseif (igr_temps_on_cpu >=2 ) then
+                   allocate(m_igr_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                            idwbuff(2)%beg:idwbuff(2)%end, &
+                                            idwbuff(3)%beg:idwbuff(3)%end, &
+                                            1:igr_temps_on_cpu-1))
+                   allocate(m_igr_pool_host2(-1:m,-1:n,-1:p)) 
+               endif
+
+               pool_idx = 1
+               if ( igr_temps_on_cpu >= 1 ) then
+                   !print*, 'jac_old on CPU'
+                   jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
+                       idwbuff(2)%beg:idwbuff(2)%end, &
+                       idwbuff(3)%beg:idwbuff(3)%end) => m_igr_pool_host(:,:,:,pool_idx)
+                   pool_idx = pool_idx + 1
+               end if
+               if ( igr_temps_on_cpu >= 2 ) then
+                   jac_rhs(-1:m,-1:n,-1:p) => m_igr_pool_host2(:,:,:)
+               end if
+               if ( igr_temps_on_cpu >= 3 ) then
+                   !print*, 'jac on CPU'
+                   jac(idwbuff(1)%beg:idwbuff(1)%end, &
+                       idwbuff(2)%beg:idwbuff(2)%end, &
+                       idwbuff(3)%beg:idwbuff(3)%end) => m_igr_pool_host(:,:,:,pool_idx)
+                   pool_idx = pool_idx + 1
+               end if
+           end if
+           if ( igr_temps_on_gpu >= 1 ) then
+                !print*, 'jac on GPU'
+                @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
+                             idwbuff(2)%beg:idwbuff(2)%end, &
+                             idwbuff(3)%beg:idwbuff(3)%end))
+                @:PREFER_GPU(jac)
+           endif
+           if ( igr_temps_on_gpu >= 2 ) then
+                !print*, 'jac_rhs on GPU'
+                @:ALLOCATE(jac_rhs(-1:m,-1:n,-1:p))
+                @:PREFER_GPU(jac_rhs)
+           endif
+           if ( igr_temps_on_gpu >= 3 ) then
+                !print*, 'jac_old on GPU'
+                @:ALLOCATE(jac_old(idwbuff(1)%beg:idwbuff(1)%end, &
+                             idwbuff(2)%beg:idwbuff(2)%end, &
+                             idwbuff(3)%beg:idwbuff(3)%end))
+                @:PREFER_GPU(jac_old)
+           endif
+#else
         @:ALLOCATE(jac(idwbuff(1)%beg:idwbuff(1)%end, &
             idwbuff(2)%beg:idwbuff(2)%end, &
             idwbuff(3)%beg:idwbuff(3)%end))
@@ -103,6 +198,7 @@ contains
                 idwbuff(2)%beg:idwbuff(2)%end, &
                 idwbuff(3)%beg:idwbuff(3)%end))
         end if
+#endif
 
         $:GPU_PARALLEL_LOOP(collapse=3)
         do l = idwbuff(3)%beg, idwbuff(3)%end
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index d040650bf..e092cc0f5 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -77,6 +77,10 @@ module m_time_steppers
 
     $:GPU_DECLARE(create='[q_cons_ts,q_prim_vf,q_T_sf,rhs_vf,q_prim_ts,rhs_mv,rhs_pb,max_dt]')
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    real(wp), allocatable, dimension(:, :, :, :), pinned, target :: q_cons_ts_pool_host
+#endif
+
 contains
 
     !> The computation of parameters, the allocation of memory,
@@ -86,6 +90,33 @@ contains
 
         integer :: i, j !< Generic loop iterators
 
+        integer :: vars_on_gpu = 0
+        character(len=10) :: vars_on_gpu_str
+
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        call get_environment_variable("NVIDIA_VARS_ON_GPU", vars_on_gpu_str)
+
+        if (trim(vars_on_gpu_str) == "0") then
+            vars_on_gpu = 0
+        elseif (trim(vars_on_gpu_str) == "1") then
+            vars_on_gpu = 1
+        elseif (trim(vars_on_gpu_str) == "2") then
+            vars_on_gpu = 2
+        elseif (trim(vars_on_gpu_str) == "3") then
+            vars_on_gpu = 3
+        elseif (trim(vars_on_gpu_str) == "4") then
+            vars_on_gpu = 4
+        elseif (trim(vars_on_gpu_str) == "5") then
+            vars_on_gpu = 5
+        elseif (trim(vars_on_gpu_str) == "6") then
+            vars_on_gpu = 6
+        elseif (trim(vars_on_gpu_str) == "7") then
+            vars_on_gpu = 7
+        else ! default
+            vars_on_gpu = 0
+        endif
+#endif
+
         ! Setting number of time-stages for selected time-stepping scheme
         if (time_stepper == 1) then
             num_ts = 1
@@ -95,11 +126,35 @@ contains
 
         ! Allocating the cell-average conservative variables
         @:ALLOCATE(q_cons_ts(1:num_ts))
+        @:PREFER_GPU(q_cons_ts)
 
         do i = 1, num_ts
             @:ALLOCATE(q_cons_ts(i)%vf(1:sys_size))
+            @:PREFER_GPU(q_cons_ts(i)%vf)
         end do
 
+        !!
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+           allocate(q_cons_ts_pool_host(idwbuff(1)%beg:idwbuff(1)%end, &
+                                        idwbuff(2)%beg:idwbuff(2)%end, &
+                                        idwbuff(3)%beg:idwbuff(3)%end, &
+                                        1:sys_size))
+           do i = 1, num_ts
+                do j = 1, sys_size
+                    if ( i == 1 ) then
+                        @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                            idwbuff(2)%beg:idwbuff(2)%end, &
+                            idwbuff(3)%beg:idwbuff(3)%end))
+                        @:PREFER_GPU(q_cons_ts(i)%vf(j)%sf)
+                    else
+                        q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
+                            idwbuff(2)%beg:idwbuff(2)%end, &
+                            idwbuff(3)%beg:idwbuff(3)%end) => q_cons_ts_pool_host(:,:,:,j)
+                    end if
+                end do
+                @:ACC_SETUP_VFs(q_cons_ts(i))
+            end do
+#else
         do i = 1, num_ts
             do j = 1, sys_size
                 @:ALLOCATE(q_cons_ts(i)%vf(j)%sf(idwbuff(1)%beg:idwbuff(1)%end, &
@@ -108,6 +163,7 @@ contains
             end do
             @:ACC_SETUP_VFs(q_cons_ts(i))
         end do
+#endif
 
         ! Allocating the cell-average primitive ts variables
         if (probe_wrt) then
@@ -682,6 +738,7 @@ contains
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=1)
 
+#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -694,6 +751,22 @@ contains
                 end do
             end do
         end do
+#else
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do l = 0, p
+            do k = 0, n
+                do j = 0, m
+                    do i = 1, sys_size
+                        q_cons_ts(2)%vf(i)%sf(j, k, l) = &
+                            q_cons_ts(1)%vf(i)%sf(j, k, l)
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                            q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                            + dt*rhs_vf(i)%sf(j, k, l)
+                    end do
+                end do
+            end do
+        end do
+#endif
 
         !Evolve pb and mv for non-polytropic qbmm
         if (qbmm .and. (.not. polytropic)) then
@@ -750,10 +823,15 @@ contains
 
         ! Stage 2 of 3
 
+#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED)
         call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 2)
+#else
+        call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg,2)
+#endif
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=2)
 
+#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -767,6 +845,21 @@ contains
                 end do
             end do
         end do
+#else
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do l = 0, p
+            do k = 0, n
+                do j = 0, m
+                    do i = 1, sys_size
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                            (3._wp*q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                             + q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                             + dt*rhs_vf(i)%sf(j, k, l))/4._wp
+                    end do
+                end do
+            end do
+        end do
+#endif
 
         if (qbmm .and. (.not. polytropic)) then
             $:GPU_PARALLEL_LOOP(collapse=5)
@@ -823,10 +916,15 @@ contains
         end if
 
         ! Stage 3 of 3
+#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED)
         call s_compute_rhs(q_cons_ts(2)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg, 3)
+#else
+        call s_compute_rhs(q_cons_ts(1)%vf, q_T_sf, q_prim_vf, bc_type, rhs_vf, pb_ts(2)%sf, rhs_pb, mv_ts(2)%sf, rhs_mv, t_step, time_avg,3)
+#endif
 
         if (bubbles_lagrange .and. .not. adap_dt) call s_update_lagrange_tdv_rk(stage=3)
 
+#if !defined(__NVCOMPILER_GPU_UNIFIED_MEM) && !defined(FRONTIER_UNIFIED)
         $:GPU_PARALLEL_LOOP(collapse=4)
         do i = 1, sys_size
             do l = 0, p
@@ -840,6 +938,21 @@ contains
                 end do
             end do
         end do
+#else
+        !$acc parallel loop collapse(3) gang vector default(present)
+        do l = 0, p
+            do k = 0, n
+                do j = 0, m
+                    do i = 1, sys_size
+                        q_cons_ts(1)%vf(i)%sf(j, k, l) = &
+                            (q_cons_ts(2)%vf(i)%sf(j, k, l) &
+                             + 2._wp*q_cons_ts(1)%vf(i)%sf(j, k, l) &
+                             + 2._wp*dt*rhs_vf(i)%sf(j, k, l))/3._wp
+                    end do
+                end do
+            end do
+        end do
+#endif
 
         if (qbmm .and. (.not. polytropic)) then
             $:GPU_PARALLEL_LOOP(collapse=5)
@@ -1143,15 +1256,29 @@ contains
 
         ! Deallocating the cell-average conservative variables
         do i = 1, num_ts
+#if defined(__NVCOMPILER_GPU_UNIFIED_MEM)
+            do j = 1, sys_size
+                if ( i == 1 ) then
+                    @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
+                else
+                    nullify(q_cons_ts(i)%vf(j)%sf)
+                end if
+            end do
+#else
             do j = 1, sys_size
                 @:DEALLOCATE(q_cons_ts(i)%vf(j)%sf)
             end do
+#endif
 
             @:DEALLOCATE(q_cons_ts(i)%vf)
         end do
 
         @:DEALLOCATE(q_cons_ts)
 
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+        deallocate(q_cons_ts_pool_host)
+#endif
+
         ! Deallocating the cell-average primitive ts variables
         if (probe_wrt) then
             do i = 0, 3
diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py
index 2de738986..750c9b294 100644
--- a/toolchain/mfc/build.py
+++ b/toolchain/mfc/build.py
@@ -64,6 +64,9 @@ def get_install_dirpath(self, case: Case ) -> str:
         # The install directory is located <root>/build/install/<slug>
         return os.sep.join([os.getcwd(), "build", "install", self.get_slug(case)])
 
+    def get_home_dirpath(self, case: Case) -> str:
+        return os.sep.join([os.getcwd()])
+
     def get_install_binpath(self, case: Case ) -> str:
         # <root>/install/<slug>/bin/<target>
         return os.sep.join([self.get_install_dirpath(case), "bin", self.name])
diff --git a/toolchain/templates/default.mako b/toolchain/templates/default.mako
index b1cdaf81e..df833ed4e 100644
--- a/toolchain/templates/default.mako
+++ b/toolchain/templates/default.mako
@@ -57,7 +57,7 @@ if engine == 'batch':
                         "${target.get_install_binpath(case)}")
             elif [ "$binary" == "mpirun" ]; then
                 (set -x; ${profiler}     \
-                    $binary -np ${nodes*tasks_per_node}            \
+                    $binary --allow-run-as-root -np ${nodes*tasks_per_node} \
                             "${target.get_install_binpath(case)}")
             elif [ "$binary" == "mpiexec" ]; then
                 (set -x; ${profiler}                               \
diff --git a/toolchain/templates/santis.mako b/toolchain/templates/santis.mako
new file mode 100644
index 000000000..818383d35
--- /dev/null
+++ b/toolchain/templates/santis.mako
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+<%namespace name="helpers" file="helpers.mako"/>
+
+% if engine == 'batch':
+#SBATCH --uenv=icon/25.2:v1
+#SBATCH --nodes=${nodes}
+#SBATCH --reservation=g183
+#SBATCH --ntasks-per-node=${tasks_per_node}
+#SBATCH --job-name="${name}"
+#SBATCH --output="${name}.out"
+#SBATCH --error="${name}.err"
+#SBATCH --time=${walltime}
+% if account:
+#SBATCH --account=${account}
+% endif
+% if partition:
+#SBATCH --partition=${partition}
+% endif
+% if quality_of_service:
+#SBATCH --qos=${quality_of_service}
+% endif
+% if email:
+#SBATCH --mail-user=${email}
+#SBATCH --mail-type="BEGIN, END, FAIL"
+% endif
+% endif
+
+# NVHPC and CUDA env vars
+export NV_ACC_USE_MALLOC=0                    # use cudaMallocManaged instead of malloc ( compiled using -gpu=mem:unified )
+export NVCOMPILER_ACC_NO_MEMHINTS=1           # disable implicit compiler hints
+#export CUDA_BUFFER_PAGE_IN_THRESHOLD_MS=0.001 # workaround for copying to/from unpopulated buffers on GH
+
+# Cray MPICH
+export MPICH_GPU_SUPPORT_ENABLED=1
+export FI_CXI_RX_MATCH_MODE=software
+export FI_MR_CACHE_MONITOR=disabled
+export MPICH_NO_BUFFER_ALIAS_CHECK=1
+
+# CUSTOM env vars to MFC
+export NVIDIA_ALLOC_MODE=0                    # do nothing
+export NVIDIA_MANUAL_GPU_HINTS=1              # prefloc GPU on some
+export NVIDIA_IGR_TEMPS_ON_GPU=3              # jac, jac_rhs, and jac_old on GPU
+export NVIDIA_VARS_ON_GPU=7                   # q_cons_ts(1)%vf%sf for j=1-7 on GPU
+
+# NSYS
+export NSYS=0                                 # enable nsys profiling
+export NSYS_FILE=myreport.qdrep
+
+${helpers.template_prologue()}
+
+ok ":) Loading modules:\n"
+cd "${MFC_ROOT_DIR}"
+% if engine == 'batch':
+. ./mfc.sh load -c san -m ${'g' if gpu else 'c'}
+% endif
+cd - > /dev/null
+echo
+
+% for target in targets:
+    ${helpers.run_prologue(target)}
+
+    % if not mpi:
+        (set -x; ${profiler} "${target.get_install_binpath(case)}")
+    % else:
+        (set -x; srun --unbuffered \
+                --ntasks=${nodes*tasks_per_node}                     \
+                --cpus-per-task 1                                    \
+                --cpu-bind=none                                      \
+            % if gpu:
+                --gpus-per-task 1                                    \
+            % endif
+                --wait 200 --bcast=/tmp/${target.name}               \
+                "${target.get_home_dirpath(case)}/misc/nvidia_uvm/bind.sh" \
+            #% if target.name == 'simulation':
+                #"${target.get_home_dirpath(case)}/misc/nvidia_uvm/nsys.sh" \
+            #% endif
+                "${target.get_install_binpath(case)}")
+    % endif
+
+    ${helpers.run_epilogue(target)}
+
+    echo
+% endfor
+
+${helpers.template_epilogue()}