MFlowCode · sbryngelson · Aug 14, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025
@@ -486,17 +486,17 @@ function(MFC_SETUP_TARGET)
                 endforeach()
 
                 target_compile_options(${a_target}
-                    PRIVATE -gpu=keep,ptxinfo,lineinfo
+                    PRIVATE -gpu=keep,ptxinfo,lineinfo,fastmath
                 )
 
                 # GH-200 Unified Memory Support
                 if (MFC_Unified)
                     target_compile_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                     # "This option must appear in both the compile and link lines" -- NVHPC Docs
                     target_link_options(${ARGS_TARGET}
-                        PRIVATE -gpu=unified
+                        PRIVATE -gpu=mem:unified -cuda
                     )
                 endif()
 

@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+import math
+import json
+
+N = 799
+Nx = N
+Ny = 2*(N+1)-1
+Nz = 2*(N+1)-1
+
+Re = 1600
+L = 1
+P0 = 101325
+rho0 = 1
+C0 = math.sqrt(1.4 * P0)
+V0 = 0.1 * C0
+mu = V0 * L / Re
+
+cfl = 0.5
+dx = 2 * math.pi * L / (Ny + 1)
+
+dt = cfl * dx / (C0)
+
+tC = L / V0
+tEnd = 20 * tC
+
+Nt = int(tEnd / dt)
+Nt = 10
+
+
+# Configuring case dictionary
+print(
+    json.dumps(
+        {
+            "rdma_mpi": "T",
+            # Logistics
+            "run_time_info": "F",
+            # Computational Domain Parameters
+            "x_domain%beg": -math.pi * L,
+            "x_domain%end": math.pi * L,
+            "y_domain%beg": -math.pi * L,
+            "y_domain%end": math.pi * L,
+            "z_domain%beg": -math.pi * L,
+            "z_domain%end": math.pi * L,
+            "m": Nx,
+            "n": Ny,
+            "p": Nz,
+            "cyl_coord": "F",
+            "dt": dt,
+            "t_step_start": 0,
+            "t_step_stop": Nt,
+            "t_step_save": int(Nt / 100),
+            # Simulation Algorithm Parameters
+            "num_patches": 1,
+            "model_eqns": 2,
+            "num_fluids": 1,
+            "time_stepper": 3,
+            "bc_x%beg": -1,
+            "bc_x%end": -1,
+            "bc_y%beg": -1,
+            "bc_y%end": -1,
+            "bc_z%beg": -1,
+            "bc_z%end": -1,
+            "igr": "T",
+            "igr_order": 5,
+            "igr_iter_solver": 1,
+            "num_igr_iters": 3,
+            "num_igr_warm_start_iters": 3,
+            "alf_factor": 10,
+            "viscous": "T",
+            # Formatted Database Files Structure Parameters
+            "format": 1,
+            "precision": 2,
+            "prim_vars_wrt": "T",
+            "omega_wrt(1)": "T",
+            "omega_wrt(2)": "T",
+            "omega_wrt(3)": "T",
+            "qm_wrt": "T",
+            "fd_order": 4,
+            "parallel_io": "T",
+            # Patch 1: Background (AIR - 2)
+            "patch_icpp(1)%geometry": 9,
+            "patch_icpp(1)%x_centroid": 0,
+            "patch_icpp(1)%y_centroid": 0,
+            "patch_icpp(1)%z_centroid": 0,
+            "patch_icpp(1)%length_x": 2 * math.pi * L,
+            "patch_icpp(1)%length_y": 2 * math.pi * L,
+            "patch_icpp(1)%length_z": 2 * math.pi * L,
+            "patch_icpp(1)%vel(1)": 0.0,
+            "patch_icpp(1)%vel(2)": 0.0,
+            "patch_icpp(1)%vel(3)": 0,
+            "patch_icpp(1)%pres": 0.0,
+            "patch_icpp(1)%hcid": 380,
+            "patch_icpp(1)%alpha_rho(1)": 1,
+            "patch_icpp(1)%alpha(1)": 1,
+            # Fluids Physical Parameters
+            "fluid_pp(1)%gamma": 1.0e00 / (1.4 - 1),
+            "fluid_pp(1)%pi_inf": 0,
+            "fluid_pp(1)%Re(1)": 1 / mu,
+        }
+    )
+)
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+# -------------------------------- #
+# Binding for a single Santis node #
+# -------------------------------- #
+
+# Local rank
+export local_rank="${OMPI_COMM_WORLD_LOCAL_RANK:-$SLURM_LOCALID}"
+
+# Bind to GPU
+export CUDA_VISIBLE_DEVICES="$local_rank"
+
+# Binding to NIC
+export MPICH_OFI_NIC_POLICY=USER
+export MPICH_OFI_NIC_MAPPING="0:0; 1:1; 2:2; 3:3"
+
+# Bind to cores ( first core per socket )
+physcores=(0 72 144 216)
+
+#echo hostname: $(hostname), rank: $local_rank, cores: ${physcores[$local_rank]}, GPU: $CUDA_VISIBLE_DEVICES, NIC mapping: $MPICH_OFI_NIC_POLICY
+
+#set -x
+numactl -l --all --physcpubind=${physcores[$local_rank]} "$@"
+#set +x
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#set -x
+set -euo pipefail
+
+rank="${OMPI_COMM_WORLD_RANK:-$SLURM_PROCID}"
+
+[[ -z "${NSYS_FILE+x}" ]] && NSYS_FILE=report.qdrep
+[[ -z "${NSYS+x}" ]] && NSYS=0
+
+if [[ "$NSYS" -ne 0 && "$rank" -eq 0 ]]; then
+  exec nsys profile \
+       --cpuctxsw=none -b none -s none \
+      --event-sample=system-wide \
+      --cpu-socket-events=61,71,265,273 \
+      --cpu-socket-metrics=103,104 \
+      --event-sampling-interval=10 \
+      --trace=nvtx,openacc \
+      --force-overwrite=true \
+      -e NSYS_MPI_STORE_TEAMS_PER_RANK=1 \
+      -o "$NSYS_FILE" "$@"
+else
+  exec "$@"
+fi
@@ -12,6 +12,53 @@
 #endif
 #:enddef
 
+#:def PREFER_GPU(*args)
+#ifdef MFC_SIMULATION
+#ifdef __NVCOMPILER_GPU_UNIFIED_MEM
+    block
+    use cudafor, gpu_sum => sum, gpu_maxval => maxval, gpu_minval => minval
+    integer :: istat
+    integer :: prefer_gpu_mode
+    character(len=10) :: prefer_gpu_mode_str
+
+    ! environment variable
+    call get_environment_variable("NVIDIA_MANUAL_GPU_HINTS", prefer_gpu_mode_str)
+    if (trim(prefer_gpu_mode_str) == "0") then ! OFF
+        prefer_gpu_mode = 0
+    elseif (trim(prefer_gpu_mode_str) == "1") then ! ON
+        prefer_gpu_mode = 1
+    else ! default
+        prefer_gpu_mode = 0
+    endif
+
+    if (prefer_gpu_mode .eq. 1) then
+    #:for arg in args
+        !print*, "Moving ${arg}$ to GPU => ", SHAPE(${arg}$)
+        ! set preferred location GPU
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetPreferredLocation, 0 )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+        ! set accessed by CPU
+        istat = cudaMemAdvise( c_devloc(${arg}$), SIZEOF(${arg}$), cudaMemAdviseSetAccessedBy, cudaCpuDeviceId )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+        ! prefetch to GPU - physically populate memory pages
+        istat = cudaMemPrefetchAsync( c_devloc(${arg}$), SIZEOF(${arg}$), 0, 0 )
+        if (istat /= cudaSuccess) then
+            write(*,"('Error code: ',I0, ': ')") istat
+            write(*,*) cudaGetErrorString(istat)
+        endif
+    #:endfor
+    end if
+    end block
+#endif
+#endif
+#:enddef
+
 #:def ALLOCATE(*args)
     @:LOG({'@:ALLOCATE(${re.sub(' +', ' ', ', '.join(args))}$)'})
     #:set allocated_variables = ', '.join(args)

@@ -38,7 +38,9 @@ module m_mpi_common
     !! average primitive variables, for a single computational domain boundary
     !! at the time, from the relevant neighboring processor.
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
     $:GPU_DECLARE(create='[buff_send, buff_recv]')
+#endif
 
     integer :: halo_size
     $:GPU_DECLARE(create='[halo_size]')
@@ -78,7 +80,13 @@ contains
 
         $:GPU_UPDATE(device='[halo_size, v_size]')
 
+#ifndef __NVCOMPILER_GPU_UNIFIED_MEM
         @:ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
+#else
+        ALLOCATE(buff_send(0:halo_size), buff_recv(0:halo_size))
+        !$acc enter data create(capture:buff_send)
+        !$acc enter data create(capture:buff_recv)
+#endif
 #endif
 
     end subroutine s_initialize_mpi_common_module

@@ -1308,16 +1308,25 @@ contains
         @:ALLOCATE(x_cb(-1 - buff_size:m + buff_size))
         @:ALLOCATE(x_cc(-buff_size:m + buff_size))
         @:ALLOCATE(dx(-buff_size:m + buff_size))
+        @:PREFER_GPU(x_cb)
+        @:PREFER_GPU(x_cc)
+        @:PREFER_GPU(dx)
 
         if (n == 0) return; 
         @:ALLOCATE(y_cb(-1 - buff_size:n + buff_size))
         @:ALLOCATE(y_cc(-buff_size:n + buff_size))
         @:ALLOCATE(dy(-buff_size:n + buff_size))
+        @:PREFER_GPU(y_cb)
+        @:PREFER_GPU(y_cc)
+        @:PREFER_GPU(dy)
 
         if (p == 0) return; 
         @:ALLOCATE(z_cb(-1 - buff_size:p + buff_size))
         @:ALLOCATE(z_cc(-buff_size:p + buff_size))
         @:ALLOCATE(dz(-buff_size:p + buff_size))
+        @:PREFER_GPU(z_cb)
+        @:PREFER_GPU(z_cc)
+        @:PREFER_GPU(dz)
 
     end subroutine s_initialize_global_parameters_module