pinton-lab · MasashiSode · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 30, 2026
diff --git a/.bumpversion.toml b/.bumpversion.toml
@@ -1,5 +1,5 @@
 [tool.bumpversion]
-current_version = "1.2.6-dev8"
+current_version = "1.2.6-dev9"
 parse = """(?x)
     (?P<major>0|[1-9]\\d*)\\.
     (?P<minor>0|[1-9]\\d*)\\.

diff --git a/fullwave/__init__.py b/fullwave/__init__.py
@@ -60,7 +60,7 @@
     __version__ = version("fullwave")
 except PackageNotFoundError:
     # Update via bump-my-version, not manually
-    __version__ = "1.2.6-dev8"
+    __version__ = "1.2.6-dev9"
 
 VERSION = __version__  # for convenience
 logger.info("Fullwave version: %s", __version__)
diff --git a/fullwave/solver/pml_builder.py b/fullwave/solver/pml_builder.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import concurrent.futures
+import gc
 import logging
 from collections import OrderedDict
 from dataclasses import dataclass, field
@@ -563,6 +564,33 @@ def _extend_map_for_pml(
 
         return output
 
+    def _move_named_arrays_to_cpu(
+        self,
+        named_arrays: list[tuple[str, NDArray]],
+        attr_names: list[str] | None = None,
+    ) -> list[tuple[str, NDArray]]:
+        """Convert CuPy arrays to numpy and free GPU memory pools.
+
+        Also replaces the corresponding ``medium_org`` attributes (given by
+        *attr_names*) so the original GPU arrays can be garbage-collected.
+        """
+        import cupy as cp  # noqa: PLC0415
+
+        numpy_arrays = []
+        for name, arr in named_arrays:
+            if hasattr(arr, "get"):
+                arr_np = arr.get()
+                numpy_arrays.append((name, arr_np))
+                # Update medium_org so the CuPy array can be freed
+                if attr_names and name in attr_names:
+                    setattr(self.medium_org, name, arr_np)
+            else:
+                numpy_arrays.append((name, arr))
+        gc.collect()
+        cp.get_default_memory_pool().free_all_blocks()
+        cp.get_default_pinned_memory_pool().free_all_blocks()
+        return numpy_arrays
+
     def _extend_arrays_gpu(
         self,
         named_arrays: list[tuple[str, NDArray]],
@@ -571,8 +599,8 @@ def _extend_arrays_gpu(
 
         Parameters
         ----------
-        named_arrays : list of (name, numpy_array) pairs
-            Arrays to extend. Must be numpy arrays (CPU).
+        named_arrays : list of (name, numpy_array) or (name, cupy_array) pairs
+            Arrays to extend.
 
         Returns
         -------
@@ -585,6 +613,8 @@ def _extend_arrays_gpu(
         n_gpus = cp.cuda.runtime.getDeviceCount()
         logger.info("CUDA devices available: %d", n_gpus)
 
+        attr_names = [name for name, _ in named_arrays]
+
         # Strategy 1: Multi-GPU parallel
         if n_gpus > 1:
             n_workers = min(n_gpus, len(named_arrays))
@@ -597,16 +627,15 @@ def _extend_arrays_gpu(
                 )
                 return self._extend_arrays_multi_gpu(named_arrays, n_gpus)
             except Exception:
-                logger.warning("Multi-GPU extension failed. Falling back to sequential single-GPU.")
+                logger.warning(
+                    "Multi-GPU extension failed. Falling back to CPU.",
+                    exc_info=True,
+                )
 
-        # Strategy 2: Sequential single-GPU (extend one, free, repeat)
-        try:
-            logger.info("Extending %d medium arrays sequentially on GPU 0.", len(named_arrays))
-            return self._extend_arrays_sequential_gpu(named_arrays)
-        except Exception:
-            logger.warning("Single-GPU extension failed. Falling back to CPU.")
+        # Move arrays to CPU and free GPU memory before CPU fallback.
+        # The original medium arrays may still occupy GPU memory.
+        named_arrays = self._move_named_arrays_to_cpu(named_arrays, attr_names)
 
-        # Strategy 3: CPU
         logger.info("Extending %d medium arrays on CPU.", len(named_arrays))
         return self._extend_arrays_cpu(named_arrays)
 
@@ -669,13 +698,21 @@ def _extend_arrays_cpu(
         named_arrays: list[tuple[str, NDArray]],
     ) -> dict[str, NDArray]:
         """Extend arrays on CPU using ThreadPoolExecutor."""
+        # Convert any CuPy arrays to numpy before CPU fallback
+        numpy_arrays = []
+        for name, arr in named_arrays:
+            if hasattr(arr, "get"):
+                numpy_arrays.append((name, arr.get()))
+            else:
+                numpy_arrays.append((name, arr))
+
         orig_xp = self.xp
         self.xp = np
         try:
             with concurrent.futures.ThreadPoolExecutor() as executor:
                 futures = {
                     name: executor.submit(self._extend_map_for_pml, arr)
-                    for name, arr in named_arrays
+                    for name, arr in numpy_arrays
                 }
                 return {name: future.result() for name, future in futures.items()}
         finally:

diff --git a/fullwave/utils/pulse.py b/fullwave/utils/pulse.py
@@ -61,14 +61,25 @@ def gaussian_modulated_sinusoidal_signal(
     dt = duration / nt
     t_offset = ncycles / f0 + delay_sec
 
+    phase_correction = 0.0
     if i_layer is not None:
         if dt_for_layer_delay is None:
             error_msg = "dt_for_layer_delay must be provided if i_layer is provided"
             raise ValueError(error_msg)
         if cfl_for_layer_delay is None:
             error_msg = "cfl_for_layer_delay must be provided if i_layer is provided"
             raise ValueError(error_msg)
-        t_offset += (dt_for_layer_delay / cfl_for_layer_delay) * i_layer
+        layer_delay_sec = (dt_for_layer_delay / cfl_for_layer_delay) * i_layer
+        t_offset += layer_delay_sec
+        # Correct carrier phase for fractional-sample delays.
+        # A continuous time shift of a fractional number of samples changes
+        # the carrier phase at each discrete sample point, causing sign
+        # inversion and wrong peak positions. Remove the fractional phase
+        # offset so the carrier stays on the same discrete phase grid as
+        # the base (i_layer=0) signal.
+        delay_samples = layer_delay_sec / dt
+        fractional_samples = delay_samples - round(delay_samples)
+        phase_correction = 2.0 * np.pi * f0 * fractional_samples * dt
 
     t = np.arange(nt, dtype=dtype) * dt - t_offset
 
@@ -87,5 +98,5 @@ def gaussian_modulated_sinusoidal_signal(
     else:
         env = np.exp(-(a_sq**drop_off))
 
-    # Compute final signal
-    return env * np.sin(w_t) * p0
+    # Compute final signal with phase-corrected carrier
+    return env * np.sin(w_t + phase_correction) * p0
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "fullwave25"
-version = "1.2.6-dev8" # Update via bump-my-version, not manually
+version = "1.2.6-dev9" # Update via bump-my-version, not manually
 description = "Fullwave 2.5: Ultrasound wave propagation simulation with heterogeneous power law attenuation modelling capabilities"
 readme = "README.md"
 requires-python = ">=3.10"

diff --git a/tests/test_pulse.py b/tests/test_pulse.py
@@ -76,6 +76,68 @@ def test_float32_dtype(base_params):
     assert y.dtype == np.float32
 
 
+def test_fractional_layer_delay_preserves_phase(base_params):
+    """Fractional-sample layer delays must match a manual integer shift.
+
+    Previously, fractional delays caused carrier phase errors that inverted
+    the peak sign for odd layers (Finding 19 in cross-physics experiment).
+    The fix corrects carrier phase so that the layered signal matches an
+    integer-sample-shifted version of the base (high NCC, same peak sign).
+    """
+    # cfl=0.3 => dt_layer/cfl = 3.33e-8 s per i_layer
+    # dt_sim = 2.5e-9 s => 13.33 sim samples per i_layer (fractional)
+    dt_layer_frac = 1e-8
+    cfl_frac = 0.3
+    dt_sim = base_params["duration"] / base_params["nt"]
+    delay_per_layer_sec = dt_layer_frac / cfl_frac
+
+    y_base = gaussian_modulated_sinusoidal_signal(**base_params)
+
+    for i in range(1, 6):
+        y_layer = gaussian_modulated_sinusoidal_signal(
+            **base_params,
+            i_layer=i,
+            dt_for_layer_delay=dt_layer_frac,
+            cfl_for_layer_delay=cfl_frac,
+        )
+        # Manual integer shift of base signal
+        shift = round(delay_per_layer_sec * i / dt_sim)
+        y_manual = np.zeros_like(y_base)
+        y_manual[shift:] = y_base[: len(y_base) - shift]
+
+        # NCC must be high (signals should be nearly identical)
+        ncc = np.dot(y_layer, y_manual) / (np.linalg.norm(y_layer) * np.linalg.norm(y_manual))
+        assert ncc > 0.999, f"i_layer={i}: NCC={ncc:.6f}, expected > 0.999"
+
+
+def test_integer_layer_delay_unchanged(base_params):
+    """Integer-sample layer delays should produce NCC ~ 1.0 vs manual shift."""
+    dt_layer = 1e-8
+    cfl_layer = 0.4
+    # delay per i_layer = 2.5e-8 s = 10 sim samples (integer)
+    dt_sim = base_params["duration"] / base_params["nt"]
+    delay_per_layer_samples = (dt_layer / cfl_layer) / dt_sim
+    assert delay_per_layer_samples == 10.0  # confirm integer
+
+    y_base = gaussian_modulated_sinusoidal_signal(**base_params)
+
+    for i_layer in [1, 2, 4]:
+        y_layer = gaussian_modulated_sinusoidal_signal(
+            **base_params,
+            i_layer=i_layer,
+            dt_for_layer_delay=dt_layer,
+            cfl_for_layer_delay=cfl_layer,
+        )
+        # Manual integer shift
+        shift = int(round(delay_per_layer_samples * i_layer))
+        y_manual = np.zeros_like(y_base)
+        y_manual[shift:] = y_base[: len(y_base) - shift]
+
+        # NCC should be very high
+        ncc = np.dot(y_layer, y_manual) / (np.linalg.norm(y_layer) * np.linalg.norm(y_manual))
+        assert ncc > 0.999, f"i_layer={i_layer}: NCC={ncc:.6f}, expected > 0.999"
+
+
 def test_performance(base_params):
     """Ensure the function completes in well under 1 second."""
     start = time.perf_counter()

diff --git a/uv.lock b/uv.lock