feat(adapter/nemo): add fully parallel save/load wrapper option (#49)

devyanic11 · g-husam · Leahlijuan · web-flow · commit e9eb9d66600c · 2026-03-23T15:38:52.000Z
Fixes #48 Add optional `use_fully_parallel_wrapper` flag to the NeMo wrapper utility. When enabled, save/load strategies are wrapped with `FullyParallelSaveStrategyWrapper` and `FullyParallelLoadStrategyWrapper`. Default behavior remains unchanged. - [x] Tests pass --------- Co-authored-by: g-husam <husameldawi@google.com> Co-authored-by: leahlijuan <leahlijuan@google.com>
diff --git a/src/ml_flashpoint/adapter/nemo/wrapper_util.py b/src/ml_flashpoint/adapter/nemo/wrapper_util.py
@@ -19,6 +19,10 @@
 
 import torch
 import torch.distributed as dist
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelLoadStrategyWrapper,
+    FullyParallelSaveStrategyWrapper,
+)
 from nemo import lightning as nl
 from nemo.lightning.io.pl import MegatronCheckpointIO
 from nemo.lightning.pytorch import strategies as nl_strategies
@@ -53,6 +57,7 @@ def wrap_trainer_and_auto_resume_with_mlflashpoint(
     initial_write_buffer_size_bytes: Optional[int] = DEFAULT_INITIAL_BUFFER_SIZE_BYTES,
     use_optimized_save: bool = True,
     use_cached_ckpt_structure: bool = False,
+    use_fully_parallel_wrapper: bool = False,
 ) -> MLFlashpointAutoResume:
     """Wraps the trainer and creates an MLFlashpointAutoResume instance wrapping `default_auto_resume`.
 
@@ -72,6 +77,10 @@ def wrap_trainer_and_auto_resume_with_mlflashpoint(
             in bytes. Defaults to `DEFAULT_INITIAL_BUFFER_SIZE_BYTES`, even if set to None explicitly.
         use_cached_ckpt_structure: Whether to reuse the checkpoint structure (plan) from the previous save.
             Defaults to False.
+        use_fully_parallel_wrapper: Whether to use the fully parallel wrapper for save and load.
+            This will evenly distribute checkpoint data across all ranks.
+            Defaults to False.
+
     Returns:
         An MLFlashpointAutoResume instance configured for ML Flashpoint, wrapping `default_auto_resume`.
     """
@@ -114,6 +123,7 @@ def wrap_trainer_and_auto_resume_with_mlflashpoint(
         initial_write_buffer_size_bytes=initial_write_buffer_size_bytes,
         use_optimized_save=use_optimized_save,
         use_cached_ckpt_structure=use_cached_ckpt_structure,
+        use_fully_parallel_wrapper=use_fully_parallel_wrapper,
     )
 
     default_auto_resume_args = vars(default_auto_resume) if default_auto_resume else {}
@@ -136,6 +146,7 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
     initial_write_buffer_size_bytes: Optional[int] = DEFAULT_INITIAL_BUFFER_SIZE_BYTES,
     use_optimized_save: bool = True,
     use_cached_ckpt_structure: bool = False,
+    use_fully_parallel_wrapper: bool = False,
 ):
     """Wraps the trainer's checkpoint I/O with ML Flashpoint capabilities.
 
@@ -165,6 +176,9 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
             in bytes. Defaults to `DEFAULT_INITIAL_BUFFER_SIZE_BYTES`, even if set to None explicitly.
         use_cached_ckpt_structure: Whether to reuse the checkpoint structure (plan) from the previous save.
             Defaults to False.
+        use_fully_parallel_wrapper: Whether to use the fully parallel wrapper for save and load.
+            This will evenly distribute checkpoint data across all ranks.
+            Defaults to False.
 
     Returns:
         None. The trainer's checkpoint_io is modified in-place.
@@ -263,6 +277,10 @@ def start_manager():
         checkpoint_loader=checkpoint_loader,
     )
 
+    if use_fully_parallel_wrapper:
+        save_strategy = FullyParallelSaveStrategyWrapper(save_strategy)
+        load_strategy = FullyParallelLoadStrategyWrapper(load_strategy)
+
     ml_flashpoint_checkpoint_io = MLFlashpointCheckpointIO(
         flashpoint_base_path=flashpoint_base_container,
         alt_checkpoint_io=checkpoint_io,
diff --git a/tests/adapter/nemo/test_wrapper_util.py b/tests/adapter/nemo/test_wrapper_util.py
@@ -17,6 +17,10 @@
 import dataclasses
 
 import pytest
+from megatron.core.dist_checkpointing.strategies.fully_parallel import (
+    FullyParallelLoadStrategyWrapper,
+    FullyParallelSaveStrategyWrapper,
+)
 from nemo import lightning as nl
 from nemo.lightning.io.pl import MegatronCheckpointIO
 from nemo.lightning.pytorch import strategies as nl_strategies
@@ -128,6 +132,7 @@ def test_successful_wrap_and_resume_creation(self, mocker, mock_ckpt_obj_manager
             initial_write_buffer_size_bytes=DEFAULT_INITIAL_BUFFER_SIZE_BYTES,
             use_optimized_save=True,
             use_cached_ckpt_structure=False,
+            use_fully_parallel_wrapper=False,
         )
 
         # 3. Result is correct type and has correct attributes
@@ -343,6 +348,58 @@ def test_use_cached_ckpt_structure_default_value(self, mocker, mock_ckpt_obj_man
         _, kwargs = mock_wrap_trainer.call_args
         assert kwargs["use_cached_ckpt_structure"] is False
 
+    @pytest.mark.parametrize("use_fully_parallel_wrapper", [True, False])
+    def test_use_fully_parallel_wrapper_forwarding(self, mocker, use_fully_parallel_wrapper):
+        """Tests that use_fully_parallel_wrapper is forwarded correctly."""
+        # Given
+        mocker.patch("ml_flashpoint.adapter.nemo.wrapper_util.ReplicationManager")
+        mock_wrap_trainer = mocker.patch(
+            "ml_flashpoint.adapter.nemo.wrapper_util.wrap_trainer_checkpoint_io_with_mlflashpoint"
+        )
+        trainer = mocker.MagicMock(spec=nl_trainer.Trainer)
+        trainer.global_rank = 0
+        flashpoint_base_container = "/tmp/test_container"
+        default_auto_resume = nl.AutoResume()
+
+        # When
+        wrap_trainer_and_auto_resume_with_mlflashpoint(
+            trainer,
+            flashpoint_base_container,
+            async_save=True,
+            default_auto_resume=default_auto_resume,
+            use_fully_parallel_wrapper=use_fully_parallel_wrapper,
+        )
+
+        # Then
+        mock_wrap_trainer.assert_called_once()
+        _, kwargs = mock_wrap_trainer.call_args
+        assert kwargs["use_fully_parallel_wrapper"] is use_fully_parallel_wrapper
+
+    def test_use_fully_parallel_wrapper_default_value(self, mocker):
+        """Tests that use_fully_parallel_wrapper defaults to False."""
+        # Given
+        mocker.patch("ml_flashpoint.adapter.nemo.wrapper_util.ReplicationManager")
+        mock_wrap_trainer = mocker.patch(
+            "ml_flashpoint.adapter.nemo.wrapper_util.wrap_trainer_checkpoint_io_with_mlflashpoint"
+        )
+        trainer = mocker.MagicMock(spec=nl_trainer.Trainer)
+        trainer.global_rank = 0
+        flashpoint_base_container = "/tmp/test_container"
+        default_auto_resume = nl.AutoResume()
+
+        # When
+        wrap_trainer_and_auto_resume_with_mlflashpoint(
+            trainer,
+            flashpoint_base_container,
+            async_save=True,
+            default_auto_resume=default_auto_resume,
+        )
+
+        # Then
+        mock_wrap_trainer.assert_called_once()
+        _, kwargs = mock_wrap_trainer.call_args
+        assert kwargs["use_fully_parallel_wrapper"] is False
+
 
 class TestWrapTrainerCheckpointIOWithMLFlashpoint:
     """Tests for the wrap_trainer_checkpoint_io_with_mlflashpoint function."""
@@ -547,6 +604,76 @@ def test_successful_wrapping_no_async_wrapper(self, mocker, mock_ckpt_obj_manage
         assert trainer.strategy.checkpoint_io.fallback_checkpoint_io is original_checkpoint_io
         assert trainer.strategy.checkpoint_io.async_save is True
 
+    def test_fully_parallel_wrapper_enabled(self, mocker, mock_ckpt_obj_manager, mock_replication_manager):
+        """Tests that FullyParallel wrappers are applied when flag=True."""
+
+        # Given
+        trainer = mocker.MagicMock(spec=nl_trainer.Trainer)
+        trainer.callbacks = [mocker.MagicMock(spec=MLFlashpointCheckpointCallback)]
+        trainer.strategy = mocker.MagicMock(spec=nl_strategies.MegatronStrategy)
+        original_checkpoint_io = mocker.MagicMock(spec=MegatronCheckpointIO)
+        trainer.strategy.checkpoint_io = original_checkpoint_io
+        base_container = "/test_base_container"
+
+        # When
+        wrap_trainer_checkpoint_io_with_mlflashpoint(
+            trainer,
+            base_container,
+            mock_ckpt_obj_manager,
+            mock_replication_manager,
+            async_save=True,
+            checkpoint_loader=mocker.MagicMock(spec=DefaultMLFlashpointCheckpointLoader),
+            use_fully_parallel_wrapper=True,  # 🔥 enable it
+        )
+
+        # Then
+        wrapped_io = trainer.strategy.checkpoint_io
+        assert isinstance(wrapped_io, MLFlashpointCheckpointIO)
+
+        assert isinstance(
+            wrapped_io.save_strategy,
+            FullyParallelSaveStrategyWrapper,
+        )
+        assert isinstance(
+            wrapped_io.load_strategy,
+            FullyParallelLoadStrategyWrapper,
+        )
+
+    def test_fully_parallel_wrapper_disabled_by_default(self, mocker, mock_ckpt_obj_manager, mock_replication_manager):
+        """Tests that FullyParallel wrappers are NOT applied when flag=False."""
+
+        # Given
+        trainer = mocker.MagicMock(spec=nl_trainer.Trainer)
+        trainer.callbacks = [mocker.MagicMock(spec=MLFlashpointCheckpointCallback)]
+        trainer.strategy = mocker.MagicMock(spec=nl_strategies.MegatronStrategy)
+        original_checkpoint_io = mocker.MagicMock(spec=MegatronCheckpointIO)
+        trainer.strategy.checkpoint_io = original_checkpoint_io
+        base_container = "/test_base_container"
+
+        # When
+        wrap_trainer_checkpoint_io_with_mlflashpoint(
+            trainer,
+            base_container,
+            mock_ckpt_obj_manager,
+            mock_replication_manager,
+            async_save=True,
+            checkpoint_loader=mocker.MagicMock(spec=DefaultMLFlashpointCheckpointLoader),
+            use_fully_parallel_wrapper=False,  # default behavior
+        )
+
+        # Then
+        wrapped_io = trainer.strategy.checkpoint_io
+        assert isinstance(wrapped_io, MLFlashpointCheckpointIO)
+
+        assert not isinstance(
+            wrapped_io.save_strategy,
+            FullyParallelSaveStrategyWrapper,
+        )
+        assert not isinstance(
+            wrapped_io.load_strategy,
+            FullyParallelLoadStrategyWrapper,
+        )
+
     def test_successful_wrapping_with_async_wrapper(self, mocker, mock_ckpt_obj_manager, mock_replication_manager):
         """Tests successful wrapping when an async wrapper is present."""
         # Given