google
diff --git a/‎docs/user-guide.md‎
Lines changed: 18 additions & 2 deletions b/‎docs/user-guide.md‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎src/ml_flashpoint/adapter/megatron/save_strategies.py‎
Lines changed: 1 addition & 1 deletion b/‎src/ml_flashpoint/adapter/megatron/save_strategies.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ml_flashpoint/adapter/nemo/nemo_checkpoint_loader.py‎
Lines changed: 23 additions & 2 deletions b/‎src/ml_flashpoint/adapter/nemo/nemo_checkpoint_loader.py‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎src/ml_flashpoint/adapter/nemo/wrapper_util.py‎
Lines changed: 17 additions & 1 deletion b/‎src/ml_flashpoint/adapter/nemo/wrapper_util.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎src/ml_flashpoint/adapter/pytorch/memory_storage_writer.py‎
Lines changed: 22 additions & 15 deletions b/‎src/ml_flashpoint/adapter/pytorch/memory_storage_writer.py‎
Lines changed: 22 additions & 15 deletions
diff --git a/‎src/ml_flashpoint/core/checkpoint_loader.py‎
Lines changed: 31 additions & 15 deletions b/‎src/ml_flashpoint/core/checkpoint_loader.py‎
Lines changed: 31 additions & 15 deletions
@@ -92,6 +92,8 @@ auto_resume = wrap_trainer_and_auto_resume_with_mlflashpoint(
     # always_save_context=False, # Optional, defaults to False
     # write_files_per_rank=1, # Optional, defaults to 1
     # initial_write_buffer_size_bytes=DESIRED_NUM_BYTES, # Optional, defaults to 16 GB
+    # use_optimized_save=True, # Optional, defaults to True. Uses the optimized save method to reduce write time.
+    # use_cached_ckpt_structure=True, # Optional, defaults to False. Caches the checkpoint structure after identifying 2 consecutive save plan structures that are equal.
 )
 ```
 
@@ -126,6 +128,7 @@ from ml_flashpoint.adapter.megatron.save_strategies import (
 )
 
 # Loading
+import torch.distributed as dist
 from ml_flashpoint.adapter.megatron.load_strategies import MLFlashpointMegatronLoadStrategy
 from ml_flashpoint.checkpoint_object_manager.checkpoint_object_manager import CheckpointObjectManager
 from ml_flashpoint.core.checkpoint_loader import DefaultMLFlashpointCheckpointLoader
@@ -148,6 +151,7 @@ memory_storage_writer = MemoryStorageWriter(...)
 # Use it to instantiate the Save Strategy
 megatron_save_strategy = MLFlashpointMegatronAsyncSaveStrategy(
     storage_writer=memory_storage_writer,
+    # use_cached_ckpt_structure=True, # Optional, defaults to False. Caches the checkpoint structure after identifying 2 consecutive save plan structures that are equal.
 )
 ```
 
@@ -167,7 +171,7 @@ async_request = save_local_aware_megatron_checkpoint(
 
 !!! note
 
-    Make sure to specify the checkpoint ID/path when saving based on the current step using: 
+    Make sure to specify the checkpoint ID/path when saving based on the current step using:
     `CheckpointContainerId.create_child(base_container, CheckpointContainerId.format_version_container(current_step))`
     where `base_container` is the base path CheckpointContainerId used for all checkpoints for the current job, e.g. `"/tmp/mlf-checkpoints/job123"`.
 
@@ -188,6 +192,11 @@ replication_manager.initialize(checkpoint_object_manager)
 checkpoint_loader = DefaultMLFlashpointCheckpointLoader(
     checkpoint_object_manager=checkpoint_object_manager,
     replication_manager=replication_manager,
+    global_rank_getter=dist.get_rank,
+    local_rank_getter=torch.distributed.get_node_local_rank,
+    broadcast_object_list_func=dist.broadcast_object_list,
+    all_gather_object_func=dist.all_gather_object,
+    world_size_getter=dist.get_world_size,
 )
 
 # Instantiate the Load Strategy with the dependencies
@@ -229,11 +238,12 @@ Code: See the [`ml_flashpoint.adapter.pytorch`](https://github.com/google/ml-fla
 To use directly with PyTorch DCP, use the provided `StorageWriter` and `StorageReader` implementations.
 You can use whatever `Planner` implementations work for your use case, or resort to the defaults.
 
-If your per-rank checkpoint data exceeds the default buffer size (16 GB as of this writing), you can increase it using the optional `initial_buffer_size_bytes` parameter. 
+If your per-rank checkpoint data exceeds the default buffer size (16 GB as of this writing), you can increase it using the optional `initial_buffer_size_bytes` parameter.
 
 #### Imports
 ```python
 import torch
+import torch.distributed as dist
 from torch import multiprocessing as torch_mp
 import torch.distributed.checkpoint as dcp
 
@@ -262,6 +272,7 @@ memory_storage_writer = MemoryStorageWriter(
         ckpt_obj_manager=checkpoint_object_manager,
         replication_manager=replication_manager,
         # initial_buffer_size_bytes=initial_write_buffer_size_bytes, # Optional - increase for larger checkpoint sizes per rank
+        # use_optimized_save=True, # Optional, defaults to True. Uses the optimized save method to reduce write time.
     ),
     mp_manager=torch_mp.Manager(),
 )
@@ -270,6 +281,11 @@ memory_storage_writer = MemoryStorageWriter(
 checkpoint_loader = DefaultMLFlashpointCheckpointLoader(
     checkpoint_object_manager=checkpoint_object_manager,
     replication_manager=replication_manager,
+    global_rank_getter=dist.get_rank,
+    local_rank_getter=torch.distributed.get_node_local_rank,
+    broadcast_object_list_func=dist.broadcast_object_list,
+    all_gather_object_func=dist.all_gather_object,
+    world_size_getter=dist.get_world_size,
 )
 memory_storage_reader = MemoryStorageReader(
     path=checkpoint_dir,
 
@@ -139,7 +139,7 @@ def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union
         # 1b. Re-initialize the StorageWriter to use a new instance per save to avoid hangs from shared state.
         self._storage_writer = MemoryStorageWriter(
             checkpoint_saver=self._checkpoint_saver,
-            mp_manager=self._storage_writer._main_process_torchmp_manager,
+            mp_manager_future=self._storage_writer._main_process_torchmp_manager_future,
             files_per_rank=self._storage_writer._files_per_rank,
         )
         # 1c. Reset the StorageWriter for this checkpoint version.
 
@@ -14,7 +14,7 @@
 
 import os
 from pathlib import Path
-from typing import List, Set
+from typing import Callable, List, Set
 
 from typing_extensions import override
 
@@ -33,6 +33,12 @@ def __init__(
         self,
         checkpoint_object_manager: CheckpointObjectManager,
         replication_manager: ReplicationManager,
+        *,
+        global_rank_getter: Callable[[], int],
+        local_rank_getter: Callable[[], int],
+        broadcast_object_list_func: Callable[..., None],
+        all_gather_object_func: Callable[..., None],
+        world_size_getter: Callable[[], int],
         recover_context: bool = False,
     ):
         """Initializes the NeMoMLFlashpointCheckpointLoader.
@@ -42,9 +48,24 @@ def __init__(
                 reading data.
             replication_manager: The replication manager to use for retrieving
                 missing checkpoint objects from peer nodes.
+            global_rank_getter: A callable that returns the global rank.
+            local_rank_getter: A callable that returns the node-local rank.
+            broadcast_object_list_func: A callable with the same signature as
+                ``torch.distributed.broadcast_object_list``.
+            all_gather_object_func: A callable with the same signature as
+                ``torch.distributed.all_gather_object``.
+            world_size_getter: A callable that returns the world size.
             recover_context: Whether to recover the context directory if missing.
         """
-        super().__init__(checkpoint_object_manager, replication_manager)
+        super().__init__(
+            checkpoint_object_manager,
+            replication_manager,
+            global_rank_getter=global_rank_getter,
+            local_rank_getter=local_rank_getter,
+            broadcast_object_list_func=broadcast_object_list_func,
+            all_gather_object_func=all_gather_object_func,
+            world_size_getter=world_size_getter,
+        )
         self._recover_context = recover_context
 
     @override
 
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import concurrent.futures
+import threading
 from typing import Union
 
 import torch
+import torch.distributed as dist
 from nemo import lightning as nl
 from nemo.lightning.io.pl import MegatronCheckpointIO
 from nemo.lightning.pytorch import strategies as nl_strategies
@@ -79,6 +82,11 @@ def wrap_trainer_and_auto_resume_with_mlflashpoint(
     ckpt_loader = NeMoMLFlashpointCheckpointLoader(
         checkpoint_object_manager=ckpt_obj_manager,
         replication_manager=replication_manager,
+        global_rank_getter=dist.get_rank,
+        local_rank_getter=dist.get_node_local_rank,
+        broadcast_object_list_func=dist.broadcast_object_list,
+        all_gather_object_func=dist.all_gather_object,
+        world_size_getter=dist.get_world_size,
         recover_context=always_save_context,
     )
 
@@ -212,6 +220,14 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
     # (OOM) errors upon restart. 'spawn' launches a clean interpreter without
     # the inherited CUDA state, allowing the GPU memory to be freed instantly.
     ctx = torch_mp.get_context("spawn")
+    mp_manager_future = concurrent.futures.Future()
+
+    def start_manager():
+        mp_manager_future.set_result(ctx.Manager())
+
+    thread = threading.Thread(target=start_manager, daemon=True)
+    thread.start()
+
     save_strategy = MLFlashpointMegatronAsyncSaveStrategy(
         storage_writer=MemoryStorageWriter(
             checkpoint_saver=DefaultMLFlashpointCheckpointSaver(
@@ -223,7 +239,7 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
                 initial_buffer_size_bytes=initial_write_buffer_size_bytes,
                 use_optimized_save=use_optimized_save,
             ),
-            mp_manager=ctx.Manager(),
+            mp_manager_future=mp_manager_future,
             files_per_rank=write_files_per_rank,
         ),
         use_cached_ckpt_structure=use_cached_ckpt_structure,
 
@@ -20,7 +20,7 @@
 from typing import Optional, Union
 
 import torch
-from torch import multiprocessing as torch_mp
+import torch.multiprocessing as torch_mp
 from torch.distributed.checkpoint import Metadata, SavePlan, SavePlanner, StorageWriter, staging
 from torch.distributed.checkpoint.filesystem import _StorageInfo
 from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE, MetadataIndex, StorageMeta
@@ -87,16 +87,17 @@ class MemoryStorageWriter(StorageWriter, staging.BlockingAsyncStager):
     def __init__(
         self,
         checkpoint_saver: MLFlashpointCheckpointSaver,
-        mp_manager: torch_mp.Manager,
+        mp_manager_future: concurrent.futures.Future,
         files_per_rank: int = 1,
     ):
         """Initializes the MemoryStorageWriter.
 
         Args:
             checkpoint_saver: An instance of `MLFlashpointCheckpointSaver` used for
                 handling the actual checkpoint saving logic.
-            mp_manager: A `torch.multiprocessing.Manager` instance for managing
-                shared state across processes, particularly for write results and events.
+            mp_manager_future: A `concurrent.futures.Future` that resolves to a
+                `torch.multiprocessing.Manager` instance for managing shared state
+                across processes, particularly for write results and events.
                 It is highly recommended to create this manager using a 'spawn'
                 multiprocessing context to avoid inheriting the parent's CUDA context,
                 which prevents CUDA OOM errors during failure recoveries
@@ -112,23 +113,22 @@ def __init__(
             _LOGGER.warning("files_per_rank must be >= 1, but was %d. Setting to 1.", files_per_rank)
             files_per_rank = 1
         self._files_per_rank = files_per_rank
-        # _main_process_torchmp_manager should only be used in the main process, not in the spawned processes.
-        # This is because mp_manager is not picklable.
-        self._main_process_torchmp_manager = mp_manager
-        self._write_events_per_checkpoint_id: dict[CheckpointContainerId, torch_mp.Event] = mp_manager.dict()
-        self._write_results_per_checkpoint_id: dict[CheckpointContainerId, list[WriteResult]] = mp_manager.dict()
+        # _main_process_torchmp_manager_future should only be used in the main process, not in the spawned processes.
+        # This is because the mp_manager it resolves to is not picklable.
+        self._main_process_torchmp_manager_future = mp_manager_future
+        self._write_events_per_checkpoint_id: Optional[dict[CheckpointContainerId, torch_mp.Event]] = None
+        self._write_results_per_checkpoint_id: Optional[dict[CheckpointContainerId, list[WriteResult]]] = None
 
     def __getstate__(self):
-        """Custom pickling to exclude unpicklable mp_manager."""
+        """Custom pickling to exclude unpicklable mp_manager_future."""
         state = self.__dict__.copy()
-        if "_main_process_torchmp_manager" in state:
-            del state["_main_process_torchmp_manager"]
+        state.pop("_main_process_torchmp_manager_future", None)
         return state
 
     def __setstate__(self, state):
-        """Custom unpickling to restore state and set mp_manager to None."""
+        """Custom unpickling to restore state and set mp_manager_future to None."""
         self.__dict__.update(state)
-        self._main_process_torchmp_manager = None
+        self._main_process_torchmp_manager_future = None
 
     def _check_checkpoint_id(self) -> None:
         if self._current_checkpoint_id is None:
@@ -154,6 +154,11 @@ def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         # Mimicking existing StorageWriter impls (e.g. `_FileSystemWriter`) by using a random ID as the save ID.
         self._current_save_id = generate_hfid("memwritersave")
 
+        if self._write_events_per_checkpoint_id is None and self._main_process_torchmp_manager_future is not None:
+            mp_manager = self._main_process_torchmp_manager_future.result()
+            self._write_events_per_checkpoint_id = mp_manager.dict()
+            self._write_results_per_checkpoint_id = mp_manager.dict()
+
     def storage_meta(self) -> Optional[StorageMeta]:
         self._check_checkpoint_id()
         return StorageMeta(checkpoint_id=self._current_checkpoint_id.data, save_id=self._current_save_id)
@@ -194,7 +199,9 @@ def prepare_write_data_buckets(
     ) -> list[ObjectWriteBucket]:
         # Create a new, unset Event for this specific checkpoint save
         if checkpoint_id not in self._write_events_per_checkpoint_id:
-            self._write_events_per_checkpoint_id[checkpoint_id] = self._main_process_torchmp_manager.Event()
+            self._write_events_per_checkpoint_id[checkpoint_id] = (
+                self._main_process_torchmp_manager_future.result().Event()
+            )
 
         write_buckets = self.checkpoint_saver.prepare_write_data(
             checkpoint_id, plan.items, planner, plan.storage_data.prefix, bucket_count=self._files_per_rank
 
@@ -22,10 +22,9 @@
 import struct
 from collections import defaultdict
 from pathlib import Path
-from typing import IO, List, Optional, Set, Tuple, TypeVar, cast
+from typing import IO, Callable, List, Optional, Set, Tuple, TypeVar, cast
 
 import torch
-import torch.distributed as dist
 from torch.distributed._shard._utils import narrow_tensor_by_index
 from torch.distributed.checkpoint import Metadata
 from torch.distributed.checkpoint.filesystem import _StorageInfo
@@ -128,6 +127,12 @@ def __init__(
         self,
         checkpoint_object_manager: CheckpointObjectManager,
         replication_manager: ReplicationManager,
+        *,
+        global_rank_getter: Callable[[], int],
+        local_rank_getter: Callable[[], int],
+        broadcast_object_list_func: Callable[..., None],
+        all_gather_object_func: Callable[..., None],
+        world_size_getter: Callable[[], int],
     ):
         """Initializes the DefaultMLFlashpointCheckpointLoader.
 
@@ -136,9 +141,21 @@ def __init__(
                 reading data.
             replication_manager: The replication manager to use for retrieving
                 missing checkpoint objects from peer nodes.
+            global_rank_getter: A callable that returns the global rank.
+            local_rank_getter: A callable that returns the node-local rank.
+            broadcast_object_list_func: A callable with the same signature as
+                ``torch.distributed.broadcast_object_list``.
+            all_gather_object_func: A callable with the same signature as
+                ``torch.distributed.all_gather_object``.
+            world_size_getter: A callable that returns the world size.
         """
         self._checkpoint_object_manager = checkpoint_object_manager
         self._replication_manager = replication_manager
+        self._global_rank_getter = global_rank_getter
+        self._local_rank_getter = local_rank_getter
+        self._broadcast_object_list_func = broadcast_object_list_func
+        self._all_gather_object_func = all_gather_object_func
+        self._world_size_getter = world_size_getter
         # Cache for available objects: CheckpointContainerId -> dict[object_path, list[rank]]
         self._available_objects_cache: dict[CheckpointContainerId, dict[str, List[int]]] = {}
 
@@ -337,8 +354,7 @@ def get_latest_complete_checkpoint(
             else continue to the next candidate checkpoint
             - return the checkpoint container id of the latest complete checkpoint
         """
-        # TODO: use global_rank_getter and local_rank_getter.
-        rank = dist.get_rank()
+        rank = self._global_rank_getter()
         _LOGGER.debug(
             "Rank %s: Getting latest complete checkpoint for '%s'",
             rank,
@@ -382,7 +398,7 @@ def get_latest_complete_checkpoint(
                 retrieval_plan = self._compute_retrieval_plan(checkpoint, available_objects_by_rank)
             # Broadcast the retrieval plan to all ranks.
             plan_container = [retrieval_plan]
-            dist.broadcast_object_list(plan_container, src=planner_rank)
+            self._broadcast_object_list_func(plan_container, src=planner_rank)
             retrieval_plan = plan_container[0]
 
             if retrieval_plan is None:
@@ -451,7 +467,7 @@ def _compute_retrieval_plan(
 
         objects_needed_by_local_rank_0.update(self._get_extra_needed_objects(checkpoint, available_objects_by_rank))
 
-        world_size = dist.get_world_size()
+        world_size = self._world_size_getter()
         num_nodes = get_num_of_nodes()
         ranks_per_node = world_size // num_nodes
 
@@ -507,8 +523,8 @@ def get_candidate_checkpoints(
 
         # Scan locally only on the first rank of each node
         base_path = Path(checkpoint_base_container.data)
-        rank = dist.get_rank()
-        local_rank = dist.get_node_local_rank()
+        rank = self._global_rank_getter()
+        local_rank = self._local_rank_getter()
 
         local_candidate_ckpt_ids = []
 
@@ -532,8 +548,8 @@ def get_candidate_checkpoints(
         else:
             _LOGGER.debug("Rank %s: Base path '%s' is not a directory or does not exist.", rank, base_path)
 
-        all_checkpoint_container_path_lists = [None for _ in range(dist.get_world_size())]
-        dist.all_gather_object(all_checkpoint_container_path_lists, local_candidate_ckpt_ids)
+        all_checkpoint_container_path_lists = [None for _ in range(self._world_size_getter())]
+        self._all_gather_object_func(all_checkpoint_container_path_lists, local_candidate_ckpt_ids)
         _LOGGER.debug(
             "Rank %s: Gathered checkpoint container paths from all ranks: '%s'",
             rank,
@@ -589,8 +605,8 @@ def get_checkpoint_objects_by_rank(
 
             local_objects.extend(self._get_extra_local_objects(container_path))
 
-        all_objects_by_rank_paths = [None for _ in range(dist.get_world_size())]
-        dist.all_gather_object(all_objects_by_rank_paths, local_objects)
+        all_objects_by_rank_paths = [None for _ in range(self._world_size_getter())]
+        self._all_gather_object_func(all_objects_by_rank_paths, local_objects)
 
         result = {}
         object_locations = defaultdict(list)
@@ -620,7 +636,7 @@ def retrieve_checkpoint(
                             If empty for this rank, no retrieval is needed.
         """
 
-        rank = dist.get_rank()
+        rank = self._global_rank_getter()
         all_success = True
 
         # Only proceed with retrieval if we have items to retrieve
@@ -656,8 +672,8 @@ def retrieve_checkpoint(
 
         # Gather success status from all ranks
         _LOGGER.debug("Gathering success status from all ranks")
-        all_success_list = [None for _ in range(dist.get_world_size())]
-        dist.all_gather_object(all_success_list, all_success)
+        all_success_list = [None for _ in range(self._world_size_getter())]
+        self._all_gather_object_func(all_success_list, all_success)
         _LOGGER.debug("All success list: '%s'", all_success_list)
         return all(all_success_list)
Original file line number	Diff line number	Diff line change
`@@ -139,7 +139,7 @@ def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union`
`139`	`139`	`# 1b. Re-initialize the StorageWriter to use a new instance per save to avoid hangs from shared state.`
`140`	`140`	`self._storage_writer = MemoryStorageWriter(`
`141`	`141`	`checkpoint_saver=self._checkpoint_saver,`
`142`		`- mp_manager=self._storage_writer._main_process_torchmp_manager,`
	`142`	`+ mp_manager_future=self._storage_writer._main_process_torchmp_manager_future,`
`143`	`143`	`files_per_rank=self._storage_writer._files_per_rank,`
`144`	`144`	`)`
`145`	`145`	`# 1c. Reset the StorageWriter for this checkpoint version.`