google
diff --git a/‎cloudbuild.yaml‎
Lines changed: 36 additions & 15 deletions b/‎cloudbuild.yaml‎
Lines changed: 36 additions & 15 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 28 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/ml_flashpoint/adapter/megatron/save_strategies.py‎
Lines changed: 42 additions & 1 deletion b/‎src/ml_flashpoint/adapter/megatron/save_strategies.py‎
Lines changed: 42 additions & 1 deletion
diff --git a/‎src/ml_flashpoint/adapter/nemo/wrapper_util.py‎
Lines changed: 9 additions & 1 deletion b/‎src/ml_flashpoint/adapter/nemo/wrapper_util.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/ml_flashpoint/adapter/pytorch/custom_state_dict_saver.py‎
Lines changed: 56 additions & 17 deletions b/‎src/ml_flashpoint/adapter/pytorch/custom_state_dict_saver.py‎
Lines changed: 56 additions & 17 deletions
@@ -13,11 +13,9 @@
 # limitations under the License.
 
 steps:
-  # 1. Build the distribution (sdist and wheel).
-  # This uses scikit-build-core as defined in pyproject.toml to compile C++ extensions.
-  # We set BUILD_TESTING=OFF to ignore tests during the artifact build.
+  # 1. Build the source distribution (sdist).
   - name: 'python:3.10'
-    id: 'build'
+    id: 'build-sdist'
     entrypoint: 'bash'
     args:
       - '-c'
@@ -27,16 +25,37 @@ steps:
           exit 1
         fi
         pip install build
-        
-        echo "Building C++ extensions with editable install..."
-        # Note: pip install -e . will also respect the build-system requirements
-        SKBUILD_CMAKE_ARGS="-DBUILD_TESTING=OFF" pip install -e .
-        
-        echo "Building sdist and wheel..."
-        SKBUILD_CMAKE_ARGS="-DBUILD_TESTING=OFF" python -m build
+        python -m build --sdist
 
-  # 2. Upload to internal Artifact Registry (AR) for OSS Exit Gate.
-  # OSS Exit Gate fetches artifacts from this repository.
+  # 2. Build the manylinux wheels.
+  # We use the manylinux image directly to avoid the "reserved" docker.sock issue.
+  # This image is pre-loaded with Python versions and build tools.
+  - name: '$_MANYLINUX_IMAGE'
+    id: 'build-wheels'
+    entrypoint: 'bash'
+    args:
+      - '-c'
+      - |
+        # Use Python 3.10 as the build controller (it will produce the abi3 wheel)
+        PYBIN="/opt/python/cp310-cp310/bin"
+
+        # Install build dependencies
+        $$PYBIN/pip install build
+
+        echo "Building wheel (scikit-build-core will handle abi3 and C++ extension compilation)..."
+        SKBUILD_CMAKE_ARGS="-DBUILD_TESTING=OFF" $$PYBIN/python -m build --wheel
+
+        echo "Repairing wheel with auditwheel to ensure manylinux compliance..."
+        # auditwheel repair will bundle any external shared libraries and fix the platform tag
+        $$PYBIN/auditwheel repair dist/*.whl --wheel-dir dist/
+
+        # Remove the original non-compliant wheel (the one with the 'linux' tag)
+        # to ensure only the manylinux version is uploaded.
+        rm dist/*-linux_*.whl
+    waitFor: ['build-sdist']
+
+
+  # 3. Upload to internal Artifact Registry (AR) for OSS Exit Gate.
   - name: 'python:3.10'
     id: 'upload-to-ar'
     entrypoint: 'bash'
@@ -45,9 +64,9 @@ steps:
       - |
         pip install -U twine keyring keyrings.google-artifactregistry-auth
         twine upload --repository-url https://us-python.pkg.dev/oss-exit-gate-prod/${_PROJECT_NAME}--pypi dist/*
-    waitFor: ['build']
+    waitFor: ['build-wheels']
 
-  # 3. Create and upload the manifest to GCS to trigger the Exit Gate publication.
+  # 4. Create and upload the manifest to GCS to trigger the Exit Gate publication.
   # The presence of this file in the specific GCS bucket triggers the verification and publishing process.
   - name: 'gcr.io/cloud-builders/gcloud'
     id: 'trigger-exit-gate'
@@ -67,3 +86,5 @@ options:
 
 substitutions:
   _PROJECT_NAME: 'ml-flashpoint'
+  # Default to x86_64; can be overridden in the Trigger for ARM64.
+  _MANYLINUX_IMAGE: 'quay.io/pypa/manylinux_2_28_x86_64'
@@ -132,6 +132,10 @@ build-backend = "scikit_build_core.build"
 # Tells scikit-build-core to use setuptools-scm to retrieve the version from git.
 metadata.version.provider = "scikit_build_core.metadata.setuptools_scm"
 
+# Enable Stable ABI (abi3) for Python 3.10 and later.
+# This produces a single wheel per architecture that works on all future Python versions.
+wheel.py-api = "cp310"
+
 # Specifies the minimum version of CMake that must be present on the system.
 cmake.version = ">=3.18"
 
@@ -193,3 +197,27 @@ fail_under = 90
 [tool.gcovr]
 fail-under-line = "80"
 #fail-under-branch = "85"
+
+# ===================================================================
+# Tool-specific Configuration for cibuildwheel
+# ===================================================================
+[tool.cibuildwheel]
+# Build only once per architecture (using Python 3.10).
+# Because abi3 is enabled, this wheel will work for 3.10, 3.11, 3.12, 3.13, etc.
+build = "cp310-*"
+# Target both Intel (x86_64) and ARM (aarch64) architectures.
+archs = ["x86_64", "aarch64"]
+# Skip 32-bit builds and musllinux (less common, so skipping for simplicity)
+skip = "*-manylinux_i686 *-musllinux_*"
+
+[tool.cibuildwheel.linux]
+# Pass the flag to skip tests during the build inside the container
+environment = { SKBUILD_CMAKE_ARGS="-DBUILD_TESTING=OFF" }
+# We need to install the build requirements inside the cibuildwheel container.
+# We also print the architecture and environment for debugging.
+before-build = """
+  uname -m && \
+  pip install pybind11 scikit-build-core cmake ninja setuptools-scm && \
+  cmake --version
+"""
+
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import json
 import logging
 import os
@@ -28,6 +29,8 @@
     _replace_state_dict_keys_with_sharded_keys,
     mcore_to_pyt_state_dict,
 )
+from torch.distributed.checkpoint.metadata import Metadata
+from torch.distributed.checkpoint.planner import SavePlan
 from torch.distributed.checkpoint.utils import _DistWrapper
 from typing_extensions import override
 
@@ -83,17 +86,27 @@ def __init__(
         storage_writer: MemoryStorageWriter,
         backend: str = default_backend_format_name(),
         version: int = default_backend_format_version(),
+        use_cached_ckpt_structure: bool = False,
     ):
         """
         Args:
             storage_writer (MemoryStorageWriter): The storage writer to use for saving operations.
             backend (str, optional): The name of the backend format. Defaults to "ml_flashpoint", which is recommended.
             version (int, optional): The version of the checkpoint format. Defaults to the latest version.
+            use_cached_ckpt_structure (bool, optional): Whether to reuse the checkpoint structure (plan)
+                from the previous save. Defaults to False.
         """
         super().__init__(backend=backend, version=version)
         self._storage_writer: MemoryStorageWriter = storage_writer
         self._checkpoint_saver: MLFlashpointCheckpointSaver = storage_writer.checkpoint_saver
 
+        # Cache for state dict saving
+        self._cached_central_plan: SavePlan | None = None
+        self._cached_local_plan: SavePlan | None = None
+        self._cached_global_metadata: Metadata | None = None
+        self._validated_cache_reuse: bool = False
+        self._use_cached_ckpt_structure: bool = use_cached_ckpt_structure
+
     @override
     def can_handle_sharded_objects(self) -> bool:
         # Not currently used, but in case it is, ensure this strategy is used for ShardedObjects as well.
@@ -157,14 +170,42 @@ def async_save(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Union
         # we also use Megatron's SavePlanner during saving for compatibility.
         planner: MCoreSavePlanner = MCoreSavePlanner(can_run_decentralized_global_plan=False)
         world_dist_wrapper = _DistWrapper(group=None, use_dist=not disable_dist, coordinator_rank=0)
-        plan, write_buckets, global_metadata = statedictsaver.generate_plan(
+        # Try twice to validate the generated `central_plan` is the same across iterations
+        # If so, reuse `cached_central_plan` and `cached_global_metadata`
+        # From the 3rd iteration, `save_state_dict_async_plan` will not generate `global_metadata`
+        # (return None) so `self.cached_global_metadata` is reused
+        cached_structure_args = None
+        if self._use_cached_ckpt_structure:
+            cached_structure_args = (
+                self._cached_central_plan,
+                self._cached_local_plan,
+                self._validated_cache_reuse,
+            )
+
+        (
+            write_buckets,
+            global_metadata,
+            self._cached_central_plan,
+            self._cached_local_plan,
+            self._validated_cache_reuse,
+        ) = statedictsaver.generate_plan(
             checkpoint_id=checkpoint_id,
             state_dict=pyt_state_dict,
             storage_writer=self._storage_writer,
             planner=planner,
             world_dist_wrapper=world_dist_wrapper,
+            cached_ckpt_structure=cached_structure_args,
         )
 
+        if global_metadata is None:
+            # We want to use the cached metadata structure, but ensure any modifications (like adding storage data)
+            # are done on a copy so the cache remains clean.
+            global_metadata = copy.deepcopy(self._cached_global_metadata)
+        else:
+            # Checkpoint structure (and thus metadata) changed or was generated for the first time.
+            # Cache a clean copy of the metadata before storage data is potentially added later.
+            self._cached_global_metadata = copy.deepcopy(global_metadata)
+
         # 5. Stage to CPU.
         staged_write_buckets = self._storage_writer.stage_write_data_buckets(
             checkpoint_id, write_buckets, non_blocking=True
 
@@ -45,6 +45,7 @@ def wrap_trainer_and_auto_resume_with_mlflashpoint(
     write_files_per_rank: int = 1,
     initial_write_buffer_size_bytes: int = DEFAULT_INITIAL_BUFFER_SIZE_BYTES,
     use_optimized_save: bool = True,
+    use_cached_ckpt_structure: bool = False,
 ) -> MLFlashpointAutoResume:
     """Wraps the trainer and creates an MLFlashpointAutoResume instance wrapping `default_auto_resume`.
 
@@ -62,6 +63,8 @@ def wrap_trainer_and_auto_resume_with_mlflashpoint(
         write_files_per_rank: Optional. The number of files each rank writes to for checkpoint data. Defaults to 1.
         initial_write_buffer_size_bytes: Optional. The initial size of the buffer for writing checkpoint data
             in bytes. Defaults to `DEFAULT_INITIAL_BUFFER_SIZE_BYTES`.
+        use_cached_ckpt_structure: Whether to reuse the checkpoint structure (plan) from the previous save.
+            Defaults to False.
     Returns:
         An MLFlashpointAutoResume instance configured for ML Flashpoint, wrapping `default_auto_resume`.
     """
@@ -90,6 +93,7 @@ def wrap_trainer_and_auto_resume_with_mlflashpoint(
         write_files_per_rank=write_files_per_rank,
         initial_write_buffer_size_bytes=initial_write_buffer_size_bytes,
         use_optimized_save=use_optimized_save,
+        use_cached_ckpt_structure=use_cached_ckpt_structure,
     )
 
     default_auto_resume_args = vars(default_auto_resume) if default_auto_resume else {}
@@ -111,6 +115,7 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
     write_files_per_rank: int = 1,
     initial_write_buffer_size_bytes: int = DEFAULT_INITIAL_BUFFER_SIZE_BYTES,
     use_optimized_save: bool = True,
+    use_cached_ckpt_structure: bool = False,
 ):
     """Wraps the trainer's checkpoint I/O with ML Flashpoint capabilities.
 
@@ -138,6 +143,8 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
         write_files_per_rank: Optional. The number of files each rank writes to for checkpoint data. Defaults to 1.
         initial_write_buffer_size_bytes: Optional. The initial size of the buffer for writing checkpoint data
             in bytes. Defaults to `DEFAULT_INITIAL_BUFFER_SIZE_BYTES`.
+        use_cached_ckpt_structure: Whether to reuse the checkpoint structure (plan) from the previous save.
+            Defaults to False.
 
     Returns:
         None. The trainer's checkpoint_io is modified in-place.
@@ -218,7 +225,8 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
             ),
             mp_manager=ctx.Manager(),
             files_per_rank=write_files_per_rank,
-        )
+        ),
+        use_cached_ckpt_structure=use_cached_ckpt_structure,
     )
     load_strategy = MLFlashpointMegatronLoadStrategy(
         replication_manager=replication_manager,
 
@@ -22,7 +22,6 @@
 
 import torch.cuda
 from torch import distributed as torchdist
-from torch.distributed.checkpoint import Metadata
 from torch.distributed.checkpoint import state_dict_saver as torchdistsaver
 from torch.distributed.checkpoint.logger import _dcp_method_logger
 from torch.distributed.checkpoint.planner import SavePlan
@@ -46,7 +45,14 @@ def generate_plan(
     storage_writer: MemoryStorageWriter,
     planner: torchdistsaver.SavePlanner,
     world_dist_wrapper: _DistWrapper,
-) -> tuple[SavePlan, list[ObjectWriteBucket], Metadata]:
+    cached_ckpt_structure: tuple[SavePlan, SavePlan, bool] | None = None,
+) -> tuple[
+    list[ObjectWriteBucket],
+    torchdistsaver.Metadata | None,
+    SavePlan,
+    SavePlan,
+    bool,
+]:
     """Performs the planning phase of checkpointing.
 
     This function is similar to PyTorch's `state_dict_saver.save` but only
@@ -62,9 +68,27 @@ def generate_plan(
         planner: The SavePlanner to use for the save.
         world_dist_wrapper: The distributed wrapper for world (all ranks) communication.
             Typically created as `_DistWrapper(process_group, not no_dist, coordinator_rank)`.
+        cached_ckpt_structure: Tuple of (cached_central_plan, cached_local_plan, validated_cache_reuse).
+
     Returns:
-        A tuple containing the updated local plan, write buckets, and global metadata.
+        A tuple containing:
+            - write_buckets: The buckets of data to be written.
+            - global_metadata: The global metadata for the checkpoint.
+            - central_plan (for caching): The centralized plan generated by the coordinator.
+            - local_plan (for caching): The local plan generated by this rank.
+            - validated_cache_reuse (bool): Whether the cached plan was successfully validated against the current
+                plan.
+                - If True: The structure of the checkpoint has not changed (e.g., same tensor shapes and sharding),
+                  so the cached plan can be safely reused for future steps to skip expensive planning.
+                - If False: The structure has changed or this is the first run, so the plan was re-generated.
+                - After 1st run: cached_central_plan is None, this value stays False -> 2nd run will validate cache.
+                - After 2nd run: cached_central_plan == central_plan (if structure stable), so this value becomes True
+                - After 3rd run+: reuse cached plan if structure stable, otherwise regenerate.
     """
+    cached_central_plan, cached_local_plan, validated_cache_reuse = (None, None, False)
+    if cached_ckpt_structure:
+        cached_central_plan, cached_local_plan, validated_cache_reuse = cached_ckpt_structure
+
     global_metadata: torchdistsaver.Metadata | None = None
 
     ckpt_kwargs = {"checkpoint_id": storage_writer.current_checkpoint_id, "process_group": world_dist_wrapper.group}
@@ -79,9 +103,12 @@ def local_step() -> SavePlan:
         )
         storage_writer.set_up_storage_writer(world_dist_wrapper.is_coordinator)
 
-        local_plan = planner.create_local_plan()
-        local_plan = storage_writer.prepare_local_plan(local_plan)
-        return local_plan
+        if cached_local_plan and validated_cache_reuse:
+            plan = cached_local_plan
+        else:
+            plan = planner.create_local_plan()
+
+        return storage_writer.prepare_local_plan(plan)
 
     @_dcp_method_logger(**ckpt_kwargs)
     def global_step(all_local_plans: list[SavePlan]) -> list[SavePlan]:
@@ -91,19 +118,31 @@ def global_step(all_local_plans: list[SavePlan]) -> list[SavePlan]:
         all_local_plans = storage_writer.prepare_global_plan(all_local_plans)
         return all_local_plans
 
-    with log_execution_time(logger=_LOGGER, name="generate_plan__reduce_scatter_plan"):
-        _LOGGER.debug("Executing plan reduce_scatter to get updated_local_plan...")
-        updated_local_plan = world_dist_wrapper.reduce_scatter("plan", local_step, global_step)
-
-    with log_execution_time(logger=_LOGGER, name="generate_plan__broadcast_metadata"):
-        _LOGGER.debug("Executing global_metadata broadcast...")
-        # TODO(perf): - can broadcast only to local rank 0 to reduce comms
-        global_metadata = world_dist_wrapper.broadcast_object(global_metadata)
-
-    final_local_plan = planner.finish_plan(updated_local_plan)
+    central_plan = None
+    if validated_cache_reuse and cached_central_plan:
+        _LOGGER.debug("Passed cache reusable")
+        local_plan = local_step()
+        central_plan = cached_central_plan
+    else:
+        with log_execution_time(logger=_LOGGER, name="generate_plan__reduce_scatter_plan"):
+            _LOGGER.debug("Executing plan reduce_scatter to get central_plan...")
+            local_plan = local_step()
+            central_plan = world_dist_wrapper.reduce_scatter("plan", lambda: local_plan, global_step)
+
+        with log_execution_time(logger=_LOGGER, name="generate_plan__broadcast_metadata"):
+            _LOGGER.debug("Executing global_metadata broadcast...")
+            global_metadata = world_dist_wrapper.broadcast_object(global_metadata)
+
+    final_local_plan = planner.finish_plan(central_plan)
     write_buckets = storage_writer.prepare_write_data_buckets(checkpoint_id, final_local_plan, planner)
 
-    return final_local_plan, write_buckets, global_metadata
+    return (
+        write_buckets,
+        global_metadata,
+        central_plan,
+        local_plan,
+        cached_central_plan == central_plan,
+    )
 
 
 @log_execution_time(logger=_LOGGER, name="write_data", level=logging.INFO)