update

deependujha · deependujha · commit cb2c7d55c3f7 · 2026-02-16T12:22:08.000+05:30
diff --git a/src/lightning/fabric/plugins/io/checkpoint_io.py b/src/lightning/fabric/plugins/io/checkpoint_io.py
@@ -80,14 +80,15 @@ def requires_cpu_collectives(self) -> bool:
         return False
 
     @property
-    def _requires_state_conversion(self) -> bool:
-        """Whether the Strategy must pre-convert stateful objects into ``state_dict`` form before calling this
-        CheckpointIO.
-
-        CheckpointIO implementations that perform in-place loading may expect the provided
-        ``state`` to already contain plain dictionaries instead of high-level objects such
-        as ``nn.Module`` or ``Optimizer``. When this returns ``True``, the Strategy should
-        convert the state using its internal state-extraction logic prior to save/load.
+    def _restore_after_setup(self) -> bool:
+        """Whether checkpoint restoration should be delayed until after the Strategy setup phase.
+
+        Some checkpoint implementations require the distributed environment, device placement,
+        or wrapped modules to be fully initialized before loading state. When this returns
+        ``True``, the Trainer/Strategy will restore the checkpoint only after setup has completed.
+
+        This is primarily used by distributed checkpointing backends that depend on collective
+        communication during load.
 
         """
         return False
diff --git a/src/lightning/fabric/plugins/io/distributed_async_io.py b/src/lightning/fabric/plugins/io/distributed_async_io.py
@@ -112,7 +112,7 @@ def __init__(
             async_type = state_dict_saver.AsyncCheckpointerType(self._checkpointer_type)
             default_save_options["async_checkpointer_type"] = async_type
             default_save_options["planner"] = DefaultSavePlanner(enable_plan_caching=enable_plan_caching)
-        print(f"{default_save_options=}")
+
         self.save_options = {**default_save_options, **(save_options or {})}
         self.load_options = dict(load_options or {})
         self._disable_safe_warnings()
@@ -137,7 +137,8 @@ def _wait(self) -> None:
 
     @override
     @property
-    def _requires_state_conversion(self) -> bool:
+    def _restore_after_setup(self) -> bool:
+        """Requires delayed restoration until after Strategy setup."""
         return True
 
     @property
diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py
@@ -336,25 +336,17 @@ def load_checkpoint(
 
         """
         torch.cuda.empty_cache()
-        if self.checkpoint_io._requires_state_conversion and state is not None:
-            if not isinstance(state, dict):
-                raise ValueError(
-                    "When using a CheckpointIO that requires state conversion, the `state` argument must be a dict."
-                )
-            # update in_place so non-tensor objects get updated as well when using in-place loading
-            state = self._convert_stateful_objects_in_state(state, filter={}, in_place=True)
-
-        # in-place loading requires state to be a dict
-        _state = state if isinstance(state, dict) else None
-        checkpoint = self.checkpoint_io.load_checkpoint(path, state=_state, weights_only=weights_only)
+        converted_state = state
+        if state is not None:
+            converted_state = self._convert_stateful_objects_in_state(
+                state,
+                filter={},
+            )
+
+        checkpoint = self.checkpoint_io.load_checkpoint(path, state=converted_state, weights_only=weights_only)
         if not state:
             return checkpoint
 
-        if checkpoint == {}:
-            # In-place loaders (e.g., DCP) return {} to signal that the state
-            # has already been fully restored by the CheckpointIO implementation.
-            return {}
-
         if isinstance(state, Module):
             self.load_module_state_dict(module=state, state_dict=checkpoint, strict=strict)
             return {}
@@ -422,13 +414,7 @@ def _convert_stateful_objects_in_state(
         self,
         state: dict[str, Union[Module, Optimizer, Any]],
         filter: dict[str, Callable[[str, Any], bool]],
-        in_place: bool = False,
     ) -> dict[str, Any]:
-        if in_place and filter != {}:
-            raise ValueError(
-                "In-place conversion does not support filtering. Please set `in_place=False` to apply the filter."
-            )
-
         converted_state: dict[str, Any] = {}
         for key, obj in state.items():
             # convert the state
@@ -441,11 +427,8 @@ def _convert_stateful_objects_in_state(
             else:
                 converted = obj
 
-            if in_place:
-                state[key] = converted
-            else:
-                _apply_filter(key, filter, converted, converted_state)
-        return converted_state if not in_place else state
+            _apply_filter(key, filter, converted, converted_state)
+        return converted_state
 
 
 class _BackwardSyncControl(ABC):
diff --git a/src/lightning/pytorch/strategies/strategy.py b/src/lightning/pytorch/strategies/strategy.py
@@ -457,7 +457,7 @@ def restore_checkpoint_after_setup(self) -> bool:
             If ``True``, restore checkpoint after strategy setup.
 
         """
-        return self.checkpoint_io._requires_state_conversion
+        return self.checkpoint_io._restore_after_setup
 
     @property
     def lightning_restore_optimizer(self) -> bool:
diff --git a/tests/tests_fabric/plugins/io/test_distributed_async_io.py b/tests/tests_fabric/plugins/io/test_distributed_async_io.py
@@ -111,8 +111,8 @@ def test_async_checkpointio_requires_cpu_collectives():
 
 
 @RunIf(min_torch="2.4")
-def test_async_checkpointio_requires_state_conversion():
-    assert DistributedAsyncCheckpointIO()._requires_state_conversion is True
+def test_async_checkpointio_requires_restore_after_setup():
+    assert DistributedAsyncCheckpointIO()._restore_after_setup is True
 
 
 @RunIf(min_torch="2.4")