remove ignore_missing_optim config from zero ds_config

Schwidola0607 · Schwidola0607 · commit 662a29716308 · 2025-04-10T05:08:54.000-05:00
Signed-off-by: Schwidola0607 &lt;khoadangpham82944@gmail.com&gt;
diff --git a/deepspeed/runtime/base_optimizer.py b/deepspeed/runtime/base_optimizer.py
@@ -17,16 +17,16 @@ class DeepSpeedOptimizer(object):
 
 class ZeROOptimizer(DeepSpeedOptimizer):
 
-    def load_hp_checkpoint_state_from_checkpoint_dir(self, lp_groups_name: str, checkpoint_dir: str, ignore_missing_optim_state: bool = False) -> None:
+    def load_hp_checkpoint_state_from_checkpoint_dir(self, lp_groups_name: str, checkpoint_dir: str) -> None:
         checkpoint_dir = os.path.join(checkpoint_dir, "zero")
-        optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")            
-        if not ignore_missing_optim_state:
-            assert os.path.isfile(
-                optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
-                
+        optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
+        if os.path.isfile(optim_state_path):
+            ignore_missing_optim_state = False
             optim_sd = torch.load(optim_state_path, weights_only=False)
             self._load_global_state(optim_sd)
         else:
+            logger.warning(f'{optim_state_path} containing optimizer global state is missing!')
+            ignore_missing_optim_state = True
             optim_sd = {}
 
         tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -2952,24 +2952,16 @@ def load_checkpoint(self,
         if self._optimizer_has_ckpt_event_prologue():
             # Prepare for checkpoint load by ensuring all parameters are partitioned
             self.optimizer.checkpoint_event_prologue()
-        
-        if not self.zero_ignore_missing_optim_state():
-            # Temporary skip this path for HF-based UCP
-            load_path, client_states = self._load_checkpoint(load_dir,
-                                                            tag,
-                                                            load_module_strict=load_module_strict,
-                                                            load_optimizer_states=load_optimizer_states,
-                                                            load_lr_scheduler_states=load_lr_scheduler_states,
-                                                            load_module_only=load_module_only,
-                                                            custom_load_fn=custom_load_fn)
-            
-            load_zero_checkpoint = load_path is not None and (self.zero_optimization() or self.bfloat16_enabled())
-        
-        else:
-            # What should load_path and client_states be?
-            load_path, client_states = None, {}
-            load_zero_checkpoint = (self.zero_optimization() or self.bfloat16_enabled())
-        
+
+        load_path, client_states = self._load_checkpoint(load_dir,
+                                                         tag,
+                                                         load_module_strict=load_module_strict,
+                                                         load_optimizer_states=load_optimizer_states,
+                                                         load_lr_scheduler_states=load_lr_scheduler_states,
+                                                         load_module_only=load_module_only,
+                                                         custom_load_fn=custom_load_fn)
+
+        load_zero_checkpoint = load_path is not None and (self.zero_optimization() or self.bfloat16_enabled())
         if load_zero_checkpoint:
             if (load_optimizer_states and not load_module_only) or self.load_universal_checkpoint():
                 success = self._load_zero_checkpoint(load_dir, tag, load_optimizer_states=load_optimizer_states)
@@ -3009,7 +3001,7 @@ def _load_checkpoint(self,
                          custom_load_fn=None):
 
         from deepspeed.runtime.state_dict_factory import SDLoaderFactory
-        logger.info(f"Loading checkpoint from {load_dir} with tag {tag}")
+
         ckpt_list = self._get_all_ckpt_names(load_dir, tag)
         sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine=self.checkpoint_engine)
 
@@ -3167,8 +3159,7 @@ def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
                                        load_from_fp32_weights=self.zero_load_from_fp32_weights(),
                                        checkpoint_folder=checkpoint_folder,
                                        load_serial=load_serial,
-                                       param_shapes=param_shapes,
-                                       ignore_missing_optim_state=self.zero_ignore_missing_optim_state())
+                                       param_shapes=param_shapes)
 
         if self.load_universal_checkpoint():
             logger.info(f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}')
diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py
@@ -164,6 +164,7 @@ def set_module(self, sd, module):
         return sd
 
     def check_ckpt_list(self):
+        #logger.info(f'checkpoint file list: {self.ckpt_list}')
         assert len(self.ckpt_list) > 0
 
         sd = self.checkpoint_engine.load(self.ckpt_list[0], map_location=lambda storage, loc: storage)
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
@@ -272,7 +272,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     ignore_unused_parameters: bool = True
     """
     Unused parameters in modules may be unexpected in static networks, but
-  could be normal in dynamic networks. This controls whether or not training
+    could be normal in dynamic networks. This controls whether or not training
     should terminate with an error message when unused parameters are detected.
     This is set to ``True`` by default, which means unused parameters are
     ignored and training continues. Now is just used in stage 2.
@@ -345,11 +345,6 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     """
     Whether to log warnings from trace cache, such as invalidation events.
     """
-    
-    ignore_missing_optim_state: bool = False
-    """
-    Ignore missing optimizer states when loading checkpoint
-    """
 
     # Validators
     @model_validator(mode="after")
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
@@ -2692,8 +2692,7 @@ def load_state_dict(self,
                         load_from_fp32_weights=False,
                         checkpoint_folder=None,
                         load_serial=None,
-                        param_shapes=None,
-                        ignore_missing_optim_state: bool = False):
+                        param_shapes=None):
         r"""Loading a ZeRO checkpoint
         Arguments:
             state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
@@ -2724,7 +2723,7 @@ def load_state_dict(self,
 
         if checkpoint_folder:
             self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights,
-                                            param_shapes, ignore_missing_optim_state=ignore_missing_optim_state)
+                                            param_shapes)
         else:
             self._rigid_load_state_dict(state_dict_list[dist.get_rank(group=self.dp_process_group)],
                                         load_optimizer_states=load_optimizer_states)
@@ -2746,19 +2745,20 @@ def load_state_dict(self,
                 # self.persistent_parameters[0].all_gather(self.persistent_parameters) # this will be done in checkpoint_event_epilogue() so remove it to prevent double all_gather
 
     def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights,
-                                   param_shapes, ignore_missing_optim_state):
-        self.load_hp_checkpoint_state_from_checkpoint_dir_stage3(checkpoint_folder, param_shapes, ignore_missing_optim_state)
+                                   param_shapes):
+        self.load_hp_checkpoint_state_from_checkpoint_dir_stage3(checkpoint_folder, param_shapes)
 
-    def load_hp_checkpoint_state_from_checkpoint_dir_stage3(self, checkpoint_dir, param_shapes, ignore_missing_optim_state):
+    def load_hp_checkpoint_state_from_checkpoint_dir_stage3(self, checkpoint_dir, param_shapes):
         """ Load optimizer and model states from the checkpoint directory. """
         checkpoint_dir = os.path.join(checkpoint_dir, "zero")
         optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
-        if not ignore_missing_optim_state:
-            assert os.path.isfile(
-                optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
-
+        if os.path.isfile(optim_state_path):
+            ignore_missing_optim_state = False
             optim_sd = torch.load(optim_state_path, weights_only=False)
             self._load_global_state_stage3(optim_sd)
+        else:
+            logger.warning(f'{optim_state_path} containing optimizer global state is missing!')
+            ignore_missing_optim_state = True
 
         key_list = ["fp32", "exp_avg", "exp_avg_sq"]
 
@@ -2777,6 +2777,7 @@ def load_hp_checkpoint_state_from_checkpoint_dir_stage3(self, checkpoint_dir, pa
             # Purge the swapped optimizer state, it was initialized to the freshly created model and not the checkpoint
             self.optimizer_swapper.purge_state()
 
+        if self.swap_optimizer:
             # Touch all parameters to synchronize all buffers
             timer_names = set()
             self._partition_all_parameters()
@@ -2813,7 +2814,6 @@ def load_hp_checkpoint_state(self, folder, key):
         local_rank = dist.get_local_rank()
 
         # Load tensors from files and reshape them to flat vectors
-         
         loaded_state = torch.load(os.path.join(folder, f"{key}.pt"), weights_only=False)
         if isinstance(loaded_state, dict):
             loaded_checkpoint_state = loaded_state['param'].view(-1)
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -2309,15 +2309,14 @@ def load_state_dict(self,
                         load_from_fp32_weights=False,
                         checkpoint_folder=None,
                         load_serial=None,
-                        param_shapes=None,
-                        ignore_missing_optim_state: bool = False):
+                        param_shapes=None):
         if checkpoint_folder:
-            self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights, ignore_missing_optim_state=ignore_missing_optim_state)
+            self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
         else:
             self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights)
 
-    def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights, ignore_missing_optim_state: bool = False):
-        self.load_hp_checkpoint_state_from_checkpoint_dir("bit16_groups", checkpoint_folder, ignore_missing_optim_state=ignore_missing_optim_state)
+    def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
+        self.load_hp_checkpoint_state_from_checkpoint_dir("bit16_groups", checkpoint_folder)
 
     def _load_global_state(self, sd):
         self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler)