formatting and license

Schwidola0607 · Schwidola0607 · commit 06a4a9f06977 · 2025-05-01T18:30:58.000-05:00
diff --git a/deepspeed/checkpoint/hf_to_universal.py b/deepspeed/checkpoint/hf_to_universal.py
@@ -1,8 +1,14 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
 import torch
 import os
 import shutil
 import logging
-from concurrent.futures import ProcessPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor
+from deepspeed.accelerator import get_accelerator
 from tqdm import tqdm
 from typing import List
 
@@ -14,8 +20,8 @@
     'word_embeddings',
     'embed_tokens',
     'embedding',
-    'wte',              # GPT style embeddings
-    'lm_head'           # Language model head, often tied with embeddings
+    'wte',  # GPT style embeddings
+    'lm_head'  # Language model head, often tied with embeddings
 ]
 
 
@@ -24,20 +30,27 @@ def get_parameter_type(name: str) -> dict:
     param_info = {
         'cat_dim': 0  # Default concatenation dimension
     }
-    
+
     # Check for vocabulary tensors (embeddings, etc.)
     if any(pattern in name.lower() for pattern in VOCAB_PARAMETER_PATTERNS):
         param_info['vocab_tensor'] = True
-    
+
     # TODO: figure out if we need to check for row-parallel parameters
     return param_info
 
+
 if __name__ == '__main__':
     import argparse
-    
+
     parser = argparse.ArgumentParser(description='Convert HuggingFace checkpoint to Universal Checkpoint format')
-    parser.add_argument('--hf_checkpoint_dir', type=str, required=True, help='Path to the HuggingFace checkpoint directory')
-    parser.add_argument('--safe_serialization', action='store_true', default=False, help='Use safetensors for serialization')
+    parser.add_argument('--hf_checkpoint_dir',
+                        type=str,
+                        required=True,
+                        help='Path to the HuggingFace checkpoint directory')
+    parser.add_argument('--safe_serialization',
+                        action='store_true',
+                        default=False,
+                        help='Use safetensors for serialization')
     parser.add_argument('--num_workers', type=int, default=4, help='Number of workers to use for saving checkpoints')
     parser.add_argument('--save_dir', type=str, required=True, help='Directory to save checkpoints')
     args = parser.parse_args()
@@ -50,19 +63,19 @@ def save_parameter(name: str, param: torch.Tensor, save_dir: str):
         # Create parameter directory under zero/
         param_dir = os.path.join(save_dir, name)
         os.makedirs(param_dir, exist_ok=True)
-        
+
         # Get parameter type and required fields
         param_info = get_parameter_type(name)
-        
+
         # Save parameter in fp32 with proper dictionary structure
         param_path = os.path.join(param_dir, "fp32.pt")
         param_dict = {
             'param': param.to(torch.float32),  # Main tensor goes in 'param' field
             **param_info  # Include all determined parameter info
         }
         torch.save(param_dict, param_path)
-        
-        # Since HuggingFace checkpoints do not have optimizer states, 
+
+        # Since HuggingFace checkpoints do not have optimizer states,
         # we initialize them with zeros
         for state in ("exp_avg", "exp_avg_sq"):
             state_path = os.path.join(param_dir, f"{state}.pt")
@@ -77,30 +90,30 @@ def process_shard(shard_file, checkpoint_dir, save_dir, safe_serialization):
         try:
             shard_path = os.path.join(checkpoint_dir, shard_file)
             logger.info(f"Loading shard from: {shard_path}")
-            
+
             if safe_serialization:
                 from safetensors.torch import load_file
                 shard_dict = load_file(shard_path)
             else:
                 shard_dict = torch.load(shard_path, map_location='cpu')
-            
+
             # Create progress bar for parameters within this shard
-            pbar = tqdm(total=len(shard_dict), 
-                       desc=f"Processing {os.path.basename(shard_file)}", 
-                       position=1, 
-                       leave=False)
-            
+            pbar = tqdm(total=len(shard_dict),
+                        desc=f"Processing {os.path.basename(shard_file)}",
+                        position=1,
+                        leave=False)
+
             for key, param in shard_dict.items():
                 save_parameter(key, param, save_dir)
                 del param
                 pbar.update(1)
                 pbar.set_postfix({'key': key[:20] + '...' if len(key) > 20 else key})
-            
+
             pbar.close()
             del shard_dict
-            torch.cuda.empty_cache()
+            get_accelerator().empty_cache()
             logger.info(f"Completed processing shard: {shard_file}")
-            
+
         except Exception as e:
             logger.error(f"Error processing shard {shard_file}: {str(e)}")
             raise
@@ -111,7 +124,7 @@ def get_shard_list(checkpoint_dir):
             index_file = os.path.join(checkpoint_dir, "model.safetensors.index.json")
         else:
             index_file = os.path.join(checkpoint_dir, "pytorch_model.bin.index.json")
-            
+
         if os.path.exists(index_file):
             import json
             with open(index_file, 'r') as f:
@@ -131,18 +144,11 @@ def process_shard_batch(shard_files: List[str], checkpoint_dir: str, save_dir: s
         with ProcessPoolExecutor(max_workers=args.num_workers) as executor:
             futures = []
             for shard_file in shard_files:
-                future = executor.submit(process_shard, 
-                                       shard_file, 
-                                       checkpoint_dir, 
-                                       save_dir,
-                                       safe_serialization)
+                future = executor.submit(process_shard, shard_file, checkpoint_dir, save_dir, safe_serialization)
                 futures.append((shard_file, future))
-            
+
             # Create progress bar for this batch
-            batch_pbar = tqdm(total=len(futures), 
-                            desc=f"Processing shard batch", 
-                            position=0, 
-                            leave=True)
+            batch_pbar = tqdm(total=len(futures), desc=f"Processing shard batch", position=0, leave=True)
 
             # Wait for all futures to complete
             for shard_file, future in futures:
@@ -153,7 +159,7 @@ def process_shard_batch(shard_files: List[str], checkpoint_dir: str, save_dir: s
                 except Exception as e:
                     logger.error(f"Failed processing shard {shard_file}: {str(e)}")
                     raise
-            
+
             batch_pbar.close()
 
     try:
@@ -162,42 +168,45 @@ def process_shard_batch(shard_files: List[str], checkpoint_dir: str, save_dir: s
         if os.path.exists(temp_zero_dir):
             logger.info(f"Removing existing temp directory: {temp_zero_dir}")
             shutil.rmtree(temp_zero_dir)
-        
+
         shard_files = get_shard_list(args.hf_checkpoint_dir)
         total_shards = len(shard_files)
         logger.info(f"Found {total_shards} shards to process")
         # Process shards in batches equal to the number of workers
         batch_size = args.num_workers
         for i in range(0, total_shards, batch_size):
             batch_shards = shard_files[i:i + batch_size]
-            logger.info(f"Processing batch of {len(batch_shards)} shards ({i+1}-{min(i+batch_size, total_shards)} of {total_shards})")
-            process_shard_batch(batch_shards, 
-                                args.hf_checkpoint_dir, 
-                                temp_zero_dir,  # Changed from temp_save_dir to temp_zero_dir
-                                args.safe_serialization)
-            
+            logger.info(
+                f"Processing batch of {len(batch_shards)} shards ({i+1}-{min(i+batch_size, total_shards)} of {total_shards})"
+            )
+            process_shard_batch(
+                batch_shards,
+                args.hf_checkpoint_dir,
+                temp_zero_dir,  # Changed from temp_save_dir to temp_zero_dir
+                args.safe_serialization)
+
             # Clear CUDA cache after each batch to free up memory
-            torch.cuda.empty_cache()
-        
+            get_accelerator().empty_cache()
+
         logger.info("All shard batches processed successfully")
-        
+
         final_save_dir = os.path.join(args.save_dir, 'zero')
         if os.path.exists(final_save_dir):
             shutil.rmtree(final_save_dir)
-        
+
         # Create the parent directory if it doesn't exist
         os.makedirs(os.path.dirname(final_save_dir), exist_ok=True)
         # Move the zero directory to its final location
         os.rename(temp_zero_dir, final_save_dir)
-        
+
         # Clean up the temporary directory
         if os.path.exists(temp_save_dir):
             shutil.rmtree(temp_save_dir)
-        
+
         # Write identifier file
         with open(os.path.join(args.save_dir, 'source.txt'), 'w') as f:
             f.write("Huggingface checkpoint")
-        
+
         logger.info(f"Successfully saved checkpoint to {final_save_dir}")
 
         # Update latest file
@@ -206,7 +215,7 @@ def process_shard_batch(shard_files: List[str], checkpoint_dir: str, save_dir: s
         latest_file = os.path.join(checkpoint_root_folder, 'latest_universal')
         with open(latest_file, 'w') as f:
             f.write(step_folder)
-        
+
         logger.info(f"Checkpoint conversion completed successfully. Latest file updated at {latest_file}")
 
     except Exception as e:
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -3035,7 +3035,7 @@ def _load_checkpoint(self,
         if self.load_universal_checkpoint() and len(ckpt_list) == 0:
             logger.warning(f"Unable to find {ckpt_file_pattern} files in UCP folder {load_dir}")
             return None, {}
-        
+
         sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, checkpoint_engine=self.checkpoint_engine)
 
         is_pipe_parallel = isinstance(self.module, PipelineModule)
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
@@ -2812,7 +2812,7 @@ def _load_global_state_stage3(self, sd):
 
     def load_hp_checkpoint_state(self, folder, key):
         local_rank = dist.get_local_rank()
-        
+
         # Load tensors from files and reshape them to flat vectors
         loaded_state = torch.load(os.path.join(folder, f"{key}.pt"), weights_only=False)
         if isinstance(loaded_state, dict):