HabanaAI
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 89 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 0 additions & 89 deletions
diff --git a/‎CODEOWNERS‎
Lines changed: 2 additions & 48 deletions b/‎CODEOWNERS‎
Lines changed: 2 additions & 48 deletions
diff --git a/‎accelerator/hpu_accelerator.py‎
Lines changed: 16 additions & 1 deletion b/‎accelerator/hpu_accelerator.py‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎build.txt‎
Lines changed: 1 addition & 0 deletions b/‎build.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/fp_quantizer/fp_quantize.cpp‎
Lines changed: 2 additions & 0 deletions b/‎csrc/fp_quantizer/fp_quantize.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎deepspeed/checkpoint/zero_checkpoint.py‎
Lines changed: 1 addition & 1 deletion b/‎deepspeed/checkpoint/zero_checkpoint.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deepspeed/comm/torch.py‎
Lines changed: 28 additions & 22 deletions b/‎deepspeed/comm/torch.py‎
Lines changed: 28 additions & 22 deletions
@@ -5,52 +5,6 @@
 # Learn more about CODEOWNERS syntax here:
 # https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
 
+*          [email protected] [email protected] [email protected] [email protected] [email protected] [email protected] [email protected]
 
-# top-level repo folders
-/.github/ @loadams
-/azure/ @awan-10
-/benchmarks/ @awan-10 @tjruwase
-/bin/ @loadams
-/csrc/ @awan-10
-/deepspeed/ @loadams @tjruwase
-/docker/ @awan-10
-/docs/ @loadams @tjruwase
-/examples/ @awan-10 @tohtana
-/op_builder/ @loadams @tjruwase @jomayeri
-/release/ @loadams
-/requirements/ @loadams
-/scripts/ @awan-10
-/tests/ @tjruwase @loadams @tohtana
-
-# deepspeed
-/deepspeed/autotuning/ @loadams
-/deepspeed/checkpoint/ @tjruwase
-/deepspeed/comm/ @awan-10
-/deepspeed/compression/ @tjruwase
-/deepspeed/elasticity/ @awan-10
-/deepspeed/launcher/ @loadams
-/deepspeed/module_inject/ @awan-10
-/deepspeed/moe/ @tohtana
-/deepspeed/monitor/ @awan-10
-/deepspeed/nebula/ @tjruwase
-/deepspeed/ops/ @tohtana
-/deepspeed/pipe/ @tohtana @loadams
-/deepspeed/profiling/ @loadams
-/deepspeed/utils/ @tjruwase @awan-10
-
-# inference
-/deepspeed/inference/ @awan-10
-/deepspeed/model_implementations/ @awan-10
-
-# training
-/deepspeed/runtime/ @tjruwase @tohtana
-/deepspeed/runtime/activation_checkpointing/ @tjruwase
-/deepspeed/runtime/checkpoint_engine/ @tjruwase
-/deepspeed/runtime/comm/ @awan-10
-/deepspeed/runtime/compression/ @awan-10
-/deepspeed/runtime/data_pipeline/ @tjruwase
-/deepspeed/runtime/fp16/ @tjruwase
-/deepspeed/runtime/fp16/onebit/ @awan-10
-/deepspeed/runtime/pipe/ @loadams
-/deepspeed/runtime/swap_tensor/ @tjruwase
-/deepspeed/runtime/zero/ @tjruwase
+CODEOWNERS [email protected] [email protected] [email protected]
@@ -21,8 +21,14 @@ def __init__(self):
         self.apply_hpu_workarounds()
         try:
             import habana_frameworks.torch.hpu as hpu
-            hpu.setDeterministic(True)
             self.hpu = hpu
+            # TODO: [SW-215614] remove this WA when SW-208658 is resolved.
+            use_habana_fw_str = os.getenv('DEEPSPEED_USE_HABANA_FRAMEWORKS_DETERMINISTIC_API', default='')
+            if use_habana_fw_str.lower() in ['true', '1']:
+                hpu.setDeterministic(True)
+            else:
+                torch.use_deterministic_algorithms(True)
+
         except ImportError as e:
             raise ValueError(
                 f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
@@ -299,6 +305,14 @@ def get_op_builder(self, class_name):
         else:
             return self.class_dict['NotImplementedBuilder'] if 'NotImplementedBuilder' in self.class_dict else None
 
+    #shall be removed once moving to torch.compile
+    def wrap_in_hpu_graph(self, module):
+        if self.hpu.is_lazy():
+            module = self.hpu.wrap_in_hpu_graph(module)
+        else:
+            print("Warning: hpu graphs in eager mode is not supported, ignoring")
+        return module
+
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
         return BuildExtension
@@ -307,6 +321,7 @@ def export_envs(self):
         return []
 
     def visible_devices_envs(self):
+        # TODO SW-195658: remove WA to not return HABANA_VISIBLE_MODULES once SW-195657 is resolved
         # Current way deepspeed set this env var is not applicable with all HPU instances
         # User has to follow instructions in:
         # https://docs.habana.ai/en/latest/PyTorch/Reference/PT_Multiple_Tenants_on_HPU/Multiple_Workloads_Single_Docker.html
 
@@ -0,0 +1 @@
++hpu.synapse.v1.20.0
@@ -24,6 +24,7 @@
 
 at::Tensor quantize(torch::Tensor& out,
                     torch::Tensor& val,
+                    torch::Tensor& scale,
                     int group_size,
                     int stochastic_rounding,
                     int q_bits,
@@ -59,6 +60,7 @@ at::Tensor quantize(torch::Tensor& out,
 
 void dequantize(torch::Tensor& val,
                 torch::Tensor& val_q,
+                torch::Tensor& scale,
                 int group_size,
                 int q_mantisa_bits,
                 int q_exponent_bits)
 
@@ -105,7 +105,7 @@ def _strip_tensor_paddings(self, sd):
             if group_paddings[key] == 0:
                 continue
             for state_name, state_value in group_state.items():
-                if state_name != "step" and torch.is_tensor(state_value):
+                if state_name != "step" and torch.is_tensor(state_value) and state_value.dim():
                     raw_length = state_value.numel() - group_paddings[key]
                     group_state[state_name] = torch.narrow(state_value, 0, 0, raw_length).clone()
                 else:
 
@@ -20,6 +20,12 @@
 DS_COMM_REDUCE_OFF = False
 
 
+def disable_compiler_collective(func):
+    if required_torch_version(min_version=2.3):
+        return func
+    return compiler.disable(func)
+
+
 def build_shm_op():
     builder = get_accelerator().create_op_builder("ShareMemCommBuilder")
     if builder is None or not deepspeed.ops.__compatible_ops__[builder.NAME]:
@@ -114,7 +120,7 @@ def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='
             self.shm_comm_op.initialize(self.get_world_size(), self.get_rank())
 
     @classmethod
-    @compiler.disable
+    @disable_compiler_collective
     def get_all_gather_function(self):
         if hasattr(torch.distributed, "all_gather_into_tensor"):
             return torch.distributed.all_gather_into_tensor
@@ -123,7 +129,7 @@ def get_all_gather_function(self):
         return None
 
     @classmethod
-    @compiler.disable
+    @disable_compiler_collective
     def get_reduce_scatter_function(self):
         if hasattr(torch.distributed, "reduce_scatter_tensor"):
             return torch.distributed.reduce_scatter_tensor
@@ -146,7 +152,7 @@ def init_process_group(self, backend, timeout, init_method, rank, world_size):
                                                  world_size=world_size)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         op = self._reduce_op(op)
         return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
@@ -158,7 +164,7 @@ def inference_all_reduce(self, tensor, op, group=None):
         else:
             return torch.ops.deepspeed.inference_all_reduce_(tensor)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         """ proxy func to torch.distributed.all_reduce_coalesced,
         which is included in PyTorch 1.13 and above
@@ -169,15 +175,15 @@ def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group
         op = self._reduce_op(op)
         return torch.distributed.all_reduce_coalesced(tensors=tensors, op=op, group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
         if DS_COMM_REDUCE_OFF:
             if int(os.getenv('RANK', '0')) == 0:
                 utils.logger.warning("REDUCE is OFF")
             return Noop()
         return torch.distributed.reduce(tensor=tensor, dst=dst, op=self._reduce_op(op), group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
         if DS_COMM_REDUCE_SCATTER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -190,7 +196,7 @@ def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_
                                                     group=group,
                                                     async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def broadcast(self, tensor, src, group=None, async_op=False):
         if DS_COMM_BROADCAST_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -199,7 +205,7 @@ def broadcast(self, tensor, src, group=None, async_op=False):
         else:
             return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -208,15 +214,15 @@ def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         else:
             return torch.distributed.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_op=False):
         if self.has_all_gather_into_tensor():
             return self.all_gather_function(output_tensor=output_tensor,
                                             input_tensor=input_tensor,
                                             group=group,
                                             async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -234,7 +240,7 @@ def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=Fals
                                      "please consider upgrading your pytorch installation.")
                 pass
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_op=False):
         """"""
         assert len(output_tensors) == len(input_tensors), ""
@@ -258,7 +264,7 @@ def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_
             else:
                 reqs[-1].wait()
 
-    @compiler.disable
+    @disable_compiler_collective
     def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, group=None, async_op=False):
         if self.has_reduce_scatter_tensor():
             return self.reduce_scatter_function(output_tensor,
@@ -272,7 +278,7 @@ def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, gr
                                  "please consider upgrading your pytorch installation.")
             pass
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_to_all_single(self,
                           output,
                           input,
@@ -287,49 +293,49 @@ def all_to_all_single(self,
                                                    group=group,
                                                    async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def all_to_all(self, output_tensor_list, input_tensor_list, group=None, async_op=False):
         return torch.distributed.all_to_all(output_tensor_list, input_tensor_list, group=group, async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def send(self, tensor, dst, group=None, tag=0):
         return torch.distributed.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
-    @compiler.disable
+    @disable_compiler_collective
     def recv(self, tensor, src=None, group=None, tag=0):
         return torch.distributed.recv(tensor=tensor, src=src, group=group, tag=tag)
 
-    @compiler.disable
+    @disable_compiler_collective
     def isend(self, tensor, dst, group=None, tag=0):
         return torch.distributed.isend(tensor=tensor, dst=dst, group=group, tag=tag)
 
-    @compiler.disable
+    @disable_compiler_collective
     def irecv(self, tensor, src=None, group=None, tag=0):
         return torch.distributed.irecv(tensor=tensor, src=src, group=group, tag=tag)
 
-    @compiler.disable
+    @disable_compiler_collective
     def gather(self, tensor, gather_list=None, dst=0, group=None, async_op=False):
         return torch.distributed.gather(tensor=tensor,
                                         gather_list=gather_list,
                                         dst=dst,
                                         group=group,
                                         async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def scatter(self, tensor, scatter_list=None, src=0, group=None, async_op=False):
         return torch.distributed.scatter(tensor=tensor,
                                          scatter_list=scatter_list,
                                          src=src,
                                          group=group,
                                          async_op=async_op)
 
-    @compiler.disable
+    @disable_compiler_collective
     def barrier(self, group=torch.distributed.GroupMember.WORLD, async_op=False, device_ids=None):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
         return torch.distributed.barrier(group=group, async_op=async_op, device_ids=device_ids)
 
-    @compiler.disable
+    @disable_compiler_collective
     def monitored_barrier(self, group=torch.distributed.GroupMember.WORLD, timeout=None, wait_all_ranks=False):
         if group is None:
             group = torch.distributed.GroupMember.WORLD