Implement XLAShardedTensor._spec and test

aws-cph · aws-cph · commit bb4eb3b8bb31 · 2025-07-17T20:31:33.000Z
diff --git a/test/neuron/run_tests.sh b/test/neuron/run_tests.sh
@@ -56,6 +56,14 @@ function run_test {
   PJRT_DEVICE=NEURON NEURON_NUM_DEVICES=1 run_coverage "$@"
 }
 
+function run_test_multi_device {
+  if ! test_is_selected "$1"; then
+    return
+  fi
+  echo "Running in PjRt runtime: $@"
+  PJRT_DEVICE=NEURON run_coverage "$@"
+}
+
 function run_test_without_functionalization {
   if ! test_is_selected "$1"; then
     return
@@ -246,7 +254,8 @@ function run_xla_op_tests3 {
   run_test "$_TEST_DIR/spmd/test_xla_spmd_python_api_interaction.py"
   #run_test "$_TEST_DIR/spmd/test_dtensor_integration.py"
   #run_test "$_TEST_DIR/spmd/test_dtensor_integration2.py"
-  run_test "$_TEST_DIR/spmd/test_dtensor_convert_mesh.py"
+  run_test_multi_device "$_TEST_DIR/spmd/test_dtensor_convert_mesh.py"
+  run_test_multi_device "$_TEST_DIR/spmd/test_xla_dtensor_spec_conv.py"
   run_test "$_TEST_DIR/spmd/test_xla_auto_sharding.py"
   #run_test "$_TEST_DIR/spmd/test_spmd_parameter_wrapping.py"
   run_test "$_TEST_DIR/spmd/test_train_spmd_linear_model.py"
diff --git a/test/run_tests.sh b/test/run_tests.sh
@@ -254,6 +254,7 @@ function run_xla_op_tests3 {
   run_test "$_TEST_DIR/spmd/test_dtensor_integration2.py"
   run_test_multi_devices_without_func "$_TEST_DIR/spmd/test_dtensor_integration3.py"
   run_test_multi_devices "$_TEST_DIR/spmd/test_dtensor_convert_mesh.py"
+  run_test_multi_devices "$_TEST_DIR/spmd/test_xla_dtensor_spec_conversion.py"
   run_test "$_TEST_DIR/spmd/test_xla_auto_sharding.py"
   run_test "$_TEST_DIR/spmd/test_spmd_parameter_wrapping.py"
   run_test "$_TEST_DIR/spmd/test_mp_input_sharding.py"
diff --git a/test/spmd/test_xla_dtensor_spec_conversion.py b/test/spmd/test_xla_dtensor_spec_conversion.py
@@ -0,0 +1,140 @@
+import os
+import sys
+
+import torch
+from torch.distributed.tensor import DeviceMesh, Shard, distribute_tensor
+
+import torch_xla
+import torch_xla.runtime as xr
+
+import unittest
+import test_xla_sharding_base
+
+
+class XLADTensorSpecConversionTest(test_xla_sharding_base.XlaShardingTest):
+
+  @classmethod
+  def setUpClass(cls):
+    super().setUpClass()
+
+  def test_sample_test_case(self):
+    world_size = xr.global_runtime_device_count()
+    mesh = DeviceMesh("xla", torch.arange(world_size))
+    big_tensor = torch.randn(100000, 88)
+    my_dtensor = distribute_tensor(big_tensor, mesh, [Shard(0)])
+    
+    assert my_dtensor._spec.mesh.device_type == mesh.device_type
+    assert my_dtensor._spec.placements == (Shard(0),)
+
+  def test_xla_to_dtensor_spec_conversion(self):
+    device_count = xr.global_runtime_device_count()
+    mesh = DeviceMesh("xla", list(range(device_count)))
+    
+    # Test different sharding patterns
+    from torch.distributed.tensor.placement_types import Replicate
+    test_cases = [
+      (torch.randn(100, 50), [Shard(0)]),
+      (torch.randn(100, 50), [Shard(1)]),
+      (torch.randn(100, 50, 25), [Shard(0)]),
+      (torch.randn(100, 50), [Replicate()]),
+    ]
+    
+    for tensor, placements in test_cases:
+      xla_tensor = distribute_tensor(tensor, mesh, placements)
+      spec = xla_tensor._spec
+      
+      assert spec is not None
+      assert spec.mesh.device_type == "xla"
+      assert spec.tensor_meta.shape == tensor.shape
+      assert spec.tensor_meta.dtype == tensor.dtype
+      assert len(spec.placements) >= 1
+      assert spec.placements == tuple(placements)
+
+  def test_mesh_conversion(self):
+    device_count = xr.global_runtime_device_count()
+    original_mesh = DeviceMesh("xla", list(range(device_count)))
+    tensor = torch.randn(50, 50)
+    xla_tensor = distribute_tensor(tensor, original_mesh, [Shard(0)])
+    
+    converted_spec = xla_tensor._spec
+    
+    assert converted_spec.mesh.device_type == "xla"
+    assert converted_spec.mesh.size() == device_count
+    
+  def test_spec_caching(self):
+    """Test that _spec property caches results for better performance"""
+    import time
+    device_count = xr.global_runtime_device_count()
+    mesh = DeviceMesh("xla", list(range(device_count)))
+    tensor = torch.randn(1000, 1000)  # Large tensor to make spec creation noticeable
+    xla_tensor = distribute_tensor(tensor, mesh, [Shard(0)])
+    
+    # first access should create and cache the spec
+    start_time = time.time()
+    spec1 = xla_tensor._spec
+    first_access_time = time.time() - start_time
+    
+    # should be much faster due to caching
+    start_time = time.time()
+    spec2 = xla_tensor._spec
+    second_access_time = time.time() - start_time
+    
+    assert spec1 is spec2
+    print(f"First access: {first_access_time:.6f}s, Second access: {second_access_time:.6f}s")
+    assert second_access_time * 10 < first_access_time, \
+        f"Cached access should be much faster: {first_access_time:.6f}s vs {second_access_time:.6f}s"
+
+  def _create_test_tensor_and_mesh(self, tensor_shape, mesh_shape, placements):
+    """Helper to create tensor and mesh for testing"""
+    device_count = xr.global_runtime_device_count()
+    if device_count < max(mesh_shape):
+      self.skipTest(f"Need at least {max(mesh_shape)} devices, got {device_count}")
+    
+    mesh = DeviceMesh("xla", torch.arange(device_count).reshape(mesh_shape))
+    tensor = torch.randn(*tensor_shape)
+    return distribute_tensor(tensor, mesh, placements), mesh
+
+  def test_multi_dim_sharding_spec(self):
+    """Test _spec for multi-dimensional sharding"""
+    device_count = xr.global_runtime_device_count()
+    if device_count < 4:
+      self.skipTest("Need at least 4 devices for 2D mesh")
+    
+    mesh_shape = (2, device_count // 2)
+    xla_tensor, mesh = self._create_test_tensor_and_mesh((100, 50), mesh_shape, [Shard(0), Shard(1)])
+    spec = xla_tensor._spec
+    
+    assert len(spec.placements) == 2
+    assert spec.mesh.ndim == 2
+
+  def test_tensor_operations_preserve_spec(self):
+    """Test that tensor operations preserve sharding metadata"""
+    xla_tensor, mesh = self._create_test_tensor_and_mesh((100, 50), (-1,), [Shard(0)])
+    
+    result_add = xla_tensor + 1
+    result_mul = xla_tensor * 2
+    result_relu = torch.relu(xla_tensor)
+    
+    for result in [result_add, result_mul, result_relu]:
+      assert hasattr(result, '_spec')
+      assert result._spec.mesh.device_type == "xla"
+
+  def test_mixed_placement_spec(self):
+    """Test _spec for tensors with mixed shard/replicate placements"""
+    from torch.distributed.tensor.placement_types import Replicate
+    device_count = xr.global_runtime_device_count()
+    if device_count < 4:
+      self.skipTest("Need at least 4 devices for 2D mesh")
+    
+    mesh_shape = (2, device_count // 2)
+    xla_tensor, mesh = self._create_test_tensor_and_mesh((100, 50), mesh_shape, [Shard(0), Replicate()])
+    spec = xla_tensor._spec
+    
+    assert len(spec.placements) == 2
+    assert isinstance(spec.placements[0], Shard)
+    assert isinstance(spec.placements[1], Replicate)
+
+
+if __name__ == '__main__':
+  test = unittest.main()
+  sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/tpu/run_tests.sh b/test/tpu/run_tests.sh
@@ -61,6 +61,7 @@ run_test "$_TEST_DIR/spmd/test_xla_spmd_python_api_interaction.py"
 run_test "$_TEST_DIR/spmd/test_xla_auto_sharding.py"
 run_test "$_TEST_DIR/spmd/test_fsdp_v2.py"
 run_test "$_TEST_DIR/spmd/test_dtensor_convert_mesh.py"
+run_test "$_TEST_DIR/spmd/test_xla_dtensor_spec_conversion.py"
 run_test "$_TEST_DIR/test_gradient_accumulation.py"
 XLA_EXPERIMENTAL=nonzero:masked_select:nms run_test "$_TEST_DIR/ds/test_dynamic_shape_models.py" -v
 run_test "$_TEST_DIR/test_autocast.py"
diff --git a/torch_xla/distributed/spmd/xla_sharded_tensor.py b/torch_xla/distributed/spmd/xla_sharded_tensor.py
@@ -91,10 +91,10 @@ class XLAShardedTensor(torch.Tensor):
   # >> assert len(input.shape) == len(partition_spec)
   partition_spec: Tuple[int, None]
 
-  __slots__ = ['global_tensor']
+  __slots__ = ['global_tensor', 'mesh_shape', 'partition_spec', '_cached_spec']
 
   @staticmethod
-  def __new__(cls, elem: torch.Tensor, *args, **kwargs):
+  def __new__(cls, elem: torch.Tensor, mesh_shape=None, partition_spec=None, *args, **kwargs):
     # TODO(yeounoh) wrapper can take different arguments
     r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
         cls,
@@ -106,6 +106,11 @@ def __new__(cls, elem: torch.Tensor, *args, **kwargs):
         device=elem.device,
         requires_grad=kwargs.get("requires_grad", False))
     r.global_tensor = elem.detach() if r.requires_grad else elem
+    # Store mesh and partition information for DTensor compatibility
+    if mesh_shape is not None:
+      r.mesh_shape = mesh_shape
+    if partition_spec is not None:
+      r.partition_spec = partition_spec
     return r
 
   # Shards on the devices are materialized/available after the lazy
@@ -159,7 +164,25 @@ def unwrap(elem):
       return elem.global_tensor if isinstance(elem, XLAShardedTensor) else elem
 
     def wrap(elem):
-      return XLAShardedTensor(elem) if isinstance(elem, torch.Tensor) else elem
+      if isinstance(elem, torch.Tensor) and not isinstance(elem, XLAShardedTensor):
+        # Try to get mesh/partition info from any XLAShardedTensor in args
+        mesh_shape = None
+        partition_spec = None
+        
+        def find_sharded_info(x):
+          nonlocal mesh_shape, partition_spec
+          if isinstance(x, XLAShardedTensor):
+            if hasattr(x, 'mesh_shape') and x.mesh_shape:
+              mesh_shape = x.mesh_shape
+            if hasattr(x, 'partition_spec') and x.partition_spec:
+              partition_spec = x.partition_spec
+        
+        tree_map(find_sharded_info, args)
+        if kwargs:
+          tree_map(find_sharded_info, kwargs)
+        
+        return XLAShardedTensor(elem, mesh_shape=mesh_shape, partition_spec=partition_spec)
+      return elem
 
     # no_dispatch is only needed if you use enable_python_mode.
     # It prevents infinite recursion.
@@ -169,6 +192,56 @@ def wrap(elem):
                     func(*tree_map(unwrap, args), **tree_map(unwrap, kwargs)))
     return rs
 
+  @property
+  def _spec(self):
+    """
+    Convert XLA sharding information to DTensorSpec for DTensor interface compatibility.
+    """
+    # Return cached spec if available
+    if hasattr(self, '_cached_spec'):
+      return self._cached_spec
+      
+    from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
+    from torch.distributed.device_mesh import DeviceMesh
+    from torch.distributed.tensor.placement_types import Shard, Replicate
+    
+    # use existing mesh_shape
+    if hasattr(self, 'mesh_shape') and self.mesh_shape:
+      import torch_xla.runtime as xr
+      device_count = xr.global_runtime_device_count()
+      device_list = list(range(device_count))
+      mesh = DeviceMesh("xla", torch.tensor(device_list).reshape(self.mesh_shape))
+    else:
+      # default to 1D mesh
+      import torch_xla.runtime as xr
+      device_count = xr.global_runtime_device_count()
+      mesh = DeviceMesh("xla", list(range(device_count)))
+    
+    # use existing partition_spec
+    if hasattr(self, 'partition_spec') and self.partition_spec:
+      placements = []
+      for mesh_dim in range(len(self.mesh_shape) if hasattr(self, 'mesh_shape') and self.mesh_shape else 1):
+        # find tensor dimension sharded on this mesh dimension
+        tensor_dim = None
+        for t_dim, m_dim in enumerate(self.partition_spec):
+          if m_dim == mesh_dim:
+            tensor_dim = t_dim
+            break
+        placements.append(Shard(tensor_dim) if tensor_dim is not None else Replicate())
+    else:
+      placements = [Replicate()]
+    
+    # tensor metadata
+    tensor_meta = TensorMeta(
+      shape=self.global_tensor.shape,
+      stride=self.global_tensor.stride(),
+      dtype=self.global_tensor.dtype
+    )
+    
+    # Create and cache the spec
+    self._cached_spec = DTensorSpec(mesh=mesh, placements=tuple(placements), tensor_meta=tensor_meta)
+    return self._cached_spec
+
   @classmethod
   def __torch_function__(cls, func, types, args=(), kwargs=None):
     return super().__torch_function__(func, types, args, kwargs)
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -651,7 +651,8 @@ def mark_sharding(t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
   op_sharding = mesh.get_op_sharding(partition_spec)
   annotate_func = torch_xla._XLAC._xla_mark_sharding
   annotate_func(unwrap_sharded_tensor(t), op_sharding)
-  return wrap_as_sharded_tensor(t)
+  # Pass mesh and partition spec information for DTensor compatibility
+  return wrap_as_sharded_tensor(t, mesh_shape=mesh.mesh_shape, partition_spec=partition_spec)
 
 
 def mark_sharding_with_gradients(
@@ -756,10 +757,16 @@ def clear_sharding(t: Union[torch.Tensor, XLAShardedTensor]) -> torch.Tensor:
 
 
 def wrap_as_sharded_tensor(
-    t: Union[torch.Tensor, XLAShardedTensor]) -> XLAShardedTensor:
+    t: Union[torch.Tensor, XLAShardedTensor], mesh_shape=None, partition_spec=None) -> XLAShardedTensor:
+  # pass along mesh and partition spec information
   if not isinstance(t, XLAShardedTensor):
-    return XLAShardedTensor(t)
-  return t
+    return XLAShardedTensor(t, mesh_shape=mesh_shape, partition_spec=partition_spec)
+  else:
+    if mesh_shape is not None:
+      t.mesh_shape = mesh_shape
+    if partition_spec is not None:
+      t.partition_spec = partition_spec
+    return t
 
 
 def unwrap_sharded_tensor(