pytorch
diff --git a/‎test/cpp/test_xla_sharding.cpp‎
Lines changed: 23 additions & 16 deletions b/‎test/cpp/test_xla_sharding.cpp‎
Lines changed: 23 additions & 16 deletions
diff --git a/‎torch_xla/csrc/ir.h‎
Lines changed: 5 additions & 0 deletions b/‎torch_xla/csrc/ir.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎torch_xla/csrc/lowering_context.cpp‎
Lines changed: 18 additions & 0 deletions b/‎torch_xla/csrc/lowering_context.cpp‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎torch_xla/csrc/lowering_context.h‎
Lines changed: 9 additions & 0 deletions b/‎torch_xla/csrc/lowering_context.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎torch_xla/csrc/tensor_util.cpp‎
Lines changed: 48 additions & 4 deletions b/‎torch_xla/csrc/tensor_util.cpp‎
Lines changed: 48 additions & 4 deletions
@@ -51,7 +51,8 @@ TEST_F(XLAShardingTest, GetShardShape) {
       {2, 3},
   });
   auto xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
-  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
+  std::vector<int64_t> denormalized_tile_assignment = {0, 1, 2, 3};
+  torch_xla::OpSharding sharding(xla_sharding, denormalized_tile_assignment);
   auto sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
 
@@ -60,7 +61,7 @@ TEST_F(XLAShardingTest, GetShardShape) {
   EXPECT_EQ(shard_shape, std::vector<int64_t>({4, 4}));
 
   xla_sharding = xla::HloSharding::Replicate().ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   sharding_spec->sharding = sharding;
   shard_shape = ShardingUtil::GetShardShape(sharding_spec);
   // For replicated sharding, each dimension should be preserved
@@ -78,7 +79,8 @@ TEST_F(XLAShardingTest, GetShardIndicesForDevices) {
       {2, 3},
   });
   auto xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
-  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
+  std::vector<int64_t> denormalized_tile_assignment = {0, 1, 2, 3};
+  torch_xla::OpSharding sharding(xla_sharding, denormalized_tile_assignment);
   auto sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   auto shard_shape = ShardingUtil::GetShardShape(sharding_spec);
@@ -108,7 +110,7 @@ TEST_F(XLAShardingTest, GetShardIndicesForDevices) {
     }
   }
   xla_sharding = xla::HloSharding::Replicate().ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   sharding_spec->sharding = sharding;
   shard_shape = ShardingUtil::GetShardShape(sharding_spec);
   replica_and_indices = ShardingUtil::GetShardReplicaAndIndicesForDevices(
@@ -126,6 +128,7 @@ TEST_F(XLAShardingTest, GetShardIndicesForDevices) {
 TEST_F(XLAShardingTest, ShardTensor) {
   std::vector<std::string> devices = {"TPU:0", "TPU:1", "TPU:2", "TPU:3",
                                       "TPU:4", "TPU:5", "TPU:6", "TPU:7"};
+  std::vector<int64_t> denormalized_tile_assignment = {0, 1, 2, 3, 4, 5, 6, 7};
 
   // 1D tiled
   at::Tensor tensor = at::ones({8}, at::TensorOptions(at::kFloat));
@@ -136,7 +139,7 @@ TEST_F(XLAShardingTest, ShardTensor) {
           CreateComputationShapeFromTensor(tensor, bridge::GetDefaultDevice()),
           devices.size())
           .ToProto();
-  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
+  torch_xla::OpSharding sharding(xla_sharding, denormalized_tile_assignment);
   auto sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   auto shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
@@ -155,7 +158,7 @@ TEST_F(XLAShardingTest, ShardTensor) {
       {4, 5, 6, 7},
   });
   xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
@@ -168,7 +171,7 @@ TEST_F(XLAShardingTest, ShardTensor) {
   // size should be smaller in dim=1 because it's not evenly divisible.
   xla::Array3D<int64_t> cube({{{0, 1}, {2, 3}, {4, 5}, {6, 7}}});
   xla_sharding = xla::HloSharding::Tile(cube).ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   sharding_spec->sharding = sharding;
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
                                      /*padded=*/false);
@@ -178,7 +181,7 @@ TEST_F(XLAShardingTest, ShardTensor) {
 
   // Replicated, all shards should be identical.
   xla_sharding = xla::HloSharding::Replicate().ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   sharding_spec->sharding = sharding;
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
                                      /*padded=*/false);
@@ -194,7 +197,7 @@ TEST_F(XLAShardingTest, ShardTensor) {
       CreateComputationShapeFromTensor(tensor, bridge::GetDefaultDevice());
   xla::Array4D<int64_t> tesseract({{{{0, 1}, {2, 3}, {4, 5}, {6, 7}}}});
   xla_sharding = xla::HloSharding::Tile(tesseract).ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
@@ -219,7 +222,7 @@ TEST_F(XLAShardingTest, ShardTensor) {
   xla::Array<int64_t> hypercube(std::vector<int64_t>{1, 1, 2, 2, 2});
   hypercube.FillIota(0);
   xla_sharding = xla::HloSharding::Tile(hypercube).ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
@@ -248,7 +251,8 @@ TEST_F(XLAShardingTest, ShardTensorMultiHost) {
       {6, 7, 2, 3},
   });
   auto xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
-  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
+  std::vector<int64_t> denormalized_tile_assignment = {4, 5, 0, 1, 6, 7, 2, 3};
+  torch_xla::OpSharding sharding(xla_sharding, denormalized_tile_assignment);
   auto sharding_spec =
       std::make_shared<XLATensor::ShardingSpec>(sharding, tensor_shape);
   // For devices at the start of the mesh, all shards should have the same
@@ -266,7 +270,8 @@ TEST_F(XLAShardingTest, ShardTensorMultiHost) {
       {2, 3, 6, 7},
   });
   xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  denormalized_tile_assignment = {0, 1, 4, 5, 2, 3, 6, 7};
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   sharding_spec->sharding = sharding;
   shards = ShardingUtil::ShardTensor(tensor, sharding_spec, devices,
                                      /*padded=*/false);
@@ -295,7 +300,8 @@ TEST_F(XLAShardingTest, ShardTensorMiniBatch) {
   });
 
   auto xla_sharding = xla::HloSharding::Tile(mesh).ToProto();
-  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
+  std::vector<int64_t> denormalized_tile_assignment = {0, 1, 2, 3, 4, 5, 6, 7};
+  torch_xla::OpSharding sharding(xla_sharding, denormalized_tile_assignment);
   auto sharding_spec = std::make_shared<XLATensor::ShardingSpec>(
       sharding, global_shape, /*minibatch=*/true);
   auto shards = ShardingUtil::ShardTensor(minibatch_tensor, sharding_spec,
@@ -314,14 +320,15 @@ TEST_F(XLAShardingTest, EqualShardingSpecs) {
                                                  {4, 5, 6, 7},
                                              })
                           .ToProto();
-  torch_xla::OpSharding sharding(xla_sharding, std::nullopt);
+  std::vector<int64_t> denormalized_tile_assignment = {0, 1, 2, 3, 4, 5, 6, 7};
+  torch_xla::OpSharding sharding(xla_sharding, denormalized_tile_assignment);
   XLATensor::ShardingSpec tiled_2d(sharding, tensor_shape);
   xla_sharding =
       xla::HloSharding::Tile({{{0, 1}, {2, 3}, {4, 5}, {6, 7}}}).ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   XLATensor::ShardingSpec tiled_3d(sharding, tensor_shape);
   xla_sharding = xla::HloSharding::Replicate().ToProto();
-  sharding = torch_xla::OpSharding(xla_sharding, std::nullopt);
+  sharding = torch_xla::OpSharding(xla_sharding, denormalized_tile_assignment);
   XLATensor::ShardingSpec replicated(sharding, tensor_shape);
   EXPECT_TRUE(ShardingUtil::EqualShardingSpecs(tiled_2d, tiled_2d));
   EXPECT_FALSE(ShardingUtil::EqualShardingSpecs(tiled_2d, tiled_3d));
 
@@ -141,6 +141,11 @@ class XlaNode : public torch::lazy::Node {
     return output_shardings_[index];
   }
 
+  const std::vector<std::shared_ptr<torch_xla::OpSharding>> GetShardings()
+      const {
+    return output_shardings_;
+  }
+
   void SetSharding(const torch_xla::OpSharding& sharding, size_t index);
 
   void ClearSharding() {
 
@@ -238,13 +238,31 @@ xla::XlaOp LoweringContext::GetOutputOp(const torch::lazy::Output& output) {
   return it->second;
 }
 
+void LoweringContext::ExtractShardingAndSetDenormalizedTileAssignments(
+    std::vector<std::shared_ptr<torch_xla::OpSharding>> shardings) {
+  for (auto sharding : shardings) {
+    std::vector<int64_t> denormalized_tile_assignment =
+        sharding->GetDenormalizedTileAssignment();
+    if (!denormalized_tile_assignment.empty()) {
+      denormalized_tile_assignments_.push_back(
+          sharding->GetDenormalizedTileAssignment());
+    }
+  }
+}
+
 XlaOpVector LoweringContext::LowerNode(const torch::lazy::Node& node) {
   XlaOpVector result_ops;
   try {
     const HloMetadataSetter meta_setter(*this, node);
     const XlaNode* const casted = dynamic_cast<const XlaNode*>(&node);
 
     result_ops = casted->Lower(this);
+    // save the denormalized_tile_assignment from all nodes and then use it
+    // during Compile
+    auto shardings = casted->GetShardings();
+    if (!shardings.empty()) {
+      ExtractShardingAndSetDenormalizedTileAssignments(shardings);
+    }
     if (!casted->dynamic_dims().empty()) {
       const xla::internal::XlaBuilderFriend builder_friend;
       auto* const inst = builder_friend.GetInstruction(result_ops[0]);
 
@@ -117,6 +117,14 @@ class LoweringContext : public torch::lazy::LoweringContext {
   int64_t AddStackFrameLocation(const torch::lazy::SourceLocation& source,
                                 int64_t parent_id);
 
+  void ExtractShardingAndSetDenormalizedTileAssignments(
+      std::vector<std::shared_ptr<torch_xla::OpSharding>>);
+
+  const std::vector<std::vector<int64_t>>& GetDenormalizedTileAssignments()
+      const {
+    return denormalized_tile_assignments_;
+  }
+
  private:
   struct Parameter {
     xla::XlaOp param;
@@ -135,6 +143,7 @@ class LoweringContext : public torch::lazy::LoweringContext {
   std::string name_;
 
   std::shared_ptr<StackFrameIndexBuilder> stack_frame_index_builder_;
+  std::vector<std::vector<int64_t>> denormalized_tile_assignments_;
 };  // namespace torch_xla
 
 }  // namespace torch_xla
 
@@ -838,6 +838,39 @@ std::vector<torch::lazy::BackendDataPtr> CreateTensorsData(
       runtime::GetComputationClientOrDie()->TransferToDevice(source_tensors));
 }
 
+namespace {
+
+/**
+ * Filters a list of device strings to include only those with IDs matching
+ * the provided indices.
+ *
+ * @param devices List of device strings in format "TYPE:ID" (e.g., "TPU:0")
+ * @param indices List of device IDs to filter by
+ * @return Filtered list of device strings, or error status if parsing fails
+ *
+ * Example:
+ *   devices = ["TPU:0", "TPU:1", "TPU:2", "TPU:3"]
+ *   indices = [1, 3]
+ *   result = ["TPU:1", "TPU:3"]
+ */
+std::vector<std::string> FilterDevicesByAddressableDevices(
+    std::vector<std::string> devices, const std::vector<int64_t>& indices) {
+  std::vector<std::string> filtered_devices_;
+  filtered_devices_.reserve(indices.size());
+  for (auto& index : indices) {
+    for (auto& device : devices) {
+      std::vector<std::string> device_spec_parts = absl::StrSplit(device, ':');
+      if (std::stoi(device_spec_parts[1]) == index) {
+        filtered_devices_.push_back(device);
+        break;
+      }
+    }
+  }
+  return filtered_devices_;
+}
+
+}  // namespace
+
 std::vector<torch::lazy::BackendDataPtr> CreateTensorsData(
     const std::vector<at::Tensor>& tensors,
     const std::vector<XLATensor::ShardingSpecPtr>& shardings,
@@ -860,14 +893,25 @@ std::vector<torch::lazy::BackendDataPtr> CreateTensorsData(
 
       std::vector<std::string> local_devices =
           runtime::GetComputationClientOrDie()->GetLocalDevices();
+      std::vector<std::string> addressable_devices = std::move(local_devices);
+      if (shardings[i]) {
+        const std::vector<int64_t>& denormalized_tile_assignment =
+            shardings[i]->sharding.GetDenormalizedTileAssignment();
+        if ((!denormalized_tile_assignment.empty()) &&
+            (denormalized_tile_assignment.size() !=
+             addressable_devices.size())) {
+          addressable_devices = FilterDevicesByAddressableDevices(
+              addressable_devices, denormalized_tile_assignment);
+        }
+      }
       // Shards the input tensors with padding, to split evenly.
       // The execution requires consistent shard sizes, and the zero-padded
       // values should be ignored.
-      std::vector<at::Tensor> local_shards =
-          ShardingUtil::ShardTensor(tensors[i], shardings[i], local_devices,
-                                    /*padded=*/true);
+      std::vector<at::Tensor> local_shards = ShardingUtil::ShardTensor(
+          tensors[i], shardings[i], addressable_devices,
+          /*padded=*/true);
       new_handles.push_back(ShardingUtil::CreateShardedData(
-          local_shards, local_devices, shardings[i]));
+          local_shards, addressable_devices, shardings[i]));
     } else {
       source_tensors.push_back(std::make_shared<runtime::AtenSource>(
           tensors[i], std::move(shape), devices[i]));