use tensors to get denormalized_tile_assignment directly instead of po_data

kvshbg-aws · kvshbg-aws · commit 7c4a3cd229d2 · 2025-07-15T17:22:54.000Z
diff --git a/test/cpp/test_xla_sharding.cpp b/test/cpp/test_xla_sharding.cpp
@@ -413,20 +413,28 @@ TEST_F(XLAShardingTest, PrepareOutputShardingPropagation) {
   xla::XlaComputation xla_computation =
       GetValueOrThrow(b.Build(/*remove_dynamic_dimensions=*/false));
 
-  std::vector<torch::lazy::BackendDataPtr> parameters_data;
-  parameters_data.push_back(
+  std::vector<XLATensorPtr> tensors{XLATensor::Create(
       torch_xla::runtime::GetComputationClientOrDie()->CreateDataPlaceholder(
-          bridge::GetDefaultDevice()->toString(), std::move(shape)));
+          bridge::GetDefaultDevice()->toString(), std::move(shape)))};
+  std::vector<std::vector<int64_t>> denormalized_tile_assignments;
+  for (auto tensor : tensors) {
+    auto sharding_spec = tensor->sharding_spec();
+    if (sharding_spec) {
+      denormalized_tile_assignments.push_back(
+          sharding_spec->sharding.GetDenormalizedTileAssignment());
+    }
+  }
 
   std::vector<torch_xla::runtime::ComputationClient::CompileInstance> instances;
-  instances.push_back({std::move(xla_computation),
-                       bridge::GetDefaultDevice()->toString(),
-                       {bridge::GetDefaultDevice()->toString()},
-                       &shape,
-                       /*should_wrap_parameter=*/false,
-                       /*is_sharded=*/true,
-                       /*allow_spmd_sharding_propagation_to_output=*/true,
-                       /*parameters_data=*/parameters_data});
+  instances.push_back(
+      {std::move(xla_computation),
+       bridge::GetDefaultDevice()->toString(),
+       {bridge::GetDefaultDevice()->toString()},
+       &shape,
+       /*should_wrap_parameter=*/false,
+       /*is_sharded=*/true,
+       /*allow_spmd_sharding_propagation_to_output=*/true,
+       /*denormalized_tile_assignments=*/denormalized_tile_assignments});
 
   std::vector<
       std::shared_ptr<torch_xla::runtime::ComputationClient::Computation>>
@@ -437,9 +445,6 @@ TEST_F(XLAShardingTest, PrepareOutputShardingPropagation) {
           "add", std::move(computations[0]->move_computation()));
 
   // Prepare output sharding propagation, expect a sharded output placeholder.
-  std::vector<XLATensorPtr> tensors{XLATensor::Create(
-      torch_xla::runtime::GetComputationClientOrDie()->CreateDataPlaceholder(
-          bridge::GetDefaultDevice()->toString(), std::move(shape)))};
   std::vector<torch::lazy::BackendDataPtr> data_placeholders;
   std::vector<XLATensor::ShardingSpecPtr> sharding_specs;
   ShardingUtil::PrepareOutputShardingPropagation(
diff --git a/torch_xla/csrc/ir.cpp b/torch_xla/csrc/ir.cpp
@@ -208,9 +208,10 @@ void XlaNode::UpdateShardingHash() {
   for (size_t i = 0; i < output_shardings_.size(); i++) {
     // keep the index as part of the hash
     sharding_hash_ = torch::lazy::HashCombine(sharding_hash_, (uint32_t)i);
-    std::shared_ptr<xla::OpSharding> sharding =
-        std::make_shared<xla::OpSharding>(
-            output_shardings_[i]->GetXlaOpSharding());
+    std::shared_ptr<torch_xla::OpSharding> sharding =
+        std::make_shared<torch_xla::OpSharding>(
+            output_shardings_[i]->GetXlaOpSharding(),
+            output_shardings_[i]->GetDenormalizedTileAssignment());
     // skip the hash compute for empty sharding
     if (!sharding) {
       continue;
diff --git a/torch_xla/csrc/runtime/computation_client.h b/torch_xla/csrc/runtime/computation_client.h
@@ -228,7 +228,7 @@ class ComputationClient {
         std::vector<std::string> devices, const xla::Shape* output_shape,
         bool parameter_is_tupled_arguments = false, bool is_sharded = false,
         bool allow_spmd_sharding_propagation_to_output = true,
-        std::vector<torch::lazy::BackendDataPtr> parameters_data = {},
+        std::vector<std::vector<int64_t>> denormalized_tile_assignments = {},
         bool use_auto_spmd_partitioning = false,
         std::vector<int64_t> auto_spmd_mesh_shape = {},
         std::vector<int64_t> auto_spmd_mesh_ids = {}, bool eager_mode = false)
@@ -240,7 +240,7 @@ class ComputationClient {
           is_sharded(is_sharded),
           allow_spmd_sharding_propagation_to_output(
               allow_spmd_sharding_propagation_to_output),
-          parameters_data(parameters_data),
+          denormalized_tile_assignments(denormalized_tile_assignments),
           use_auto_spmd_partitioning(use_auto_spmd_partitioning),
           auto_spmd_mesh_shape(auto_spmd_mesh_shape),
           auto_spmd_mesh_ids(auto_spmd_mesh_ids),
@@ -250,7 +250,7 @@ class ComputationClient {
     std::string compilation_device;
     std::vector<std::string> devices;
     const xla::Shape* output_shape = nullptr;
-    std::vector<torch::lazy::BackendDataPtr> parameters_data;
+    std::vector<std::vector<int64_t>> denormalized_tile_assignments;
     bool parameter_is_tupled_arguments;
     bool is_sharded;
     bool allow_spmd_sharding_propagation_to_output;
diff --git a/torch_xla/csrc/runtime/ifrt_computation_client.cpp b/torch_xla/csrc/runtime/ifrt_computation_client.cpp
@@ -471,14 +471,8 @@ std::vector<ComputationClient::ComputationPtr> IfrtComputationClient::Compile(
 
   for (auto& instance : instances) {
     std::vector<int64_t> denormalized_tile_assignment;
-    if (!instance.parameters_data.empty() && instance.parameters_data[0]) {
-      auto sharding_opt = GetDataSharding(
-          std::dynamic_pointer_cast<runtime::ComputationClient::Data>(
-              instance.parameters_data[0]));
-      if (sharding_opt.has_value()) {
-        denormalized_tile_assignment =
-            sharding_opt.value().GetDenormalizedTileAssignment();
-      }
+    if (!instance.denormalized_tile_assignments.empty()) {
+      denormalized_tile_assignment = instance.denormalized_tile_assignments[0];
     }
 
     xla::CompileOptions compile_options;
diff --git a/torch_xla/csrc/runtime/pjrt_computation_client.cpp b/torch_xla/csrc/runtime/pjrt_computation_client.cpp
@@ -547,14 +547,8 @@ std::vector<ComputationClient::ComputationPtr> PjRtComputationClient::Compile(
 
   for (auto& instance : instances) {
     std::vector<int64_t> denormalized_tile_assignment;
-    if (!instance.parameters_data.empty() && instance.parameters_data[0]) {
-      auto sharding_opt = GetDataSharding(
-          std::dynamic_pointer_cast<runtime::ComputationClient::Data>(
-              instance.parameters_data[0]));
-      if (sharding_opt.has_value()) {
-        denormalized_tile_assignment =
-            sharding_opt.value().GetDenormalizedTileAssignment();
-      }
+    if (!instance.denormalized_tile_assignments.empty()) {
+      denormalized_tile_assignment = instance.denormalized_tile_assignments[0];
     }
     xla::CompileOptions compile_options;
     if (enable_cm_in_mp) {
diff --git a/torch_xla/csrc/torch_xla_op_sharding.cpp b/torch_xla/csrc/torch_xla_op_sharding.cpp
@@ -99,6 +99,11 @@ OpSharding::iota_transpose_perm() const {
   return op_sharding_->iota_transpose_perm();
 }
 
+const ::google::protobuf::RepeatedField<int32_t>& OpSharding::last_tile_dims()
+    const {
+  return op_sharding_->last_tile_dims();
+}
+
 const xla::ShapeProto& OpSharding::tile_shape() const {
   return op_sharding_->tile_shape();
 }
diff --git a/torch_xla/csrc/torch_xla_op_sharding.h b/torch_xla/csrc/torch_xla_op_sharding.h
@@ -58,6 +58,7 @@ class OpSharding {
   const ::google::protobuf::RepeatedField<int64_t>& tile_assignment_devices()
       const;
   const ::google::protobuf::RepeatedField<int32_t>& iota_transpose_perm() const;
+  const ::google::protobuf::RepeatedField<int32_t>& last_tile_dims() const;
   const xla::ShapeProto& tile_shape() const;
 
   // Access to underlying xla::OpSharding
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
@@ -1435,14 +1435,24 @@ XLAGraphExecutor::CompilationResult XLAGraphExecutor::Compile(
   xla::Shape shape = MakeShapeWithDeviceLayout(
       program_shape.result(), static_cast<XlaDeviceType>(coll.device.type()));
 
+  std::vector<std::vector<int64_t>> denormalized_tile_assignments;
+  for (auto tensor : tensors) {
+    auto sharding_spec = tensor->sharding_spec();
+    if (sharding_spec) {
+      denormalized_tile_assignments.push_back(
+          sharding_spec->sharding.GetDenormalizedTileAssignment());
+    } else {
+      TF_VLOG(5) << "no sharding spec for tensor - " << tensor;
+    }
+  }
   std::vector<runtime::ComputationClient::CompileInstance> instances;
   instances.push_back(
       {std::move(computation), coll.device.toString(),
        runtime::GetComputationClientOrDie()->GetCompilationDevices(
            coll.device.toString(), devices),
        &shape, should_wrap_parameter, is_sharded,
        /*allow_spmd_sharding_propagation_to_output=*/true,
-       /*parameters_data=*/po_data->parameters_data});
+       /*denormalized_tile_assignments=*/denormalized_tile_assignments});
   instances.front().eager_mode = UseEagerMode();
   if (use_autosharding) {
     TF_VLOG(5) << "use_auto_spmd_partitioning is set.";

Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,11 @@ OpSharding::iota_transpose_perm() const {`
`99`	`99`	`return op_sharding_->iota_transpose_perm();`
`100`	`100`	`}`
`101`	`101`
	`102`	`+const ::google::protobuf::RepeatedField<int32_t>& OpSharding::last_tile_dims()`
	`103`	`+ const {`
	`104`	`+ return op_sharding_->last_tile_dims();`
	`105`	`+}`
	`106`	`+`
`102`	`107`	`const xla::ShapeProto& OpSharding::tile_shape() const {`
`103`	`108`	`return op_sharding_->tile_shape();`
`104`	`109`	`}`