Skip to content

Error Handling: propagate status for ReleaseGilAndTransferData and XlaDataToTensors. #9431

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/cpp/test_xla_sharding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ bool XlaDataValuesEqual(torch::lazy::BackendDataPtr a,
torch::lazy::BackendDataPtr b,
at::ScalarType element_type) {
std::vector<at::Tensor> tensors =
XlaDataToTensors({a, b}, {element_type, element_type});
GetValueOrThrow(XlaDataToTensors({a, b}, {element_type, element_type}));
return TensorCompare(tensors[0], tensors[1]);
}
} // namespace
Expand Down
2 changes: 1 addition & 1 deletion torch_xla/csrc/init_python_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2712,7 +2712,7 @@ void InitXlaModuleBindings(py::module m) {
}

std::vector<at::Tensor> cpu_shards =
XlaDataToTensors(WrapXlaData(handles), element_types);
GetValueOrThrow(XlaDataToTensors(WrapXlaData(handles), element_types));
// Populate the resulting vector of shards and device strings
std::vector<std::vector<std::pair<at::Tensor, std::string>>> result;
int shards_per_tensor =
Expand Down
3 changes: 2 additions & 1 deletion torch_xla/csrc/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "torch_xla/csrc/runtime/pjrt_computation_client.h"
#include "torch_xla/csrc/runtime/sys_util.h"
#include "torch_xla/csrc/runtime/xla_util.h"
#include "torch_xla/csrc/status.h"
#include "torch_xla/csrc/tensor_util.h"
#include "torch_xla/csrc/torch_util.h"
#include "torch_xla/csrc/xla_graph_executor.h"
Expand Down Expand Up @@ -512,7 +513,7 @@ at::Tensor XLATensor::ToTensor(bool detached) {
// The GetXlaData() call will trigger an ApplyPendingGraph() if an IR
// XlaNode is available on the tensor.
std::vector<at::Tensor> tensors =
XlaDataToTensors({GetXlaData()}, {dtype()});
GetValueOrThrow(XlaDataToTensors({GetXlaData()}, {dtype()}));
tensor = std::move(tensors.front());
if (!detached) {
SetTensorData(tensor);
Expand Down
16 changes: 10 additions & 6 deletions torch_xla/csrc/tensor_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -896,7 +896,7 @@ xla::Literal GetTensorLiteral(const at::Tensor& tensor, const xla::Shape* shape,
return literal;
}

std::vector<xla::Literal> ReleaseGilAndTransferData(
absl::StatusOr<std::vector<xla::Literal>> ReleaseGilAndTransferData(
absl::Span<const torch::lazy::BackendDataPtr> xla_data) {
// HACK: This method may be called outside of python (mainly in C++ tests) or
// when the GIL is already released, so we must check both cases here. If
Expand All @@ -909,20 +909,24 @@ std::vector<xla::Literal> ReleaseGilAndTransferData(
if (release_gil && Py_IsInitialized() && PyGILState_Check()) {
save = PyEval_SaveThread();
}
std::vector<xla::Literal> literals =
GetValueOrThrow(runtime::GetComputationClientOrDie()->TransferFromDevice(
UnwrapXlaData(xla_data)));

XLA_ASSIGN_OR_RETURN(runtime::ComputationClient * client,
runtime::GetComputationClient());
XLA_ASSIGN_OR_RETURN(std::vector<xla::Literal> literals,
client->TransferFromDevice(UnwrapXlaData(xla_data)));

if (save) {
PyEval_RestoreThread(save);
}

return literals;
}

std::vector<at::Tensor> XlaDataToTensors(
absl::StatusOr<std::vector<at::Tensor>> XlaDataToTensors(
absl::Span<const torch::lazy::BackendDataPtr> xla_data,
absl::Span<const at::ScalarType> dest_element_type) {
std::vector<xla::Literal> literals = ReleaseGilAndTransferData(xla_data);
XLA_ASSIGN_OR_RETURN(std::vector<xla::Literal> literals,
ReleaseGilAndTransferData(xla_data));
std::vector<at::Tensor> tensors(literals.size());
absl::BlockingCounter counter(literals.size());
for (size_t i = 0; i < tensors.size(); ++i) {
Expand Down
4 changes: 2 additions & 2 deletions torch_xla/csrc/tensor_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ at::Tensor MakeTensorFromXlaLiteral(const xla::Literal& literal,
// Execution and data transfer are async in PJRT, so TransferFromDevice may
// block until `DataPtr`s are ready. Release the GIL so other threads can
// proceed and unblock any transfers or collective computations.
std::vector<xla::Literal> ReleaseGilAndTransferData(
absl::StatusOr<std::vector<xla::Literal>> ReleaseGilAndTransferData(
absl::Span<const torch::lazy::BackendDataPtr> xla_data);

// TODO LTC @wonjoo - Migrate to upstream after Device -> BackendDevice
std::vector<at::Tensor> XlaDataToTensors(
absl::StatusOr<std::vector<at::Tensor>> XlaDataToTensors(
absl::Span<const torch::lazy::BackendDataPtr> xla_data,
absl::Span<const at::ScalarType> dest_element_type);

Expand Down
4 changes: 3 additions & 1 deletion torch_xla/csrc/xla_backend_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include "torch_xla/csrc/runtime/computation_client.h"
#include "torch_xla/csrc/runtime/debug_macros.h"
#include "torch_xla/csrc/runtime/runtime.h"
#include "torch_xla/csrc/status.h"
#include "torch_xla/csrc/tensor_util.h"

namespace at {
// This function is defined in the codegenerated RegisterDispatchKey.cpp file.
Expand Down Expand Up @@ -92,7 +94,7 @@ class XlaBackendImpl : public torch::lazy::BackendImplInterface {
const torch::lazy::BackendDataPtr data,
std::optional<at::ScalarType> logical_scalar_type) const override {
// TODO(JackCaoG): handle the logical_scalar_type == nullptr case
return XlaDataToTensors({data}, {*logical_scalar_type})[0];
return GetValueOrThrow(XlaDataToTensors({data}, {*logical_scalar_type}))[0];
}

std::unique_ptr<torch::lazy::LoweringContext> CreateLoweringContext(
Expand Down
3 changes: 2 additions & 1 deletion torch_xla/csrc/xla_graph_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,8 @@ std::vector<at::Tensor> XLAGraphExecutor::GetTensors(
async != nullptr ? async->tensors_data
: absl::Span<const torch::lazy::BackendDataPtr>());

std::vector<xla::Literal> literals = ReleaseGilAndTransferData(tensors_data);
std::vector<xla::Literal> literals =
GetValueOrThrow(ReleaseGilAndTransferData(tensors_data));

return FetchTensors(tensors, literals,
async != nullptr ? &async->indices : nullptr);
Expand Down
Loading