[BE] Introduce torch.AcceleratorError (pytorch#152023)

malfet · pytorchmergebot · commit 0350c7e72c78 · 2025-06-01T21:02:43.000Z
Which inherits from `RuntimeError` and contains `error_code`, which in case of CUDA should contain error returned by `cudaGetLastError` `torch::detail::_new_accelerator_error_object(c10::AcceleratorError&)` follows the pattern of CPython's [`PyErr_SetString`](https://github.com/python/cpython/blob/cb8a72b301f47e76d93a7fe5b259e9a5758792e1/Python/errors.c#L282), namely - Convert cstr into Python string with `PyUnicode_FromString` - Create new exception object using `PyObject_CallOneArg` just like it's done in [`_PyErr_CreateException`](https://github.com/python/cpython/blob/cb8a72b301f47e76d93a7fe5b259e9a5758792e1/Python/errors.c#L32) - Set `error_code` property using `PyObject_SetAttrString` - decref all temporary references Test that it works and captures CPP backtrace (in addition to CI) by running ```python import os os.environ['TORCH_SHOW_CPP_STACKTRACES'] = '1' import torch x = torch.rand(10, device="cuda") y = torch.arange(20, device="cuda") try: x[y] = 2 print(x) except torch.AcceleratorError as e: print("Exception was raised", e.args[0]) print("Captured error code is ", e.error_code) ``` which produces following output ``` Exception was raised CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions. Exception raised from c10_cuda_check_implementation at /home/ubuntu/pytorch/c10/cuda/CUDAException.cpp:41 (most recent call first): C++ CapturedTraceback: #4 std::_Function_handler<std::shared_ptr<c10::LazyValue<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > const> (), c10::SetStackTraceFetcher(std::function<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) from Logging.cpp:0 #5 c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >) from ??:0 #6 c10::cuda::c10_cuda_check_implementation(int, char const*, char const*, int, bool) [clone .cold] from CUDAException.cpp:0 #7 void at::native::gpu_kernel_impl<at::native::AbsFunctor<float> >(at::TensorIteratorBase&, at::native::AbsFunctor<float> const&) [clone .isra.0] from tmpxft_000191fc_00000000-6_AbsKernel.cudafe1.cpp:0 #8 at::native::abs_kernel_cuda(at::TensorIteratorBase&) from ??:0 #9 at::Tensor& at::native::unary_op_impl_with_complex_to_float_out<at::native::abs_stub_DECLARE_DISPATCH_type>(at::Tensor&, at::Tensor const&, at::native::abs_stub_DECLARE_DISPATCH_type&, bool) [clone .constprop.0] from UnaryOps.cpp:0 #10 at::(anonymous namespace)::(anonymous namespace)::wrapper_CUDA_out_abs_out(at::Tensor const&, at::Tensor&) from RegisterCUDA_0.cpp:0 #11 at::_ops::abs_out::call(at::Tensor const&, at::Tensor&) from ??:0 #12 at::native::abs(at::Tensor const&) from ??:0 #13 c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeExplicitAutograd__abs>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&> >, at::Tensor (at::Tensor const&)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&) from RegisterCompositeExplicitAutograd_0.cpp:0 #14 at::_ops::abs::redispatch(c10::DispatchKeySet, at::Tensor const&) from ??:0 #15 torch::autograd::VariableType::(anonymous namespace)::abs(c10::DispatchKeySet, at::Tensor const&) from VariableType_1.cpp:0 #16 c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (c10::DispatchKeySet, at::Tensor const&), &torch::autograd::VariableType::(anonymous namespace)::abs>, at::Tensor, c10::guts::typelist::typelist<c10::DispatchKeySet, at::Tensor const&> >, at::Tensor (c10::DispatchKeySet, at::Tensor const&)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&) from VariableType_1.cpp:0 #17 at::_ops::abs::call(at::Tensor const&) from ??:0 #18 at::native::isfinite(at::Tensor const&) from ??:0 #19 c10::impl::wrap_kernel_functor_unboxed_<c10::impl::detail::WrapFunctionIntoFunctor_<c10::CompileTimeFunctionPointer<at::Tensor (at::Tensor const&), &at::(anonymous namespace)::(anonymous namespace)::wrapper_CompositeImplicitAutograd__isfinite>, at::Tensor, c10::guts::typelist::typelist<at::Tensor const&> >, at::Tensor (at::Tensor const&)>::call(c10::OperatorKernel*, c10::DispatchKeySet, at::Tensor const&) from RegisterCompositeImplicitAutograd_0.cpp:0 #20 at::_ops::isfinite::call(at::Tensor const&) from ??:0 #21 torch::autograd::THPVariable_isfinite(_object*, _object*, _object*) from python_torch_functions_2.cpp:0 #22 PyObject_CallFunctionObjArgs from ??:0 #23 _PyObject_MakeTpCall from ??:0 #24 _PyEval_EvalFrameDefault from ??:0 pytorch#25 _PyObject_FastCallDictTstate from ??:0 pytorch#26 _PyStack_AsDict from ??:0 pytorch#27 _PyObject_MakeTpCall from ??:0 pytorch#28 _PyEval_EvalFrameDefault from ??:0 pytorch#29 _PyFunction_Vectorcall from ??:0 pytorch#30 _PyEval_EvalFrameDefault from ??:0 pytorch#31 _PyFunction_Vectorcall from ??:0 pytorch#32 _PyEval_EvalFrameDefault from ??:0 pytorch#33 _PyFunction_Vectorcall from ??:0 pytorch#34 _PyEval_EvalFrameDefault from ??:0 pytorch#35 PyFrame_GetCode from ??:0 pytorch#36 PyNumber_Xor from ??:0 pytorch#37 PyObject_Str from ??:0 pytorch#38 PyFile_WriteObject from ??:0 pytorch#39 _PyWideStringList_AsList from ??:0 pytorch#40 _PyDict_NewPresized from ??:0 pytorch#41 _PyEval_EvalFrameDefault from ??:0 pytorch#42 PyEval_EvalCode from ??:0 pytorch#43 PyEval_EvalCode from ??:0 pytorch#44 PyUnicode_Tailmatch from ??:0 pytorch#45 PyInit__collections from ??:0 pytorch#46 PyUnicode_Tailmatch from ??:0 pytorch#47 _PyRun_SimpleFileObject from ??:0 pytorch#48 _PyRun_AnyFileObject from ??:0 pytorch#49 Py_RunMain from ??:0 pytorch#50 Py_BytesMain from ??:0 pytorch#51 __libc_init_first from ??:0 pytorch#52 __libc_start_main from ??:0 pytorch#53 _start from ??:0 Captured error code is 710 ``` Pull Request resolved: pytorch#152023 Approved by: https://github.com/eqy, https://github.com/mradmila, https://github.com/ngimel ghstack dependencies: pytorch#154436
diff --git a/c10/cuda/CUDAException.cpp b/c10/cuda/CUDAException.cpp
@@ -38,8 +38,8 @@ void c10_cuda_check_implementation(
         "Device-side assertions were explicitly omitted for this error check; the error probably arose while initializing the DSA handlers.");
   }
 #endif
-
-  TORCH_CHECK(false, check_message);
+  throw c10::AcceleratorError(
+      {__func__, __FILE__, int32_t(__LINE__)}, err, check_message);
 }
 
 } // namespace c10::cuda
diff --git a/c10/util/Exception.h b/c10/util/Exception.h
@@ -295,6 +295,19 @@ class C10_API SyntaxError : public Error {
   using Error::Error;
 };
 
+// Raised when accelerator API call hits an error.
+// These turn into AcceleratorError when the cross into Python
+class C10_API AcceleratorError : public Error {
+  int32_t error_code;
+
+ public:
+  AcceleratorError(SourceLocation loc, int32_t code, const std::string& msg)
+      : Error(loc, msg), error_code(code) {}
+  int32_t get_error_code() const {
+    return error_code;
+  }
+};
+
 // Base error type for all distributed errors.
 // These turn into DistError when they cross into Python.
 class C10_API DistError : public Error {
diff --git a/docs/source/cuda.rst b/docs/source/cuda.rst
@@ -40,6 +40,7 @@ torch.cuda
     temperature
     power_draw
     clock_rate
+    AcceleratorError
     OutOfMemoryError
 
 Random Number Generator
diff --git a/test/test_cuda.py b/test/test_cuda.py
@@ -1387,6 +1387,8 @@ def _spawn_method(self, method, arg):
             for e in errors:
                 if "device-side assert triggered" not in str(e):
                     self.fail(e)
+                if e.error_code != 710:  # cudaErrorAssert == 710
+                    self.fail(e)
 
     @staticmethod
     def _test_index_bounds_cuda(idx):
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
@@ -59,6 +59,7 @@ def test_no_new_bindings(self):
         #
         #   {elem for elem in dir(torch._C) if not elem.startswith("_")}
         torch_C_allowlist_superset = {
+            "AcceleratorError",
             "AggregationType",
             "AliasDb",
             "AnyType",
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -2628,6 +2628,7 @@ def _will_engine_execute_node(node: _Node) -> _bool: ...
 def _dispatch_key_set(tensor) -> str: ...
 
 # Defined in torch/csrc/Exceptions.cpp
+class AcceleratorError(RuntimeError): ...
 class OutOfMemoryError(RuntimeError): ...
 class _DistError(RuntimeError): ...
 class _DistBackendError(RuntimeError): ...
diff --git a/torch/csrc/Exceptions.cpp b/torch/csrc/Exceptions.cpp
@@ -14,7 +14,8 @@
 PyObject *THPException_FatalError, *THPException_LinAlgError,
     *THPException_OutOfMemoryError, *THPException_DistError,
     *THPException_DistBackendError, *THPException_DistNetworkError,
-    *THPException_DistStoreError, *THPException_DistQueueEmptyError;
+    *THPException_DistStoreError, *THPException_DistQueueEmptyError,
+    *THPException_AcceleratorError;
 
 #define ASSERT_TRUE(cond) \
   if (!(cond))            \
@@ -125,6 +126,18 @@ could not be completed because the input matrix is singular.",
           module, "_DistQueueEmptyError", THPException_DistQueueEmptyError) ==
       0);
 
+  // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+  ASSERT_TRUE(
+      THPException_AcceleratorError = PyErr_NewExceptionWithDoc(
+          "torch.AcceleratorError",
+          "Exception raised while executing on device",
+          PyExc_RuntimeError,
+          nullptr));
+  type = (PyTypeObject*)THPException_AcceleratorError;
+  ASSERT_TRUE(
+      PyModule_AddObject(
+          module, "AcceleratorError", THPException_AcceleratorError) == 0);
+
   return true;
 }
 
@@ -341,4 +354,18 @@ PyWarningHandler::~PyWarningHandler() noexcept(false) {
   }
 }
 
+namespace detail {
+PyObject* _new_accelerator_error_object(const c10::AcceleratorError& e) {
+  auto msg = torch::get_cpp_stacktraces_enabled() ? e.what()
+                                                  : e.what_without_backtrace();
+
+  auto py_msg = PyUnicode_FromString(msg);
+  auto rc = PyObject_CallOneArg(THPException_AcceleratorError, py_msg);
+  auto error_code = PyInt_FromLong(e.get_error_code());
+  PyObject_SetAttrString(rc, "error_code", error_code);
+  Py_XDECREF(py_msg);
+  Py_XDECREF(error_code);
+  return rc;
+}
+} // namespace detail
 } // namespace torch
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
@@ -86,6 +86,12 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
       DistQueueEmptyError, THPException_DistQueueEmptyError, retstmnt)        \
   _CATCH_GENERIC_ERROR(DistStoreError, THPException_DistStoreError, retstmnt) \
   _CATCH_GENERIC_ERROR(DistError, THPException_DistError, retstmnt)           \
+  catch (c10::AcceleratorError & e) {                                         \
+    auto exc = torch::detail::_new_accelerator_error_object(e);               \
+    PyErr_SetObject(THPException_AcceleratorError, exc);                      \
+    Py_XDECREF(exc);                                                          \
+    retstmnt;                                                                 \
+  }                                                                           \
   _CATCH_GENERIC_ERROR(Error, PyExc_RuntimeError, retstmnt)                   \
   catch (torch::PyTorchError & e) {                                           \
     auto msg = torch::processErrorMsg(e.what());                              \
@@ -141,7 +147,8 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
 extern PyObject *THPException_FatalError, *THPException_LinAlgError,
     *THPException_OutOfMemoryError, *THPException_DistError,
     *THPException_DistBackendError, *THPException_DistNetworkError,
-    *THPException_DistStoreError, *THPException_DistQueueEmptyError;
+    *THPException_DistStoreError, *THPException_DistQueueEmptyError,
+    *THPException_AcceleratorError;
 
 // Throwing this exception means that the python error flags have been already
 // set and control should be immediately returned to the interpreter.
@@ -369,6 +376,8 @@ auto wrap_pybind_function_impl_(
     END_HANDLE_TH_ERRORS_PYBIND
   };
 }
+
+PyObject* _new_accelerator_error_object(const c10::AcceleratorError&);
 } // namespace detail
 
 // Wrap a function with TH error and warning handling.
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
@@ -333,6 +333,7 @@ class DeferredCudaCallError(Exception):
     pass
 
 
+AcceleratorError = torch._C.AcceleratorError
 OutOfMemoryError = torch._C.OutOfMemoryError
 
 

Original file line number	Diff line number	Diff line change
`@@ -38,8 +38,8 @@ void c10_cuda_check_implementation(`
`38`	`38`	`"Device-side assertions were explicitly omitted for this error check; the error probably arose while initializing the DSA handlers.");`
`39`	`39`	`}`
`40`	`40`	`#endif`
`41`		`-`
`42`		`- TORCH_CHECK(false, check_message);`
	`41`	`+ throw c10::AcceleratorError(`
	`42`	`+ {__func__, __FILE__, int32_t(__LINE__)}, err, check_message);`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`} // namespace c10::cuda`
Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,7 @@ def test_no_new_bindings(self):`
`59`	`59`	`#`
`60`	`60`	`# {elem for elem in dir(torch._C) if not elem.startswith("_")}`
`61`	`61`	`torch_C_allowlist_superset = {`
	`62`	`+ "AcceleratorError",`
`62`	`63`	`"AggregationType",`
`63`	`64`	`"AliasDb",`
`64`	`65`	`"AnyType",`
Original file line number	Diff line number	Diff line change
`@@ -333,6 +333,7 @@ class DeferredCudaCallError(Exception):`
`333`	`333`	`pass`
`334`	`334`
`335`	`335`
	`336`	`+AcceleratorError = torch._C.AcceleratorError`
`336`	`337`	`OutOfMemoryError = torch._C.OutOfMemoryError`
`337`	`338`
`338`	`339`