diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index f574f800608..993fc34f5f5 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -ab43fe4bdf5ccd82897f0e982c451a0127bd175e +2dccff7dcf56b0d168ebfd7ca08bdeca37273c56 diff --git a/examples/openvino/aot_optimize_and_infer.py b/examples/openvino/aot_optimize_and_infer.py index acd9c896f42..8ec3b89b325 100644 --- a/examples/openvino/aot_optimize_and_infer.py +++ b/examples/openvino/aot_optimize_and_infer.py @@ -278,7 +278,7 @@ def transform_fn(x): return x[0] quantized_model = quantize_model( - cast(torch.fx.GraphModule, aten_dialect.module()), + cast(torch.fx.GraphModule, aten_dialect.module()), # type: ignore[redundant-cast] calibration_dataset, subset_size=subset_size, transform_fn=transform_fn, diff --git a/install_requirements.py b/install_requirements.py index edd799b3bdd..a2372a12bb9 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -71,7 +71,7 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250716" +NIGHTLY_VERSION = "dev20250723" def install_requirements(use_pytorch_nightly): diff --git a/runtime/core/portable_type/c10/c10/macros/Export.h b/runtime/core/portable_type/c10/c10/macros/Export.h index 3d912661026..1b8a6811c53 100644 --- a/runtime/core/portable_type/c10/c10/macros/Export.h +++ b/runtime/core/portable_type/c10/c10/macros/Export.h @@ -1,78 +1 @@ -#ifndef C10_MACROS_EXPORT_H_ -#define C10_MACROS_EXPORT_H_ - -#ifndef C10_USING_CUSTOM_GENERATED_MACROS -#include -#endif // C10_USING_CUSTOM_GENERATED_MACROS - #include - -// This one is being used by libtorch.so -#ifdef CAFFE2_BUILD_MAIN_LIB -#define TORCH_API C10_EXPORT -#else -#define TORCH_API C10_IMPORT -#endif - -// You may be wondering: Whose brilliant idea was it to split torch_cuda into -// two pieces with confusing names? -// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we -// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker -// issues when linking big binaries. -// (https://github.com/pytorch/pytorch/issues/39968) We had two choices: -// (1) Stop supporting so many GPU architectures -// (2) Do something else -// We chose #2 and decided to split the behemoth that was torch_cuda into two -// smaller libraries, one with most of the core kernel functions (torch_cuda_cu) -// and the other that had..well..everything else (torch_cuda_cpp). The idea was -// this: instead of linking our static libraries (like the hefty -// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky -// relocation marker issues, we could link our static libraries to a smaller -// part of torch_cuda (torch_cuda_cpp) and avoid the issues. - -// libtorch_cuda_cu.so -#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB -#define TORCH_CUDA_CU_API C10_EXPORT -#elif defined(BUILD_SPLIT_CUDA) -#define TORCH_CUDA_CU_API C10_IMPORT -#endif - -// libtorch_cuda_cpp.so -#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB -#define TORCH_CUDA_CPP_API C10_EXPORT -#elif defined(BUILD_SPLIT_CUDA) -#define TORCH_CUDA_CPP_API C10_IMPORT -#endif - -// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the -// same api) -#ifdef TORCH_CUDA_BUILD_MAIN_LIB -#define TORCH_CUDA_CPP_API C10_EXPORT -#define TORCH_CUDA_CU_API C10_EXPORT -#elif !defined(BUILD_SPLIT_CUDA) -#define TORCH_CUDA_CPP_API C10_IMPORT -#define TORCH_CUDA_CU_API C10_IMPORT -#endif - -#if defined(TORCH_HIP_BUILD_MAIN_LIB) -#define TORCH_HIP_CPP_API C10_EXPORT -#define TORCH_HIP_API C10_EXPORT -#else -#define TORCH_HIP_CPP_API C10_IMPORT -#define TORCH_HIP_API C10_IMPORT -#endif - -#if defined(TORCH_XPU_BUILD_MAIN_LIB) -#define TORCH_XPU_API C10_EXPORT -#else -#define TORCH_XPU_API C10_IMPORT -#endif - -// Enums only need to be exported on windows for non-CUDA files -#if defined(_WIN32) && defined(__CUDACC__) -#define C10_API_ENUM C10_API -#else -#define C10_API_ENUM -#endif - -#endif // C10_MACROS_EXPORT_H_ diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h index 55a79ee6743..87ebc4f422c 100644 --- a/runtime/core/portable_type/c10/c10/macros/Macros.h +++ b/runtime/core/portable_type/c10/c10/macros/Macros.h @@ -1,548 +1 @@ -#ifndef C10_MACROS_MACROS_H_ -#define C10_MACROS_MACROS_H_ -#include - -/* Main entry for c10/macros. - * - * In your code, include c10/macros/Macros.h directly, instead of individual - * files in this folder. - */ - -// For build systems that do not directly depend on CMake and directly build -// from the source directory (such as Buck), one may not have a cmake_macros.h -// file at all. In this case, the build system is responsible for providing -// correct macro definitions corresponding to the cmake_macros.h.in file. -// -// In such scenarios, one should define the macro -// C10_USING_CUSTOM_GENERATED_MACROS -// to inform this header that it does not need to include the cmake_macros.h -// file. - -#ifndef C10_USING_CUSTOM_GENERATED_MACROS -#include -#endif // C10_USING_CUSTOM_GENERATED_MACROS - -#include - -#if defined(__clang__) -#define __ubsan_ignore_float_divide_by_zero__ \ - __attribute__((no_sanitize("float-divide-by-zero"))) -#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined"))) -#define __ubsan_ignore_signed_int_overflow__ \ - __attribute__((no_sanitize("signed-integer-overflow"))) -#define __ubsan_ignore_pointer_overflow__ \ - __attribute__((no_sanitize("pointer-overflow"))) -#define __ubsan_ignore_function__ __attribute__((no_sanitize("function"))) -#define __ubsan_ignore_float_cast_overflow__ \ - __attribute__((no_sanitize("float-cast-overflow"))) -#else -#define __ubsan_ignore_float_divide_by_zero__ -#define __ubsan_ignore_undefined__ -#define __ubsan_ignore_signed_int_overflow__ -#define __ubsan_ignore_pointer_overflow__ -#define __ubsan_ignore_function__ -#define __ubsan_ignore_float_cast_overflow__ -#endif - -// Detect address sanitizer as some stuff doesn't work with it -#undef C10_ASAN_ENABLED - -// for clang -#if defined(__has_feature) -#if ((__has_feature(address_sanitizer))) -#define C10_ASAN_ENABLED 1 -#endif -#endif - -// for gcc -#if defined(__SANITIZE_ADDRESS__) -#if __SANITIZE_ADDRESS__ -#if !defined(C10_ASAN_ENABLED) -#define C10_ASAN_ENABLED 1 -#endif -#endif -#endif - -#if !defined(C10_ASAN_ENABLED) -#define C10_ASAN_ENABLED 0 -#endif - -// Detect undefined-behavior sanitizer (UBSAN) -#undef C10_UBSAN_ENABLED - -// for clang or gcc >= 14 -// NB: gcc 14 adds support for Clang's __has_feature -// https://gcc.gnu.org/gcc-14/changes.html -// gcc < 14 doesn't have a macro for UBSAN -// (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc) -// https://github.com/google/sanitizers/issues/765 -#if defined(__has_feature) -#if ((__has_feature(undefined_behavior_sanitizer))) -#define C10_UBSAN_ENABLED 1 -#endif -#endif - -#if !defined(C10_UBSAN_ENABLED) -#define C10_UBSAN_ENABLED 0 -#endif - -// Disable the copy and assignment operator for a class. Note that this will -// disable the usage of the class in std containers. -#define C10_DISABLE_COPY_AND_ASSIGN(classname) \ - classname(const classname&) = delete; \ - classname& operator=(const classname&) = delete - -#define C10_CONCATENATE_IMPL(s1, s2) s1##s2 -#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2) - -#define C10_MACRO_EXPAND(args) args - -#define C10_STRINGIZE_IMPL(x) #x -#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x) - -/** - * C10_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts with - * str and ends with a unique number. - */ -#ifdef __COUNTER__ -#define C10_UID __COUNTER__ -#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__) -#else -#define C10_UID __LINE__ -#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__) -#endif - -#ifdef __has_cpp_attribute -#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) -#else -#define C10_HAS_CPP_ATTRIBUTE(x) (0) -#endif - -#ifndef FBCODE_CAFFE2 -/// DEPRECATED: Warn if a type or return value is discarded. -#define C10_NODISCARD [[nodiscard]] - -/// DEPRECATED: Suppress an unused variable. -#define C10_UNUSED [[maybe_unused]] -#endif - -#if !defined(__has_attribute) -#define __has_attribute(x) 0 -#endif - -// Direct port of LLVM_ATTRIBUTE_USED. -#if __has_attribute(used) -#define C10_USED __attribute__((__used__)) -#else -#define C10_USED -#endif - -#define C10_RESTRICT __restrict - -// Simply define the namespace, in case a dependent library want to refer to -// the c10 namespace but not any nontrivial files. -namespace c10 {} -namespace c10::cuda {} -namespace c10::hip {} -namespace c10::xpu {} - -// Since C10 is the core library for caffe2 (and aten), we will simply reroute -// all abstractions defined in c10 to be available in caffe2 as well. -// This is only for backwards compatibility. Please use the symbols from the -// c10 namespace where possible. -namespace caffe2 { -using namespace c10; -} -namespace at { -using namespace c10; -} -namespace at::cuda { -using namespace c10::cuda; -} // namespace at::cuda - -// WARNING!!! THIS IS A GIANT HACK!!! -// This line means you cannot simultaneously include c10/hip -// and c10/cuda and then use them from the at::cuda namespace. -// This is true in practice, because HIPIFY works inplace on -// files in ATen/cuda, so it assumes that c10::hip is available -// from at::cuda. This namespace makes that happen. When -// HIPIFY is no longer out-of-place, we can switch the cuda -// here to hip and everyone is happy. -namespace at::cuda { -using namespace c10::hip; -} // namespace at::cuda - -namespace at::xpu { -using namespace c10::xpu; -} // namespace at::xpu - -// C10_LIKELY/C10_UNLIKELY -// -// These macros provide parentheses, so you can use these macros as: -// -// if C10_LIKELY(some_expr) { -// ... -// } -// -// NB: static_cast to boolean is mandatory in C++, because __builtin_expect -// takes a long argument, which means you may trigger the wrong conversion -// without it. -// -#if defined(__GNUC__) || defined(__ICL) || defined(__clang__) -#define C10_LIKELY(expr) (__builtin_expect(static_cast(expr), 1)) -#define C10_UNLIKELY(expr) (__builtin_expect(static_cast(expr), 0)) -#else -#define C10_LIKELY(expr) (expr) -#define C10_UNLIKELY(expr) (expr) -#endif - -/// C10_NOINLINE - Functions whose declaration is annotated with this will not -/// be inlined. -#ifdef __GNUC__ -#define C10_NOINLINE __attribute__((noinline)) -#elif _MSC_VER -#define C10_NOINLINE __declspec(noinline) -#else -#define C10_NOINLINE -#endif - -#if defined(_MSC_VER) -#define C10_ALWAYS_INLINE __forceinline -#elif __has_attribute(always_inline) || defined(__GNUC__) -#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline -#else -#define C10_ALWAYS_INLINE inline -#endif - -// Unlike C10_ALWAYS_INLINE, C10_ALWAYS_INLINE_ATTRIBUTE can be used -// on a lambda. -#if defined(_MSC_VER) -// MSVC 14.39 is reasonably recent and doesn't like -// [[msvc::forceinline]] on a lambda, so don't try to use it. -#define C10_ALWAYS_INLINE_ATTRIBUTE -#elif __has_attribute(always_inline) || defined(__GNUC__) -#define C10_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__)) -#else -#define C10_ALWAYS_INLINE_ATTRIBUTE -#endif - -#if defined(_MSC_VER) -#define C10_ATTR_VISIBILITY_HIDDEN -#elif defined(__GNUC__) -#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden"))) -#else -#define C10_ATTR_VISIBILITY_HIDDEN -#endif - -#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN - -#include - -#ifdef __HIPCC__ -// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. -// We do this #include here so that C10_HOST_DEVICE and friends will Just Work. -// See https://github.com/ROCm/hip/issues/441 -#include -#endif - -#if defined(__CUDACC__) || defined(__HIPCC__) -// Designates functions callable from the host (CPU) and the device (GPU) -#define C10_HOST_DEVICE __host__ __device__ -#define C10_DEVICE __device__ -#define C10_HOST __host__ -// constants from -// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications) -// The maximum number of threads per multiprocessor is 1024 for Turing -// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and -// 2048 for all other architectures. You'll get warnings if you exceed these -// constants. Hence, the following macros adjust the input values from the user -// to resolve potential warnings. -#if __CUDA_ARCH__ == 750 -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024; -#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536; -#else -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048; -#endif -// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently -constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024; -// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block -// size. 256 is a good number for this fallback and should give good occupancy -// and versatility across all architectures. -constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; -// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it -// turns out that although __launch_bounds__ can take constexpr, it -// can't take a constexpr that has anything to do with templates. -// Currently we use launch_bounds that depend on template arguments in -// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK -// and C10_MIN_BLOCKS_PER_SM are kept as macros. -// Suppose you were planning to write __launch_bounds__(a, b), based on your -// performance tuning on a modern GPU. Instead, you should write -// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)), -// which will also properly respect limits on old architectures. -#define C10_MAX_THREADS_PER_BLOCK(val) \ - (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \ - : CUDA_THREADS_PER_BLOCK_FALLBACK) -#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) \ - ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \ - ? (blocks_per_sm) \ - : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / \ - (threads_per_block)))) -// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__ -#define C10_LAUNCH_BOUNDS_0 \ - __launch_bounds__( \ - 256, 4) // default launch bounds that should give good occupancy and - // versatility across all architectures. -#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \ - __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block)))) -#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \ - __launch_bounds__( \ - (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \ - (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm)))) -#else -#define C10_HOST_DEVICE -#define C10_HOST -#define C10_DEVICE -#endif - -#if defined(USE_ROCM) -#define C10_HIP_HOST_DEVICE __host__ __device__ -#else -#define C10_HIP_HOST_DEVICE -#endif - -#if defined(USE_ROCM) -// C10_WARP_SIZE is only allowed for device code. -// Host code _must_ use at::cuda::warp_size() -// HIP header used to define warpSize as a constexpr that was either 32 or 64 -// depending on the target device, and then always set it to 64 for host code. -// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we -// set it to something unreasonable to trigger obvious host code errors. -#if defined(__HIP_DEVICE_COMPILE__) -#if defined(__GFX9__) -static constexpr int C10_WARP_SIZE = 64; -#else // __GFX9__ -static constexpr int C10_WARP_SIZE = 32; -#endif // __GFX9__ -#else -static constexpr int C10_WARP_SIZE = 1; -#endif // __HIP_DEVICE_COMPILE__ -#else -#define C10_WARP_SIZE 32 -#endif - -#if defined(_MSC_VER) && _MSC_VER <= 1900 -#define __func__ __FUNCTION__ -#endif - -// CUDA_KERNEL_ASSERT checks the assertion -// even when NDEBUG is defined. This is useful for important assertions in CUDA -// code that would otherwise be suppressed when building Release. -#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__) -// Those platforms do not support assert() -#define CUDA_KERNEL_ASSERT(cond) -#define CUDA_KERNEL_ASSERT_MSG(cond, msg) -#define SYCL_KERNEL_ASSERT(cond) -#elif defined(_MSC_VER) -#if defined(NDEBUG) -extern "C" { -C10_IMPORT -#if defined(__SYCL_DEVICE_ONLY__) -extern SYCL_EXTERNAL void _wassert( - const wchar_t* wexpr, - const wchar_t* wfile, - unsigned line); -#else -#if defined(__CUDA_ARCH__) -__host__ __device__ -#endif // __CUDA_ARCH__ - void - _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line); -#endif // __SYCL_DEVICE_ONLY__ -} -#endif // NDEBUG -#define CUDA_KERNEL_ASSERT(cond) \ - if (C10_UNLIKELY(!(cond))) { \ - (void)(_wassert( \ - _CRT_WIDE(#cond), \ - _CRT_WIDE(__FILE__), \ - static_cast(__LINE__)), \ - 0); \ - } -// TODO: This doesn't assert the message because I (chilli) couldn't figure out -// a nice way to convert a char* to a wchar_t* -#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ - if (C10_UNLIKELY(!(cond))) { \ - (void)(_wassert( \ - _CRT_WIDE(#cond), \ - _CRT_WIDE(__FILE__), \ - static_cast(__LINE__)), \ - 0); \ - } -#define SYCL_KERNEL_ASSERT(cond) \ - if (C10_UNLIKELY(!(cond))) { \ - (void)(_wassert( \ - _CRT_WIDE(#cond), \ - _CRT_WIDE(__FILE__), \ - static_cast(__LINE__)), \ - 0); \ - } -#else // __APPLE__, _MSC_VER -#if defined(NDEBUG) -extern "C" { -#if defined(__SYCL_DEVICE_ONLY__) -extern SYCL_EXTERNAL void __assert_fail( - const char* expr, - const char* file, - unsigned int line, - const char* func); -#else // __SYCL_DEVICE_ONLY__ -#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__))) -// CUDA supports __assert_fail function which are common for both device -// and host side code. -__host__ __device__ -#endif - - // This forward declaration matching the declaration of __assert_fail - // exactly how it is in glibc in case parts of the program are compiled with - // different NDEBUG settings. Otherwise we might get 'ambiguous declaration' - // error. Note: On ROCm - this declaration serves for host side compilation. - void - __assert_fail( - const char* assertion, - const char* file, - unsigned int line, - const char* function) noexcept __attribute__((__noreturn__)); - -#endif // __SYCL_DEVICE_ONLY__ -} -#endif // NDEBUG -// ROCm disables kernel assert by default for performance considerations. -// Though ROCm supports __assert_fail, it uses kernel printf which has -// a non-negligible performance impact even if the assert condition is -// never triggered. We choose to use abort() instead which will still -// terminate the application but without a more useful error message. -#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM) -#define CUDA_KERNEL_ASSERT(cond) \ - if C10_UNLIKELY (!(cond)) { \ - abort(); \ - } -#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ - if C10_UNLIKELY (!(cond)) { \ - abort(); \ - } -#define SYCL_KERNEL_ASSERT(cond) \ - if C10_UNLIKELY (!(cond)) { \ - abort(); \ - } -#else -#define CUDA_KERNEL_ASSERT(cond) \ - if (C10_UNLIKELY(!(cond))) { \ - __assert_fail( \ - #cond, __FILE__, static_cast(__LINE__), __func__); \ - } -#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ - if (C10_UNLIKELY(!(cond))) { \ - __assert_fail( \ - msg, __FILE__, static_cast(__LINE__), __func__); \ - } -#define SYCL_KERNEL_ASSERT(cond) \ - if (C10_UNLIKELY(!(cond))) { \ - __assert_fail( \ - #cond, __FILE__, static_cast(__LINE__), __func__); \ - } -#endif // C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM -#endif // __APPLE__ - -#ifdef __APPLE__ -#include -#endif - -#if defined(__ANDROID__) -#define C10_ANDROID 1 -#define C10_MOBILE 1 -#elif ( \ - defined(__APPLE__) && \ - (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)) -#define C10_IOS 1 -#define C10_MOBILE 1 -#endif // ANDROID / IOS - -#if defined(C10_MOBILE) && C10_MOBILE -#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline -#else -#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE -#endif - -#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) -#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr -#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr - -#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \ - static constexpr const char field[] = val; -#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val) -#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) - -#ifndef HAS_DEMANGLE -#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) -#define HAS_DEMANGLE 0 -#elif defined(__APPLE__) && \ - (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE) -#define HAS_DEMANGLE 0 -#else -#define HAS_DEMANGLE 1 -#endif -#endif // HAS_DEMANGLE - -#define _C10_PRAGMA__(string) _Pragma(#string) -#define _C10_PRAGMA_(string) _C10_PRAGMA__(string) - -#ifdef __clang__ -#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push") -#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop") -#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \ - _C10_PRAGMA_(clang diagnostic ignored flag) -#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag) -#else -#define C10_CLANG_DIAGNOSTIC_PUSH() -#define C10_CLANG_DIAGNOSTIC_POP() -#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) -#define C10_CLANG_HAS_WARNING(flag) 0 -#endif - -#ifdef __clang__ - -#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \ - _C10_PRAGMA_(clang diagnostic push) \ - _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \ - _C10_PRAGMA_(clang diagnostic ignored warning) - -#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop) - -#elif __GNUC__ - -#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \ - _C10_PRAGMA_(GCC diagnostic push) \ - _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas") \ - _C10_PRAGMA_(GCC diagnostic ignored warning) - -#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop) - -#else - -#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) -#define C10_DIAGNOSTIC_POP() - -#endif - -// This macro is used to find older C++ compilers -// that don't support move optimization for return values. - -#if (defined(__GNUC__) && __GNUC__ < 13) || \ - (defined(__clang_major__) && __clang_major__ < 13) -#define C10_RETURN_MOVE_IF_OLD_COMPILER 1 -#else -#define C10_RETURN_MOVE_IF_OLD_COMPILER 0 -#endif - -#endif // C10_MACROS_MACROS_H_ +#include diff --git a/runtime/core/portable_type/c10/c10/util/TypeSafeSignMath.h b/runtime/core/portable_type/c10/c10/util/TypeSafeSignMath.h index 2853ff48d18..58c05067830 100644 --- a/runtime/core/portable_type/c10/c10/util/TypeSafeSignMath.h +++ b/runtime/core/portable_type/c10/c10/util/TypeSafeSignMath.h @@ -79,7 +79,7 @@ template inline constexpr bool greater_than_max(const T& x) { constexpr bool can_overflow = std::numeric_limits::digits > std::numeric_limits::digits; - return can_overflow && x > std::numeric_limits::max(); + return can_overflow && x > (std::numeric_limits::max)(); } #ifdef __GNUC__ diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h index 183aeab5634..8dd25419efb 100644 --- a/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h +++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h @@ -1,5 +1,12 @@ #pragma once +#ifndef C10_MACROS_EXPORT_H_ +#define C10_MACROS_EXPORT_H_ + +#ifndef C10_USING_CUSTOM_GENERATED_MACROS +#include +#endif // C10_USING_CUSTOM_GENERATED_MACROS + /* Header file to define the common scaffolding for exported symbols. * * Export is by itself a quite tricky situation to deal with, and if you are @@ -85,3 +92,62 @@ #else #define C10_API C10_IMPORT #endif + +// This one is being used by libtorch.so +#ifdef CAFFE2_BUILD_MAIN_LIB +#define TORCH_API C10_EXPORT +#else +#define TORCH_API C10_IMPORT +#endif + +// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API +// belonging to the same library instead of just one TORCH_CUDA_API. Well, it +// can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_CUDA_CPP_API +// and TORCH_CUDA_CU_API are artifacts of when we needed a split build to +// avoid relocation marker linking errors. The context is as follows: +// +// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we +// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker +// issues when linking big binaries. +// (https://github.com/pytorch/pytorch/issues/39968) We had two choices: +// (1) Stop supporting so many GPU architectures +// (2) Do something else +// We chose #2 and decided to split the behemoth that was torch_cuda into two +// smaller libraries, one with most of the core kernel functions (torch_cuda_cu) +// and the other that had..well..everything else (torch_cuda_cpp). The idea was +// this: instead of linking our static libraries (like the hefty +// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky +// relocation marker issues, we could link our static libraries to a smaller +// part of torch_cuda (torch_cuda_cpp) and avoid the issues. + +// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the +// same api) +#ifdef TORCH_CUDA_BUILD_MAIN_LIB +#define TORCH_CUDA_CPP_API C10_EXPORT +#define TORCH_CUDA_CU_API C10_EXPORT +#else +#define TORCH_CUDA_CPP_API C10_IMPORT +#define TORCH_CUDA_CU_API C10_IMPORT +#endif + +#if defined(TORCH_HIP_BUILD_MAIN_LIB) +#define TORCH_HIP_CPP_API C10_EXPORT +#define TORCH_HIP_API C10_EXPORT +#else +#define TORCH_HIP_CPP_API C10_IMPORT +#define TORCH_HIP_API C10_IMPORT +#endif + +#if defined(TORCH_XPU_BUILD_MAIN_LIB) +#define TORCH_XPU_API C10_EXPORT +#else +#define TORCH_XPU_API C10_IMPORT +#endif + +// Enums only need to be exported on windows for non-CUDA files +#if defined(_WIN32) && defined(__CUDACC__) +#define C10_API_ENUM C10_API +#else +#define C10_API_ENUM +#endif +#endif // C10_MACROS_EXPORT_H_ diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h new file mode 100644 index 00000000000..1e07ab0446e --- /dev/null +++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Macros.h @@ -0,0 +1,564 @@ +#ifndef C10_MACROS_MACROS_H_ +#define C10_MACROS_MACROS_H_ +#include + +/* Main entry for torch/headeronly/macros (used to be c10/macros). + * + * In your code, include torch/headeronly/macros/Macros.h directly, instead of + * individual files in this folder. + */ + +// For build systems that do not directly depend on CMake and directly build +// from the source directory (such as Buck), one may not have a cmake_macros.h +// file at all. In this case, the build system is responsible for providing +// correct macro definitions corresponding to the cmake_macros.h.in file. +// +// In such scenarios, one should define the macro +// C10_USING_CUSTOM_GENERATED_MACROS +// to inform this header that it does not need to include the cmake_macros.h +// file. + +#ifndef C10_USING_CUSTOM_GENERATED_MACROS +#include +#endif // C10_USING_CUSTOM_GENERATED_MACROS + +#include + +#if defined(__clang__) +#define __ubsan_ignore_float_divide_by_zero__ \ + __attribute__((no_sanitize("float-divide-by-zero"))) +#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined"))) +#define __ubsan_ignore_signed_int_overflow__ \ + __attribute__((no_sanitize("signed-integer-overflow"))) +#define __ubsan_ignore_pointer_overflow__ \ + __attribute__((no_sanitize("pointer-overflow"))) +#define __ubsan_ignore_function__ __attribute__((no_sanitize("function"))) +#define __ubsan_ignore_float_cast_overflow__ \ + __attribute__((no_sanitize("float-cast-overflow"))) +#else +#define __ubsan_ignore_float_divide_by_zero__ +#define __ubsan_ignore_undefined__ +#define __ubsan_ignore_signed_int_overflow__ +#define __ubsan_ignore_pointer_overflow__ +#define __ubsan_ignore_function__ +#define __ubsan_ignore_float_cast_overflow__ +#endif + +// Detect address sanitizer as some stuff doesn't work with it +#undef C10_ASAN_ENABLED + +// for clang +#if defined(__has_feature) +#if ((__has_feature(address_sanitizer))) +#define C10_ASAN_ENABLED 1 +#endif +#endif + +// for gcc +#if defined(__SANITIZE_ADDRESS__) +#if __SANITIZE_ADDRESS__ +#if !defined(C10_ASAN_ENABLED) +#define C10_ASAN_ENABLED 1 +#endif +#endif +#endif + +#if !defined(C10_ASAN_ENABLED) +#define C10_ASAN_ENABLED 0 +#endif + +// Detect undefined-behavior sanitizer (UBSAN) +#undef C10_UBSAN_ENABLED + +// for clang or gcc >= 14 +// NB: gcc 14 adds support for Clang's __has_feature +// https://gcc.gnu.org/gcc-14/changes.html +// gcc < 14 doesn't have a macro for UBSAN +// (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc) +// https://github.com/google/sanitizers/issues/765 +#if defined(__has_feature) +#if ((__has_feature(undefined_behavior_sanitizer))) +#define C10_UBSAN_ENABLED 1 +#endif +#endif + +#if !defined(C10_UBSAN_ENABLED) +#define C10_UBSAN_ENABLED 0 +#endif + +// Disable the copy and assignment operator for a class. Note that this will +// disable the usage of the class in std containers. +#define C10_DISABLE_COPY_AND_ASSIGN(classname) \ + classname(const classname&) = delete; \ + classname& operator=(const classname&) = delete + +#define C10_CONCATENATE_IMPL(s1, s2) s1##s2 +#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2) + +#define C10_MACRO_EXPAND(args) args + +#define C10_STRINGIZE_IMPL(x) #x +#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x) + +/** + * C10_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts with + * str and ends with a unique number. + */ +#ifdef __COUNTER__ +#define C10_UID __COUNTER__ +#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__) +#else +#define C10_UID __LINE__ +#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__) +#endif + +#ifdef __has_cpp_attribute +#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +#define C10_HAS_CPP_ATTRIBUTE(x) (0) +#endif + +#ifndef FBCODE_CAFFE2 +/// DEPRECATED: Warn if a type or return value is discarded. +#define C10_NODISCARD [[nodiscard]] + +/// DEPRECATED: Suppress an unused variable. +#define C10_UNUSED [[maybe_unused]] +#endif + +#if !defined(__has_attribute) +#define __has_attribute(x) 0 +#endif + +// Direct port of LLVM_ATTRIBUTE_USED. +#if __has_attribute(used) +#define C10_USED __attribute__((__used__)) +#else +#define C10_USED +#endif + +#define C10_RESTRICT __restrict + +// Simply define the namespace, in case a dependent library want to refer to +// the c10 namespace but not any nontrivial files. +namespace c10 {} +namespace c10::cuda {} +namespace c10::hip {} +namespace c10::xpu {} + +// Since C10 is the core library for caffe2 (and aten), we will simply reroute +// all abstractions defined in c10 to be available in caffe2 as well. +// This is only for backwards compatibility. Please use the symbols from the +// c10 namespace where possible. +namespace caffe2 { +using namespace c10; +} +namespace at { +using namespace c10; +} +namespace at::cuda { +using namespace c10::cuda; +} // namespace at::cuda + +// WARNING!!! THIS IS A GIANT HACK!!! +// This line means you cannot simultaneously include c10/hip +// and c10/cuda and then use them from the at::cuda namespace. +// This is true in practice, because HIPIFY works inplace on +// files in ATen/cuda, so it assumes that c10::hip is available +// from at::cuda. This namespace makes that happen. When +// HIPIFY is no longer out-of-place, we can switch the cuda +// here to hip and everyone is happy. +namespace at::cuda { +using namespace c10::hip; +} // namespace at::cuda + +namespace at::xpu { +using namespace c10::xpu; +} // namespace at::xpu + +// C10_LIKELY/C10_UNLIKELY +// +// These macros provide parentheses, so you can use these macros as: +// +// if C10_LIKELY(some_expr) { +// ... +// } +// +// NB: static_cast to boolean is mandatory in C++, because __builtin_expect +// takes a long argument, which means you may trigger the wrong conversion +// without it. +// +#if defined(__GNUC__) || defined(__ICL) || defined(__clang__) +#define C10_LIKELY(expr) (__builtin_expect(static_cast(expr), 1)) +#define C10_UNLIKELY(expr) (__builtin_expect(static_cast(expr), 0)) +#else +#define C10_LIKELY(expr) (expr) +#define C10_UNLIKELY(expr) (expr) +#endif + +/// C10_NOINLINE - Functions whose declaration is annotated with this will not +/// be inlined. +#ifdef __GNUC__ +#define C10_NOINLINE __attribute__((noinline)) +#elif _MSC_VER +#define C10_NOINLINE __declspec(noinline) +#else +#define C10_NOINLINE +#endif + +#if defined(_MSC_VER) +#define C10_ALWAYS_INLINE __forceinline +#elif __has_attribute(always_inline) || defined(__GNUC__) +#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline +#else +#define C10_ALWAYS_INLINE inline +#endif + +// Unlike C10_ALWAYS_INLINE, C10_ALWAYS_INLINE_ATTRIBUTE can be used +// on a lambda. +#if defined(_MSC_VER) +// MSVC 14.39 is reasonably recent and doesn't like +// [[msvc::forceinline]] on a lambda, so don't try to use it. +#define C10_ALWAYS_INLINE_ATTRIBUTE +#elif __has_attribute(always_inline) || defined(__GNUC__) +#define C10_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__)) +#else +#define C10_ALWAYS_INLINE_ATTRIBUTE +#endif + +#if defined(_MSC_VER) +#define C10_ATTR_VISIBILITY_HIDDEN +#elif defined(__GNUC__) +#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden"))) +#else +#define C10_ATTR_VISIBILITY_HIDDEN +#endif + +#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN + +#include + +#ifdef __HIPCC__ +// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. +// We do this #include here so that C10_HOST_DEVICE and friends will Just Work. +// See https://github.com/ROCm/hip/issues/441 +#include +#endif + +#if defined(__CUDACC__) || defined(__HIPCC__) +// Designates functions callable from the host (CPU) and the device (GPU) +#define C10_HOST_DEVICE __host__ __device__ +#define C10_DEVICE __device__ +#define C10_HOST __host__ +// constants from +// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications) +// The maximum number of threads per multiprocessor is 1024 for Turing +// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and +// 2048 for all other architectures. You'll get warnings if you exceed these +// constants. Hence, the following macros adjust the input values from the user +// to resolve potential warnings. +#if __CUDA_ARCH__ == 750 +constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024; +#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 +constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536; +#else +constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048; +#endif +// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently +constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024; +// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block +// size. 256 is a good number for this fallback and should give good occupancy +// and versatility across all architectures. +constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; +// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it +// turns out that although __launch_bounds__ can take constexpr, it +// can't take a constexpr that has anything to do with templates. +// Currently we use launch_bounds that depend on template arguments in +// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK +// and C10_MIN_BLOCKS_PER_SM are kept as macros. +// Suppose you were planning to write __launch_bounds__(a, b), based on your +// performance tuning on a modern GPU. Instead, you should write +// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)), +// which will also properly respect limits on old architectures. +#define C10_MAX_THREADS_PER_BLOCK(val) \ + (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \ + : CUDA_THREADS_PER_BLOCK_FALLBACK) +#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) \ + ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \ + ? (blocks_per_sm) \ + : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / \ + (threads_per_block)))) +// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__ +#define C10_LAUNCH_BOUNDS_0 \ + __launch_bounds__( \ + 256, 4) // default launch bounds that should give good occupancy and + // versatility across all architectures. +#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \ + __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block)))) +#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \ + __launch_bounds__( \ + (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \ + (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm)))) +#else +#define C10_HOST_DEVICE +#define C10_HOST +#define C10_DEVICE +#endif + +#if defined(USE_ROCM) +#define C10_HIP_HOST_DEVICE __host__ __device__ +#else +#define C10_HIP_HOST_DEVICE +#endif + +#if defined(USE_ROCM) +// C10_WARP_SIZE is only allowed for device code. +// Host code _must_ use at::cuda::warp_size() +// HIP header used to define warpSize as a constexpr that was either 32 or 64 +// depending on the target device, and then always set it to 64 for host code. +// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we +// set it to something unreasonable to trigger obvious host code errors. + +namespace at::cuda { +TORCH_CUDA_CPP_API int warp_size(); +} +#ifdef __HIPCC__ +static inline int __host__ C10_WARP_SIZE_INTERNAL() { + return at::cuda::warp_size(); +} + +static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() { +#if defined(__GFX9__) + return 64; +#else // __GFX9__ + return 32; +#endif // __GFX9__ +} +#else // __HIPCC__ +static inline int C10_WARP_SIZE_INTERNAL() { + return at::cuda::warp_size(); +} +#endif // __HIPCC__ + +#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL()) +#define C10_WARP_SIZE_STATIC 64 + +#else // defined(USE_ROCM) +#define C10_WARP_SIZE 32 +#endif + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#define __func__ __FUNCTION__ +#endif + +// CUDA_KERNEL_ASSERT checks the assertion +// even when NDEBUG is defined. This is useful for important assertions in CUDA +// code that would otherwise be suppressed when building Release. +#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__) +// Those platforms do not support assert() +#define CUDA_KERNEL_ASSERT(cond) +#define CUDA_KERNEL_ASSERT_MSG(cond, msg) +#define SYCL_KERNEL_ASSERT(cond) +#elif defined(_MSC_VER) +#if defined(NDEBUG) +extern "C" { +C10_IMPORT +#if defined(__SYCL_DEVICE_ONLY__) +extern SYCL_EXTERNAL void _wassert( + const wchar_t* wexpr, + const wchar_t* wfile, + unsigned line); +#else +#if defined(__CUDA_ARCH__) +__host__ __device__ +#endif // __CUDA_ARCH__ + void + _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line); +#endif // __SYCL_DEVICE_ONLY__ +} +#endif // NDEBUG +#define CUDA_KERNEL_ASSERT(cond) \ + if (C10_UNLIKELY(!(cond))) { \ + (void)(_wassert( \ + _CRT_WIDE(#cond), \ + _CRT_WIDE(__FILE__), \ + static_cast(__LINE__)), \ + 0); \ + } +// TODO: This doesn't assert the message because I (chilli) couldn't figure out +// a nice way to convert a char* to a wchar_t* +#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ + if (C10_UNLIKELY(!(cond))) { \ + (void)(_wassert( \ + _CRT_WIDE(#cond), \ + _CRT_WIDE(__FILE__), \ + static_cast(__LINE__)), \ + 0); \ + } +#define SYCL_KERNEL_ASSERT(cond) \ + if (C10_UNLIKELY(!(cond))) { \ + (void)(_wassert( \ + _CRT_WIDE(#cond), \ + _CRT_WIDE(__FILE__), \ + static_cast(__LINE__)), \ + 0); \ + } +#else // __APPLE__, _MSC_VER +#if defined(NDEBUG) +extern "C" { +#if defined(__SYCL_DEVICE_ONLY__) +extern SYCL_EXTERNAL void __assert_fail( + const char* expr, + const char* file, + unsigned int line, + const char* func); +#else // __SYCL_DEVICE_ONLY__ +#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__))) +// CUDA supports __assert_fail function which are common for both device +// and host side code. +__host__ __device__ +#endif + + // This forward declaration matching the declaration of __assert_fail + // exactly how it is in glibc in case parts of the program are compiled with + // different NDEBUG settings. Otherwise we might get 'ambiguous declaration' + // error. Note: On ROCm - this declaration serves for host side compilation. + void + __assert_fail( + const char* assertion, + const char* file, + unsigned int line, + const char* function) noexcept __attribute__((__noreturn__)); + +#endif // __SYCL_DEVICE_ONLY__ +} +#endif // NDEBUG +// ROCm disables kernel assert by default for performance considerations. +// Though ROCm supports __assert_fail, it uses kernel printf which has +// a non-negligible performance impact even if the assert condition is +// never triggered. We choose to use abort() instead which will still +// terminate the application but without a more useful error message. +#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM) +#define CUDA_KERNEL_ASSERT(cond) \ + if C10_UNLIKELY (!(cond)) { \ + abort(); \ + } +#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ + if C10_UNLIKELY (!(cond)) { \ + abort(); \ + } +#define SYCL_KERNEL_ASSERT(cond) \ + if C10_UNLIKELY (!(cond)) { \ + abort(); \ + } +#else +#define CUDA_KERNEL_ASSERT(cond) \ + if (C10_UNLIKELY(!(cond))) { \ + __assert_fail( \ + #cond, __FILE__, static_cast(__LINE__), __func__); \ + } +#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ + if (C10_UNLIKELY(!(cond))) { \ + __assert_fail( \ + msg, __FILE__, static_cast(__LINE__), __func__); \ + } +#define SYCL_KERNEL_ASSERT(cond) \ + if (C10_UNLIKELY(!(cond))) { \ + __assert_fail( \ + #cond, __FILE__, static_cast(__LINE__), __func__); \ + } +#endif // C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM +#endif // __APPLE__ + +#ifdef __APPLE__ +#include +#endif + +#if defined(__ANDROID__) +#define C10_ANDROID 1 +#define C10_MOBILE 1 +#elif ( \ + defined(__APPLE__) && \ + (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)) +#define C10_IOS 1 +#define C10_MOBILE 1 +#endif // ANDROID / IOS + +#if defined(C10_MOBILE) && C10_MOBILE +#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline +#else +#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE +#endif + +#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) +#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr +#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr + +#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \ + static constexpr const char field[] = val; +#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val) +#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) + +#ifndef HAS_DEMANGLE +#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) +#define HAS_DEMANGLE 0 +#elif defined(__APPLE__) && \ + (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE) +#define HAS_DEMANGLE 0 +#else +#define HAS_DEMANGLE 1 +#endif +#endif // HAS_DEMANGLE + +#define _C10_PRAGMA__(string) _Pragma(#string) +#define _C10_PRAGMA_(string) _C10_PRAGMA__(string) + +#ifdef __clang__ +#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push") +#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop") +#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \ + _C10_PRAGMA_(clang diagnostic ignored flag) +#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag) +#else +#define C10_CLANG_DIAGNOSTIC_PUSH() +#define C10_CLANG_DIAGNOSTIC_POP() +#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) +#define C10_CLANG_HAS_WARNING(flag) 0 +#endif + +#ifdef __clang__ + +#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \ + _C10_PRAGMA_(clang diagnostic push) \ + _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \ + _C10_PRAGMA_(clang diagnostic ignored warning) + +#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop) + +#elif __GNUC__ + +#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \ + _C10_PRAGMA_(GCC diagnostic push) \ + _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas") \ + _C10_PRAGMA_(GCC diagnostic ignored warning) + +#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop) + +#else + +#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) +#define C10_DIAGNOSTIC_POP() + +#endif + +// This macro is used to find older C++ compilers +// that don't support move optimization for return values. + +#if (defined(__GNUC__) && __GNUC__ < 13) || \ + (defined(__clang_major__) && __clang_major__ < 13) +#define C10_RETURN_MOVE_IF_OLD_COMPILER 1 +#else +#define C10_RETURN_MOVE_IF_OLD_COMPILER 0 +#endif + +#endif // C10_MACROS_MACROS_H_