From a29ebf2b0b0ca624ed43b57a8b79b9e405ae9f4e Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Fri, 18 Jul 2025 13:02:40 -0700 Subject: [PATCH 1/5] bump pt core pin to 0718 Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/) [ghstack-poisoned] --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index f574f800608..a56cae9d85a 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -ab43fe4bdf5ccd82897f0e982c451a0127bd175e +744d29186e941d5fe8cb141de5bb9b89d6d77351 diff --git a/install_requirements.py b/install_requirements.py index 368e7cd079d..4b80e78764d 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -71,7 +71,7 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250716" +NIGHTLY_VERSION = "dev20250718" def install_requirements(use_pytorch_nightly): From 45246049a3a9fdbfccd725896b4d7b78e7d85585 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Fri, 18 Jul 2025 14:04:09 -0700 Subject: [PATCH 2/5] Update on "bump pt core pin to 0718" Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/) [ghstack-poisoned] --- .../portable_type/c10/c10/macros/Export.h | 77 --- .../portable_type/c10/c10/macros/Macros.h | 549 +----------------- 2 files changed, 1 insertion(+), 625 deletions(-) diff --git a/runtime/core/portable_type/c10/c10/macros/Export.h b/runtime/core/portable_type/c10/c10/macros/Export.h index 3d912661026..1b8a6811c53 100644 --- a/runtime/core/portable_type/c10/c10/macros/Export.h +++ b/runtime/core/portable_type/c10/c10/macros/Export.h @@ -1,78 +1 @@ -#ifndef C10_MACROS_EXPORT_H_ -#define C10_MACROS_EXPORT_H_ - -#ifndef C10_USING_CUSTOM_GENERATED_MACROS -#include -#endif // C10_USING_CUSTOM_GENERATED_MACROS - #include - -// This one is being used by libtorch.so -#ifdef CAFFE2_BUILD_MAIN_LIB -#define TORCH_API C10_EXPORT -#else -#define TORCH_API C10_IMPORT -#endif - -// You may be wondering: Whose brilliant idea was it to split torch_cuda into -// two pieces with confusing names? -// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we -// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker -// issues when linking big binaries. -// (https://github.com/pytorch/pytorch/issues/39968) We had two choices: -// (1) Stop supporting so many GPU architectures -// (2) Do something else -// We chose #2 and decided to split the behemoth that was torch_cuda into two -// smaller libraries, one with most of the core kernel functions (torch_cuda_cu) -// and the other that had..well..everything else (torch_cuda_cpp). The idea was -// this: instead of linking our static libraries (like the hefty -// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky -// relocation marker issues, we could link our static libraries to a smaller -// part of torch_cuda (torch_cuda_cpp) and avoid the issues. - -// libtorch_cuda_cu.so -#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB -#define TORCH_CUDA_CU_API C10_EXPORT -#elif defined(BUILD_SPLIT_CUDA) -#define TORCH_CUDA_CU_API C10_IMPORT -#endif - -// libtorch_cuda_cpp.so -#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB -#define TORCH_CUDA_CPP_API C10_EXPORT -#elif defined(BUILD_SPLIT_CUDA) -#define TORCH_CUDA_CPP_API C10_IMPORT -#endif - -// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the -// same api) -#ifdef TORCH_CUDA_BUILD_MAIN_LIB -#define TORCH_CUDA_CPP_API C10_EXPORT -#define TORCH_CUDA_CU_API C10_EXPORT -#elif !defined(BUILD_SPLIT_CUDA) -#define TORCH_CUDA_CPP_API C10_IMPORT -#define TORCH_CUDA_CU_API C10_IMPORT -#endif - -#if defined(TORCH_HIP_BUILD_MAIN_LIB) -#define TORCH_HIP_CPP_API C10_EXPORT -#define TORCH_HIP_API C10_EXPORT -#else -#define TORCH_HIP_CPP_API C10_IMPORT -#define TORCH_HIP_API C10_IMPORT -#endif - -#if defined(TORCH_XPU_BUILD_MAIN_LIB) -#define TORCH_XPU_API C10_EXPORT -#else -#define TORCH_XPU_API C10_IMPORT -#endif - -// Enums only need to be exported on windows for non-CUDA files -#if defined(_WIN32) && defined(__CUDACC__) -#define C10_API_ENUM C10_API -#else -#define C10_API_ENUM -#endif - -#endif // C10_MACROS_EXPORT_H_ diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h index 55a79ee6743..87ebc4f422c 100644 --- a/runtime/core/portable_type/c10/c10/macros/Macros.h +++ b/runtime/core/portable_type/c10/c10/macros/Macros.h @@ -1,548 +1 @@ -#ifndef C10_MACROS_MACROS_H_ -#define C10_MACROS_MACROS_H_ -#include - -/* Main entry for c10/macros. - * - * In your code, include c10/macros/Macros.h directly, instead of individual - * files in this folder. - */ - -// For build systems that do not directly depend on CMake and directly build -// from the source directory (such as Buck), one may not have a cmake_macros.h -// file at all. In this case, the build system is responsible for providing -// correct macro definitions corresponding to the cmake_macros.h.in file. -// -// In such scenarios, one should define the macro -// C10_USING_CUSTOM_GENERATED_MACROS -// to inform this header that it does not need to include the cmake_macros.h -// file. - -#ifndef C10_USING_CUSTOM_GENERATED_MACROS -#include -#endif // C10_USING_CUSTOM_GENERATED_MACROS - -#include - -#if defined(__clang__) -#define __ubsan_ignore_float_divide_by_zero__ \ - __attribute__((no_sanitize("float-divide-by-zero"))) -#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined"))) -#define __ubsan_ignore_signed_int_overflow__ \ - __attribute__((no_sanitize("signed-integer-overflow"))) -#define __ubsan_ignore_pointer_overflow__ \ - __attribute__((no_sanitize("pointer-overflow"))) -#define __ubsan_ignore_function__ __attribute__((no_sanitize("function"))) -#define __ubsan_ignore_float_cast_overflow__ \ - __attribute__((no_sanitize("float-cast-overflow"))) -#else -#define __ubsan_ignore_float_divide_by_zero__ -#define __ubsan_ignore_undefined__ -#define __ubsan_ignore_signed_int_overflow__ -#define __ubsan_ignore_pointer_overflow__ -#define __ubsan_ignore_function__ -#define __ubsan_ignore_float_cast_overflow__ -#endif - -// Detect address sanitizer as some stuff doesn't work with it -#undef C10_ASAN_ENABLED - -// for clang -#if defined(__has_feature) -#if ((__has_feature(address_sanitizer))) -#define C10_ASAN_ENABLED 1 -#endif -#endif - -// for gcc -#if defined(__SANITIZE_ADDRESS__) -#if __SANITIZE_ADDRESS__ -#if !defined(C10_ASAN_ENABLED) -#define C10_ASAN_ENABLED 1 -#endif -#endif -#endif - -#if !defined(C10_ASAN_ENABLED) -#define C10_ASAN_ENABLED 0 -#endif - -// Detect undefined-behavior sanitizer (UBSAN) -#undef C10_UBSAN_ENABLED - -// for clang or gcc >= 14 -// NB: gcc 14 adds support for Clang's __has_feature -// https://gcc.gnu.org/gcc-14/changes.html -// gcc < 14 doesn't have a macro for UBSAN -// (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc) -// https://github.com/google/sanitizers/issues/765 -#if defined(__has_feature) -#if ((__has_feature(undefined_behavior_sanitizer))) -#define C10_UBSAN_ENABLED 1 -#endif -#endif - -#if !defined(C10_UBSAN_ENABLED) -#define C10_UBSAN_ENABLED 0 -#endif - -// Disable the copy and assignment operator for a class. Note that this will -// disable the usage of the class in std containers. -#define C10_DISABLE_COPY_AND_ASSIGN(classname) \ - classname(const classname&) = delete; \ - classname& operator=(const classname&) = delete - -#define C10_CONCATENATE_IMPL(s1, s2) s1##s2 -#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2) - -#define C10_MACRO_EXPAND(args) args - -#define C10_STRINGIZE_IMPL(x) #x -#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x) - -/** - * C10_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts with - * str and ends with a unique number. - */ -#ifdef __COUNTER__ -#define C10_UID __COUNTER__ -#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__) -#else -#define C10_UID __LINE__ -#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__) -#endif - -#ifdef __has_cpp_attribute -#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) -#else -#define C10_HAS_CPP_ATTRIBUTE(x) (0) -#endif - -#ifndef FBCODE_CAFFE2 -/// DEPRECATED: Warn if a type or return value is discarded. -#define C10_NODISCARD [[nodiscard]] - -/// DEPRECATED: Suppress an unused variable. -#define C10_UNUSED [[maybe_unused]] -#endif - -#if !defined(__has_attribute) -#define __has_attribute(x) 0 -#endif - -// Direct port of LLVM_ATTRIBUTE_USED. -#if __has_attribute(used) -#define C10_USED __attribute__((__used__)) -#else -#define C10_USED -#endif - -#define C10_RESTRICT __restrict - -// Simply define the namespace, in case a dependent library want to refer to -// the c10 namespace but not any nontrivial files. -namespace c10 {} -namespace c10::cuda {} -namespace c10::hip {} -namespace c10::xpu {} - -// Since C10 is the core library for caffe2 (and aten), we will simply reroute -// all abstractions defined in c10 to be available in caffe2 as well. -// This is only for backwards compatibility. Please use the symbols from the -// c10 namespace where possible. -namespace caffe2 { -using namespace c10; -} -namespace at { -using namespace c10; -} -namespace at::cuda { -using namespace c10::cuda; -} // namespace at::cuda - -// WARNING!!! THIS IS A GIANT HACK!!! -// This line means you cannot simultaneously include c10/hip -// and c10/cuda and then use them from the at::cuda namespace. -// This is true in practice, because HIPIFY works inplace on -// files in ATen/cuda, so it assumes that c10::hip is available -// from at::cuda. This namespace makes that happen. When -// HIPIFY is no longer out-of-place, we can switch the cuda -// here to hip and everyone is happy. -namespace at::cuda { -using namespace c10::hip; -} // namespace at::cuda - -namespace at::xpu { -using namespace c10::xpu; -} // namespace at::xpu - -// C10_LIKELY/C10_UNLIKELY -// -// These macros provide parentheses, so you can use these macros as: -// -// if C10_LIKELY(some_expr) { -// ... -// } -// -// NB: static_cast to boolean is mandatory in C++, because __builtin_expect -// takes a long argument, which means you may trigger the wrong conversion -// without it. -// -#if defined(__GNUC__) || defined(__ICL) || defined(__clang__) -#define C10_LIKELY(expr) (__builtin_expect(static_cast(expr), 1)) -#define C10_UNLIKELY(expr) (__builtin_expect(static_cast(expr), 0)) -#else -#define C10_LIKELY(expr) (expr) -#define C10_UNLIKELY(expr) (expr) -#endif - -/// C10_NOINLINE - Functions whose declaration is annotated with this will not -/// be inlined. -#ifdef __GNUC__ -#define C10_NOINLINE __attribute__((noinline)) -#elif _MSC_VER -#define C10_NOINLINE __declspec(noinline) -#else -#define C10_NOINLINE -#endif - -#if defined(_MSC_VER) -#define C10_ALWAYS_INLINE __forceinline -#elif __has_attribute(always_inline) || defined(__GNUC__) -#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline -#else -#define C10_ALWAYS_INLINE inline -#endif - -// Unlike C10_ALWAYS_INLINE, C10_ALWAYS_INLINE_ATTRIBUTE can be used -// on a lambda. -#if defined(_MSC_VER) -// MSVC 14.39 is reasonably recent and doesn't like -// [[msvc::forceinline]] on a lambda, so don't try to use it. -#define C10_ALWAYS_INLINE_ATTRIBUTE -#elif __has_attribute(always_inline) || defined(__GNUC__) -#define C10_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__)) -#else -#define C10_ALWAYS_INLINE_ATTRIBUTE -#endif - -#if defined(_MSC_VER) -#define C10_ATTR_VISIBILITY_HIDDEN -#elif defined(__GNUC__) -#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden"))) -#else -#define C10_ATTR_VISIBILITY_HIDDEN -#endif - -#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN - -#include - -#ifdef __HIPCC__ -// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. -// We do this #include here so that C10_HOST_DEVICE and friends will Just Work. -// See https://github.com/ROCm/hip/issues/441 -#include -#endif - -#if defined(__CUDACC__) || defined(__HIPCC__) -// Designates functions callable from the host (CPU) and the device (GPU) -#define C10_HOST_DEVICE __host__ __device__ -#define C10_DEVICE __device__ -#define C10_HOST __host__ -// constants from -// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications) -// The maximum number of threads per multiprocessor is 1024 for Turing -// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and -// 2048 for all other architectures. You'll get warnings if you exceed these -// constants. Hence, the following macros adjust the input values from the user -// to resolve potential warnings. -#if __CUDA_ARCH__ == 750 -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024; -#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536; -#else -constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048; -#endif -// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently -constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024; -// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block -// size. 256 is a good number for this fallback and should give good occupancy -// and versatility across all architectures. -constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; -// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it -// turns out that although __launch_bounds__ can take constexpr, it -// can't take a constexpr that has anything to do with templates. -// Currently we use launch_bounds that depend on template arguments in -// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK -// and C10_MIN_BLOCKS_PER_SM are kept as macros. -// Suppose you were planning to write __launch_bounds__(a, b), based on your -// performance tuning on a modern GPU. Instead, you should write -// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)), -// which will also properly respect limits on old architectures. -#define C10_MAX_THREADS_PER_BLOCK(val) \ - (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \ - : CUDA_THREADS_PER_BLOCK_FALLBACK) -#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) \ - ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \ - ? (blocks_per_sm) \ - : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) / \ - (threads_per_block)))) -// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__ -#define C10_LAUNCH_BOUNDS_0 \ - __launch_bounds__( \ - 256, 4) // default launch bounds that should give good occupancy and - // versatility across all architectures. -#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \ - __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block)))) -#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \ - __launch_bounds__( \ - (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \ - (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm)))) -#else -#define C10_HOST_DEVICE -#define C10_HOST -#define C10_DEVICE -#endif - -#if defined(USE_ROCM) -#define C10_HIP_HOST_DEVICE __host__ __device__ -#else -#define C10_HIP_HOST_DEVICE -#endif - -#if defined(USE_ROCM) -// C10_WARP_SIZE is only allowed for device code. -// Host code _must_ use at::cuda::warp_size() -// HIP header used to define warpSize as a constexpr that was either 32 or 64 -// depending on the target device, and then always set it to 64 for host code. -// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we -// set it to something unreasonable to trigger obvious host code errors. -#if defined(__HIP_DEVICE_COMPILE__) -#if defined(__GFX9__) -static constexpr int C10_WARP_SIZE = 64; -#else // __GFX9__ -static constexpr int C10_WARP_SIZE = 32; -#endif // __GFX9__ -#else -static constexpr int C10_WARP_SIZE = 1; -#endif // __HIP_DEVICE_COMPILE__ -#else -#define C10_WARP_SIZE 32 -#endif - -#if defined(_MSC_VER) && _MSC_VER <= 1900 -#define __func__ __FUNCTION__ -#endif - -// CUDA_KERNEL_ASSERT checks the assertion -// even when NDEBUG is defined. This is useful for important assertions in CUDA -// code that would otherwise be suppressed when building Release. -#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__) -// Those platforms do not support assert() -#define CUDA_KERNEL_ASSERT(cond) -#define CUDA_KERNEL_ASSERT_MSG(cond, msg) -#define SYCL_KERNEL_ASSERT(cond) -#elif defined(_MSC_VER) -#if defined(NDEBUG) -extern "C" { -C10_IMPORT -#if defined(__SYCL_DEVICE_ONLY__) -extern SYCL_EXTERNAL void _wassert( - const wchar_t* wexpr, - const wchar_t* wfile, - unsigned line); -#else -#if defined(__CUDA_ARCH__) -__host__ __device__ -#endif // __CUDA_ARCH__ - void - _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line); -#endif // __SYCL_DEVICE_ONLY__ -} -#endif // NDEBUG -#define CUDA_KERNEL_ASSERT(cond) \ - if (C10_UNLIKELY(!(cond))) { \ - (void)(_wassert( \ - _CRT_WIDE(#cond), \ - _CRT_WIDE(__FILE__), \ - static_cast(__LINE__)), \ - 0); \ - } -// TODO: This doesn't assert the message because I (chilli) couldn't figure out -// a nice way to convert a char* to a wchar_t* -#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ - if (C10_UNLIKELY(!(cond))) { \ - (void)(_wassert( \ - _CRT_WIDE(#cond), \ - _CRT_WIDE(__FILE__), \ - static_cast(__LINE__)), \ - 0); \ - } -#define SYCL_KERNEL_ASSERT(cond) \ - if (C10_UNLIKELY(!(cond))) { \ - (void)(_wassert( \ - _CRT_WIDE(#cond), \ - _CRT_WIDE(__FILE__), \ - static_cast(__LINE__)), \ - 0); \ - } -#else // __APPLE__, _MSC_VER -#if defined(NDEBUG) -extern "C" { -#if defined(__SYCL_DEVICE_ONLY__) -extern SYCL_EXTERNAL void __assert_fail( - const char* expr, - const char* file, - unsigned int line, - const char* func); -#else // __SYCL_DEVICE_ONLY__ -#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__))) -// CUDA supports __assert_fail function which are common for both device -// and host side code. -__host__ __device__ -#endif - - // This forward declaration matching the declaration of __assert_fail - // exactly how it is in glibc in case parts of the program are compiled with - // different NDEBUG settings. Otherwise we might get 'ambiguous declaration' - // error. Note: On ROCm - this declaration serves for host side compilation. - void - __assert_fail( - const char* assertion, - const char* file, - unsigned int line, - const char* function) noexcept __attribute__((__noreturn__)); - -#endif // __SYCL_DEVICE_ONLY__ -} -#endif // NDEBUG -// ROCm disables kernel assert by default for performance considerations. -// Though ROCm supports __assert_fail, it uses kernel printf which has -// a non-negligible performance impact even if the assert condition is -// never triggered. We choose to use abort() instead which will still -// terminate the application but without a more useful error message. -#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM) -#define CUDA_KERNEL_ASSERT(cond) \ - if C10_UNLIKELY (!(cond)) { \ - abort(); \ - } -#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ - if C10_UNLIKELY (!(cond)) { \ - abort(); \ - } -#define SYCL_KERNEL_ASSERT(cond) \ - if C10_UNLIKELY (!(cond)) { \ - abort(); \ - } -#else -#define CUDA_KERNEL_ASSERT(cond) \ - if (C10_UNLIKELY(!(cond))) { \ - __assert_fail( \ - #cond, __FILE__, static_cast(__LINE__), __func__); \ - } -#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ - if (C10_UNLIKELY(!(cond))) { \ - __assert_fail( \ - msg, __FILE__, static_cast(__LINE__), __func__); \ - } -#define SYCL_KERNEL_ASSERT(cond) \ - if (C10_UNLIKELY(!(cond))) { \ - __assert_fail( \ - #cond, __FILE__, static_cast(__LINE__), __func__); \ - } -#endif // C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM -#endif // __APPLE__ - -#ifdef __APPLE__ -#include -#endif - -#if defined(__ANDROID__) -#define C10_ANDROID 1 -#define C10_MOBILE 1 -#elif ( \ - defined(__APPLE__) && \ - (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)) -#define C10_IOS 1 -#define C10_MOBILE 1 -#endif // ANDROID / IOS - -#if defined(C10_MOBILE) && C10_MOBILE -#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline -#else -#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE -#endif - -#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) -#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr -#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr - -#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \ - static constexpr const char field[] = val; -#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val) -#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) - -#ifndef HAS_DEMANGLE -#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) -#define HAS_DEMANGLE 0 -#elif defined(__APPLE__) && \ - (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE) -#define HAS_DEMANGLE 0 -#else -#define HAS_DEMANGLE 1 -#endif -#endif // HAS_DEMANGLE - -#define _C10_PRAGMA__(string) _Pragma(#string) -#define _C10_PRAGMA_(string) _C10_PRAGMA__(string) - -#ifdef __clang__ -#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push") -#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop") -#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \ - _C10_PRAGMA_(clang diagnostic ignored flag) -#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag) -#else -#define C10_CLANG_DIAGNOSTIC_PUSH() -#define C10_CLANG_DIAGNOSTIC_POP() -#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) -#define C10_CLANG_HAS_WARNING(flag) 0 -#endif - -#ifdef __clang__ - -#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \ - _C10_PRAGMA_(clang diagnostic push) \ - _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \ - _C10_PRAGMA_(clang diagnostic ignored warning) - -#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop) - -#elif __GNUC__ - -#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \ - _C10_PRAGMA_(GCC diagnostic push) \ - _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas") \ - _C10_PRAGMA_(GCC diagnostic ignored warning) - -#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop) - -#else - -#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) -#define C10_DIAGNOSTIC_POP() - -#endif - -// This macro is used to find older C++ compilers -// that don't support move optimization for return values. - -#if (defined(__GNUC__) && __GNUC__ < 13) || \ - (defined(__clang_major__) && __clang_major__ < 13) -#define C10_RETURN_MOVE_IF_OLD_COMPILER 1 -#else -#define C10_RETURN_MOVE_IF_OLD_COMPILER 0 -#endif - -#endif // C10_MACROS_MACROS_H_ +#include From c7df2c576a747e30cea334240e96462a5e4d1b3a Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Fri, 18 Jul 2025 14:09:49 -0700 Subject: [PATCH 3/5] Update on "bump pt core pin to 0718" Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/) [ghstack-poisoned] --- .../c10/torch/headeronly/macros/Export.h | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h index 183aeab5634..8dd25419efb 100644 --- a/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h +++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h @@ -1,5 +1,12 @@ #pragma once +#ifndef C10_MACROS_EXPORT_H_ +#define C10_MACROS_EXPORT_H_ + +#ifndef C10_USING_CUSTOM_GENERATED_MACROS +#include +#endif // C10_USING_CUSTOM_GENERATED_MACROS + /* Header file to define the common scaffolding for exported symbols. * * Export is by itself a quite tricky situation to deal with, and if you are @@ -85,3 +92,62 @@ #else #define C10_API C10_IMPORT #endif + +// This one is being used by libtorch.so +#ifdef CAFFE2_BUILD_MAIN_LIB +#define TORCH_API C10_EXPORT +#else +#define TORCH_API C10_IMPORT +#endif + +// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API +// belonging to the same library instead of just one TORCH_CUDA_API. Well, it +// can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_CUDA_CPP_API +// and TORCH_CUDA_CU_API are artifacts of when we needed a split build to +// avoid relocation marker linking errors. The context is as follows: +// +// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we +// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker +// issues when linking big binaries. +// (https://github.com/pytorch/pytorch/issues/39968) We had two choices: +// (1) Stop supporting so many GPU architectures +// (2) Do something else +// We chose #2 and decided to split the behemoth that was torch_cuda into two +// smaller libraries, one with most of the core kernel functions (torch_cuda_cu) +// and the other that had..well..everything else (torch_cuda_cpp). The idea was +// this: instead of linking our static libraries (like the hefty +// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky +// relocation marker issues, we could link our static libraries to a smaller +// part of torch_cuda (torch_cuda_cpp) and avoid the issues. + +// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the +// same api) +#ifdef TORCH_CUDA_BUILD_MAIN_LIB +#define TORCH_CUDA_CPP_API C10_EXPORT +#define TORCH_CUDA_CU_API C10_EXPORT +#else +#define TORCH_CUDA_CPP_API C10_IMPORT +#define TORCH_CUDA_CU_API C10_IMPORT +#endif + +#if defined(TORCH_HIP_BUILD_MAIN_LIB) +#define TORCH_HIP_CPP_API C10_EXPORT +#define TORCH_HIP_API C10_EXPORT +#else +#define TORCH_HIP_CPP_API C10_IMPORT +#define TORCH_HIP_API C10_IMPORT +#endif + +#if defined(TORCH_XPU_BUILD_MAIN_LIB) +#define TORCH_XPU_API C10_EXPORT +#else +#define TORCH_XPU_API C10_IMPORT +#endif + +// Enums only need to be exported on windows for non-CUDA files +#if defined(_WIN32) && defined(__CUDACC__) +#define C10_API_ENUM C10_API +#else +#define C10_API_ENUM +#endif +#endif // C10_MACROS_EXPORT_H_ From c05b0cf671dbecbd329a47fe9cab7db271e27233 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Fri, 18 Jul 2025 14:14:49 -0700 Subject: [PATCH 4/5] Update on "bump pt core pin to 0718" Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/) [ghstack-poisoned] From 447fde39d56ffc0430b0cc5650e2115fbe3cadf0 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Fri, 18 Jul 2025 15:44:57 -0700 Subject: [PATCH 5/5] Update on "bump pt core pin to 0718" Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/) [ghstack-poisoned] --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index a56cae9d85a..cdf993d3eb1 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -744d29186e941d5fe8cb141de5bb9b89d6d77351 +89d842fec5229fff0df5342b2db121368d51e717