From a29ebf2b0b0ca624ed43b57a8b79b9e405ae9f4e Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 18 Jul 2025 13:02:40 -0700
Subject: [PATCH 1/5] bump pt core pin to 0718

Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/)

[ghstack-poisoned]
---
 .ci/docker/ci_commit_pins/pytorch.txt | 2 +-
 install_requirements.py               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index f574f800608..a56cae9d85a 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-ab43fe4bdf5ccd82897f0e982c451a0127bd175e
+744d29186e941d5fe8cb141de5bb9b89d6d77351
diff --git a/install_requirements.py b/install_requirements.py
index 368e7cd079d..4b80e78764d 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -71,7 +71,7 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250716"
+NIGHTLY_VERSION = "dev20250718"
 
 
 def install_requirements(use_pytorch_nightly):

From 45246049a3a9fdbfccd725896b4d7b78e7d85585 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 18 Jul 2025 14:04:09 -0700
Subject: [PATCH 2/5] Update on "bump pt core pin to 0718"

Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/)

[ghstack-poisoned]
---
 .../portable_type/c10/c10/macros/Export.h     |  77 ---
 .../portable_type/c10/c10/macros/Macros.h     | 549 +-----------------
 2 files changed, 1 insertion(+), 625 deletions(-)

diff --git a/runtime/core/portable_type/c10/c10/macros/Export.h b/runtime/core/portable_type/c10/c10/macros/Export.h
index 3d912661026..1b8a6811c53 100644
--- a/runtime/core/portable_type/c10/c10/macros/Export.h
+++ b/runtime/core/portable_type/c10/c10/macros/Export.h
@@ -1,78 +1 @@
-#ifndef C10_MACROS_EXPORT_H_
-#define C10_MACROS_EXPORT_H_
-
-#ifndef C10_USING_CUSTOM_GENERATED_MACROS
-#include <torch/headeronly/macros/cmake_macros.h>
-#endif // C10_USING_CUSTOM_GENERATED_MACROS
-
 #include <torch/headeronly/macros/Export.h>
-
-// This one is being used by libtorch.so
-#ifdef CAFFE2_BUILD_MAIN_LIB
-#define TORCH_API C10_EXPORT
-#else
-#define TORCH_API C10_IMPORT
-#endif
-
-// You may be wondering: Whose brilliant idea was it to split torch_cuda into
-// two pieces with confusing names?
-// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
-// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker
-// issues when linking big binaries.
-// (https://github.com/pytorch/pytorch/issues/39968) We had two choices:
-//    (1) Stop supporting so many GPU architectures
-//    (2) Do something else
-// We chose #2 and decided to split the behemoth that was torch_cuda into two
-// smaller libraries, one with most of the core kernel functions (torch_cuda_cu)
-// and the other that had..well..everything else (torch_cuda_cpp). The idea was
-// this: instead of linking our static libraries (like the hefty
-// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky
-// relocation marker issues, we could link our static libraries to a smaller
-// part of torch_cuda (torch_cuda_cpp) and avoid the issues.
-
-// libtorch_cuda_cu.so
-#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB
-#define TORCH_CUDA_CU_API C10_EXPORT
-#elif defined(BUILD_SPLIT_CUDA)
-#define TORCH_CUDA_CU_API C10_IMPORT
-#endif
-
-// libtorch_cuda_cpp.so
-#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB
-#define TORCH_CUDA_CPP_API C10_EXPORT
-#elif defined(BUILD_SPLIT_CUDA)
-#define TORCH_CUDA_CPP_API C10_IMPORT
-#endif
-
-// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the
-// same api)
-#ifdef TORCH_CUDA_BUILD_MAIN_LIB
-#define TORCH_CUDA_CPP_API C10_EXPORT
-#define TORCH_CUDA_CU_API C10_EXPORT
-#elif !defined(BUILD_SPLIT_CUDA)
-#define TORCH_CUDA_CPP_API C10_IMPORT
-#define TORCH_CUDA_CU_API C10_IMPORT
-#endif
-
-#if defined(TORCH_HIP_BUILD_MAIN_LIB)
-#define TORCH_HIP_CPP_API C10_EXPORT
-#define TORCH_HIP_API C10_EXPORT
-#else
-#define TORCH_HIP_CPP_API C10_IMPORT
-#define TORCH_HIP_API C10_IMPORT
-#endif
-
-#if defined(TORCH_XPU_BUILD_MAIN_LIB)
-#define TORCH_XPU_API C10_EXPORT
-#else
-#define TORCH_XPU_API C10_IMPORT
-#endif
-
-// Enums only need to be exported on windows for non-CUDA files
-#if defined(_WIN32) && defined(__CUDACC__)
-#define C10_API_ENUM C10_API
-#else
-#define C10_API_ENUM
-#endif
-
-#endif // C10_MACROS_EXPORT_H_
diff --git a/runtime/core/portable_type/c10/c10/macros/Macros.h b/runtime/core/portable_type/c10/c10/macros/Macros.h
index 55a79ee6743..87ebc4f422c 100644
--- a/runtime/core/portable_type/c10/c10/macros/Macros.h
+++ b/runtime/core/portable_type/c10/c10/macros/Macros.h
@@ -1,548 +1 @@
-#ifndef C10_MACROS_MACROS_H_
-#define C10_MACROS_MACROS_H_
-#include <cassert>
-
-/* Main entry for c10/macros.
- *
- * In your code, include c10/macros/Macros.h directly, instead of individual
- * files in this folder.
- */
-
-// For build systems that do not directly depend on CMake and directly build
-// from the source directory (such as Buck), one may not have a cmake_macros.h
-// file at all. In this case, the build system is responsible for providing
-// correct macro definitions corresponding to the cmake_macros.h.in file.
-//
-// In such scenarios, one should define the macro
-//     C10_USING_CUSTOM_GENERATED_MACROS
-// to inform this header that it does not need to include the cmake_macros.h
-// file.
-
-#ifndef C10_USING_CUSTOM_GENERATED_MACROS
-#include <torch/headeronly/macros/cmake_macros.h>
-#endif // C10_USING_CUSTOM_GENERATED_MACROS
-
-#include <c10/macros/Export.h>
-
-#if defined(__clang__)
-#define __ubsan_ignore_float_divide_by_zero__ \
-  __attribute__((no_sanitize("float-divide-by-zero")))
-#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined")))
-#define __ubsan_ignore_signed_int_overflow__ \
-  __attribute__((no_sanitize("signed-integer-overflow")))
-#define __ubsan_ignore_pointer_overflow__ \
-  __attribute__((no_sanitize("pointer-overflow")))
-#define __ubsan_ignore_function__ __attribute__((no_sanitize("function")))
-#define __ubsan_ignore_float_cast_overflow__ \
-  __attribute__((no_sanitize("float-cast-overflow")))
-#else
-#define __ubsan_ignore_float_divide_by_zero__
-#define __ubsan_ignore_undefined__
-#define __ubsan_ignore_signed_int_overflow__
-#define __ubsan_ignore_pointer_overflow__
-#define __ubsan_ignore_function__
-#define __ubsan_ignore_float_cast_overflow__
-#endif
-
-// Detect address sanitizer as some stuff doesn't work with it
-#undef C10_ASAN_ENABLED
-
-// for clang
-#if defined(__has_feature)
-#if ((__has_feature(address_sanitizer)))
-#define C10_ASAN_ENABLED 1
-#endif
-#endif
-
-// for gcc
-#if defined(__SANITIZE_ADDRESS__)
-#if __SANITIZE_ADDRESS__
-#if !defined(C10_ASAN_ENABLED)
-#define C10_ASAN_ENABLED 1
-#endif
-#endif
-#endif
-
-#if !defined(C10_ASAN_ENABLED)
-#define C10_ASAN_ENABLED 0
-#endif
-
-// Detect undefined-behavior sanitizer (UBSAN)
-#undef C10_UBSAN_ENABLED
-
-// for clang or gcc >= 14
-// NB: gcc 14 adds support for Clang's __has_feature
-//   https://gcc.gnu.org/gcc-14/changes.html
-//   gcc < 14 doesn't have a macro for UBSAN
-//   (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc)
-//   https://github.com/google/sanitizers/issues/765
-#if defined(__has_feature)
-#if ((__has_feature(undefined_behavior_sanitizer)))
-#define C10_UBSAN_ENABLED 1
-#endif
-#endif
-
-#if !defined(C10_UBSAN_ENABLED)
-#define C10_UBSAN_ENABLED 0
-#endif
-
-// Disable the copy and assignment operator for a class. Note that this will
-// disable the usage of the class in std containers.
-#define C10_DISABLE_COPY_AND_ASSIGN(classname) \
-  classname(const classname&) = delete;        \
-  classname& operator=(const classname&) = delete
-
-#define C10_CONCATENATE_IMPL(s1, s2) s1##s2
-#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2)
-
-#define C10_MACRO_EXPAND(args) args
-
-#define C10_STRINGIZE_IMPL(x) #x
-#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x)
-
-/**
- * C10_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts with
- * str and ends with a unique number.
- */
-#ifdef __COUNTER__
-#define C10_UID __COUNTER__
-#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__)
-#else
-#define C10_UID __LINE__
-#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__)
-#endif
-
-#ifdef __has_cpp_attribute
-#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#else
-#define C10_HAS_CPP_ATTRIBUTE(x) (0)
-#endif
-
-#ifndef FBCODE_CAFFE2
-/// DEPRECATED: Warn if a type or return value is discarded.
-#define C10_NODISCARD [[nodiscard]]
-
-/// DEPRECATED: Suppress an unused variable.
-#define C10_UNUSED [[maybe_unused]]
-#endif
-
-#if !defined(__has_attribute)
-#define __has_attribute(x) 0
-#endif
-
-// Direct port of LLVM_ATTRIBUTE_USED.
-#if __has_attribute(used)
-#define C10_USED __attribute__((__used__))
-#else
-#define C10_USED
-#endif
-
-#define C10_RESTRICT __restrict
-
-// Simply define the namespace, in case a dependent library want to refer to
-// the c10 namespace but not any nontrivial files.
-namespace c10 {}
-namespace c10::cuda {}
-namespace c10::hip {}
-namespace c10::xpu {}
-
-// Since C10 is the core library for caffe2 (and aten), we will simply reroute
-// all abstractions defined in c10 to be available in caffe2 as well.
-// This is only for backwards compatibility. Please use the symbols from the
-// c10 namespace where possible.
-namespace caffe2 {
-using namespace c10;
-}
-namespace at {
-using namespace c10;
-}
-namespace at::cuda {
-using namespace c10::cuda;
-} // namespace at::cuda
-
-// WARNING!!! THIS IS A GIANT HACK!!!
-// This line means you cannot simultaneously include c10/hip
-// and c10/cuda and then use them from the at::cuda namespace.
-// This is true in practice, because HIPIFY works inplace on
-// files in ATen/cuda, so it assumes that c10::hip is available
-// from at::cuda.  This namespace makes that happen.  When
-// HIPIFY is no longer out-of-place, we can switch the cuda
-// here to hip and everyone is happy.
-namespace at::cuda {
-using namespace c10::hip;
-} // namespace at::cuda
-
-namespace at::xpu {
-using namespace c10::xpu;
-} // namespace at::xpu
-
-// C10_LIKELY/C10_UNLIKELY
-//
-// These macros provide parentheses, so you can use these macros as:
-//
-//    if C10_LIKELY(some_expr) {
-//      ...
-//    }
-//
-// NB: static_cast to boolean is mandatory in C++, because __builtin_expect
-// takes a long argument, which means you may trigger the wrong conversion
-// without it.
-//
-#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
-#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
-#define C10_UNLIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 0))
-#else
-#define C10_LIKELY(expr) (expr)
-#define C10_UNLIKELY(expr) (expr)
-#endif
-
-/// C10_NOINLINE - Functions whose declaration is annotated with this will not
-/// be inlined.
-#ifdef __GNUC__
-#define C10_NOINLINE __attribute__((noinline))
-#elif _MSC_VER
-#define C10_NOINLINE __declspec(noinline)
-#else
-#define C10_NOINLINE
-#endif
-
-#if defined(_MSC_VER)
-#define C10_ALWAYS_INLINE __forceinline
-#elif __has_attribute(always_inline) || defined(__GNUC__)
-#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline
-#else
-#define C10_ALWAYS_INLINE inline
-#endif
-
-// Unlike C10_ALWAYS_INLINE, C10_ALWAYS_INLINE_ATTRIBUTE can be used
-// on a lambda.
-#if defined(_MSC_VER)
-// MSVC 14.39 is reasonably recent and doesn't like
-// [[msvc::forceinline]] on a lambda, so don't try to use it.
-#define C10_ALWAYS_INLINE_ATTRIBUTE
-#elif __has_attribute(always_inline) || defined(__GNUC__)
-#define C10_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__))
-#else
-#define C10_ALWAYS_INLINE_ATTRIBUTE
-#endif
-
-#if defined(_MSC_VER)
-#define C10_ATTR_VISIBILITY_HIDDEN
-#elif defined(__GNUC__)
-#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden")))
-#else
-#define C10_ATTR_VISIBILITY_HIDDEN
-#endif
-
-#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN
-
-#include <cstdint>
-
-#ifdef __HIPCC__
-// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work.
-// We do this #include here so that C10_HOST_DEVICE and friends will Just Work.
-// See https://github.com/ROCm/hip/issues/441
-#include <hip/hip_runtime.h>
-#endif
-
-#if defined(__CUDACC__) || defined(__HIPCC__)
-// Designates functions callable from the host (CPU) and the device (GPU)
-#define C10_HOST_DEVICE __host__ __device__
-#define C10_DEVICE __device__
-#define C10_HOST __host__
-// constants from
-// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications)
-// The maximum number of threads per multiprocessor is 1024 for Turing
-// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and
-// 2048 for all other architectures. You'll get warnings if you exceed these
-// constants. Hence, the following macros adjust the input values from the user
-// to resolve potential warnings.
-#if __CUDA_ARCH__ == 750
-constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024;
-#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890
-constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536;
-#else
-constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048;
-#endif
-// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently
-constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024;
-// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block
-// size. 256 is a good number for this fallback and should give good occupancy
-// and versatility across all architectures.
-constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
-// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it
-//       turns out that although __launch_bounds__ can take constexpr, it
-//       can't take a constexpr that has anything to do with templates.
-//       Currently we use launch_bounds that depend on template arguments in
-//       Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK
-//       and C10_MIN_BLOCKS_PER_SM are kept as macros.
-// Suppose you were planning to write __launch_bounds__(a, b), based on your
-// performance tuning on a modern GPU. Instead, you should write
-// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)),
-// which will also properly respect limits on old architectures.
-#define C10_MAX_THREADS_PER_BLOCK(val)           \
-  (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \
-                                         : CUDA_THREADS_PER_BLOCK_FALLBACK)
-#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm)        \
-  ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \
-        ? (blocks_per_sm)                                              \
-        : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block) - 1) /       \
-           (threads_per_block))))
-// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__
-#define C10_LAUNCH_BOUNDS_0 \
-  __launch_bounds__(        \
-      256, 4) // default launch bounds that should give good occupancy and
-              // versatility across all architectures.
-#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \
-  __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))))
-#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \
-  __launch_bounds__(                                                  \
-      (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))),           \
-      (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm))))
-#else
-#define C10_HOST_DEVICE
-#define C10_HOST
-#define C10_DEVICE
-#endif
-
-#if defined(USE_ROCM)
-#define C10_HIP_HOST_DEVICE __host__ __device__
-#else
-#define C10_HIP_HOST_DEVICE
-#endif
-
-#if defined(USE_ROCM)
-// C10_WARP_SIZE is only allowed for device code.
-// Host code _must_ use at::cuda::warp_size()
-// HIP header used to define warpSize as a constexpr that was either 32 or 64
-// depending on the target device, and then always set it to 64 for host code.
-// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
-// set it to something unreasonable to trigger obvious host code errors.
-#if defined(__HIP_DEVICE_COMPILE__)
-#if defined(__GFX9__)
-static constexpr int C10_WARP_SIZE = 64;
-#else // __GFX9__
-static constexpr int C10_WARP_SIZE = 32;
-#endif // __GFX9__
-#else
-static constexpr int C10_WARP_SIZE = 1;
-#endif // __HIP_DEVICE_COMPILE__
-#else
-#define C10_WARP_SIZE 32
-#endif
-
-#if defined(_MSC_VER) && _MSC_VER <= 1900
-#define __func__ __FUNCTION__
-#endif
-
-// CUDA_KERNEL_ASSERT checks the assertion
-// even when NDEBUG is defined. This is useful for important assertions in CUDA
-// code that would otherwise be suppressed when building Release.
-#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__)
-// Those platforms do not support assert()
-#define CUDA_KERNEL_ASSERT(cond)
-#define CUDA_KERNEL_ASSERT_MSG(cond, msg)
-#define SYCL_KERNEL_ASSERT(cond)
-#elif defined(_MSC_VER)
-#if defined(NDEBUG)
-extern "C" {
-C10_IMPORT
-#if defined(__SYCL_DEVICE_ONLY__)
-extern SYCL_EXTERNAL void _wassert(
-    const wchar_t* wexpr,
-    const wchar_t* wfile,
-    unsigned line);
-#else
-#if defined(__CUDA_ARCH__)
-__host__ __device__
-#endif // __CUDA_ARCH__
-    void
-    _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line);
-#endif // __SYCL_DEVICE_ONLY__
-}
-#endif // NDEBUG
-#define CUDA_KERNEL_ASSERT(cond)                 \
-  if (C10_UNLIKELY(!(cond))) {                   \
-    (void)(_wassert(                             \
-               _CRT_WIDE(#cond),                 \
-               _CRT_WIDE(__FILE__),              \
-               static_cast<unsigned>(__LINE__)), \
-           0);                                   \
-  }
-// TODO: This doesn't assert the message because I (chilli) couldn't figure out
-// a nice way to convert a char* to a wchar_t*
-#define CUDA_KERNEL_ASSERT_MSG(cond, msg)        \
-  if (C10_UNLIKELY(!(cond))) {                   \
-    (void)(_wassert(                             \
-               _CRT_WIDE(#cond),                 \
-               _CRT_WIDE(__FILE__),              \
-               static_cast<unsigned>(__LINE__)), \
-           0);                                   \
-  }
-#define SYCL_KERNEL_ASSERT(cond)                 \
-  if (C10_UNLIKELY(!(cond))) {                   \
-    (void)(_wassert(                             \
-               _CRT_WIDE(#cond),                 \
-               _CRT_WIDE(__FILE__),              \
-               static_cast<unsigned>(__LINE__)), \
-           0);                                   \
-  }
-#else // __APPLE__, _MSC_VER
-#if defined(NDEBUG)
-extern "C" {
-#if defined(__SYCL_DEVICE_ONLY__)
-extern SYCL_EXTERNAL void __assert_fail(
-    const char* expr,
-    const char* file,
-    unsigned int line,
-    const char* func);
-#else // __SYCL_DEVICE_ONLY__
-#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__)))
-// CUDA supports __assert_fail function which are common for both device
-// and host side code.
-__host__ __device__
-#endif
-
-    // This forward declaration matching the declaration of __assert_fail
-    // exactly how it is in glibc in case parts of the program are compiled with
-    // different NDEBUG settings. Otherwise we might get 'ambiguous declaration'
-    // error. Note: On ROCm - this declaration serves for host side compilation.
-    void
-    __assert_fail(
-        const char* assertion,
-        const char* file,
-        unsigned int line,
-        const char* function) noexcept __attribute__((__noreturn__));
-
-#endif // __SYCL_DEVICE_ONLY__
-}
-#endif // NDEBUG
-// ROCm disables kernel assert by default for performance considerations.
-// Though ROCm supports __assert_fail, it uses kernel printf which has
-// a non-negligible performance impact even if the assert condition is
-// never triggered. We choose to use abort() instead which will still
-// terminate the application but without a more useful error message.
-#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM)
-#define CUDA_KERNEL_ASSERT(cond) \
-  if C10_UNLIKELY (!(cond)) {    \
-    abort();                     \
-  }
-#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \
-  if C10_UNLIKELY (!(cond)) {             \
-    abort();                              \
-  }
-#define SYCL_KERNEL_ASSERT(cond) \
-  if C10_UNLIKELY (!(cond)) {    \
-    abort();                     \
-  }
-#else
-#define CUDA_KERNEL_ASSERT(cond)                                         \
-  if (C10_UNLIKELY(!(cond))) {                                           \
-    __assert_fail(                                                       \
-        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
-  }
-#define CUDA_KERNEL_ASSERT_MSG(cond, msg)                              \
-  if (C10_UNLIKELY(!(cond))) {                                         \
-    __assert_fail(                                                     \
-        msg, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
-  }
-#define SYCL_KERNEL_ASSERT(cond)                                         \
-  if (C10_UNLIKELY(!(cond))) {                                           \
-    __assert_fail(                                                       \
-        #cond, __FILE__, static_cast<unsigned int>(__LINE__), __func__); \
-  }
-#endif //  C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM
-#endif // __APPLE__
-
-#ifdef __APPLE__
-#include <TargetConditionals.h>
-#endif
-
-#if defined(__ANDROID__)
-#define C10_ANDROID 1
-#define C10_MOBILE 1
-#elif (                   \
-    defined(__APPLE__) && \
-    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
-#define C10_IOS 1
-#define C10_MOBILE 1
-#endif // ANDROID / IOS
-
-#if defined(C10_MOBILE) && C10_MOBILE
-#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline
-#else
-#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE
-#endif
-
-#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
-#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr
-#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr
-
-#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \
-  static constexpr const char field[] = val;
-#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val)
-#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
-
-#ifndef HAS_DEMANGLE
-#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__)
-#define HAS_DEMANGLE 0
-#elif defined(__APPLE__) && \
-    (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)
-#define HAS_DEMANGLE 0
-#else
-#define HAS_DEMANGLE 1
-#endif
-#endif // HAS_DEMANGLE
-
-#define _C10_PRAGMA__(string) _Pragma(#string)
-#define _C10_PRAGMA_(string) _C10_PRAGMA__(string)
-
-#ifdef __clang__
-#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push")
-#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop")
-#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \
-  _C10_PRAGMA_(clang diagnostic ignored flag)
-#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag)
-#else
-#define C10_CLANG_DIAGNOSTIC_PUSH()
-#define C10_CLANG_DIAGNOSTIC_POP()
-#define C10_CLANG_DIAGNOSTIC_IGNORE(flag)
-#define C10_CLANG_HAS_WARNING(flag) 0
-#endif
-
-#ifdef __clang__
-
-#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)         \
-  _C10_PRAGMA_(clang diagnostic push)                               \
-  _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \
-  _C10_PRAGMA_(clang diagnostic ignored warning)
-
-#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop)
-
-#elif __GNUC__
-
-#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \
-  _C10_PRAGMA_(GCC diagnostic push)                         \
-  _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas")          \
-  _C10_PRAGMA_(GCC diagnostic ignored warning)
-
-#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop)
-
-#else
-
-#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning)
-#define C10_DIAGNOSTIC_POP()
-
-#endif
-
-// This macro is used to find older C++ compilers
-// that don't support move optimization for return values.
-
-#if (defined(__GNUC__) && __GNUC__ < 13) || \
-    (defined(__clang_major__) && __clang_major__ < 13)
-#define C10_RETURN_MOVE_IF_OLD_COMPILER 1
-#else
-#define C10_RETURN_MOVE_IF_OLD_COMPILER 0
-#endif
-
-#endif // C10_MACROS_MACROS_H_
+#include <torch/headeronly/macros/Macros.h>

From c7df2c576a747e30cea334240e96462a5e4d1b3a Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 18 Jul 2025 14:09:49 -0700
Subject: [PATCH 3/5] Update on "bump pt core pin to 0718"

Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/)

[ghstack-poisoned]
---
 .../c10/torch/headeronly/macros/Export.h      | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h b/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h
index 183aeab5634..8dd25419efb 100644
--- a/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h
+++ b/runtime/core/portable_type/c10/torch/headeronly/macros/Export.h
@@ -1,5 +1,12 @@
 #pragma once
 
+#ifndef C10_MACROS_EXPORT_H_
+#define C10_MACROS_EXPORT_H_
+
+#ifndef C10_USING_CUSTOM_GENERATED_MACROS
+#include <torch/headeronly/macros/cmake_macros.h>
+#endif // C10_USING_CUSTOM_GENERATED_MACROS
+
 /* Header file to define the common scaffolding for exported symbols.
  *
  * Export is by itself a quite tricky situation to deal with, and if you are
@@ -85,3 +92,62 @@
 #else
 #define C10_API C10_IMPORT
 #endif
+
+// This one is being used by libtorch.so
+#ifdef CAFFE2_BUILD_MAIN_LIB
+#define TORCH_API C10_EXPORT
+#else
+#define TORCH_API C10_IMPORT
+#endif
+
+// You may be wondering why we have TORCH_CUDA_CPP_API and TORCH_CUDA_CU_API
+// belonging to the same library instead of just one TORCH_CUDA_API. Well, it
+// can indeed just be one TORCH_CUDA_API (and used to be)! TORCH_CUDA_CPP_API
+// and TORCH_CUDA_CU_API are artifacts of when we needed a split build to
+// avoid relocation marker linking errors. The context is as follows:
+//
+// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we
+// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker
+// issues when linking big binaries.
+// (https://github.com/pytorch/pytorch/issues/39968) We had two choices:
+//    (1) Stop supporting so many GPU architectures
+//    (2) Do something else
+// We chose #2 and decided to split the behemoth that was torch_cuda into two
+// smaller libraries, one with most of the core kernel functions (torch_cuda_cu)
+// and the other that had..well..everything else (torch_cuda_cpp). The idea was
+// this: instead of linking our static libraries (like the hefty
+// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky
+// relocation marker issues, we could link our static libraries to a smaller
+// part of torch_cuda (torch_cuda_cpp) and avoid the issues.
+
+// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the
+// same api)
+#ifdef TORCH_CUDA_BUILD_MAIN_LIB
+#define TORCH_CUDA_CPP_API C10_EXPORT
+#define TORCH_CUDA_CU_API C10_EXPORT
+#else
+#define TORCH_CUDA_CPP_API C10_IMPORT
+#define TORCH_CUDA_CU_API C10_IMPORT
+#endif
+
+#if defined(TORCH_HIP_BUILD_MAIN_LIB)
+#define TORCH_HIP_CPP_API C10_EXPORT
+#define TORCH_HIP_API C10_EXPORT
+#else
+#define TORCH_HIP_CPP_API C10_IMPORT
+#define TORCH_HIP_API C10_IMPORT
+#endif
+
+#if defined(TORCH_XPU_BUILD_MAIN_LIB)
+#define TORCH_XPU_API C10_EXPORT
+#else
+#define TORCH_XPU_API C10_IMPORT
+#endif
+
+// Enums only need to be exported on windows for non-CUDA files
+#if defined(_WIN32) && defined(__CUDACC__)
+#define C10_API_ENUM C10_API
+#else
+#define C10_API_ENUM
+#endif
+#endif // C10_MACROS_EXPORT_H_

From c05b0cf671dbecbd329a47fe9cab7db271e27233 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 18 Jul 2025 14:14:49 -0700
Subject: [PATCH 4/5] Update on "bump pt core pin to 0718"

Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/)

[ghstack-poisoned]

From 447fde39d56ffc0430b0cc5650e2115fbe3cadf0 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 18 Jul 2025 15:44:57 -0700
Subject: [PATCH 5/5] Update on "bump pt core pin to 0718"

Differential Revision: [D78579692](https://our.internmc.facebook.com/intern/diff/D78579692/)

[ghstack-poisoned]
---
 .ci/docker/ci_commit_pins/pytorch.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index a56cae9d85a..cdf993d3eb1 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-744d29186e941d5fe8cb141de5bb9b89d6d77351
+89d842fec5229fff0df5342b2db121368d51e717