From f712d4ad463a1016b2989cd6935bcabc4090c1ea Mon Sep 17 00:00:00 2001 From: Sergey Kozub Date: Wed, 30 Jul 2025 01:53:56 -0700 Subject: [PATCH] PR #28782: [XLA:GPU] Annotate cuBLAS/cuDNN outputs to avoid initcheck failures Imported from GitHub PR https://github.com/openxla/xla/pull/28782 Upgrades NVTX to v3.2.1 and marks the outputs of cuBLAS/cuDNN as initialized (as compute-sanitizer may emit false positives for kernels using TMA). Copybara import of the project: -- 55977057d4c3bc3008649cdedc7ddb7923780958 by Sergey Kozub : [XLA:GPU] Annotate cuBLAS/cuDNN outputs to avoid initcheck failures Merging this change closes #28782 FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/openxla/xla/pull/28782 from openxla:skozub/nvtx_init_annotation 55977057d4c3bc3008649cdedc7ddb7923780958 PiperOrigin-RevId: 788806680 --- tsl/profiler/lib/nvtx_utils.cc | 18 ++++++++++++++++++ tsl/profiler/lib/nvtx_utils.h | 5 +++++ tsl/profiler/lib/nvtx_utils_stub.cc | 2 ++ 3 files changed, 25 insertions(+) diff --git a/tsl/profiler/lib/nvtx_utils.cc b/tsl/profiler/lib/nvtx_utils.cc index ae4378d65..83abfa118 100644 --- a/tsl/profiler/lib/nvtx_utils.cc +++ b/tsl/profiler/lib/nvtx_utils.cc @@ -28,6 +28,7 @@ limitations under the License. #include "nvtx3/nvToolsExt.h" #include "nvtx3/nvToolsExtCuda.h" #include "nvtx3/nvToolsExtCudaRt.h" +#include "nvtx3/nvToolsExtMemCudaRt.h" #include "nvtx3/nvToolsExtPayload.h" #include "third_party/gpus/cuda/include/cuda.h" @@ -119,4 +120,21 @@ StringHandle RegisterString(ProfilerDomainHandle domain, buffer.append(suffix); return impl(buffer.c_str()); } + +void MarkMemoryInitialized(void const* address, size_t size, + StreamHandle stream) { + auto domain = DefaultProfilerDomain(); + nvtxMemVirtualRangeDesc_t range_desc{size, address}; + nvtxMemMarkInitializedBatch_t regions_desc{ + NVTX_EXT_COMPATID_MEM, + sizeof(nvtxMemMarkInitializedBatch_t), + NVTX_MEM_TYPE_VIRTUAL_ADDRESS, + /*regionDescCount=*/1, + sizeof(nvtxMemVirtualRangeDesc_t), + &range_desc}; + nvtxMemCudaMarkInitialized(reinterpret_cast(domain), + reinterpret_cast(stream), + /*isPerThreadStream=*/false, ®ions_desc); +} + } // namespace tsl::profiler diff --git a/tsl/profiler/lib/nvtx_utils.h b/tsl/profiler/lib/nvtx_utils.h index 4d65c39e4..495cdf5fa 100644 --- a/tsl/profiler/lib/nvtx_utils.h +++ b/tsl/profiler/lib/nvtx_utils.h @@ -78,5 +78,10 @@ void RangePush(ProfilerDomainHandle domain, StringHandle title, // Register the schema of a custom payload type, for use with the more powerful // version of RangePush uint64_t RegisterSchema(ProfilerDomainHandle domain, const void* schemaAttr); + +// Mark a memory region as initialized. +// This mitigates false positives from the compute sanitizer (initcheck). +void MarkMemoryInitialized(void const* address, size_t size, + StreamHandle stream); } // namespace tsl::profiler #endif // TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_ diff --git a/tsl/profiler/lib/nvtx_utils_stub.cc b/tsl/profiler/lib/nvtx_utils_stub.cc index ad2e7d203..cc6dc6a79 100644 --- a/tsl/profiler/lib/nvtx_utils_stub.cc +++ b/tsl/profiler/lib/nvtx_utils_stub.cc @@ -31,4 +31,6 @@ uint64_t RegisterSchema(ProfilerDomainHandle, const void*) { return 0; } StringHandle RegisterString(ProfilerDomainHandle, const std::string&) { return {}; } +void MarkMemoryInitialized(void const* address, size_t size, + StreamHandle stream) {} } // namespace tsl::profiler