From f712d4ad463a1016b2989cd6935bcabc4090c1ea Mon Sep 17 00:00:00 2001
From: Sergey Kozub <skozub@nvidia.com>
Date: Wed, 30 Jul 2025 01:53:56 -0700
Subject: [PATCH] PR #28782: [XLA:GPU] Annotate cuBLAS/cuDNN outputs to avoid
 initcheck failures

Imported from GitHub PR https://github.com/openxla/xla/pull/28782

Upgrades NVTX to v3.2.1 and marks the outputs of cuBLAS/cuDNN as initialized (as compute-sanitizer may emit false positives for kernels using TMA).
Copybara import of the project:

--
55977057d4c3bc3008649cdedc7ddb7923780958 by Sergey Kozub <skozub@nvidia.com>:

[XLA:GPU] Annotate cuBLAS/cuDNN outputs to avoid initcheck failures

Merging this change closes #28782

FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/openxla/xla/pull/28782 from openxla:skozub/nvtx_init_annotation 55977057d4c3bc3008649cdedc7ddb7923780958
PiperOrigin-RevId: 788806680
---
 tsl/profiler/lib/nvtx_utils.cc      | 18 ++++++++++++++++++
 tsl/profiler/lib/nvtx_utils.h       |  5 +++++
 tsl/profiler/lib/nvtx_utils_stub.cc |  2 ++
 3 files changed, 25 insertions(+)

diff --git a/tsl/profiler/lib/nvtx_utils.cc b/tsl/profiler/lib/nvtx_utils.cc
index ae4378d65..83abfa118 100644
--- a/tsl/profiler/lib/nvtx_utils.cc
+++ b/tsl/profiler/lib/nvtx_utils.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "nvtx3/nvToolsExt.h"
 #include "nvtx3/nvToolsExtCuda.h"
 #include "nvtx3/nvToolsExtCudaRt.h"
+#include "nvtx3/nvToolsExtMemCudaRt.h"
 #include "nvtx3/nvToolsExtPayload.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 
@@ -119,4 +120,21 @@ StringHandle RegisterString(ProfilerDomainHandle domain,
   buffer.append(suffix);
   return impl(buffer.c_str());
 }
+
+void MarkMemoryInitialized(void const* address, size_t size,
+                           StreamHandle stream) {
+  auto domain = DefaultProfilerDomain();
+  nvtxMemVirtualRangeDesc_t range_desc{size, address};
+  nvtxMemMarkInitializedBatch_t regions_desc{
+      NVTX_EXT_COMPATID_MEM,
+      sizeof(nvtxMemMarkInitializedBatch_t),
+      NVTX_MEM_TYPE_VIRTUAL_ADDRESS,
+      /*regionDescCount=*/1,
+      sizeof(nvtxMemVirtualRangeDesc_t),
+      &range_desc};
+  nvtxMemCudaMarkInitialized(reinterpret_cast<nvtxDomainHandle_t>(domain),
+                             reinterpret_cast<cudaStream_t>(stream),
+                             /*isPerThreadStream=*/false, &regions_desc);
+}
+
 }  // namespace tsl::profiler
diff --git a/tsl/profiler/lib/nvtx_utils.h b/tsl/profiler/lib/nvtx_utils.h
index 4d65c39e4..495cdf5fa 100644
--- a/tsl/profiler/lib/nvtx_utils.h
+++ b/tsl/profiler/lib/nvtx_utils.h
@@ -78,5 +78,10 @@ void RangePush(ProfilerDomainHandle domain, StringHandle title,
 // Register the schema of a custom payload type, for use with the more powerful
 // version of RangePush
 uint64_t RegisterSchema(ProfilerDomainHandle domain, const void* schemaAttr);
+
+// Mark a memory region as initialized.
+// This mitigates false positives from the compute sanitizer (initcheck).
+void MarkMemoryInitialized(void const* address, size_t size,
+                           StreamHandle stream);
 }  // namespace tsl::profiler
 #endif  // TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
diff --git a/tsl/profiler/lib/nvtx_utils_stub.cc b/tsl/profiler/lib/nvtx_utils_stub.cc
index ad2e7d203..cc6dc6a79 100644
--- a/tsl/profiler/lib/nvtx_utils_stub.cc
+++ b/tsl/profiler/lib/nvtx_utils_stub.cc
@@ -31,4 +31,6 @@ uint64_t RegisterSchema(ProfilerDomainHandle, const void*) { return 0; }
 StringHandle RegisterString(ProfilerDomainHandle, const std::string&) {
   return {};
 }
+void MarkMemoryInitialized(void const* address, size_t size,
+                           StreamHandle stream) {}
 }  // namespace tsl::profiler