diff --git a/build.zig b/build.zig index c436149..0b54918 100644 --- a/build.zig +++ b/build.zig @@ -51,8 +51,8 @@ pub fn build(b: *std.Build) void { if (options.callstack > 0) { translate_c.defineCMacro("TRACY_USE_CALLSTACK", ""); var callstack_buffer: [64]u8 = undefined; - const callstack = std.fmt.bufPrintIntToSlice(&callstack_buffer, @as(u32, options.callstack), 10, .lower, .{}); - translate_c.defineCMacro("TRACY_CALLSTACK", callstack); + const callstack_str_len = std.fmt.printInt(&callstack_buffer, @as(u32, options.callstack), 10, .lower, .{}); + translate_c.defineCMacro("TRACY_CALLSTACK", callstack_buffer[0..callstack_str_len]); } const ztracy = b.addModule("root", .{ @@ -63,30 +63,29 @@ pub fn build(b: *std.Build) void { }); ztracy.addImport("c", translate_c.createModule()); - const tracy = if (options.shared) blk: { - const lib = b.addSharedLibrary(.{ - .name = "tracy", + const tracy = b.addLibrary(.{ + .name = "tracy", + .linkage = if (options.shared) .dynamic else .static, + .root_module = b.createModule(.{ .target = target, .optimize = optimize, - }); - lib.root_module.addCMacro("TRACY_EXPORTS", ""); - break :blk lib; - } else b.addStaticLibrary(.{ - .name = "tracy", - .target = target, - .optimize = optimize, + }), }); + if (options.shared) { + tracy.root_module.addCMacro("TRACY_EXPORTS", ""); + } + tracy.addIncludePath(b.path("libs/tracy/tracy")); tracy.addCSourceFile(.{ .file = b.path("libs/tracy/TracyClient.cpp"), .flags = &.{ - if (options.enable_ztracy) "-DTRACY_ENABLE" else "", - if (options.enable_fibers) "-DTRACY_FIBERS" else "", "-fno-sanitize=undefined", }, }); + if (options.enable_ztracy) tracy.root_module.addCMacro("TRACY_ENABLE", ""); + if (options.enable_fibers) tracy.root_module.addCMacro("TRACY_FIBERS", ""); if (options.on_demand) tracy.root_module.addCMacro("TRACY_ON_DEMAND", ""); tracy.linkLibC(); @@ -115,9 +114,11 @@ pub fn build(b: *std.Build) void { const tests = b.addTest(.{ .name = "ztracy-tests", - .root_source_file = b.path("src/ztracy.zig"), - .target = target, - .optimize = optimize, + .root_module = b.createModule(.{ + .root_source_file = b.path("src/ztracy.zig"), + .target = target, + .optimize = optimize, + }), }); tests.linkLibrary(tracy); b.installArtifact(tests); diff --git a/build.zig.zon b/build.zig.zon index f6a2206..3bf3ffe 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -1,8 +1,8 @@ .{ .name = .ztracy, .fingerprint = 0xf803f1f7ab5272cc, - .version = "0.14.0-dev", - .minimum_zig_version = "0.14.0", + .version = "0.15.0-dev", + .minimum_zig_version = "0.15.0-dev.1230+cf9db9c7b", .paths = .{ "build.zig", "build.zig.zon", diff --git a/libs/tracy/TracyClient.cpp b/libs/tracy/TracyClient.cpp index 6224f48..e9a0184 100644 --- a/libs/tracy/TracyClient.cpp +++ b/libs/tracy/TracyClient.cpp @@ -32,6 +32,10 @@ #include "client/TracyOverride.cpp" #include "client/TracyKCore.cpp" +#ifdef TRACY_ROCPROF +# include "client/TracyRocprof.cpp" +#endif + #if defined(TRACY_HAS_CALLSTACK) # if TRACY_HAS_CALLSTACK == 2 || TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 # include "libbacktrace/alloc.cpp" diff --git a/libs/tracy/client/TracyCallstack.cpp b/libs/tracy/client/TracyCallstack.cpp index 946a197..bd32906 100644 --- a/libs/tracy/client/TracyCallstack.cpp +++ b/libs/tracy/client/TracyCallstack.cpp @@ -282,7 +282,12 @@ extern "C" t_SymFromInlineContext _SymFromInlineContext = 0; t_SymGetLineFromInlineContext _SymGetLineFromInlineContext = 0; - TRACY_API ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain = 0; + typedef unsigned long (__stdcall *___tracy_t_RtlWalkFrameChain)( void**, unsigned long, unsigned long ); + ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChainPtr = nullptr; + TRACY_API unsigned long ___tracy_RtlWalkFrameChain( void** callers, unsigned long count, unsigned long flags) + { + return ___tracy_RtlWalkFrameChainPtr(callers, count, flags); + } } struct ModuleCache @@ -307,7 +312,7 @@ size_t s_krnlCacheCnt; void InitCallstackCritical() { - ___tracy_RtlWalkFrameChain = (___tracy_t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" ); + ___tracy_RtlWalkFrameChainPtr = (___tracy_t_RtlWalkFrameChain)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlWalkFrameChain" ); } void DbgHelpInit() diff --git a/libs/tracy/client/TracyCallstack.h b/libs/tracy/client/TracyCallstack.h index 2c7ecad..1aca729 100644 --- a/libs/tracy/client/TracyCallstack.h +++ b/libs/tracy/client/TracyCallstack.h @@ -8,8 +8,8 @@ # endif # if defined _WIN32 -# include "../common/TracyUwp.hpp" -# ifndef TRACY_UWP +# include "../common/TracyWinFamily.hpp" +# if !defined TRACY_WIN32_NO_DESKTOP # define TRACY_HAS_CALLSTACK 1 # endif # elif defined __ANDROID__ diff --git a/libs/tracy/client/TracyCallstack.hpp b/libs/tracy/client/TracyCallstack.hpp index fdc9345..1d8cd65 100644 --- a/libs/tracy/client/TracyCallstack.hpp +++ b/libs/tracy/client/TracyCallstack.hpp @@ -9,7 +9,8 @@ namespace tracy { -static tracy_force_inline void* Callstack( int /*depth*/ ) { return nullptr; } +static constexpr bool has_callstack() { return false; } +static tracy_force_inline void* Callstack( int32_t /*depth*/ ) { return nullptr; } } #else @@ -38,6 +39,8 @@ static tracy_force_inline void* Callstack( int /*depth*/ ) { return nullptr; } namespace tracy { +static constexpr bool has_callstack() { return true; } + struct CallstackSymbolData { const char* file; @@ -79,11 +82,10 @@ debuginfod_client* GetDebuginfodClient(); extern "C" { - typedef unsigned long (__stdcall *___tracy_t_RtlWalkFrameChain)( void**, unsigned long, unsigned long ); - TRACY_API extern ___tracy_t_RtlWalkFrameChain ___tracy_RtlWalkFrameChain; + TRACY_API unsigned long ___tracy_RtlWalkFrameChain( void**, unsigned long, unsigned long ); } -static tracy_force_inline void* Callstack( int depth ) +static tracy_force_inline void* Callstack( int32_t depth ) { assert( depth >= 1 && depth < 63 ); auto trace = (uintptr_t*)tracy_malloc( ( 1 + depth ) * sizeof( uintptr_t ) ); @@ -112,7 +114,7 @@ static _Unwind_Reason_Code tracy_unwind_callback( struct _Unwind_Context* ctx, v return _URC_NO_REASON; } -static tracy_force_inline void* Callstack( int depth ) +static tracy_force_inline void* Callstack( int32_t depth ) { assert( depth >= 1 && depth < 63 ); @@ -127,7 +129,7 @@ static tracy_force_inline void* Callstack( int depth ) #elif TRACY_HAS_CALLSTACK == 3 || TRACY_HAS_CALLSTACK == 4 || TRACY_HAS_CALLSTACK == 6 -static tracy_force_inline void* Callstack( int depth ) +static tracy_force_inline void* Callstack( int32_t depth ) { assert( depth >= 1 ); diff --git a/libs/tracy/client/TracyLock.hpp b/libs/tracy/client/TracyLock.hpp index d12a3c1..e00b344 100644 --- a/libs/tracy/client/TracyLock.hpp +++ b/libs/tracy/client/TracyLock.hpp @@ -219,8 +219,9 @@ class Lockable m_ctx.CustomName( name, size ); } -private: T m_lockable; + +private: LockableCtx m_ctx; }; @@ -535,8 +536,9 @@ class SharedLockable m_ctx.CustomName( name, size ); } -private: T m_lockable; + +private: SharedLockableCtx m_ctx; }; diff --git a/libs/tracy/client/TracyProfiler.cpp b/libs/tracy/client/TracyProfiler.cpp index fa93043..e1b9d50 100644 --- a/libs/tracy/client/TracyProfiler.cpp +++ b/libs/tracy/client/TracyProfiler.cpp @@ -9,7 +9,10 @@ # include # include # include -# include "../common/TracyUwp.hpp" +# include "../common/TracyWinFamily.hpp" +# ifndef _MSC_VER +# include +# endif #else # include # include @@ -81,6 +84,10 @@ #include "TracySysTrace.hpp" #include "../tracy/TracyC.h" +#if defined TRACY_MANUAL_LIFETIME && !defined(TRACY_DELAYED_INIT) +# error "TRACY_MANUAL_LIFETIME requires enabled TRACY_DELAYED_INIT" +#endif + #ifdef TRACY_PORT # ifndef TRACY_DATA_PORT # define TRACY_DATA_PORT TRACY_PORT @@ -106,9 +113,12 @@ # include extern "C" typedef LONG (WINAPI *t_RtlGetVersion)( PRTL_OSVERSIONINFOW ); extern "C" typedef BOOL (WINAPI *t_GetLogicalProcessorInformationEx)( LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD ); +extern "C" typedef char* (WINAPI *t_WineGetVersion)(); +extern "C" typedef char* (WINAPI *t_WineGetBuildId)(); #else # include # include +# include #endif #if defined __linux__ # include @@ -317,7 +327,13 @@ static inline void CpuId( uint32_t* regs, uint32_t leaf ) static void InitFailure( const char* msg ) { -#if defined _WIN32 +#if defined TRACY_GDK + const char* format = "Tracy Profiler initialization failure: %s\n"; + const int length = snprintf( nullptr, 0, format, msg ); + char* buffer = (char*)alloca( length + 1 ); + snprintf( buffer, length + 1, format, msg ); + OutputDebugStringA( buffer ); +#elif defined _WIN32 bool hasConsole = false; bool reopen = false; const auto attached = AttachConsole( ATTACH_PARENT_PROCESS ); @@ -500,7 +516,7 @@ static const char* GetHostInfo() static char buf[1024]; auto ptr = buf; #if defined _WIN32 -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP auto GetVersion = &::GetVersionEx; # else auto GetVersion = (t_RtlGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "RtlGetVersion" ); @@ -521,7 +537,16 @@ static const char* GetHostInfo() # ifdef __MINGW32__ ptr += sprintf( ptr, "OS: Windows %i.%i.%i (MingW)\n", (int)ver.dwMajorVersion, (int)ver.dwMinorVersion, (int)ver.dwBuildNumber ); # else - ptr += sprintf( ptr, "OS: Windows %lu.%lu.%lu\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber ); + auto WineGetVersion = (t_WineGetVersion)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "wine_get_version" ); + auto WineGetBuildId = (t_WineGetBuildId)GetProcAddress( GetModuleHandleA( "ntdll.dll" ), "wine_get_build_id" ); + if( WineGetVersion && WineGetBuildId ) + { + ptr += sprintf( ptr, "OS: Windows %lu.%lu.%lu (Wine %s [%s])\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber, WineGetVersion(), WineGetBuildId() ); + } + else + { + ptr += sprintf( ptr, "OS: Windows %lu.%lu.%lu\n", ver.dwMajorVersion, ver.dwMinorVersion, ver.dwBuildNumber ); + } # endif } #elif defined __linux__ @@ -574,7 +599,7 @@ static const char* GetHostInfo() char hostname[512]; gethostname( hostname, 512 ); -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP const char* user = ""; # else DWORD userSz = UNLEN+1; @@ -785,7 +810,7 @@ static BroadcastMessage& GetBroadcastMessage( const char* procname, size_t pnsz, return msg; } -#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER +#if defined _WIN32 && !defined TRACY_WIN32_NO_DESKTOP && !defined TRACY_NO_CRASH_HANDLER static DWORD s_profilerThreadId = 0; static DWORD s_symbolThreadId = 0; static char s_crashText[1024]; @@ -893,6 +918,13 @@ LONG WINAPI CrashFilter( PEXCEPTION_POINTERS pExp ) } #endif +#if defined _WIN32 && !defined _MSC_VER +LONG WINAPI CrashFilterExecute( PEXCEPTION_POINTERS pExp ) +{ + return EXCEPTION_EXECUTE_HANDLER; +} +#endif + static Profiler* s_instance = nullptr; static Thread* s_thread; #ifndef TRACY_NO_FRAME_IMAGE @@ -1139,6 +1171,38 @@ static void CrashHandler( int signal, siginfo_t* info, void* /*ucontext*/ ) } #endif +#ifdef TRACY_HAS_SYSTEM_TRACING +static void StartSystemTracing( int64_t& samplingPeriod ) +{ + assert( s_sysTraceThread == nullptr ); + + // use TRACY_NO_SYS_TRACE=1 to force disabling sys tracing (even if available in the underlying system) + // as it can have significant impact on the size of the traces + const char* noSysTrace = GetEnvVar( "TRACY_NO_SYS_TRACE" ); + const bool disableSystrace = (noSysTrace && noSysTrace[0] == '1'); + if( disableSystrace ) + { + TracyDebug("TRACY: Sys Trace was disabled by 'TRACY_NO_SYS_TRACE=1'\n"); + } + else if( SysTraceStart( samplingPeriod ) ) + { + s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) ); + new(s_sysTraceThread) Thread( SysTraceWorker, nullptr ); + std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) ); + } +} + +static void StopSystemTracing() +{ + if( s_sysTraceThread ) + { + SysTraceStop(); + s_sysTraceThread->~Thread(); + tracy_free( s_sysTraceThread ); + s_sysTraceThread = nullptr; + } +} +#endif enum { QueuePrealloc = 256 * 1024 }; @@ -1378,6 +1442,8 @@ TRACY_API LuaZoneState& GetLuaZoneState() { return s_luaZoneState; } TRACY_API bool ProfilerAvailable() { return s_instance != nullptr; } TRACY_API bool ProfilerAllocatorAvailable() { return !RpThreadShutdown; } +constexpr static size_t SafeSendBufferSize = 65536; + Profiler::Profiler() : m_timeBegin( 0 ) , m_mainThread( detail::GetThreadHandleImpl() ) @@ -1451,6 +1517,21 @@ Profiler::Profiler() m_userPort = atoi( userPort ); } + m_safeSendBuffer = (char*)tracy_malloc( SafeSendBufferSize ); + +#ifndef _WIN32 + pipe(m_pipe); +# if defined __APPLE__ || defined BSD + // FreeBSD/XNU don't have F_SETPIPE_SZ, so use the default + m_pipeBufSize = 16384; +# else + m_pipeBufSize = (int)(ptrdiff_t)SafeSendBufferSize; + while( fcntl( m_pipe[0], F_SETPIPE_SZ, m_pipeBufSize ) < 0 && errno == EPERM ) m_pipeBufSize /= 2; // too big; reduce + m_pipeBufSize = fcntl( m_pipe[0], F_GETPIPE_SZ ); +# endif + fcntl( m_pipe[1], F_SETFL, O_NONBLOCK ); +#endif + #if !defined(TRACY_DELAYED_INIT) || !defined(TRACY_MANUAL_LIFETIME) SpawnWorkerThreads(); #endif @@ -1475,8 +1556,10 @@ void Profiler::InstallCrashHandler() sigaction( SIGABRT, &crashHandler, &m_prevSignal.abrt ); #endif -#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER - m_exceptionHandler = AddVectoredExceptionHandler( 1, CrashFilter ); +#if defined _WIN32 && !defined TRACY_WIN32_NO_DESKTOP && !defined TRACY_NO_CRASH_HANDLER + // We cannot use Vectored Exception handling because it catches application-wide frame-based SEH blocks. We only + // want to catch unhandled exceptions. + m_prevHandler = reinterpret_cast( SetUnhandledExceptionFilter( CrashFilter ) ); #endif #ifndef TRACY_NO_CRASH_HANDLER @@ -1487,20 +1570,29 @@ void Profiler::InstallCrashHandler() void Profiler::RemoveCrashHandler() { -#if defined _WIN32 && !defined TRACY_UWP - if( m_crashHandlerInstalled ) RemoveVectoredExceptionHandler( m_exceptionHandler ); +#if defined _WIN32 && !defined TRACY_WIN32_NO_DESKTOP && !defined TRACY_NO_CRASH_HANDLER + if( m_crashHandlerInstalled ) + { + auto prev = SetUnhandledExceptionFilter( (LPTOP_LEVEL_EXCEPTION_FILTER)m_prevHandler ); + if( prev != CrashFilter ) SetUnhandledExceptionFilter( prev ); // A different exception filter was installed over ours => put it back + } #endif #if defined __linux__ && !defined TRACY_NO_CRASH_HANDLER if( m_crashHandlerInstalled ) { - sigaction( TRACY_CRASH_SIGNAL, &m_prevSignal.pwr, nullptr ); - sigaction( SIGILL, &m_prevSignal.ill, nullptr ); - sigaction( SIGFPE, &m_prevSignal.fpe, nullptr ); - sigaction( SIGSEGV, &m_prevSignal.segv, nullptr ); - sigaction( SIGPIPE, &m_prevSignal.pipe, nullptr ); - sigaction( SIGBUS, &m_prevSignal.bus, nullptr ); - sigaction( SIGABRT, &m_prevSignal.abrt, nullptr ); + auto restore = []( int signum, struct sigaction* prev ) { + struct sigaction old; + sigaction( signum, prev, &old ); + if( old.sa_sigaction != CrashHandler ) sigaction( signum, &old, nullptr ); // A different signal handler was installed over ours => put it back + }; + restore( TRACY_CRASH_SIGNAL, &m_prevSignal.pwr ); + restore( SIGILL, &m_prevSignal.ill ); + restore( SIGFPE, &m_prevSignal.fpe ); + restore( SIGSEGV, &m_prevSignal.segv ); + restore( SIGPIPE, &m_prevSignal.pipe ); + restore( SIGBUS, &m_prevSignal.bus ); + restore( SIGABRT, &m_prevSignal.abrt ); } #endif m_crashHandlerInstalled = false; @@ -1509,20 +1601,7 @@ void Profiler::RemoveCrashHandler() void Profiler::SpawnWorkerThreads() { #ifdef TRACY_HAS_SYSTEM_TRACING - // use TRACY_NO_SYS_TRACE=1 to force disabling sys tracing (even if available in the underlying system) - // as it can have significant impact on the size of the traces - const char* noSysTrace = GetEnvVar( "TRACY_NO_SYS_TRACE" ); - const bool disableSystrace = (noSysTrace && noSysTrace[0] == '1'); - if( disableSystrace ) - { - TracyDebug("TRACY: Sys Trace was disabled by 'TRACY_NO_SYS_TRACE=1'\n"); - } - else if( SysTraceStart( m_samplingPeriod ) ) - { - s_sysTraceThread = (Thread*)tracy_malloc( sizeof( Thread ) ); - new(s_sysTraceThread) Thread( SysTraceWorker, nullptr ); - std::this_thread::sleep_for( std::chrono::milliseconds( 1 ) ); - } + StartSystemTracing( m_samplingPeriod ); #endif s_thread = (Thread*)tracy_malloc( sizeof( Thread ) ); @@ -1538,7 +1617,7 @@ void Profiler::SpawnWorkerThreads() new(s_symbolThread) Thread( LaunchSymbolWorker, this ); #endif -#if defined _WIN32 && !defined TRACY_UWP && !defined TRACY_NO_CRASH_HANDLER +#if defined _WIN32 && !defined TRACY_WIN32_NO_DESKTOP && !defined TRACY_NO_CRASH_HANDLER s_profilerThreadId = GetThreadId( s_thread->Handle() ); # ifdef TRACY_HAS_CALLSTACK s_symbolThreadId = GetThreadId( s_symbolThread->Handle() ); @@ -1559,12 +1638,7 @@ Profiler::~Profiler() RemoveCrashHandler(); #ifdef TRACY_HAS_SYSTEM_TRACING - if( s_sysTraceThread ) - { - SysTraceStop(); - s_sysTraceThread->~Thread(); - tracy_free( s_sysTraceThread ); - } + StopSystemTracing(); #endif #ifdef TRACY_HAS_CALLSTACK @@ -1589,6 +1663,12 @@ Profiler::~Profiler() tracy_free( m_kcore ); #endif +#ifndef _WIN32 + close( m_pipe[0] ); + close( m_pipe[1] ); +#endif + tracy_free( m_safeSendBuffer ); + tracy_free( m_lz4Buf ); tracy_free( m_buffer ); LZ4_freeStream( (LZ4_stream_t*)m_stream ); @@ -1711,7 +1791,6 @@ void Profiler::Worker() MemWrite( &welcome.timerMul, m_timerMul ); MemWrite( &welcome.initBegin, GetInitTime() ); MemWrite( &welcome.initEnd, m_timeBegin.load( std::memory_order_relaxed ) ); - MemWrite( &welcome.delay, m_delay ); MemWrite( &welcome.resolution, m_resolution ); MemWrite( &welcome.epoch, m_epoch ); MemWrite( &welcome.exectime, m_exectime ); @@ -1951,7 +2030,6 @@ void Profiler::Worker() } else if( status == DequeueStatus::QueueEmpty && serialStatus == DequeueStatus::QueueEmpty ) { - if( ShouldExit() ) break; if( m_bufferOffset != m_bufferStart ) { if( !CommitData() ) break; @@ -1982,7 +2060,7 @@ void Profiler::Worker() connActive = HandleServerQuery(); if( !connActive ) break; } - if( !connActive ) break; + if( !connActive || ShouldExit() ) break; } if( ShouldExit() ) break; @@ -2048,7 +2126,13 @@ void Profiler::Worker() while( s_symbolThreadGone.load() == false ) { YieldThread(); } #endif - // Client is exiting. Send items remaining in queues. + // Client is exiting. +#ifdef TRACY_HAS_SYSTEM_TRACING + // Stop filling queues with new data. + StopSystemTracing(); +#endif + + // Send items remaining in queues. for(;;) { const auto status = Dequeue( token ); @@ -2299,6 +2383,10 @@ static void FreeAssociatedMemory( const QueueItem& item ) tracy_free( (void*)ptr ); break; #endif + case QueueType::GpuAnnotationName: + ptr = MemRead( &item.gpuAnnotationNameFat.ptr ); + tracy_free( (void*)ptr ); + break; #ifdef TRACY_ON_DEMAND case QueueType::MessageAppInfo: case QueueType::GpuContextName: @@ -2514,6 +2602,12 @@ Profiler::DequeueStatus Profiler::Dequeue( moodycamel::ConsumerToken& token ) tracy_free_fast( (void*)ptr ); #endif break; + case QueueType::GpuAnnotationName: + ptr = MemRead( &item->gpuAnnotationNameFat.ptr ); + size = MemRead( &item->gpuAnnotationNameFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free_fast( (void*)ptr ); + break; case QueueType::PlotDataInt: case QueueType::PlotDataFloat: case QueueType::PlotDataDouble: @@ -2816,6 +2910,15 @@ Profiler::DequeueStatus Profiler::DequeueSerial() MemWrite( &item->memFree.time, dt ); break; } + case QueueType::MemDiscard: + case QueueType::MemDiscardCallstack: + { + int64_t t = MemRead( &item->memDiscard.time ); + int64_t dt = t - refSerial; + refSerial = t; + MemWrite( &item->memDiscard.time, dt ); + break; + } case QueueType::GpuZoneBeginSerial: case QueueType::GpuZoneBeginCallstackSerial: { @@ -2863,6 +2966,14 @@ Profiler::DequeueStatus Profiler::DequeueSerial() #endif break; } + case QueueType::GpuAnnotationName: + { + ptr = MemRead( &item->gpuAnnotationNameFat.ptr ); + uint16_t size = MemRead( &item->gpuAnnotationNameFat.size ); + SendSingleString( (const char*)ptr, size ); + tracy_free_fast( (void*)ptr ); + break; + } #ifdef TRACY_FIBERS case QueueType::ZoneBegin: case QueueType::ZoneBeginCallstack: @@ -3052,6 +3163,66 @@ bool Profiler::CommitData() return ret; } +char* Profiler::SafeCopyProlog( const char* data, size_t size ) +{ + bool success = true; + char* buf = m_safeSendBuffer; +#ifndef NDEBUG + assert( !m_inUse.exchange(true) ); +#endif + + if( size > SafeSendBufferSize ) buf = (char*)tracy_malloc( size ); + +#ifdef _WIN32 +# ifdef _MSC_VER + __try + { + memcpy( buf, data, size ); + } + __except( 1 /*EXCEPTION_EXECUTE_HANDLER*/ ) + { + success = false; + } +# else + memcpy( buf, data, size ); +# endif +#else + // Send through the pipe to ensure safe reads + for( size_t offset = 0; offset != size; /*in loop*/ ) + { + size_t sendsize = size - offset; + ssize_t result1, result2; + while( ( result1 = write( m_pipe[1], data + offset, sendsize ) ) < 0 && errno == EINTR ) { /* retry */ } + if( result1 < 0 ) + { + success = false; + break; + } + while( ( result2 = read( m_pipe[0], buf + offset, result1 ) ) < 0 && errno == EINTR ) { /* retry */ } + if( result2 != result1 ) + { + success = false; + break; + } + offset += result1; + } +#endif + + if( success ) return buf; + + SafeCopyEpilog( buf ); + return nullptr; +} + +void Profiler::SafeCopyEpilog( char* buf ) +{ + if( buf != m_safeSendBuffer ) tracy_free( buf ); + +#ifndef NDEBUG + m_inUse.store( false ); +#endif +} + bool Profiler::SendData( const char* data, size_t len ) { const lz4sz_t lz4sz = LZ4_compress_fast_continue( (LZ4_stream_t*)m_stream, data, m_lz4Buf + sizeof( lz4sz_t ), (int)len, LZ4Size, 1 ); @@ -3682,43 +3853,6 @@ void Profiler::CalibrateDelay() if( dti > 0 && dti < mindiff ) mindiff = dti; } m_resolution = mindiff; - -#ifdef TRACY_DELAYED_INIT - m_delay = m_resolution; -#else - constexpr int Events = Iterations * 2; // start + end - static_assert( Events < QueuePrealloc, "Delay calibration loop will allocate memory in queue" ); - - static const tracy::SourceLocationData __tracy_source_location { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; - const auto t0 = GetTime(); - for( int i=0; izoneBegin.time, Profiler::GetTime() ); - MemWrite( &item->zoneBegin.srcloc, (uint64_t)&__tracy_source_location ); - TracyLfqCommit; - } - { - TracyLfqPrepare( QueueType::ZoneEnd ); - MemWrite( &item->zoneEnd.time, GetTime() ); - TracyLfqCommit; - } - } - const auto t1 = GetTime(); - const auto dt = t1 - t0; - m_delay = dt / Events; - - moodycamel::ConsumerToken token( GetQueue() ); - int left = Events; - while( left != 0 ) - { - const auto sz = GetQueue().try_dequeue_bulk_single( token, [](const uint64_t&){}, [](QueueItem* item, size_t sz){} ); - assert( sz > 0 ); - left -= (int)sz; - } - assert( GetQueue().size_approx() == 0 ); -#endif } void Profiler::ReportTopology() @@ -3733,24 +3867,55 @@ void Profiler::ReportTopology() }; #if defined _WIN32 -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = &::GetLogicalProcessorInformationEx; # else t_GetLogicalProcessorInformationEx _GetLogicalProcessorInformationEx = (t_GetLogicalProcessorInformationEx)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetLogicalProcessorInformationEx" ); # endif if( !_GetLogicalProcessorInformationEx ) return; + SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* packageInfo = nullptr; + SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* dieInfo = nullptr; + SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* coreInfo = nullptr; + DWORD psz = 0; _GetLogicalProcessorInformationEx( RelationProcessorPackage, nullptr, &psz ); - auto packageInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( psz ); - auto res = _GetLogicalProcessorInformationEx( RelationProcessorPackage, packageInfo, &psz ); - assert( res ); + if( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) + { + packageInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( psz ); + auto res = _GetLogicalProcessorInformationEx( RelationProcessorPackage, packageInfo, &psz ); + assert( res ); + } + else + { + psz = 0; + } + + DWORD dsz = 0; + _GetLogicalProcessorInformationEx( RelationProcessorDie, nullptr, &dsz ); + if( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) + { + dieInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( dsz ); + auto res = _GetLogicalProcessorInformationEx( RelationProcessorDie, dieInfo, &dsz ); + assert( res ); + } + else + { + dsz = 0; + } DWORD csz = 0; _GetLogicalProcessorInformationEx( RelationProcessorCore, nullptr, &csz ); - auto coreInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( csz ); - res = _GetLogicalProcessorInformationEx( RelationProcessorCore, coreInfo, &csz ); - assert( res ); + if( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) + { + coreInfo = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)tracy_malloc( csz ); + auto res = _GetLogicalProcessorInformationEx( RelationProcessorCore, coreInfo, &csz ); + assert( res ); + } + else + { + csz = 0; + } SYSTEM_INFO sysinfo; GetSystemInfo( &sysinfo ); @@ -3778,6 +3943,24 @@ void Profiler::ReportTopology() idx++; } + idx = 0; + ptr = dieInfo; + while( (char*)ptr < ((char*)dieInfo) + dsz ) + { + assert( ptr->Relationship == RelationProcessorDie ); + // FIXME account for GroupCount + auto mask = ptr->Processor.GroupMask[0].Mask; + int core = 0; + while( mask != 0 ) + { + if( mask & 1 ) cpuData[core].die = idx; + core++; + mask >>= 1; + } + ptr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(((char*)ptr) + ptr->Size); + idx++; + } + idx = 0; ptr = coreInfo; while( (char*)ptr < ((char*)coreInfo) + csz ) @@ -3838,12 +4021,26 @@ void Profiler::ReportTopology() fclose( f ); cpuData[i].package = uint32_t( atoi( buf ) ); cpuData[i].thread = i; + sprintf( path, "%s%i/topology/core_id", basePath, i ); f = fopen( path, "rb" ); - read = fread( buf, 1, 1024, f ); - buf[read] = '\0'; - fclose( f ); - cpuData[i].core = uint32_t( atoi( buf ) ); + if( f ) + { + read = fread( buf, 1, 1024, f ); + buf[read] = '\0'; + fclose( f ); + cpuData[i].core = uint32_t( atoi( buf ) ); + } + + sprintf( path, "%s%i/topology/die_id", basePath, i ); + f = fopen( path, "rb" ); + if( f ) + { + read = fread( buf, 1, 1024, f ); + buf[read] = '\0'; + fclose( f ); + cpuData[i].die = uint32_t( atoi( buf ) ); + } } for( int i=0; i 0 && tracy::has_callstack() ) { - TracyQueuePrepareC( tracy::QueueType::ZoneBeginCallstack ); - tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); - tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); - TracyQueueCommitC( zoneBeginThread ); + tracy::GetProfiler().SendCallstack( depth ); + zoneQueue = tracy::QueueType::ZoneBeginCallstack; } + TracyQueuePrepareC( zoneQueue ); + tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); + TracyQueueCommitC( zoneBeginThread ); + return ctx; } -TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int active ) +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int32_t active ) { ___tracy_c_zone_context ctx; #ifdef TRACY_ON_DEMAND @@ -4160,7 +4360,7 @@ TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int act return ctx; } -TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int depth, int active ) +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int32_t depth, int32_t active ) { ___tracy_c_zone_context ctx; #ifdef TRACY_ON_DEMAND @@ -4183,13 +4383,17 @@ TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srclo TracyQueueCommitC( zoneValidationThread ); } #endif - tracy::GetProfiler().SendCallstack( depth ); + auto zoneQueue = tracy::QueueType::ZoneBeginAllocSrcLoc; + if( depth > 0 && tracy::has_callstack() ) { - TracyQueuePrepareC( tracy::QueueType::ZoneBeginAllocSrcLocCallstack ); - tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); - tracy::MemWrite( &item->zoneBegin.srcloc, srcloc ); - TracyQueueCommitC( zoneBeginThread ); + tracy::GetProfiler().SendCallstack( depth ); + zoneQueue = tracy::QueueType::ZoneBeginAllocSrcLocCallstack; } + TracyQueuePrepareC( zoneQueue ); + tracy::MemWrite( &item->zoneBegin.time, tracy::Profiler::GetTime() ); + tracy::MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommitC( zoneBeginThread ); + return ctx; } @@ -4287,26 +4491,78 @@ TRACY_API void ___tracy_emit_zone_value( TracyCZoneCtx ctx, uint64_t value ) } } -TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int secure ) { tracy::Profiler::MemAlloc( ptr, size, secure != 0 ); } -TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth, int secure ) { tracy::Profiler::MemAllocCallstack( ptr, size, depth, secure != 0 ); } -TRACY_API void ___tracy_emit_memory_free( const void* ptr, int secure ) { tracy::Profiler::MemFree( ptr, secure != 0 ); } -TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth, int secure ) { tracy::Profiler::MemFreeCallstack( ptr, depth, secure != 0 ); } -TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int secure, const char* name ) { tracy::Profiler::MemAllocNamed( ptr, size, secure != 0, name ); } -TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int depth, int secure, const char* name ) { tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, secure != 0, name ); } -TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int secure, const char* name ) { tracy::Profiler::MemFreeNamed( ptr, secure != 0, name ); } -TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int depth, int secure, const char* name ) { tracy::Profiler::MemFreeCallstackNamed( ptr, depth, secure != 0, name ); } +TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int32_t secure ) { tracy::Profiler::MemAlloc( ptr, size, secure != 0 ); } +TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int32_t depth, int32_t secure ) +{ + if( depth > 0 && tracy::has_callstack() ) + { + tracy::Profiler::MemAllocCallstack( ptr, size, depth, secure != 0 ); + } + else + { + tracy::Profiler::MemAlloc( ptr, size, secure != 0 ); + } +} +TRACY_API void ___tracy_emit_memory_free( const void* ptr, int32_t secure ) { tracy::Profiler::MemFree( ptr, secure != 0 ); } +TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int32_t depth, int32_t secure ) +{ + if( depth > 0 && tracy::has_callstack() ) + { + tracy::Profiler::MemFreeCallstack( ptr, depth, secure != 0 ); + } + else + { + tracy::Profiler::MemFree( ptr, secure != 0 ); + } +} +TRACY_API void ___tracy_emit_memory_discard( const char* name, int32_t secure ) { tracy::Profiler::MemDiscard( name, secure != 0 ); } +TRACY_API void ___tracy_emit_memory_discard_callstack( const char* name, int32_t secure, int32_t depth ) +{ + if( depth > 0 && tracy::has_callstack() ) + { + tracy::Profiler::MemDiscardCallstack( name, secure != 0, depth ); + } + else + { + tracy::Profiler::MemDiscard( name, secure != 0 ); + } +} +TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int32_t secure, const char* name ) { tracy::Profiler::MemAllocNamed( ptr, size, secure != 0, name ); } +TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int32_t depth, int32_t secure, const char* name ) +{ + if( depth > 0 && tracy::has_callstack() ) + { + tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, secure != 0, name ); + } + else + { + tracy::Profiler::MemAllocNamed( ptr, size, secure != 0, name ); + } +} +TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int32_t secure, const char* name ) { tracy::Profiler::MemFreeNamed( ptr, secure != 0, name ); } +TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int32_t depth, int32_t secure, const char* name ) +{ + if( depth > 0 && tracy::has_callstack() ) + { + tracy::Profiler::MemFreeCallstackNamed( ptr, depth, secure != 0, name ); + } + else + { + tracy::Profiler::MemFreeNamed( ptr, secure != 0, name ); + } +} TRACY_API void ___tracy_emit_frame_mark( const char* name ) { tracy::Profiler::SendFrameMark( name ); } TRACY_API void ___tracy_emit_frame_mark_start( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgStart ); } TRACY_API void ___tracy_emit_frame_mark_end( const char* name ) { tracy::Profiler::SendFrameMark( name, tracy::QueueType::FrameMarkMsgEnd ); } -TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip ) { tracy::Profiler::SendFrameImage( image, w, h, offset, flip ); } +TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int32_t flip ) { tracy::Profiler::SendFrameImage( image, w, h, offset, flip != 0 ); } TRACY_API void ___tracy_emit_plot( const char* name, double val ) { tracy::Profiler::PlotData( name, val ); } TRACY_API void ___tracy_emit_plot_float( const char* name, float val ) { tracy::Profiler::PlotData( name, val ); } TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ) { tracy::Profiler::PlotData( name, val ); } -TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ) { tracy::Profiler::ConfigurePlot( name, tracy::PlotFormatType(type), step, fill, color ); } -TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ) { tracy::Profiler::Message( txt, size, callstack ); } -TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ) { tracy::Profiler::Message( txt, callstack ); } -TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, size, color, callstack ); } -TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack ) { tracy::Profiler::MessageColor( txt, color, callstack ); } +TRACY_API void ___tracy_emit_plot_config( const char* name, int32_t type, int32_t step, int32_t fill, uint32_t color ) { tracy::Profiler::ConfigurePlot( name, tracy::PlotFormatType(type), step != 0, fill != 0, color ); } +TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int32_t callstack_depth ) { tracy::Profiler::Message( txt, size, callstack_depth ); } +TRACY_API void ___tracy_emit_messageL( const char* txt, int32_t callstack_depth ) { tracy::Profiler::Message( txt, callstack_depth ); } +TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int32_t callstack_depth ) { tracy::Profiler::MessageColor( txt, size, color, callstack_depth ); } +TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int32_t callstack_depth ) { tracy::Profiler::MessageColor( txt, color, callstack_depth ); } TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ) { tracy::Profiler::MessageAppInfo( txt, size ); } TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, uint32_t color ) { @@ -4604,7 +4860,7 @@ TRACY_API void ___tracy_terminate_lockable_ctx( struct __tracy_lockable_context_ tracy::tracy_free((void*)lockdata); } -TRACY_API int ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ) +TRACY_API int32_t ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ) { #ifdef TRACY_ON_DEMAND bool queue = false; @@ -4616,7 +4872,7 @@ TRACY_API int ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context if( active != connected ) lockdata->m_active.store( connected, std::memory_order_relaxed ); if( connected ) queue = true; } - if( !queue ) return false; + if( !queue ) return static_cast(false); #endif auto item = tracy::Profiler::QueueSerial(); @@ -4625,7 +4881,7 @@ TRACY_API int ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context tracy::MemWrite( &item->lockWait.id, lockdata->m_id ); tracy::MemWrite( &item->lockWait.time, tracy::Profiler::GetTime() ); tracy::Profiler::QueueSerialFinish(); - return true; + return static_cast(true); } TRACY_API void ___tracy_after_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ) @@ -4657,7 +4913,7 @@ TRACY_API void ___tracy_after_unlock_lockable_ctx( struct __tracy_lockable_conte tracy::Profiler::QueueSerialFinish(); } -TRACY_API void ___tracy_after_try_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata, int acquired ) +TRACY_API void ___tracy_after_try_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata, int32_t acquired ) { #ifdef TRACY_ON_DEMAND if( !acquired ) return; @@ -4722,9 +4978,9 @@ TRACY_API void ___tracy_custom_name_lockable_ctx( struct __tracy_lockable_contex tracy::Profiler::QueueSerialFinish(); } -TRACY_API int ___tracy_connected( void ) +TRACY_API int32_t ___tracy_connected( void ) { - return tracy::GetProfiler().IsConnected(); + return static_cast( tracy::GetProfiler().IsConnected() ); } #ifdef TRACY_FIBERS @@ -4732,7 +4988,7 @@ TRACY_API void ___tracy_fiber_enter( const char* fiber ){ tracy::Profiler::Enter TRACY_API void ___tracy_fiber_leave( void ){ tracy::Profiler::LeaveFiber(); } #endif -# ifdef TRACY_MANUAL_LIFETIME +# if defined TRACY_MANUAL_LIFETIME && defined TRACY_DELAYED_INIT TRACY_API void ___tracy_startup_profiler( void ) { tracy::StartupProfiler(); @@ -4743,9 +4999,9 @@ TRACY_API void ___tracy_shutdown_profiler( void ) tracy::ShutdownProfiler(); } -TRACY_API int ___tracy_profiler_started( void ) +TRACY_API int32_t ___tracy_profiler_started( void ) { - return tracy::s_isProfilerStarted.load( std::memory_order_seq_cst ); + return static_cast( tracy::s_isProfilerStarted.load( std::memory_order_seq_cst ) ); } # endif diff --git a/libs/tracy/client/TracyProfiler.hpp b/libs/tracy/client/TracyProfiler.hpp index 46f11f3..e773f5e 100644 --- a/libs/tracy/client/TracyProfiler.hpp +++ b/libs/tracy/client/TracyProfiler.hpp @@ -114,11 +114,11 @@ struct LuaZoneState #define TracyLfqPrepare( _type ) \ - moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \ - auto __token = GetToken(); \ + tracy::moodycamel::ConcurrentQueueDefaultTraits::index_t __magic; \ + auto __token = tracy::GetToken(); \ auto& __tail = __token->get_tail_index(); \ auto item = __token->enqueue_begin( __magic ); \ - MemWrite( &item->hdr.type, _type ); + tracy::MemWrite( &item->hdr.type, _type ); #define TracyLfqCommit \ __tail.store( __magic + 1, std::memory_order_release ); @@ -136,11 +136,11 @@ struct LuaZoneState #ifdef TRACY_FIBERS # define TracyQueuePrepare( _type ) \ - auto item = Profiler::QueueSerial(); \ - MemWrite( &item->hdr.type, _type ); + auto item = tracy::Profiler::QueueSerial(); \ + tracy::MemWrite( &item->hdr.type, _type ); # define TracyQueueCommit( _name ) \ - MemWrite( &item->_name.thread, GetThreadHandle() ); \ - Profiler::QueueSerialFinish(); + tracy::MemWrite( &item->_name.thread, tracy::GetThreadHandle() ); \ + tracy::Profiler::QueueSerialFinish(); # define TracyQueuePrepareC( _type ) \ auto item = tracy::Profiler::QueueSerial(); \ tracy::MemWrite( &item->hdr.type, _type ); @@ -387,58 +387,58 @@ class Profiler TracyLfqCommit; } - static tracy_force_inline void Message( const char* txt, size_t size, int callstack ) + static tracy_force_inline void Message( const char* txt, size_t size, int32_t callstack_depth ) { assert( size < (std::numeric_limits::max)() ); #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif - if( callstack != 0 ) + if( callstack_depth != 0 && has_callstack() ) { - tracy::GetProfiler().SendCallstack( callstack ); + tracy::GetProfiler().SendCallstack( callstack_depth ); } auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); - TracyQueuePrepare( callstack == 0 ? QueueType::Message : QueueType::MessageCallstack ); + TracyQueuePrepare( callstack_depth == 0 ? QueueType::Message : QueueType::MessageCallstack ); MemWrite( &item->messageFat.time, GetTime() ); MemWrite( &item->messageFat.text, (uint64_t)ptr ); MemWrite( &item->messageFat.size, (uint16_t)size ); TracyQueueCommit( messageFatThread ); } - static tracy_force_inline void Message( const char* txt, int callstack ) + static tracy_force_inline void Message( const char* txt, int32_t callstack_depth ) { #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif - if( callstack != 0 ) + if( callstack_depth != 0 && has_callstack() ) { - tracy::GetProfiler().SendCallstack( callstack ); + tracy::GetProfiler().SendCallstack( callstack_depth ); } - TracyQueuePrepare( callstack == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack ); + TracyQueuePrepare( callstack_depth == 0 ? QueueType::MessageLiteral : QueueType::MessageLiteralCallstack ); MemWrite( &item->messageLiteral.time, GetTime() ); MemWrite( &item->messageLiteral.text, (uint64_t)txt ); TracyQueueCommit( messageLiteralThread ); } - static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int callstack ) + static tracy_force_inline void MessageColor( const char* txt, size_t size, uint32_t color, int32_t callstack_depth ) { assert( size < (std::numeric_limits::max)() ); #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif - if( callstack != 0 ) + if( callstack_depth != 0 && has_callstack() ) { - tracy::GetProfiler().SendCallstack( callstack ); + tracy::GetProfiler().SendCallstack( callstack_depth ); } auto ptr = (char*)tracy_malloc( size ); memcpy( ptr, txt, size ); - TracyQueuePrepare( callstack == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack ); + TracyQueuePrepare( callstack_depth == 0 ? QueueType::MessageColor : QueueType::MessageColorCallstack ); MemWrite( &item->messageColorFat.time, GetTime() ); MemWrite( &item->messageColorFat.text, (uint64_t)ptr ); MemWrite( &item->messageColorFat.b, uint8_t( ( color ) & 0xFF ) ); @@ -448,17 +448,17 @@ class Profiler TracyQueueCommit( messageColorFatThread ); } - static tracy_force_inline void MessageColor( const char* txt, uint32_t color, int callstack ) + static tracy_force_inline void MessageColor( const char* txt, uint32_t color, int32_t callstack_depth ) { #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) return; #endif - if( callstack != 0 ) + if( callstack_depth != 0 && has_callstack() ) { - tracy::GetProfiler().SendCallstack( callstack ); + tracy::GetProfiler().SendCallstack( callstack_depth ); } - TracyQueuePrepare( callstack == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack ); + TracyQueuePrepare( callstack_depth == 0 ? QueueType::MessageLiteralColor : QueueType::MessageLiteralColorCallstack ); MemWrite( &item->messageColorLiteral.time, GetTime() ); MemWrite( &item->messageColorLiteral.text, (uint64_t)txt ); MemWrite( &item->messageColorLiteral.b, uint8_t( ( color ) & 0xFF ) ); @@ -510,29 +510,31 @@ class Profiler GetProfiler().m_serialLock.unlock(); } - static tracy_force_inline void MemAllocCallstack( const void* ptr, size_t size, int depth, bool secure ) + static tracy_force_inline void MemAllocCallstack( const void* ptr, size_t size, int32_t depth, bool secure ) { if( secure && !ProfilerAvailable() ) return; -#ifdef TRACY_HAS_CALLSTACK - auto& profiler = GetProfiler(); + if( depth > 0 && has_callstack() ) + { + auto& profiler = GetProfiler(); # ifdef TRACY_ON_DEMAND - if( !profiler.IsConnected() ) return; + if( !profiler.IsConnected() ) return; # endif - const auto thread = GetThreadHandle(); + const auto thread = GetThreadHandle(); - auto callstack = Callstack( depth ); + auto callstack = Callstack( depth ); - profiler.m_serialLock.lock(); - SendCallstackSerial( callstack ); - SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size ); - profiler.m_serialLock.unlock(); -#else - static_cast(depth); // unused - MemAlloc( ptr, size, secure ); -#endif + profiler.m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemAlloc( QueueType::MemAllocCallstack, thread, ptr, size ); + profiler.m_serialLock.unlock(); + } + else + { + MemAlloc( ptr, size, secure ); + } } - static tracy_force_inline void MemFreeCallstack( const void* ptr, int depth, bool secure ) + static tracy_force_inline void MemFreeCallstack( const void* ptr, int32_t depth, bool secure ) { if( secure && !ProfilerAvailable() ) return; if( !ProfilerAllocatorAvailable() ) @@ -540,23 +542,25 @@ class Profiler MemFree( ptr, secure ); return; } -#ifdef TRACY_HAS_CALLSTACK - auto& profiler = GetProfiler(); + if( depth > 0 && has_callstack() ) + { + auto& profiler = GetProfiler(); # ifdef TRACY_ON_DEMAND - if( !profiler.IsConnected() ) return; + if( !profiler.IsConnected() ) return; # endif - const auto thread = GetThreadHandle(); + const auto thread = GetThreadHandle(); - auto callstack = Callstack( depth ); + auto callstack = Callstack( depth ); - profiler.m_serialLock.lock(); - SendCallstackSerial( callstack ); - SendMemFree( QueueType::MemFreeCallstack, thread, ptr ); - profiler.m_serialLock.unlock(); -#else - static_cast(depth); // unused - MemFree( ptr, secure ); -#endif + profiler.m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemFree( QueueType::MemFreeCallstack, thread, ptr ); + profiler.m_serialLock.unlock(); + } + else + { + MemFree( ptr, secure ); + } } static tracy_force_inline void MemAllocNamed( const void* ptr, size_t size, bool secure, const char* name ) @@ -587,62 +591,101 @@ class Profiler GetProfiler().m_serialLock.unlock(); } - static tracy_force_inline void MemAllocCallstackNamed( const void* ptr, size_t size, int depth, bool secure, const char* name ) + static tracy_force_inline void MemAllocCallstackNamed( const void* ptr, size_t size, int32_t depth, bool secure, const char* name ) { if( secure && !ProfilerAvailable() ) return; -#ifdef TRACY_HAS_CALLSTACK - auto& profiler = GetProfiler(); + if( depth > 0 && has_callstack() ) + { + auto& profiler = GetProfiler(); # ifdef TRACY_ON_DEMAND - if( !profiler.IsConnected() ) return; + if( !profiler.IsConnected() ) return; # endif - const auto thread = GetThreadHandle(); + const auto thread = GetThreadHandle(); - auto callstack = Callstack( depth ); + auto callstack = Callstack( depth ); - profiler.m_serialLock.lock(); - SendCallstackSerial( callstack ); - SendMemName( name ); - SendMemAlloc( QueueType::MemAllocCallstackNamed, thread, ptr, size ); - profiler.m_serialLock.unlock(); -#else - static_cast(depth); // unused - MemAllocNamed( ptr, size, secure, name ); -#endif + profiler.m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemName( name ); + SendMemAlloc( QueueType::MemAllocCallstackNamed, thread, ptr, size ); + profiler.m_serialLock.unlock(); + } + else + { + MemAllocNamed( ptr, size, secure, name ); + } } - static tracy_force_inline void MemFreeCallstackNamed( const void* ptr, int depth, bool secure, const char* name ) + static tracy_force_inline void MemFreeCallstackNamed( const void* ptr, int32_t depth, bool secure, const char* name ) { if( secure && !ProfilerAvailable() ) return; -#ifdef TRACY_HAS_CALLSTACK - auto& profiler = GetProfiler(); + if( depth > 0 && has_callstack() ) + { + auto& profiler = GetProfiler(); # ifdef TRACY_ON_DEMAND - if( !profiler.IsConnected() ) return; + if( !profiler.IsConnected() ) return; # endif - const auto thread = GetThreadHandle(); + const auto thread = GetThreadHandle(); - auto callstack = Callstack( depth ); + auto callstack = Callstack( depth ); - profiler.m_serialLock.lock(); - SendCallstackSerial( callstack ); - SendMemName( name ); - SendMemFree( QueueType::MemFreeCallstackNamed, thread, ptr ); - profiler.m_serialLock.unlock(); -#else - static_cast(depth); // unused - MemFreeNamed( ptr, secure, name ); -#endif + profiler.m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemName( name ); + SendMemFree( QueueType::MemFreeCallstackNamed, thread, ptr ); + profiler.m_serialLock.unlock(); + } + else + { + MemFreeNamed( ptr, secure, name ); + } } - static tracy_force_inline void SendCallstack( int depth ) + static tracy_force_inline void MemDiscard( const char* name, bool secure ) { -#ifdef TRACY_HAS_CALLSTACK - auto ptr = Callstack( depth ); - TracyQueuePrepare( QueueType::Callstack ); - MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); - TracyQueueCommit( callstackFatThread ); -#else - static_cast(depth); // unused + if( secure && !ProfilerAvailable() ) return; +#ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; #endif + const auto thread = GetThreadHandle(); + + GetProfiler().m_serialLock.lock(); + SendMemDiscard( QueueType::MemDiscard, thread, name ); + GetProfiler().m_serialLock.unlock(); + } + + static tracy_force_inline void MemDiscardCallstack( const char* name, bool secure, int32_t depth ) + { + if( secure && !ProfilerAvailable() ) return; + if( depth > 0 && has_callstack() ) + { +# ifdef TRACY_ON_DEMAND + if( !GetProfiler().IsConnected() ) return; +# endif + const auto thread = GetThreadHandle(); + + auto callstack = Callstack( depth ); + + GetProfiler().m_serialLock.lock(); + SendCallstackSerial( callstack ); + SendMemDiscard( QueueType::MemDiscard, thread, name ); + GetProfiler().m_serialLock.unlock(); + } + else + { + MemDiscard( name, secure ); + } + } + + static tracy_force_inline void SendCallstack( int32_t depth ) + { + if( depth > 0 && has_callstack() ) + { + auto ptr = Callstack( depth ); + TracyQueuePrepare( QueueType::Callstack ); + MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); + TracyQueueCommit( callstackFatThread ); + } } static tracy_force_inline void ParameterRegister( ParameterCallback cb, void* data ) @@ -692,7 +735,7 @@ class Profiler } #endif - void SendCallstack( int depth, const char* skipBefore ); + void SendCallstack( int32_t depth, const char* skipBefore ); static void CutCallstack( void* callstack, const char* skipBefore ); static bool ShouldExit(); @@ -800,7 +843,7 @@ class Profiler void InstallCrashHandler(); void RemoveCrashHandler(); - + void ClearQueues( tracy::moodycamel::ConsumerToken& token ); void ClearSerial(); DequeueStatus Dequeue( tracy::moodycamel::ConsumerToken& token ); @@ -833,6 +876,21 @@ class Profiler m_bufferOffset += int( len ); } + char* SafeCopyProlog( const char* p, size_t size ); + void SafeCopyEpilog( char* buf ); + + template // must be void( const char* buf, size_t size ) + bool WithSafeCopy( const char* p, size_t size, Callable&& callable ) + { + if( char* buf = SafeCopyProlog( p, size ) ) + { + callable( buf, size ); + SafeCopyEpilog( buf ); + return true; + } + return false; + } + bool SendData( const char* data, size_t len ); void SendLongString( uint64_t ptr, const char* str, size_t len, QueueType type ); void SendSourceLocation( uint64_t ptr ); @@ -862,14 +920,13 @@ class Profiler static tracy_force_inline void SendCallstackSerial( void* ptr ) { -#ifdef TRACY_HAS_CALLSTACK - auto item = GetProfiler().m_serialQueue.prepare_next(); - MemWrite( &item->hdr.type, QueueType::CallstackSerial ); - MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); - GetProfiler().m_serialQueue.commit_next(); -#else - static_cast(ptr); // unused -#endif + if( has_callstack() ) + { + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, QueueType::CallstackSerial ); + MemWrite( &item->callstackFat.ptr, (uint64_t)ptr ); + GetProfiler().m_serialQueue.commit_next(); + } } static tracy_force_inline void SendMemAlloc( QueueType type, const uint32_t thread, const void* ptr, size_t size ) @@ -907,6 +964,18 @@ class Profiler GetProfiler().m_serialQueue.commit_next(); } + static tracy_force_inline void SendMemDiscard( QueueType type, const uint32_t thread, const char* name ) + { + assert( type == QueueType::MemDiscard || type == QueueType::MemDiscardCallstack ); + + auto item = GetProfiler().m_serialQueue.prepare_next(); + MemWrite( &item->hdr.type, type ); + MemWrite( &item->memDiscard.time, GetTime() ); + MemWrite( &item->memDiscard.thread, thread ); + MemWrite( &item->memDiscard.name, (uint64_t)name ); + GetProfiler().m_serialQueue.commit_next(); + } + static tracy_force_inline void SendMemName( const char* name ) { assert( name ); @@ -922,7 +991,6 @@ class Profiler double m_timerMul; uint64_t m_resolution; - uint64_t m_delay; std::atomic m_timeBegin; uint32_t m_mainThread; uint64_t m_epoch, m_exectime; @@ -990,9 +1058,19 @@ class Profiler char* m_queryData; char* m_queryDataPtr; +#ifndef NDEBUG + // m_safeSendBuffer and m_pipe should only be used by the Tracy Profiler thread; this ensures that in debug builds. + std::atomic_bool m_inUse{ false }; +#endif + char* m_safeSendBuffer; + #if defined _WIN32 - void* m_exceptionHandler; + void* m_prevHandler; +#else + int m_pipe[2]; + int m_pipeBufSize; #endif + #ifdef __linux__ struct { struct sigaction pwr, ill, fpe, segv, pipe, bus, abrt; diff --git a/libs/tracy/client/TracyRocprof.cpp b/libs/tracy/client/TracyRocprof.cpp new file mode 100644 index 0000000..370e42e --- /dev/null +++ b/libs/tracy/client/TracyRocprof.cpp @@ -0,0 +1,556 @@ +#include "../server/tracy_robin_hood.h" +#include "TracyProfiler.hpp" +#include "TracyThread.hpp" +#include "tracy/TracyC.h" +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define ROCPROFILER_CALL( result, msg ) \ + { \ + rocprofiler_status_t CHECKSTATUS = result; \ + if( CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS ) \ + { \ + std::string status_msg = rocprofiler_get_status_string( CHECKSTATUS ); \ + std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg << " failed with error code " \ + << CHECKSTATUS << ": " << status_msg << std::endl; \ + std::stringstream errmsg{}; \ + errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] " << msg " failure (" << status_msg \ + << ")"; \ + throw std::runtime_error( errmsg.str() ); \ + } \ + } + +namespace +{ + +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; + +struct DispatchData +{ + int64_t launch_start; + int64_t launch_end; + uint32_t thread_id; + uint16_t query_id; +}; + +struct ToolData +{ + uint32_t version; + const char* runtime_version; + uint32_t priority; + rocprofiler_client_id_t client_id; + uint8_t context_id; + bool init; + uint64_t query_id; + int64_t previous_cpu_time; + tracy::unordered_map client_kernels; + tracy::unordered_map dispatch_data; + tracy::unordered_set counter_names = { "SQ_WAVES", "GL2C_MISS", "GL2C_HIT" }; + std::unique_ptr cal_thread; + std::mutex mut{}; +}; + +using namespace tracy; + +rocprofiler_context_id_t& get_client_ctx() +{ + static rocprofiler_context_id_t ctx{ 0 }; + return ctx; +} + +const char* CTX_NAME = "rocprofv3"; + +uint8_t gpu_context_allocate( ToolData* data ) +{ + + timespec ts; + clock_gettime( CLOCK_BOOTTIME, &ts ); + uint64_t cpu_timestamp = Profiler::GetTime(); + uint64_t gpu_timestamp = ( (uint64_t)ts.tv_sec * 1000000000 ) + ts.tv_nsec; + float timestamp_period = 1.0f; + data->previous_cpu_time = cpu_timestamp; + + // Allocate the process-unique GPU context ID. There's a max of 255 available; + // if we are recreating devices a lot we may exceed that. Don't do that, or + // wrap around and get weird (but probably still usable) numbers. + uint8_t context_id = tracy::GetGpuCtxCounter().fetch_add( 1, std::memory_order_relaxed ); + if( context_id >= 255 ) + { + context_id %= 255; + } + + uint8_t context_flags = 0; +#ifdef TRACY_ROCPROF_CALIBRATION + // Tell tracy we'll be passing calibrated timestamps and not to mess with + // the times. We'll periodically send GpuCalibration events in case the + // times drift. + context_flags |= tracy::GpuContextCalibration; +#endif + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuNewContext ); + tracy::MemWrite( &item->gpuNewContext.cpuTime, cpu_timestamp ); + tracy::MemWrite( &item->gpuNewContext.gpuTime, gpu_timestamp ); + memset( &item->gpuNewContext.thread, 0, sizeof( item->gpuNewContext.thread ) ); + tracy::MemWrite( &item->gpuNewContext.period, timestamp_period ); + tracy::MemWrite( &item->gpuNewContext.context, context_id ); + tracy::MemWrite( &item->gpuNewContext.flags, context_flags ); + tracy::MemWrite( &item->gpuNewContext.type, tracy::GpuContextType::Rocprof ); + tracy::Profiler::QueueSerialFinish(); + } + + // Send the name of the context along. + // NOTE: Tracy will unconditionally free the name so we must clone it here. + // Since internally Tracy will use its own rpmalloc implementation we must + // make sure we allocate from the same source. + size_t name_length = strlen( CTX_NAME ); + char* cloned_name = (char*)tracy::tracy_malloc( name_length ); + memcpy( cloned_name, CTX_NAME, name_length ); + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuContextName ); + tracy::MemWrite( &item->gpuContextNameFat.context, context_id ); + tracy::MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)cloned_name ); + tracy::MemWrite( &item->gpuContextNameFat.size, name_length ); + tracy::Profiler::QueueSerialFinish(); + } + + return context_id; +} + +uint64_t kernel_src_loc( ToolData* data, uint64_t kernel_id ) +{ + uint64_t src_loc = 0; + auto _lk = std::unique_lock{ data->mut }; + rocprofiler_kernel_id_t kid = kernel_id; + if( data->client_kernels.count( kid ) ) + { + auto& sym_data = data->client_kernels[kid]; + const char* name = sym_data.kernel_name; + size_t name_len = strlen( name ); + uint32_t line = 0; + src_loc = tracy::Profiler::AllocSourceLocation( line, NULL, 0, name, name_len, NULL, 0 ); + } + return src_loc; +} + +void record_interval( ToolData* data, rocprofiler_timestamp_t start_timestamp, rocprofiler_timestamp_t end_timestamp, + uint64_t src_loc, rocprofiler_dispatch_id_t dispatch_id ) +{ + + uint16_t query_id = 0; + uint8_t context_id = data->context_id; + + { + auto _lk = std::unique_lock{ data->mut }; + query_id = data->query_id; + data->query_id++; + if( dispatch_id != UINT64_MAX ) + { + DispatchData& dispatch_data = data->dispatch_data[dispatch_id]; + dispatch_data.query_id = query_id; + dispatch_data.thread_id = tracy::GetThreadHandle(); + } + } + + uint64_t cpu_start_time = 0, cpu_end_time = 0; + if( dispatch_id == UINT64_MAX ) + { + cpu_start_time = tracy::Profiler::GetTime(); + cpu_end_time = tracy::Profiler::GetTime(); + } + else + { + auto _lk = std::unique_lock{ data->mut }; + DispatchData& dispatch_data = data->dispatch_data[dispatch_id]; + cpu_start_time = dispatch_data.launch_start; + cpu_end_time = dispatch_data.launch_end; + } + + if( src_loc != 0 ) + { + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginAllocSrcLocSerial ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, cpu_start_time ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)src_loc ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, query_id ); + tracy::MemWrite( &item->gpuZoneBegin.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } + } + else + { + static const ___tracy_source_location_data src_loc = { NULL, NULL, NULL, 0, 0 }; + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneBeginSerial ); + tracy::MemWrite( &item->gpuZoneBegin.cpuTime, cpu_start_time ); + tracy::MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)&src_loc ); + tracy::MemWrite( &item->gpuZoneBegin.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneBegin.queryId, query_id ); + tracy::MemWrite( &item->gpuZoneBegin.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } + } + + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime ); + tracy::MemWrite( &item->gpuTime.gpuTime, start_timestamp ); + tracy::MemWrite( &item->gpuTime.queryId, query_id ); + tracy::MemWrite( &item->gpuTime.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } + + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneEndSerial ); + tracy::MemWrite( &item->gpuZoneEnd.cpuTime, cpu_end_time ); + tracy::MemWrite( &item->gpuZoneEnd.thread, tracy::GetThreadHandle() ); + tracy::MemWrite( &item->gpuZoneEnd.queryId, query_id ); + tracy::MemWrite( &item->gpuZoneEnd.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } + + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuTime ); + tracy::MemWrite( &item->gpuTime.gpuTime, end_timestamp ); + tracy::MemWrite( &item->gpuTime.queryId, query_id ); + tracy::MemWrite( &item->gpuTime.context, context_id ); + tracy::Profiler::QueueSerialFinish(); + } +} + +void record_callback( rocprofiler_dispatch_counting_service_data_t dispatch_data, + rocprofiler_record_counter_t* record_data, size_t record_count, + rocprofiler_user_data_t /*user_data*/, void* callback_data ) +{ + assert( callback_data != nullptr ); + ToolData* data = static_cast( callback_data ); + if( !data->init ) return; + + std::unordered_map sums; + for( size_t i = 0; i < record_count; ++i ) + { + auto _counter_id = rocprofiler_counter_id_t{}; + ROCPROFILER_CALL( rocprofiler_query_record_counter_id( record_data[i].id, &_counter_id ), + "query record counter id" ); + sums[_counter_id.handle] += record_data[i].counter_value; + } + + uint16_t query_id = 0; + uint32_t thread_id = 0; + { + auto _lk = std::unique_lock{ data->mut }; + // An assumption is made here that the counter values are supplied after the dispatch + // complete callback. + assert( data->dispatch_data.count( dispatch_data.dispatch_info.dispatch_id ) ); + DispatchData& ddata = data->dispatch_data[dispatch_data.dispatch_info.dispatch_id]; + query_id = ddata.query_id; + thread_id = ddata.thread_id; + } + + for( auto& p : sums ) + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuZoneAnnotation ); + tracy::MemWrite( &item->zoneAnnotation.noteId, p.first ); + tracy::MemWrite( &item->zoneAnnotation.queryId, query_id ); + tracy::MemWrite( &item->zoneAnnotation.thread, thread_id ); + tracy::MemWrite( &item->zoneAnnotation.value, p.second ); + tracy::MemWrite( &item->zoneAnnotation.context, data->context_id ); + tracy::Profiler::QueueSerialFinish(); + } +} + +/** + * Callback from rocprofiler when an kernel dispatch is enqueued into the HSA queue. + * rocprofiler_counter_config_id_t* is a return to specify what counters to collect + * for this dispatch (dispatch_packet). + */ +void dispatch_callback( rocprofiler_dispatch_counting_service_data_t dispatch_data, + rocprofiler_profile_config_id_t* config, rocprofiler_user_data_t* /*user_data*/, + void* callback_data ) +{ + assert( callback_data != nullptr ); + ToolData* data = static_cast( callback_data ); + if( !data->init ) return; + + /** + * This simple example uses the same profile counter set for all agents. + * We store this in a cache to prevent constructing many identical profile counter + * sets. We first check the cache to see if we have already constructed a counter" + * set for the agent. If we have, return it. Otherwise, construct a new profile counter + * set. + */ + static std::shared_mutex m_mutex = {}; + static std::unordered_map profile_cache = {}; + + auto search_cache = [&]() + { + if( auto pos = profile_cache.find( dispatch_data.dispatch_info.agent_id.handle ); pos != profile_cache.end() ) + { + *config = pos->second; + return true; + } + return false; + }; + + { + auto rlock = std::shared_lock{ m_mutex }; + if( search_cache() ) return; + } + + auto wlock = std::unique_lock{ m_mutex }; + if( search_cache() ) return; + + // GPU Counter IDs + std::vector gpu_counters; + + // Iterate through the agents and get the counters available on that agent + ROCPROFILER_CALL( + rocprofiler_iterate_agent_supported_counters( + dispatch_data.dispatch_info.agent_id, + []( rocprofiler_agent_id_t, rocprofiler_counter_id_t* counters, size_t num_counters, void* user_data ) + { + std::vector* vec = + static_cast*>( user_data ); + for( size_t i = 0; i < num_counters; i++ ) + { + vec->push_back( counters[i] ); + } + return ROCPROFILER_STATUS_SUCCESS; + }, + static_cast( &gpu_counters ) ), + "Could not fetch supported counters" ); + + std::vector collect_counters; + collect_counters.reserve( data->counter_names.size() ); + // Look for the counters contained in counters_to_collect in gpu_counters + for( auto& counter : gpu_counters ) + { + rocprofiler_counter_info_v0_t info; + ROCPROFILER_CALL( + rocprofiler_query_counter_info( counter, ROCPROFILER_COUNTER_INFO_VERSION_0, static_cast( &info ) ), + "Could not query info" ); + if( data->counter_names.count( std::string( info.name ) ) > 0 ) + { + collect_counters.push_back( counter ); + + size_t name_length = strlen( info.name ); + char* cloned_name = (char*)tracy::tracy_malloc( name_length ); + memcpy( cloned_name, info.name, name_length ); + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuAnnotationName ); + tracy::MemWrite( &item->gpuAnnotationNameFat.context, data->context_id ); + tracy::MemWrite( &item->gpuAnnotationNameFat.noteId, counter.handle ); + tracy::MemWrite( &item->gpuAnnotationNameFat.ptr, (uint64_t)cloned_name ); + tracy::MemWrite( &item->gpuAnnotationNameFat.size, name_length ); + tracy::Profiler::QueueSerialFinish(); + } + } + } + + // Create a colleciton profile for the counters + rocprofiler_profile_config_id_t profile = { .handle = 0 }; + ROCPROFILER_CALL( rocprofiler_create_profile_config( dispatch_data.dispatch_info.agent_id, collect_counters.data(), + collect_counters.size(), &profile ), + "Could not construct profile cfg" ); + + profile_cache.emplace( dispatch_data.dispatch_info.agent_id.handle, profile ); + // Return the profile to collect those counters for this dispatch + *config = profile; +} + +void tool_callback_tracing_callback( rocprofiler_callback_tracing_record_t record, rocprofiler_user_data_t* user_data, + void* callback_data ) +{ + assert( callback_data != nullptr ); + ToolData* data = static_cast( callback_data ); + if( !data->init ) return; + + if( record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER ) + { + auto* sym_data = static_cast( record.payload ); + + if( record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD ) + { + auto _lk = std::unique_lock{ data->mut }; + data->client_kernels.emplace( sym_data->kernel_id, *sym_data ); + } + else if( record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD ) + { + auto _lk = std::unique_lock{ data->mut }; + data->client_kernels.erase( sym_data->kernel_id ); + } + } + else if( record.kind == ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH ) + { + auto* rdata = static_cast( record.payload ); + if( record.operation == ROCPROFILER_KERNEL_DISPATCH_ENQUEUE ) + { + if( record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER ) + { + auto _lk = std::unique_lock{ data->mut }; + data->dispatch_data[rdata->dispatch_info.dispatch_id].launch_start = tracy::Profiler::GetTime(); + } + else if( record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT ) + { + auto _lk = std::unique_lock{ data->mut }; + data->dispatch_data[rdata->dispatch_info.dispatch_id].launch_end = tracy::Profiler::GetTime(); + } + } + else if( record.operation == ROCPROFILER_KERNEL_DISPATCH_COMPLETE ) + { + uint64_t src_loc = kernel_src_loc( data, rdata->dispatch_info.kernel_id ); + record_interval( data, rdata->start_timestamp, rdata->end_timestamp, src_loc, + rdata->dispatch_info.dispatch_id ); + } + } + else if( record.kind == ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY && + record.operation != ROCPROFILER_MEMORY_COPY_NONE && record.phase == ROCPROFILER_CALLBACK_PHASE_EXIT ) + { + auto* rdata = static_cast( record.payload ); + const char* name = nullptr; + switch( record.operation ) + { + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_DEVICE: + name = "DeviceToDeviceCopy"; + break; + case ROCPROFILER_MEMORY_COPY_DEVICE_TO_HOST: + name = "DeviceToHostCopy"; + break; + case ROCPROFILER_MEMORY_COPY_HOST_TO_DEVICE: + name = "HostToDeviceCopy"; + break; + case ROCPROFILER_MEMORY_COPY_HOST_TO_HOST: + name = "HostToHostCopy"; + break; + } + size_t name_len = strlen( name ); + uint64_t src_loc = tracy::Profiler::AllocSourceLocation( 0, NULL, 0, name, name_len, NULL, 0 ); + record_interval( data, rdata->start_timestamp, rdata->end_timestamp, src_loc, UINT64_MAX ); + } +} + +void calibration_thread( void* ptr ) +{ + while( !TracyIsStarted ) + ; + ToolData* data = static_cast( ptr ); + data->context_id = gpu_context_allocate( data ); + const char* user_counters = GetEnvVar( "TRACY_ROCPROF_COUNTERS" ); + if( user_counters ) + { + data->counter_names.clear(); + std::stringstream ss( user_counters ); + std::string counter; + while( std::getline( ss, counter, ',' ) ) data->counter_names.insert( counter ); + } + data->init = true; + +#ifdef TRACY_ROCPROF_CALIBRATION + while( data->init ) + { + sleep( 1 ); + + timespec ts; + // HSA performs a linear interpolation of GPU time to CLOCK_BOOTTIME. However, this is + // subject to network time updates and can drift relative to tracy's clock. + clock_gettime( CLOCK_BOOTTIME, &ts ); + int64_t cpu_timestamp = Profiler::GetTime(); + int64_t gpu_timestamp = ts.tv_nsec + ts.tv_sec * 1e9L; + + if( cpu_timestamp > data->previous_cpu_time ) + { + auto* item = tracy::Profiler::QueueSerial(); + tracy::MemWrite( &item->hdr.type, tracy::QueueType::GpuCalibration ); + tracy::MemWrite( &item->gpuCalibration.gpuTime, gpu_timestamp ); + tracy::MemWrite( &item->gpuCalibration.cpuTime, cpu_timestamp ); + tracy::MemWrite( &item->gpuCalibration.cpuDelta, cpu_timestamp - data->previous_cpu_time ); + tracy::MemWrite( &item->gpuCalibration.context, data->context_id ); + tracy::Profiler::QueueSerialFinish(); + data->previous_cpu_time = cpu_timestamp; + } + } +#endif +} + +int tool_init( rocprofiler_client_finalize_t fini_func, void* user_data ) +{ + ToolData* data = static_cast( user_data ); + data->cal_thread = std::make_unique( calibration_thread, data ); + + ROCPROFILER_CALL( rocprofiler_create_context( &get_client_ctx() ), "context creation failed" ); + + ROCPROFILER_CALL( rocprofiler_configure_callback_dispatch_counting_service( get_client_ctx(), dispatch_callback, + user_data, record_callback, user_data ), + "Could not setup counting service" ); + + rocprofiler_tracing_operation_t ops[] = { ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER }; + ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(), + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, ops, 1, + tool_callback_tracing_callback, user_data ), + "callback tracing service failed to configure" ); + + rocprofiler_tracing_operation_t ops2[] = { ROCPROFILER_KERNEL_DISPATCH_COMPLETE, + ROCPROFILER_KERNEL_DISPATCH_ENQUEUE }; + ROCPROFILER_CALL( + rocprofiler_configure_callback_tracing_service( get_client_ctx(), ROCPROFILER_CALLBACK_TRACING_KERNEL_DISPATCH, + ops2, 2, tool_callback_tracing_callback, user_data ), + "callback tracing service failed to configure" ); + + ROCPROFILER_CALL( rocprofiler_configure_callback_tracing_service( get_client_ctx(), + ROCPROFILER_CALLBACK_TRACING_MEMORY_COPY, nullptr, + 0, tool_callback_tracing_callback, user_data ), + "callback tracing service failed to configure" ); + + ROCPROFILER_CALL( rocprofiler_start_context( get_client_ctx() ), "start context" ); + return 0; +} + +void tool_fini( void* tool_data_v ) +{ + rocprofiler_stop_context( get_client_ctx() ); + + ToolData* data = static_cast( tool_data_v ); + data->init = false; + data->cal_thread.reset(); +} +} + +extern "C" +{ + rocprofiler_tool_configure_result_t* rocprofiler_configure( uint32_t version, const char* runtime_version, + uint32_t priority, rocprofiler_client_id_t* client_id ) + { + // If not the first tool to register, indicate that the tool doesn't want to do anything + if( priority > 0 ) return nullptr; + + // (optional) Provide a name for this tool to rocprofiler + client_id->name = "Tracy"; + + // (optional) create configure data + static ToolData data = ToolData{ version, runtime_version, priority, *client_id, 0, false, 0, 0 }; + + // construct configure result + static auto cfg = rocprofiler_tool_configure_result_t{ sizeof( rocprofiler_tool_configure_result_t ), + &tool_init, &tool_fini, static_cast( &data ) }; + + return &cfg; + } +} diff --git a/libs/tracy/client/TracyScoped.hpp b/libs/tracy/client/TracyScoped.hpp index 8e81c99..7f9256d 100644 --- a/libs/tracy/client/TracyScoped.hpp +++ b/libs/tracy/client/TracyScoped.hpp @@ -10,6 +10,7 @@ #include "../common/TracyAlign.hpp" #include "../common/TracyAlloc.hpp" #include "TracyProfiler.hpp" +#include "TracyCallstack.hpp" namespace tracy { @@ -22,7 +23,7 @@ class ScopedZone ScopedZone& operator=( const ScopedZone& ) = delete; ScopedZone& operator=( ScopedZone&& ) = delete; - tracy_force_inline ScopedZone( const SourceLocationData* srcloc, bool is_active = true ) + tracy_force_inline ScopedZone( const SourceLocationData* srcloc, int32_t depth = -1, bool is_active = true ) #ifdef TRACY_ON_DEMAND : m_active( is_active && GetProfiler().IsConnected() ) #else @@ -33,13 +34,19 @@ class ScopedZone #ifdef TRACY_ON_DEMAND m_connectionId = GetProfiler().ConnectionId(); #endif - TracyQueuePrepare( QueueType::ZoneBegin ); + auto zoneQueue = QueueType::ZoneBegin; + if( depth > 0 && has_callstack() ) + { + GetProfiler().SendCallstack( depth ); + zoneQueue = QueueType::ZoneBeginCallstack; + } + TracyQueuePrepare( zoneQueue ); MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); TracyQueueCommit( zoneBeginThread ); } - tracy_force_inline ScopedZone( const SourceLocationData* srcloc, int depth, bool is_active = true ) + tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color, int32_t depth = -1, bool is_active = true ) #ifdef TRACY_ON_DEMAND : m_active( is_active && GetProfiler().IsConnected() ) #else @@ -50,55 +57,21 @@ class ScopedZone #ifdef TRACY_ON_DEMAND m_connectionId = GetProfiler().ConnectionId(); #endif - GetProfiler().SendCallstack( depth ); - - TracyQueuePrepare( QueueType::ZoneBeginCallstack ); - MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); - MemWrite( &item->zoneBegin.srcloc, (uint64_t)srcloc ); - TracyQueueCommit( zoneBeginThread ); - } - - tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color, bool is_active = true ) -#ifdef TRACY_ON_DEMAND - : m_active( is_active && GetProfiler().IsConnected() ) -#else - : m_active( is_active ) -#endif - { - if( !m_active ) return; -#ifdef TRACY_ON_DEMAND - m_connectionId = GetProfiler().ConnectionId(); -#endif - TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc ); - const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz, color ); - MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); - MemWrite( &item->zoneBegin.srcloc, srcloc ); - TracyQueueCommit( zoneBeginThread ); - } - - tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool is_active = true ) : ScopedZone( line, source, sourceSz, function, functionSz, name, nameSz, static_cast(0), is_active ) {} - - tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color, int depth, bool is_active = true ) -#ifdef TRACY_ON_DEMAND - : m_active( is_active && GetProfiler().IsConnected() ) -#else - : m_active( is_active ) -#endif - { - if( !m_active ) return; -#ifdef TRACY_ON_DEMAND - m_connectionId = GetProfiler().ConnectionId(); -#endif - GetProfiler().SendCallstack( depth ); - - TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLocCallstack ); - const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz, color ); + auto zoneQueue = QueueType::ZoneBeginAllocSrcLoc; + if( depth > 0 && has_callstack() ) + { + GetProfiler().SendCallstack( depth ); + zoneQueue = QueueType::ZoneBeginAllocSrcLocCallstack; + } + TracyQueuePrepare( zoneQueue ); + const auto srcloc = + Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz, color ); MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); MemWrite( &item->zoneBegin.srcloc, srcloc ); TracyQueueCommit( zoneBeginThread ); } - tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active = true ) : ScopedZone( line, source, sourceSz, function, functionSz, name, nameSz, 0, depth, is_active ) {} + tracy_force_inline ScopedZone( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int32_t depth, bool is_active = true ) : ScopedZone( line, source, sourceSz, function, functionSz, name, nameSz, 0, depth, is_active ) {} tracy_force_inline ~ScopedZone() { diff --git a/libs/tracy/client/TracySysPower.cpp b/libs/tracy/client/TracySysPower.cpp index bd5939d..6ad1d64 100644 --- a/libs/tracy/client/TracySysPower.cpp +++ b/libs/tracy/client/TracySysPower.cpp @@ -85,7 +85,7 @@ void SysPower::ScanDirectory( const char* path, int parent ) FILE* f = fopen( tmp, "r" ); if( f ) { - fscanf( f, "%" PRIu64, &maxRange ); + (void)fscanf( f, "%" PRIu64, &maxRange ); fclose( f ); } } diff --git a/libs/tracy/client/TracySysTime.cpp b/libs/tracy/client/TracySysTime.cpp index b690a91..cf7dd9b 100644 --- a/libs/tracy/client/TracySysTime.cpp +++ b/libs/tracy/client/TracySysTime.cpp @@ -4,6 +4,7 @@ # if defined _WIN32 # include +# include "../common/TracyWinFamily.hpp" # elif defined __linux__ # include # include @@ -27,13 +28,24 @@ static inline uint64_t ConvertTime( const FILETIME& t ) void SysTime::ReadTimes() { - FILETIME idleTime; FILETIME kernelTime; FILETIME userTime; +# if defined TRACY_GDK + FILETIME creationTime; + FILETIME exitTime; + + GetProcessTimes( GetCurrentProcess(), &creationTime, &exitTime, &kernelTime, &userTime ); + + idle = 0; +# else + FILETIME idleTime; + GetSystemTimes( &idleTime, &kernelTime, &userTime ); idle = ConvertTime( idleTime ); +# endif + const auto kernel = ConvertTime( kernelTime ); const auto user = ConvertTime( userTime ); used = kernel + user; diff --git a/libs/tracy/client/TracySysTrace.cpp b/libs/tracy/client/TracySysTrace.cpp index 0fd1d0a..8e7f613 100644 --- a/libs/tracy/client/TracySysTrace.cpp +++ b/libs/tracy/client/TracySysTrace.cpp @@ -173,8 +173,11 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record ) MemWrite( &item->contextSwitch.oldThread, cswitch->oldThreadId ); MemWrite( &item->contextSwitch.newThread, cswitch->newThreadId ); MemWrite( &item->contextSwitch.cpu, record->BufferContext.ProcessorNumber ); - MemWrite( &item->contextSwitch.reason, cswitch->oldThreadWaitReason ); - MemWrite( &item->contextSwitch.state, cswitch->oldThreadState ); + MemWrite( &item->contextSwitch.oldThreadWaitReason, cswitch->oldThreadWaitReason ); + MemWrite( &item->contextSwitch.oldThreadState, cswitch->oldThreadState ); + MemWrite( &item->contextSwitch.newThreadPriority, cswitch->newThreadPriority ); + MemWrite( &item->contextSwitch.oldThreadPriority, cswitch->oldThreadPriority ); + MemWrite( &item->contextSwitch.previousCState, cswitch->previousCState ); TracyLfqCommit; } else if( hdr.EventDescriptor.Opcode == 50 ) @@ -183,7 +186,10 @@ void WINAPI EventRecordCallback( PEVENT_RECORD record ) TracyLfqPrepare( QueueType::ThreadWakeup ); MemWrite( &item->threadWakeup.time, hdr.TimeStamp.QuadPart ); + MemWrite( &item->threadWakeup.cpu, record->BufferContext.ProcessorNumber ); MemWrite( &item->threadWakeup.thread, rt->threadId ); + MemWrite( &item->threadWakeup.adjustReason, rt->adjustReason ); + MemWrite( &item->threadWakeup.adjustIncrement, rt->adjustIncrement ); TracyLfqCommit; } else if( hdr.EventDescriptor.Opcode == 1 || hdr.EventDescriptor.Opcode == 3 ) @@ -498,11 +504,11 @@ void SysTraceGetExternalName( uint64_t thread, const char*& threadName, const ch if( _GetThreadDescription ) { PWSTR tmp; - _GetThreadDescription( hnd, &tmp ); - char buf[256]; - if( tmp ) + if ( SUCCEEDED( _GetThreadDescription( hnd, &tmp ) ) ) { + char buf[256]; auto ret = wcstombs( buf, tmp, 256 ); + LocalFree(tmp); if( ret != 0 ) { threadName = CopyString( buf, ret ); @@ -678,7 +684,7 @@ enum TraceEventId EventBranchMiss, EventVsync, EventContextSwitch, - EventWakeup, + EventWaking, }; static void ProbePreciseIp( perf_event_attr& pe, unsigned long long config0, unsigned long long config1, pid_t pid ) @@ -767,16 +773,16 @@ bool SysTraceStart( int64_t& samplingPeriod ) TracyDebug( "perf_event_paranoid: %i\n", paranoidLevel ); #endif - int switchId = -1, wakeupId = -1, vsyncId = -1; + int switchId = -1, wakingId = -1, vsyncId = -1; const auto switchIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_switch/id" ); if( switchIdStr ) switchId = atoi( switchIdStr ); - const auto wakeupIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_wakeup/id" ); - if( wakeupIdStr ) wakeupId = atoi( wakeupIdStr ); + const auto wakingIdStr = ReadFile( "/sys/kernel/debug/tracing/events/sched/sched_waking/id" ); + if( wakingIdStr ) wakingId = atoi( wakingIdStr ); const auto vsyncIdStr = ReadFile( "/sys/kernel/debug/tracing/events/drm/drm_vblank_event/id" ); if( vsyncIdStr ) vsyncId = atoi( vsyncIdStr ); TracyDebug( "sched_switch id: %i\n", switchId ); - TracyDebug( "sched_wakeup id: %i\n", wakeupId ); + TracyDebug( "sched_waking id: %i\n", wakingId ); TracyDebug( "drm_vblank_event id: %i\n", vsyncId ); #ifdef TRACY_NO_SAMPLING @@ -831,7 +837,7 @@ bool SysTraceStart( int64_t& samplingPeriod ) 2 + // CPU cycles + instructions retired 2 + // cache reference + miss 2 + // branch retired + miss - 2 + // context switches + wakeups + 2 + // context switches + waking ups 1 // vsync ); s_ring = (RingBuffer*)tracy_malloc( sizeof( RingBuffer ) * maxNumBuffers ); @@ -1076,18 +1082,31 @@ bool SysTraceStart( int64_t& samplingPeriod ) } } - if( wakeupId != -1 ) + if( wakingId != -1 ) { - pe.config = wakeupId; - pe.config &= ~PERF_SAMPLE_CALLCHAIN; + pe = {}; + pe.type = PERF_TYPE_TRACEPOINT; + pe.size = sizeof( perf_event_attr ); + pe.sample_period = 1; + pe.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_RAW; + // Coult ask for callstack here + //pe.sample_type |= PERF_SAMPLE_CALLCHAIN; + pe.disabled = 1; + pe.inherit = 1; + pe.config = wakingId; + pe.read_format = 0; +#if !defined TRACY_HW_TIMER || !( defined __i386 || defined _M_IX86 || defined __x86_64__ || defined _M_X64 ) + pe.use_clockid = 1; + pe.clockid = CLOCK_MONOTONIC_RAW; +#endif - TracyDebug( "Setup wakeup capture\n" ); + TracyDebug( "Setup waking up capture\n" ); for( int i=0; i 0 ) { + // Find the earliest event from the active buffers int sel = -1; int selPos; int64_t t0 = std::numeric_limits::max(); @@ -1369,6 +1389,7 @@ void SysTraceWorker( void* ptr ) } } } + // Found any event if( sel >= 0 ) { auto& ring = ringArray[ctxBufferIdx + sel]; @@ -1384,10 +1405,10 @@ void SysTraceWorker( void* ptr ) const auto rid = ring.GetId(); if( rid == EventContextSwitch ) { - // Layout: - // u64 time - // u64 cnt - // u64 ip[cnt] + // Layout: See /sys/kernel/debug/tracing/events/sched/sched_switch/format + // u64 time // PERF_SAMPLE_TIME + // u64 cnt // PERF_SAMPLE_CALLCHAIN + // u64 ip[cnt] // PERF_SAMPLE_CALLCHAIN // u32 size // u8 data[size] // Data (not ABI stable, but has not changed since it was added, in 2009): @@ -1408,35 +1429,43 @@ void SysTraceWorker( void* ptr ) const auto traceOffset = offset; offset += sizeof( uint64_t ) * cnt + sizeof( uint32_t ) + 8 + 16; - uint32_t prev_pid, next_pid; + uint32_t prev_pid, prev_prio; + uint32_t next_pid, next_prio; long prev_state; ring.Read( &prev_pid, offset, sizeof( uint32_t ) ); - offset += sizeof( uint32_t ) + sizeof( uint32_t ); + offset += sizeof( uint32_t ); + ring.Read( &prev_prio, offset, sizeof( uint32_t ) ); + offset += sizeof( uint32_t ); ring.Read( &prev_state, offset, sizeof( long ) ); offset += sizeof( long ) + 16; ring.Read( &next_pid, offset, sizeof( uint32_t ) ); - - uint8_t reason = 100; - uint8_t state; - - if( prev_state & 0x0001 ) state = 104; - else if( prev_state & 0x0002 ) state = 101; - else if( prev_state & 0x0004 ) state = 105; - else if( prev_state & 0x0008 ) state = 106; - else if( prev_state & 0x0010 ) state = 108; - else if( prev_state & 0x0020 ) state = 109; - else if( prev_state & 0x0040 ) state = 110; - else if( prev_state & 0x0080 ) state = 102; - else state = 103; + offset += sizeof( uint32_t ); + ring.Read( &next_prio, offset, sizeof( uint32_t ) ); + + uint8_t oldThreadWaitReason = 100; + uint8_t oldThreadState; + + if( prev_state & 0x0001 ) oldThreadState = 104; + else if( prev_state & 0x0002 ) oldThreadState = 101; + else if( prev_state & 0x0004 ) oldThreadState = 105; + else if( prev_state & 0x0008 ) oldThreadState = 106; + else if( prev_state & 0x0010 ) oldThreadState = 108; + else if( prev_state & 0x0020 ) oldThreadState = 109; + else if( prev_state & 0x0040 ) oldThreadState = 110; + else if( prev_state & 0x0080 ) oldThreadState = 102; + else oldThreadState = 103; TracyLfqPrepare( QueueType::ContextSwitch ); MemWrite( &item->contextSwitch.time, t0 ); MemWrite( &item->contextSwitch.oldThread, prev_pid ); MemWrite( &item->contextSwitch.newThread, next_pid ); MemWrite( &item->contextSwitch.cpu, uint8_t( ring.GetCpu() ) ); - MemWrite( &item->contextSwitch.reason, reason ); - MemWrite( &item->contextSwitch.state, state ); + MemWrite( &item->contextSwitch.oldThreadWaitReason, oldThreadWaitReason ); + MemWrite( &item->contextSwitch.oldThreadState, oldThreadState ); + MemWrite( &item->contextSwitch.previousCState, uint8_t( 0 ) ); + MemWrite( &item->contextSwitch.newThreadPriority, int8_t( next_prio ) ); + MemWrite( &item->contextSwitch.oldThreadPriority, int8_t( prev_prio ) ); TracyLfqCommit; if( cnt > 0 && prev_pid != 0 && CurrentProcOwnsThread( prev_pid ) ) @@ -1450,27 +1479,33 @@ void SysTraceWorker( void* ptr ) TracyLfqCommit; } } - else if( rid == EventWakeup ) + else if( rid == EventWaking) { + // See /sys/kernel/debug/tracing/events/sched/sched_waking/format // Layout: - // u64 time + // u64 time // PERF_SAMPLE_TIME // u32 size // u8 data[size] // Data: // u8 hdr[8] // u8 comm[16] // u32 pid - // u32 prio - // u64 target_cpu - - offset += sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ) + 8 + 16; - + // i32 prio + // i32 target_cpu + const uint32_t dataOffset = sizeof( perf_event_header ) + sizeof( uint64_t ) + sizeof( uint32_t ); + offset += dataOffset + 8 + 16; uint32_t pid; ring.Read( &pid, offset, sizeof( uint32_t ) ); - + TracyLfqPrepare( QueueType::ThreadWakeup ); MemWrite( &item->threadWakeup.time, t0 ); MemWrite( &item->threadWakeup.thread, pid ); + MemWrite( &item->threadWakeup.cpu, (uint8_t)ring.GetCpu() ); + + int8_t adjustReason = -1; // Does not exist on Linux + int8_t adjustIncrement = 0; // Should perhaps store the new prio? + MemWrite( &item->threadWakeup.adjustReason, adjustReason ); + MemWrite( &item->threadWakeup.adjustIncrement, adjustIncrement ); TracyLfqCommit; } else diff --git a/libs/tracy/client/TracySysTrace.hpp b/libs/tracy/client/TracySysTrace.hpp index 8c663cd..2a28e8b 100644 --- a/libs/tracy/client/TracySysTrace.hpp +++ b/libs/tracy/client/TracySysTrace.hpp @@ -2,8 +2,8 @@ #define __TRACYSYSTRACE_HPP__ #if !defined TRACY_NO_SYSTEM_TRACING && ( defined _WIN32 || defined __linux__ ) -# include "../common/TracyUwp.hpp" -# ifndef TRACY_UWP +# include "../common/TracyWinFamily.hpp" +# if !defined TRACY_WIN32_NO_DESKTOP # define TRACY_HAS_SYSTEM_TRACING # endif #endif diff --git a/libs/tracy/client/tracy_rpmalloc.cpp b/libs/tracy/client/tracy_rpmalloc.cpp index 4a0d0b4..c43b8ca 100644 --- a/libs/tracy/client/tracy_rpmalloc.cpp +++ b/libs/tracy/client/tracy_rpmalloc.cpp @@ -690,7 +690,9 @@ static pthread_key_t _memory_thread_heap; # define _Thread_local __declspec(thread) # define TLS_MODEL # else -# ifndef __HAIKU__ +# if defined(__ANDROID__) && __ANDROID_API__ >= 29 && defined(__NDK_MAJOR__) && __NDK_MAJOR__ >= 26 +# define TLS_MODEL __attribute__((tls_model("local-dynamic"))) +# elif !defined(__HAIKU__) # define TLS_MODEL __attribute__((tls_model("initial-exec"))) # else # define TLS_MODEL @@ -2778,7 +2780,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) { _memory_huge_pages = 1; } -#if PLATFORM_WINDOWS +#if PLATFORM_WINDOWS && !defined TRACY_GDK if (_memory_config.enable_huge_pages) { HANDLE token = 0; size_t large_page_minimum = GetLargePageMinimum(); diff --git a/libs/tracy/common/TracyProtocol.hpp b/libs/tracy/common/TracyProtocol.hpp index 5412458..ff38686 100644 --- a/libs/tracy/common/TracyProtocol.hpp +++ b/libs/tracy/common/TracyProtocol.hpp @@ -9,7 +9,7 @@ namespace tracy constexpr unsigned Lz4CompressBound( unsigned isize ) { return isize + ( isize / 255 ) + 16; } -enum : uint32_t { ProtocolVersion = 69 }; +enum : uint32_t { ProtocolVersion = 76 }; enum : uint16_t { BroadcastVersion = 3 }; using lz4sz_t = uint32_t; @@ -95,7 +95,6 @@ struct WelcomeMessage double timerMul; int64_t initBegin; int64_t initEnd; - uint64_t delay; uint64_t resolution; uint64_t epoch; uint64_t exectime; diff --git a/libs/tracy/common/TracyQueue.hpp b/libs/tracy/common/TracyQueue.hpp index affbd67..765c83c 100644 --- a/libs/tracy/common/TracyQueue.hpp +++ b/libs/tracy/common/TracyQueue.hpp @@ -42,6 +42,8 @@ enum class QueueType : uint8_t MemAllocCallstackNamed, MemFreeCallstack, MemFreeCallstackNamed, + MemDiscard, + MemDiscardCallstack, GpuZoneBegin, GpuZoneBeginCallstack, GpuZoneBeginAllocSrcLoc, @@ -59,6 +61,7 @@ enum class QueueType : uint8_t ThreadWakeup, GpuTime, GpuContextName, + GpuAnnotationName, CallstackFrameSize, SymbolInformation, ExternalNameMetadata, @@ -109,6 +112,7 @@ enum class QueueType : uint8_t SecondStringData, MemNamePayload, ThreadGroupHint, + GpuZoneAnnotation, StringData, ThreadName, PlotName, @@ -329,7 +333,7 @@ struct QueuePlotDataInt : public QueuePlotDataBase int64_t val; }; -struct QueuePlotDataFloat : public QueuePlotDataBase +struct QueuePlotDataFloat : public QueuePlotDataBase { float val; }; @@ -401,7 +405,11 @@ enum class GpuContextType : uint8_t Vulkan, OpenCL, Direct3D12, - Direct3D11 + Direct3D11, + Metal, + Custom, + CUDA, + Rocprof }; enum GpuContextFlags : uint8_t @@ -441,6 +449,15 @@ struct QueueGpuZoneEnd uint8_t context; }; +struct QueueGpuZoneAnnotation +{ + int64_t noteId; + double value; + uint32_t thread; + uint16_t queryId; + uint8_t context; +}; + struct QueueGpuTime { int64_t gpuTime; @@ -462,7 +479,7 @@ struct QueueGpuTimeSync int64_t cpuTime; uint8_t context; }; - + struct QueueGpuContextName { uint8_t context; @@ -474,6 +491,18 @@ struct QueueGpuContextNameFat : public QueueGpuContextName uint16_t size; }; +struct QueueGpuAnnotationName +{ + int64_t noteId; + uint8_t context; +}; + +struct QueueGpuAnnotationNameFat : public QueueGpuAnnotationName +{ + uint64_t ptr; + uint16_t size; +}; + struct QueueMemNamePayload { uint64_t name; @@ -500,6 +529,13 @@ struct QueueMemFree uint64_t ptr; }; +struct QueueMemDiscard +{ + int64_t time; + uint32_t thread; + uint64_t name; +}; + struct QueueCallstackFat { uint64_t ptr; @@ -593,14 +629,20 @@ struct QueueContextSwitch uint32_t oldThread; uint32_t newThread; uint8_t cpu; - uint8_t reason; - uint8_t state; + uint8_t oldThreadWaitReason; + uint8_t oldThreadState; + uint8_t previousCState; + int8_t newThreadPriority; + int8_t oldThreadPriority; }; struct QueueThreadWakeup { int64_t time; uint32_t thread; + uint8_t cpu; + int8_t adjustReason; + int8_t adjustIncrement; }; struct QueueTidToPid @@ -738,8 +780,11 @@ struct QueueItem QueueGpuTimeSync gpuTimeSync; QueueGpuContextName gpuContextName; QueueGpuContextNameFat gpuContextNameFat; + QueueGpuAnnotationName gpuAnnotationName; + QueueGpuAnnotationNameFat gpuAnnotationNameFat; QueueMemAlloc memAlloc; QueueMemFree memFree; + QueueMemDiscard memDiscard; QueueMemNamePayload memName; QueueThreadGroupHint threadGroupHint; QueueCallstackFat callstackFat; @@ -770,6 +815,7 @@ struct QueueItem QueueSourceCodeNotAvailable sourceCodeNotAvailable; QueueFiberEnter fiberEnter; QueueFiberLeave fiberLeave; + QueueGpuZoneAnnotation zoneAnnotation; }; }; #pragma pack( pop ) @@ -811,6 +857,8 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueMemAlloc ), // callstack, named sizeof( QueueHeader ) + sizeof( QueueMemFree ), // callstack sizeof( QueueHeader ) + sizeof( QueueMemFree ), // callstack, named + sizeof( QueueHeader ) + sizeof( QueueMemDiscard ), + sizeof( QueueHeader ) + sizeof( QueueMemDiscard ), // callstack sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), sizeof( QueueHeader ) + sizeof( QueueGpuZoneBegin ), // callstack sizeof( QueueHeader ) + sizeof( QueueGpuZoneBeginLean ),// allocated source location @@ -828,6 +876,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ) + sizeof( QueueThreadWakeup ), sizeof( QueueHeader ) + sizeof( QueueGpuTime ), sizeof( QueueHeader ) + sizeof( QueueGpuContextName ), + sizeof( QueueHeader ) + sizeof( QueueGpuAnnotationName ), sizeof( QueueHeader ) + sizeof( QueueCallstackFrameSize ), sizeof( QueueHeader ) + sizeof( QueueSymbolInformation ), sizeof( QueueHeader ), // ExternalNameMetadata - not for wire transfer @@ -879,6 +928,7 @@ static constexpr size_t QueueDataSize[] = { sizeof( QueueHeader ), // second string data sizeof( QueueHeader ) + sizeof( QueueMemNamePayload ), sizeof( QueueHeader ) + sizeof( QueueThreadGroupHint ), + sizeof( QueueHeader ) + sizeof( QueueGpuZoneAnnotation ), // GPU zone annotation // keep all QueueStringTransfer below sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // string data sizeof( QueueHeader ) + sizeof( QueueStringTransfer ), // thread name diff --git a/libs/tracy/common/TracySystem.cpp b/libs/tracy/common/TracySystem.cpp index d51f5d6..7696ca3 100644 --- a/libs/tracy/common/TracySystem.cpp +++ b/libs/tracy/common/TracySystem.cpp @@ -10,7 +10,7 @@ # endif # include # include -# include "TracyUwp.hpp" +# include "TracyWinFamily.hpp" #else # include # include @@ -26,7 +26,9 @@ # include #elif defined __FreeBSD__ # include -#elif defined __NetBSD__ || defined __DragonFly__ +#elif defined __NetBSD__ +# include +#elif defined __DragonFly__ # include #elif defined __QNX__ # include @@ -135,7 +137,7 @@ TRACY_API void SetThreadName( const char* name ) TRACY_API void SetThreadNameWithHint( const char* name, int32_t groupHint ) { #if defined _WIN32 -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP static auto _SetThreadDescription = &::SetThreadDescription; # else static auto _SetThreadDescription = (t_SetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "SetThreadDescription" ); @@ -244,7 +246,7 @@ TRACY_API const char* GetThreadName( uint32_t id ) #endif #if defined _WIN32 -# ifdef TRACY_UWP +# if defined TRACY_WIN32_NO_DESKTOP static auto _GetThreadDescription = &::GetThreadDescription; # else static auto _GetThreadDescription = (t_GetThreadDescription)GetProcAddress( GetModuleHandleA( "kernel32.dll" ), "GetThreadDescription" ); diff --git a/libs/tracy/common/TracyUwp.hpp b/libs/tracy/common/TracyUwp.hpp deleted file mode 100644 index 7dce96b..0000000 --- a/libs/tracy/common/TracyUwp.hpp +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef __TRACYUWP_HPP__ -#define __TRACYUWP_HPP__ - -#ifdef _WIN32 -# include -# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -# define TRACY_UWP -# endif -#endif - -#endif diff --git a/libs/tracy/common/TracyVersion.hpp b/libs/tracy/common/TracyVersion.hpp index 0905ef9..7d704c5 100644 --- a/libs/tracy/common/TracyVersion.hpp +++ b/libs/tracy/common/TracyVersion.hpp @@ -6,8 +6,8 @@ namespace tracy namespace Version { enum { Major = 0 }; -enum { Minor = 11 }; -enum { Patch = 1 }; +enum { Minor = 12 }; +enum { Patch = 4 }; } } diff --git a/libs/tracy/common/TracyWinFamily.hpp b/libs/tracy/common/TracyWinFamily.hpp new file mode 100644 index 0000000..b601455 --- /dev/null +++ b/libs/tracy/common/TracyWinFamily.hpp @@ -0,0 +1,16 @@ +#ifndef __TRACYWINFAMILY_HPP__ +#define __TRACYWINFAMILY_HPP__ + +#ifdef _WIN32 +# include +# if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +# define TRACY_WIN32_NO_DESKTOP +# if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_GAMES) +# define TRACY_GDK +# elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) +# define TRACY_UWP +# endif +# endif +#endif + +#endif diff --git a/libs/tracy/libbacktrace/dwarf.cpp b/libs/tracy/libbacktrace/dwarf.cpp index b6d681a..52fa8a8 100644 --- a/libs/tracy/libbacktrace/dwarf.cpp +++ b/libs/tracy/libbacktrace/dwarf.cpp @@ -725,8 +725,8 @@ struct dwarf_data struct dwarf_data *next; /* The data for .gnu_debugaltlink. */ struct dwarf_data *altlink; - /* The base address for this file. */ - uintptr_t base_address; +/* The base address mapping for this file. */ + struct libbacktrace_base_address base_address; /* A sorted list of address ranges. */ struct unit_addrs *addrs; /* Number of address ranges in list. */ @@ -1947,8 +1947,9 @@ update_pcrange (const struct attr* attr, const struct attr_val* val, static int add_low_high_range (struct backtrace_state *state, const struct dwarf_sections *dwarf_sections, - uintptr_t base_address, int is_bigendian, - struct unit *u, const struct pcrange *pcrange, + struct libbacktrace_base_address base_address, + int is_bigendian, struct unit *u, + const struct pcrange *pcrange, int (*add_range) (struct backtrace_state *state, void *rdata, uintptr_t lowpc, uintptr_t highpc, @@ -1983,8 +1984,8 @@ add_low_high_range (struct backtrace_state *state, /* Add in the base address of the module when recording PC values, so that we can look up the PC directly. */ - lowpc += base_address; - highpc += base_address; + lowpc = libbacktrace_add_base (lowpc, base_address); + highpc = libbacktrace_add_base (highpc, base_address); return add_range (state, rdata, lowpc, highpc, error_callback, data, vec); } @@ -1996,7 +1997,7 @@ static int add_ranges_from_ranges ( struct backtrace_state *state, const struct dwarf_sections *dwarf_sections, - uintptr_t base_address, int is_bigendian, + struct libbacktrace_base_address base_address, int is_bigendian, struct unit *u, uintptr_t base, const struct pcrange *pcrange, int (*add_range) (struct backtrace_state *state, void *rdata, @@ -2042,10 +2043,11 @@ add_ranges_from_ranges ( base = (uintptr_t) high; else { - if (!add_range (state, rdata, - (uintptr_t) low + base + base_address, - (uintptr_t) high + base + base_address, - error_callback, data, vec)) + uintptr_t rl, rh; + + rl = libbacktrace_add_base ((uintptr_t) low + base, base_address); + rh = libbacktrace_add_base ((uintptr_t) high + base, base_address); + if (!add_range (state, rdata, rl, rh, error_callback, data, vec)) return 0; } } @@ -2063,7 +2065,7 @@ static int add_ranges_from_rnglists ( struct backtrace_state *state, const struct dwarf_sections *dwarf_sections, - uintptr_t base_address, int is_bigendian, + struct libbacktrace_base_address base_address, int is_bigendian, struct unit *u, uintptr_t base, const struct pcrange *pcrange, int (*add_range) (struct backtrace_state *state, void *rdata, @@ -2146,9 +2148,10 @@ add_ranges_from_rnglists ( u->addrsize, is_bigendian, index, error_callback, data, &high)) return 0; - if (!add_range (state, rdata, low + base_address, - high + base_address, error_callback, data, - vec)) + if (!add_range (state, rdata, + libbacktrace_add_base (low, base_address), + libbacktrace_add_base (high, base_address), + error_callback, data, vec)) return 0; } break; @@ -2165,7 +2168,7 @@ add_ranges_from_rnglists ( error_callback, data, &low)) return 0; length = read_uleb128 (&rnglists_buf); - low += base_address; + low = libbacktrace_add_base (low, base_address); if (!add_range (state, rdata, low, low + length, error_callback, data, vec)) return 0; @@ -2179,8 +2182,9 @@ add_ranges_from_rnglists ( low = read_uleb128 (&rnglists_buf); high = read_uleb128 (&rnglists_buf); - if (!add_range (state, rdata, low + base + base_address, - high + base + base_address, + if (!add_range (state, rdata, + libbacktrace_add_base (low + base, base_address), + libbacktrace_add_base (high + base, base_address), error_callback, data, vec)) return 0; } @@ -2197,9 +2201,10 @@ add_ranges_from_rnglists ( low = (uintptr_t) read_address (&rnglists_buf, u->addrsize); high = (uintptr_t) read_address (&rnglists_buf, u->addrsize); - if (!add_range (state, rdata, low + base_address, - high + base_address, error_callback, data, - vec)) + if (!add_range (state, rdata, + libbacktrace_add_base (low, base_address), + libbacktrace_add_base (high, base_address), + error_callback, data, vec)) return 0; } break; @@ -2211,7 +2216,7 @@ add_ranges_from_rnglists ( low = (uintptr_t) read_address (&rnglists_buf, u->addrsize); length = (uintptr_t) read_uleb128 (&rnglists_buf); - low += base_address; + low = libbacktrace_add_base (low, base_address); if (!add_range (state, rdata, low, low + length, error_callback, data, vec)) return 0; @@ -2239,7 +2244,7 @@ add_ranges_from_rnglists ( static int add_ranges (struct backtrace_state *state, const struct dwarf_sections *dwarf_sections, - uintptr_t base_address, int is_bigendian, + struct libbacktrace_base_address base_address, int is_bigendian, struct unit *u, uintptr_t base, const struct pcrange *pcrange, int (*add_range) (struct backtrace_state *state, void *rdata, uintptr_t lowpc, uintptr_t highpc, @@ -2275,7 +2280,8 @@ add_ranges (struct backtrace_state *state, read, 0 if there is some error. */ static int -find_address_ranges (struct backtrace_state *state, uintptr_t base_address, +find_address_ranges (struct backtrace_state *state, + struct libbacktrace_base_address base_address, struct dwarf_buf *unit_buf, const struct dwarf_sections *dwarf_sections, int is_bigendian, struct dwarf_data *altlink, @@ -2430,7 +2436,8 @@ find_address_ranges (struct backtrace_state *state, uintptr_t base_address, on success, 0 on failure. */ static int -build_address_map (struct backtrace_state *state, uintptr_t base_address, +build_address_map (struct backtrace_state *state, + struct libbacktrace_base_address base_address, const struct dwarf_sections *dwarf_sections, int is_bigendian, struct dwarf_data *altlink, backtrace_error_callback error_callback, void *data, @@ -2649,7 +2656,7 @@ add_line (struct backtrace_state *state, struct dwarf_data *ddata, /* Add in the base address here, so that we can look up the PC directly. */ - ln->pc = pc + ddata->base_address; + ln->pc = libbacktrace_add_base (pc, ddata->base_address); ln->filename = filename; ln->lineno = lineno; @@ -4329,7 +4336,7 @@ dwarf_fileline (struct backtrace_state *state, uintptr_t pc, static struct dwarf_data * build_dwarf_data (struct backtrace_state *state, - uintptr_t base_address, + struct libbacktrace_base_address base_address, const struct dwarf_sections *dwarf_sections, int is_bigendian, struct dwarf_data *altlink, @@ -4387,7 +4394,7 @@ build_dwarf_data (struct backtrace_state *state, int backtrace_dwarf_add (struct backtrace_state *state, - uintptr_t base_address, + struct libbacktrace_base_address base_address, const struct dwarf_sections *dwarf_sections, int is_bigendian, struct dwarf_data *fileline_altlink, diff --git a/libs/tracy/libbacktrace/elf.cpp b/libs/tracy/libbacktrace/elf.cpp index e88a33b..ffe8d70 100644 --- a/libs/tracy/libbacktrace/elf.cpp +++ b/libs/tracy/libbacktrace/elf.cpp @@ -643,7 +643,7 @@ elf_symbol_search (const void *vkey, const void *ventry) static int elf_initialize_syminfo (struct backtrace_state *state, - uintptr_t base_address, + struct libbacktrace_base_address base_address, const unsigned char *symtab_data, size_t symtab_size, const unsigned char *strtab, size_t strtab_size, backtrace_error_callback error_callback, @@ -709,7 +709,8 @@ elf_initialize_syminfo (struct backtrace_state *state, = *(const b_elf_addr *) (opd->data + (sym->st_value - opd->addr)); else elf_symbols[j].address = sym->st_value; - elf_symbols[j].address += base_address; + elf_symbols[j].address = + libbacktrace_add_base (elf_symbols[j].address, base_address); elf_symbols[j].size = sym->st_size; ++j; } @@ -1200,14 +1201,7 @@ elf_fetch_bits_backward (const unsigned char **ppin, val = *pval; if (unlikely (pin <= pinend)) - { - if (bits == 0) - { - elf_uncompress_failed (); - return 0; - } - return 1; - } + return 1; pin -= 4; @@ -5712,10 +5706,10 @@ elf_uncompress_lzma_block (const unsigned char *compressed, /* Block header CRC. */ computed_crc = elf_crc32 (0, compressed + block_header_offset, block_header_size - 4); - stream_crc = (compressed[off] - | (compressed[off + 1] << 8) - | (compressed[off + 2] << 16) - | (compressed[off + 3] << 24)); + stream_crc = ((uint32_t)compressed[off] + | ((uint32_t)compressed[off + 1] << 8) + | ((uint32_t)compressed[off + 2] << 16) + | ((uint32_t)compressed[off + 3] << 24)); if (unlikely (computed_crc != stream_crc)) { elf_uncompress_failed (); @@ -6222,10 +6216,10 @@ elf_uncompress_lzma_block (const unsigned char *compressed, return 0; } computed_crc = elf_crc32 (0, uncompressed, uncompressed_offset); - stream_crc = (compressed[off] - | (compressed[off + 1] << 8) - | (compressed[off + 2] << 16) - | (compressed[off + 3] << 24)); + stream_crc = ((uint32_t)compressed[off] + | ((uint32_t)compressed[off + 1] << 8) + | ((uint32_t)compressed[off + 2] << 16) + | ((uint32_t)compressed[off + 3] << 24)); if (computed_crc != stream_crc) { elf_uncompress_failed (); @@ -6325,10 +6319,10 @@ elf_uncompress_lzma (struct backtrace_state *state, /* Next comes a CRC of the stream flags. */ computed_crc = elf_crc32 (0, compressed + 6, 2); - stream_crc = (compressed[8] - | (compressed[9] << 8) - | (compressed[10] << 16) - | (compressed[11] << 24)); + stream_crc = ((uint32_t)compressed[8] + | ((uint32_t)compressed[9] << 8) + | ((uint32_t)compressed[10] << 16) + | ((uint32_t)compressed[11] << 24)); if (unlikely (computed_crc != stream_crc)) { elf_uncompress_failed (); @@ -6369,10 +6363,10 @@ elf_uncompress_lzma (struct backtrace_state *state, /* Before that is a footer CRC. */ computed_crc = elf_crc32 (0, compressed + offset, 6); - stream_crc = (compressed[offset - 4] - | (compressed[offset - 3] << 8) - | (compressed[offset - 2] << 16) - | (compressed[offset - 1] << 24)); + stream_crc = ((uint32_t)compressed[offset - 4] + | ((uint32_t)compressed[offset - 3] << 8) + | ((uint32_t)compressed[offset - 2] << 16) + | ((uint32_t)compressed[offset - 1] << 24)); if (unlikely (computed_crc != stream_crc)) { elf_uncompress_failed (); @@ -6428,10 +6422,10 @@ elf_uncompress_lzma (struct backtrace_state *state, /* Next is a CRC of the index. */ computed_crc = elf_crc32 (0, compressed + index_offset, offset - index_offset); - stream_crc = (compressed[offset] - | (compressed[offset + 1] << 8) - | (compressed[offset + 2] << 16) - | (compressed[offset + 3] << 24)); + stream_crc = ((uint32_t)compressed[offset] + | ((uint32_t)compressed[offset + 1] << 8) + | ((uint32_t)compressed[offset + 2] << 16) + | ((uint32_t)compressed[offset + 3] << 24)); if (unlikely (computed_crc != stream_crc)) { elf_uncompress_failed (); @@ -6524,7 +6518,8 @@ backtrace_uncompress_lzma (struct backtrace_state *state, static int elf_add (struct backtrace_state *state, const char *filename, int descriptor, const unsigned char *memory, size_t memory_size, - uintptr_t base_address, struct elf_ppc64_opd_data *caller_opd, + struct libbacktrace_base_address base_address, + struct elf_ppc64_opd_data *caller_opd, backtrace_error_callback error_callback, void *data, fileline *fileline_fn, int *found_sym, int *found_dwarf, struct dwarf_data **fileline_entry, int exe, int debuginfo, @@ -6867,7 +6862,8 @@ elf_add (struct backtrace_state *state, const char *filename, int descriptor, } } - if (!gnu_debugdata_view_valid + if (!debuginfo + && !gnu_debugdata_view_valid && strcmp (name, ".gnu_debugdata") == 0) { if (!elf_get_view (state, descriptor, memory, memory_size, @@ -7425,6 +7421,7 @@ phdr_callback (struct PhdrIterate *info, void *pdata) const char *filename; int descriptor; int does_not_exist; + struct libbacktrace_base_address base_address; fileline elf_fileline_fn; int found_dwarf; @@ -7454,7 +7451,8 @@ phdr_callback (struct PhdrIterate *info, void *pdata) return 0; } - if (elf_add (pd->state, filename, descriptor, NULL, 0, info->dlpi_addr, NULL, + base_address.m = info->dlpi_addr; + if (elf_add (pd->state, filename, descriptor, NULL, 0, base_address, NULL, pd->error_callback, pd->data, &elf_fileline_fn, pd->found_sym, &found_dwarf, NULL, 0, 0, NULL, 0)) { @@ -7543,11 +7541,21 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, fileline elf_fileline_fn = elf_nodebug; struct phdr_data pd; - ret = elf_add (state, filename, descriptor, NULL, 0, 0, NULL, error_callback, - data, &elf_fileline_fn, &found_sym, &found_dwarf, NULL, 1, 0, - NULL, 0); - if (!ret) - return 0; + + /* When using fdpic we must use dl_iterate_phdr for all modules, including + the main executable, so that we can get the right base address + mapping. */ + if (!libbacktrace_using_fdpic ()) + { + struct libbacktrace_base_address zero_base_address; + + memset (&zero_base_address, 0, sizeof zero_base_address); + ret = elf_add (state, filename, descriptor, NULL, 0, zero_base_address, + NULL, error_callback, data, &elf_fileline_fn, &found_sym, + &found_dwarf, NULL, 1, 0, NULL, 0); + if (!ret) + return 0; + } pd.state = state; pd.error_callback = error_callback; diff --git a/libs/tracy/libbacktrace/internal.hpp b/libs/tracy/libbacktrace/internal.hpp index fea298f..2139597 100644 --- a/libs/tracy/libbacktrace/internal.hpp +++ b/libs/tracy/libbacktrace/internal.hpp @@ -333,10 +333,44 @@ struct dwarf_sections struct dwarf_data; +/* The load address mapping. */ + +#if defined(__FDPIC__) && defined(HAVE_DL_ITERATE_PHDR) && (defined(HAVE_LINK_H) || defined(HAVE_SYS_LINK_H)) + +#ifdef HAVE_LINK_H + #include +#endif +#ifdef HAVE_SYS_LINK_H + #include +#endif + +#define libbacktrace_using_fdpic() (1) + +struct libbacktrace_base_address +{ + struct elf32_fdpic_loadaddr m; +}; + +#define libbacktrace_add_base(pc, base) \ + ((uintptr_t) (__RELOC_POINTER ((pc), (base).m))) + +#else /* not _FDPIC__ */ + +#define libbacktrace_using_fdpic() (0) + +struct libbacktrace_base_address +{ + uintptr_t m; +}; + +#define libbacktrace_add_base(pc, base) ((pc) + (base).m) + +#endif /* not _FDPIC__ */ + /* Add file/line information for a DWARF module. */ extern int backtrace_dwarf_add (struct backtrace_state *state, - uintptr_t base_address, + struct libbacktrace_base_address base_address, const struct dwarf_sections *dwarf_sections, int is_bigendian, struct dwarf_data *fileline_altlink, diff --git a/libs/tracy/libbacktrace/macho.cpp b/libs/tracy/libbacktrace/macho.cpp index 6cccdab..b9f0845 100644 --- a/libs/tracy/libbacktrace/macho.cpp +++ b/libs/tracy/libbacktrace/macho.cpp @@ -274,12 +274,14 @@ struct macho_nlist_64 /* Value found in nlist n_type field. */ -#define MACH_O_N_EXT 0x01 /* Extern symbol */ +#define MACH_O_N_STAB 0xe0 /* Stabs debugging symbol */ +#define MACH_O_N_TYPE 0x0e /* Mask for type bits */ + +/* Values found after masking with MACH_O_N_TYPE. */ +#define MACH_O_N_UNDF 0x00 /* Undefined symbol */ #define MACH_O_N_ABS 0x02 /* Absolute symbol */ -#define MACH_O_N_SECT 0x0e /* Defined in section */ +#define MACH_O_N_SECT 0x0e /* Defined in section from n_sect field */ -#define MACH_O_N_TYPE 0x0e /* Mask for type bits */ -#define MACH_O_N_STAB 0xe0 /* Stabs debugging symbol */ /* Information we keep for a Mach-O symbol. */ @@ -316,8 +318,9 @@ static const char * const dwarf_section_names[DEBUG_MAX] = /* Forward declaration. */ static int macho_add (struct backtrace_state *, const char *, int, off_t, - const unsigned char *, uintptr_t, int, - backtrace_error_callback, void *, fileline *, int *); + const unsigned char *, struct libbacktrace_base_address, + int, backtrace_error_callback, void *, fileline *, + int *); /* A dummy callback function used when we can't find any debug info. */ @@ -495,10 +498,10 @@ macho_defined_symbol (uint8_t type) { if ((type & MACH_O_N_STAB) != 0) return 0; - if ((type & MACH_O_N_EXT) != 0) - return 0; switch (type & MACH_O_N_TYPE) { + case MACH_O_N_UNDF: + return 0; case MACH_O_N_ABS: return 1; case MACH_O_N_SECT: @@ -512,7 +515,7 @@ macho_defined_symbol (uint8_t type) static int macho_add_symtab (struct backtrace_state *state, int descriptor, - uintptr_t base_address, int is_64, + struct libbacktrace_base_address base_address, int is_64, off_t symoff, unsigned int nsyms, off_t stroff, unsigned int strsize, backtrace_error_callback error_callback, void *data) @@ -627,7 +630,7 @@ macho_add_symtab (struct backtrace_state *state, int descriptor, if (name[0] == '_') ++name; macho_symbols[j].name = name; - macho_symbols[j].address = value + base_address; + macho_symbols[j].address = libbacktrace_add_base (value, base_address); ++j; } @@ -760,7 +763,8 @@ macho_syminfo (struct backtrace_state *state, uintptr_t addr, static int macho_add_fat (struct backtrace_state *state, const char *filename, int descriptor, int swapped, off_t offset, - const unsigned char *match_uuid, uintptr_t base_address, + const unsigned char *match_uuid, + struct libbacktrace_base_address base_address, int skip_symtab, uint32_t nfat_arch, int is_64, backtrace_error_callback error_callback, void *data, fileline *fileline_fn, int *found_sym) @@ -862,7 +866,8 @@ macho_add_fat (struct backtrace_state *state, const char *filename, static int macho_add_dsym (struct backtrace_state *state, const char *filename, - uintptr_t base_address, const unsigned char *uuid, + struct libbacktrace_base_address base_address, + const unsigned char *uuid, backtrace_error_callback error_callback, void *data, fileline* fileline_fn) { @@ -980,7 +985,7 @@ macho_add_dsym (struct backtrace_state *state, const char *filename, static int macho_add (struct backtrace_state *state, const char *filename, int descriptor, off_t offset, const unsigned char *match_uuid, - uintptr_t base_address, int skip_symtab, + struct libbacktrace_base_address base_address, int skip_symtab, backtrace_error_callback error_callback, void *data, fileline *fileline_fn, int *found_sym) { @@ -1242,7 +1247,7 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, c = _dyld_image_count (); for (i = 0; i < c; ++i) { - uintptr_t base_address; + struct libbacktrace_base_address base_address; const char *name; int d; fileline mff; @@ -1266,7 +1271,7 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, continue; } - base_address = _dyld_get_image_vmaddr_slide (i); + base_address.m = _dyld_get_image_vmaddr_slide (i); mff = macho_nodebug; if (!macho_add (state, name, d, 0, NULL, base_address, 0, @@ -1321,10 +1326,12 @@ backtrace_initialize (struct backtrace_state *state, const char *filename, void *data, fileline *fileline_fn) { fileline macho_fileline_fn; + struct libbacktrace_base_address zero_base_address; int found_sym; macho_fileline_fn = macho_nodebug; - if (!macho_add (state, filename, descriptor, 0, NULL, 0, 0, + memset (&zero_base_address, 0, sizeof zero_base_address); + if (!macho_add (state, filename, descriptor, 0, NULL, zero_base_address, 0, error_callback, data, &macho_fileline_fn, &found_sym)) return 0; diff --git a/libs/tracy/tracy/Tracy.hpp b/libs/tracy/tracy/Tracy.hpp index e75d02c..98957f6 100644 --- a/libs/tracy/tracy/Tracy.hpp +++ b/libs/tracy/tracy/Tracy.hpp @@ -13,7 +13,7 @@ #endif #ifndef TracyLine -# define TracyLine __LINE__ +# define TracyLine TracyConcat(__LINE__,U) // MSVC Edit and continue __LINE__ is non-constant. See https://developercommunity.visualstudio.com/t/-line-cannot-be-used-as-an-argument-for-constexpr/195665 #endif #ifndef TRACY_ENABLE @@ -75,8 +75,10 @@ #define TracyAlloc(x,y) #define TracyFree(x) +#define TracyMemoryDiscard(x) #define TracySecureAlloc(x,y) #define TracySecureFree(x) +#define TracySecureMemoryDiscard(x) #define TracyAllocN(x,y,z) #define TracyFreeN(x,y) @@ -98,8 +100,10 @@ #define TracyAllocS(x,y,z) #define TracyFreeS(x,y) +#define TracyMemoryDiscardS(x,y) #define TracySecureAllocS(x,y,z) #define TracySecureFreeS(x,y) +#define TracySecureMemoryDiscardS(x,y) #define TracyAllocNS(x,y,z,w) #define TracyFreeNS(x,y,z) @@ -130,27 +134,20 @@ #include "../client/TracyProfiler.hpp" #include "../client/TracyScoped.hpp" +#ifndef TRACY_CALLSTACK +#define TRACY_CALLSTACK 0 +#endif + #define TracyNoop tracy::ProfilerAvailable() -#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) -# define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) -# define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) -# define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) +#define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) +#define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) +#define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) +#define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ) -# define ZoneTransient( varname, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, TRACY_CALLSTACK, active ) -# define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active ) -# define ZoneTransientNC( varname, name, color, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), color, TRACY_CALLSTACK, active ) -#else -# define ZoneNamed( varname, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), active ) -# define ZoneNamedN( varname, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), active ) -# define ZoneNamedC( varname, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), active ) -# define ZoneNamedNC( varname, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), active ) - -# define ZoneTransient( varname, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, active ) -# define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), active ) -# define ZoneTransientNC( varname, name, color, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), color, active ) -#endif +#define ZoneTransient( varname, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, TRACY_CALLSTACK, active ) +#define ZoneTransientN( varname, name, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), TRACY_CALLSTACK, active ) +#define ZoneTransientNC( varname, name, color, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), color, TRACY_CALLSTACK, active ) #define ZoneScoped ZoneNamed( ___tracy_scoped_zone, true ) #define ZoneScopedN( name ) ZoneNamedN( ___tracy_scoped_zone, name, true ) @@ -185,7 +182,7 @@ #define TracySharedLockableN( type, varname, desc ) tracy::SharedLockable varname { [] () -> const tracy::SourceLocationData* { static constexpr tracy::SourceLocationData srcloc { nullptr, desc, TracyFile, TracyLine, 0 }; return &srcloc; }() } #define LockableBase( type ) tracy::Lockable #define SharedLockableBase( type ) tracy::SharedLockable -#define LockMark( varname ) static constexpr tracy::SourceLocationData __tracy_lock_location_##varname { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; varname.Mark( &__tracy_lock_location_##varname ) +#define LockMark( varname ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_lock_location_,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; varname.Mark( &TracyConcat(__tracy_lock_location_,TracyLine) ) #define LockableName( varname, txt, size ) varname.CustomName( txt, size ) #define TracyPlot( name, val ) tracy::Profiler::PlotData( name, val ) @@ -193,95 +190,52 @@ #define TracyAppInfo( txt, size ) tracy::Profiler::MessageAppInfo( txt, size ) -#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK ) -# define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK ) -# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK ) -# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK ) - -# define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false ) -# define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false ) -# define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true ) -# define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true ) - -# define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name ) -# define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name ) -# define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name ) -# define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name ) -#else -# define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, 0 ) -# define TracyMessageL( txt ) tracy::Profiler::Message( txt, 0 ) -# define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, 0 ) -# define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, 0 ) - -# define TracyAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, false ) -# define TracyFree( ptr ) tracy::Profiler::MemFree( ptr, false ) -# define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAlloc( ptr, size, true ) -# define TracySecureFree( ptr ) tracy::Profiler::MemFree( ptr, true ) - -# define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, false, name ) -# define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, false, name ) -# define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocNamed( ptr, size, true, name ) -# define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeNamed( ptr, true, name ) -#endif - -#ifdef TRACY_HAS_CALLSTACK -# define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) -# define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) -# define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) -# define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) - -# define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, depth, active ) -# define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), depth, active ) - -# define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true ) -# define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true ) -# define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true ) -# define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true ) - -# define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false ) -# define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false ) -# define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true ) -# define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true ) - -# define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name ) -# define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name ) -# define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name ) -# define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name ) - -# define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth ) -# define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth ) -# define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth ) -# define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth ) -#else -# define ZoneNamedS( varname, depth, active ) ZoneNamed( varname, active ) -# define ZoneNamedNS( varname, name, depth, active ) ZoneNamedN( varname, name, active ) -# define ZoneNamedCS( varname, color, depth, active ) ZoneNamedC( varname, color, active ) -# define ZoneNamedNCS( varname, name, color, depth, active ) ZoneNamedNC( varname, name, color, active ) - -# define ZoneTransientS( varname, depth, active ) ZoneTransient( varname, active ) -# define ZoneTransientNS( varname, name, depth, active ) ZoneTransientN( varname, name, active ) - -# define ZoneScopedS( depth ) ZoneScoped -# define ZoneScopedNS( name, depth ) ZoneScopedN( name ) -# define ZoneScopedCS( color, depth ) ZoneScopedC( color ) -# define ZoneScopedNCS( name, color, depth ) ZoneScopedNC( name, color ) - -# define TracyAllocS( ptr, size, depth ) TracyAlloc( ptr, size ) -# define TracyFreeS( ptr, depth ) TracyFree( ptr ) -# define TracySecureAllocS( ptr, size, depth ) TracySecureAlloc( ptr, size ) -# define TracySecureFreeS( ptr, depth ) TracySecureFree( ptr ) - -# define TracyAllocNS( ptr, size, depth, name ) TracyAllocN( ptr, size, name ) -# define TracyFreeNS( ptr, depth, name ) TracyFreeN( ptr, name ) -# define TracySecureAllocNS( ptr, size, depth, name ) TracySecureAllocN( ptr, size, name ) -# define TracySecureFreeNS( ptr, depth, name ) TracySecureFreeN( ptr, name ) - -# define TracyMessageS( txt, size, depth ) TracyMessage( txt, size ) -# define TracyMessageLS( txt, depth ) TracyMessageL( txt ) -# define TracyMessageCS( txt, size, color, depth ) TracyMessageC( txt, size, color ) -# define TracyMessageLCS( txt, color, depth ) TracyMessageLC( txt, color ) -#endif +#define TracyMessage( txt, size ) tracy::Profiler::Message( txt, size, TRACY_CALLSTACK ) +#define TracyMessageL( txt ) tracy::Profiler::Message( txt, TRACY_CALLSTACK ) +#define TracyMessageC( txt, size, color ) tracy::Profiler::MessageColor( txt, size, color, TRACY_CALLSTACK ) +#define TracyMessageLC( txt, color ) tracy::Profiler::MessageColor( txt, color, TRACY_CALLSTACK ) + +#define TracyAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, false ) +#define TracyFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, false ) +#define TracySecureAlloc( ptr, size ) tracy::Profiler::MemAllocCallstack( ptr, size, TRACY_CALLSTACK, true ) +#define TracySecureFree( ptr ) tracy::Profiler::MemFreeCallstack( ptr, TRACY_CALLSTACK, true ) + +#define TracyAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, false, name ) +#define TracyFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, false, name ) +#define TracyMemoryDiscard( name ) tracy::Profiler::MemDiscardCallstack( name, false, TRACY_CALLSTACK ) +#define TracySecureAllocN( ptr, size, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, TRACY_CALLSTACK, true, name ) +#define TracySecureFreeN( ptr, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, TRACY_CALLSTACK, true, name ) +#define TracySecureMemoryDiscard( name ) tracy::Profiler::MemDiscardCallstack( name, true, TRACY_CALLSTACK ) + +#define ZoneNamedS( varname, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) +#define ZoneNamedNS( varname, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) +#define ZoneNamedCS( varname, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { nullptr, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) +#define ZoneNamedNCS( varname, name, color, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::ScopedZone varname( &TracyConcat(__tracy_source_location,TracyLine), depth, active ) + +#define ZoneTransientS( varname, depth, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), nullptr, 0, depth, active ) +#define ZoneTransientNS( varname, name, depth, active ) tracy::ScopedZone varname( TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), depth, active ) + +#define ZoneScopedS( depth ) ZoneNamedS( ___tracy_scoped_zone, depth, true ) +#define ZoneScopedNS( name, depth ) ZoneNamedNS( ___tracy_scoped_zone, name, depth, true ) +#define ZoneScopedCS( color, depth ) ZoneNamedCS( ___tracy_scoped_zone, color, depth, true ) +#define ZoneScopedNCS( name, color, depth ) ZoneNamedNCS( ___tracy_scoped_zone, name, color, depth, true ) + +#define TracyAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, false ) +#define TracyFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, false ) +#define TracySecureAllocS( ptr, size, depth ) tracy::Profiler::MemAllocCallstack( ptr, size, depth, true ) +#define TracySecureFreeS( ptr, depth ) tracy::Profiler::MemFreeCallstack( ptr, depth, true ) + +#define TracyAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, false, name ) +#define TracyFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, false, name ) +#define TracyMemoryDiscardS( name, depth ) tracy::Profiler::MemDiscardCallstack( name, false, depth ) +#define TracySecureAllocNS( ptr, size, depth, name ) tracy::Profiler::MemAllocCallstackNamed( ptr, size, depth, true, name ) +#define TracySecureFreeNS( ptr, depth, name ) tracy::Profiler::MemFreeCallstackNamed( ptr, depth, true, name ) +#define TracySecureMemoryDiscardS( name, depth ) tracy::Profiler::MemDiscardCallstack( name, true, depth ) + +#define TracyMessageS( txt, size, depth ) tracy::Profiler::Message( txt, size, depth ) +#define TracyMessageLS( txt, depth ) tracy::Profiler::Message( txt, depth ) +#define TracyMessageCS( txt, size, color, depth ) tracy::Profiler::MessageColor( txt, size, color, depth ) +#define TracyMessageLCS( txt, color, depth ) tracy::Profiler::MessageColor( txt, color, depth ) #define TracySourceCallbackRegister( cb, data ) tracy::Profiler::SourceCallbackRegister( cb, data ) #define TracyParameterRegister( cb, data ) tracy::Profiler::ParameterRegister( cb, data ) diff --git a/libs/tracy/tracy/TracyC.h b/libs/tracy/tracy/TracyC.h index 8b447be..1b1373e 100644 --- a/libs/tracy/tracy/TracyC.h +++ b/libs/tracy/tracy/TracyC.h @@ -4,7 +4,6 @@ #include #include -#include "../client/TracyCallstack.h" #include "../common/TracyApi.h" #ifdef __cplusplus @@ -53,8 +52,10 @@ typedef const void* TracyCLockCtx; #define TracyCAlloc(x,y) #define TracyCFree(x) +#define TracyCMemoryDiscard(x) #define TracyCSecureAlloc(x,y) #define TracyCSecureFree(x) +#define TracyCSecureMemoryDiscard(x) #define TracyCAllocN(x,y,z) #define TracyCFreeN(x,y) @@ -85,8 +86,10 @@ typedef const void* TracyCLockCtx; #define TracyCAllocS(x,y,z) #define TracyCFreeS(x,y) +#define TracyCMemoryDiscardS(x,y) #define TracyCSecureAllocS(x,y,z) #define TracyCSecureFreeS(x,y) +#define TracyCSecureMemoryDiscardS(x,y) #define TracyCAllocNS(x,y,z,w) #define TracyCFreeNS(x,y,z) @@ -137,7 +140,7 @@ struct ___tracy_source_location_data struct ___tracy_c_zone_context { uint32_t id; - int active; + int32_t active; }; struct ___tracy_gpu_time_data @@ -155,7 +158,7 @@ struct ___tracy_gpu_zone_begin_data { struct ___tracy_gpu_zone_begin_callstack_data { uint64_t srcloc; - int depth; + int32_t depth; uint16_t queryId; uint8_t context; }; @@ -201,7 +204,7 @@ typedef struct __tracy_lockable_context_data* TracyCLockCtx; #ifdef TRACY_MANUAL_LIFETIME TRACY_API void ___tracy_startup_profiler(void); TRACY_API void ___tracy_shutdown_profiler(void); -TRACY_API int ___tracy_profiler_started(void); +TRACY_API int32_t ___tracy_profiler_started(void); # define TracyCIsStarted ___tracy_profiler_started() #else @@ -211,10 +214,10 @@ TRACY_API int ___tracy_profiler_started(void); TRACY_API uint64_t ___tracy_alloc_srcloc( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, uint32_t color ); TRACY_API uint64_t ___tracy_alloc_srcloc_name( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, uint32_t color ); -TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int active ); -TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int depth, int active ); -TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int active ); -TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int depth, int active ); +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin( const struct ___tracy_source_location_data* srcloc, int32_t active ); +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_callstack( const struct ___tracy_source_location_data* srcloc, int32_t depth, int32_t active ); +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc( uint64_t srcloc, int32_t active ); +TRACY_API TracyCZoneCtx ___tracy_emit_zone_begin_alloc_callstack( uint64_t srcloc, int32_t depth, int32_t active ); TRACY_API void ___tracy_emit_zone_end( TracyCZoneCtx ctx ); TRACY_API void ___tracy_emit_zone_text( TracyCZoneCtx ctx, const char* txt, size_t size ); TRACY_API void ___tracy_emit_zone_name( TracyCZoneCtx ctx, const char* txt, size_t size ); @@ -243,20 +246,17 @@ TRACY_API void ___tracy_emit_gpu_context_name_serial( const struct ___tracy_gpu_ TRACY_API void ___tracy_emit_gpu_calibration_serial( const struct ___tracy_gpu_calibration_data ); TRACY_API void ___tracy_emit_gpu_time_sync_serial( const struct ___tracy_gpu_time_sync_data ); -TRACY_API int ___tracy_connected(void); +TRACY_API int32_t ___tracy_connected(void); -#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); -# define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); -# define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); -# define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); -#else -# define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,TracyLine), active ); -# define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,TracyLine), active ); -# define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,TracyLine), active ); -# define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin( &TracyConcat(__tracy_source_location,TracyLine), active ); +#ifndef TRACY_CALLSTACK +#define TRACY_CALLSTACK 0 #endif +#define TracyCZone( ctx, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); +#define TracyCZoneN( ctx, name, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); +#define TracyCZoneC( ctx, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); +#define TracyCZoneNC( ctx, name, color, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), TRACY_CALLSTACK, active ); + #define TracyCZoneEnd( ctx ) ___tracy_emit_zone_end( ctx ); #define TracyCZoneText( ctx, txt, size ) ___tracy_emit_zone_text( ctx, txt, size ); @@ -265,57 +265,44 @@ TRACY_API int ___tracy_connected(void); #define TracyCZoneValue( ctx, value ) ___tracy_emit_zone_value( ctx, value ); -TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int secure ); -TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int depth, int secure ); -TRACY_API void ___tracy_emit_memory_free( const void* ptr, int secure ); -TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int depth, int secure ); -TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int secure, const char* name ); -TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int depth, int secure, const char* name ); -TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int secure, const char* name ); -TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int depth, int secure, const char* name ); - -TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int callstack ); -TRACY_API void ___tracy_emit_messageL( const char* txt, int callstack ); -TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int callstack ); -TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int callstack ); - -#if defined TRACY_HAS_CALLSTACK && defined TRACY_CALLSTACK -# define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK, 0 ) -# define TracyCFree( ptr ) ___tracy_emit_memory_free_callstack( ptr, TRACY_CALLSTACK, 0 ) -# define TracyCSecureAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK, 1 ) -# define TracyCSecureFree( ptr ) ___tracy_emit_memory_free_callstack( ptr, TRACY_CALLSTACK, 1 ) - -# define TracyCAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, TRACY_CALLSTACK, 0, name ) -# define TracyCFreeN( ptr, name ) ___tracy_emit_memory_free_callstack_named( ptr, TRACY_CALLSTACK, 0, name ) -# define TracyCSecureAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, TRACY_CALLSTACK, 1, name ) -# define TracyCSecureFreeN( ptr, name ) ___tracy_emit_memory_free_callstack_named( ptr, TRACY_CALLSTACK, 1, name ) - -# define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, TRACY_CALLSTACK ); -# define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, TRACY_CALLSTACK ); -# define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, TRACY_CALLSTACK ); -# define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, TRACY_CALLSTACK ); -#else -# define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc( ptr, size, 0 ); -# define TracyCFree( ptr ) ___tracy_emit_memory_free( ptr, 0 ); -# define TracyCSecureAlloc( ptr, size ) ___tracy_emit_memory_alloc( ptr, size, 1 ); -# define TracyCSecureFree( ptr ) ___tracy_emit_memory_free( ptr, 1 ); - -# define TracyCAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_named( ptr, size, 0, name ); -# define TracyCFreeN( ptr, name ) ___tracy_emit_memory_free_named( ptr, 0, name ); -# define TracyCSecureAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_named( ptr, size, 1, name ); -# define TracyCSecureFreeN( ptr, name ) ___tracy_emit_memory_free_named( ptr, 1, name ); - -# define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, 0 ); -# define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, 0 ); -# define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, 0 ); -# define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, 0 ); -#endif +TRACY_API void ___tracy_emit_memory_alloc( const void* ptr, size_t size, int32_t secure ); +TRACY_API void ___tracy_emit_memory_alloc_callstack( const void* ptr, size_t size, int32_t depth, int32_t secure ); +TRACY_API void ___tracy_emit_memory_free( const void* ptr, int32_t secure ); +TRACY_API void ___tracy_emit_memory_free_callstack( const void* ptr, int32_t depth, int32_t secure ); +TRACY_API void ___tracy_emit_memory_alloc_named( const void* ptr, size_t size, int32_t secure, const char* name ); +TRACY_API void ___tracy_emit_memory_alloc_callstack_named( const void* ptr, size_t size, int32_t depth, int32_t secure, const char* name ); +TRACY_API void ___tracy_emit_memory_free_named( const void* ptr, int32_t secure, const char* name ); +TRACY_API void ___tracy_emit_memory_free_callstack_named( const void* ptr, int32_t depth, int32_t secure, const char* name ); +TRACY_API void ___tracy_emit_memory_discard( const char* name, int32_t secure ); +TRACY_API void ___tracy_emit_memory_discard_callstack( const char* name, int32_t secure, int32_t depth ); + +TRACY_API void ___tracy_emit_message( const char* txt, size_t size, int32_t callstack_depth ); +TRACY_API void ___tracy_emit_messageL( const char* txt, int32_t callstack_depth ); +TRACY_API void ___tracy_emit_messageC( const char* txt, size_t size, uint32_t color, int32_t callstack_depth ); +TRACY_API void ___tracy_emit_messageLC( const char* txt, uint32_t color, int32_t callstack_depth ); + +#define TracyCAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK, 0 ) +#define TracyCFree( ptr ) ___tracy_emit_memory_free_callstack( ptr, TRACY_CALLSTACK, 0 ) +#define TracyCMemoryDiscard( name ) ___tracy_emit_memory_discard_callstack( name, 0, TRACY_CALLSTACK ); +#define TracyCSecureAlloc( ptr, size ) ___tracy_emit_memory_alloc_callstack( ptr, size, TRACY_CALLSTACK, 1 ) +#define TracyCSecureFree( ptr ) ___tracy_emit_memory_free_callstack( ptr, TRACY_CALLSTACK, 1 ) +#define TracyCSecureMemoryDiscard( name ) ___tracy_emit_memory_discard_callstack( name, 1, TRACY_CALLSTACK ); + +#define TracyCAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, TRACY_CALLSTACK, 0, name ) +#define TracyCFreeN( ptr, name ) ___tracy_emit_memory_free_callstack_named( ptr, TRACY_CALLSTACK, 0, name ) +#define TracyCSecureAllocN( ptr, size, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, TRACY_CALLSTACK, 1, name ) +#define TracyCSecureFreeN( ptr, name ) ___tracy_emit_memory_free_callstack_named( ptr, TRACY_CALLSTACK, 1, name ) + +#define TracyCMessage( txt, size ) ___tracy_emit_message( txt, size, TRACY_CALLSTACK ); +#define TracyCMessageL( txt ) ___tracy_emit_messageL( txt, TRACY_CALLSTACK ); +#define TracyCMessageC( txt, size, color ) ___tracy_emit_messageC( txt, size, color, TRACY_CALLSTACK ); +#define TracyCMessageLC( txt, color ) ___tracy_emit_messageLC( txt, color, TRACY_CALLSTACK ); TRACY_API void ___tracy_emit_frame_mark( const char* name ); TRACY_API void ___tracy_emit_frame_mark_start( const char* name ); TRACY_API void ___tracy_emit_frame_mark_end( const char* name ); -TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int flip ); +TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_t h, uint8_t offset, int32_t flip ); #define TracyCFrameMark ___tracy_emit_frame_mark( 0 ); #define TracyCFrameMarkNamed( name ) ___tracy_emit_frame_mark( name ); @@ -327,7 +314,7 @@ TRACY_API void ___tracy_emit_frame_image( const void* image, uint16_t w, uint16_ TRACY_API void ___tracy_emit_plot( const char* name, double val ); TRACY_API void ___tracy_emit_plot_float( const char* name, float val ); TRACY_API void ___tracy_emit_plot_int( const char* name, int64_t val ); -TRACY_API void ___tracy_emit_plot_config( const char* name, int type, int step, int fill, uint32_t color ); +TRACY_API void ___tracy_emit_plot_config( const char* name, int32_t type, int32_t step, int32_t fill, uint32_t color ); TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ); #define TracyCPlot( name, val ) ___tracy_emit_plot( name, val ); @@ -337,55 +324,35 @@ TRACY_API void ___tracy_emit_message_appinfo( const char* txt, size_t size ); #define TracyCAppInfo( txt, size ) ___tracy_emit_message_appinfo( txt, size ); -#ifdef TRACY_HAS_CALLSTACK -# define TracyCZoneS( ctx, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); -# define TracyCZoneNS( ctx, name, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); -# define TracyCZoneCS( ctx, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); -# define TracyCZoneNCS( ctx, name, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); +#define TracyCZoneS( ctx, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); +#define TracyCZoneNS( ctx, name, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, 0 }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); +#define TracyCZoneCS( ctx, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { NULL, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); +#define TracyCZoneNCS( ctx, name, color, depth, active ) static const struct ___tracy_source_location_data TracyConcat(__tracy_source_location,TracyLine) = { name, __func__, TracyFile, (uint32_t)TracyLine, color }; TracyCZoneCtx ctx = ___tracy_emit_zone_begin_callstack( &TracyConcat(__tracy_source_location,TracyLine), depth, active ); -# define TracyCAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth, 0 ) -# define TracyCFreeS( ptr, depth ) ___tracy_emit_memory_free_callstack( ptr, depth, 0 ) -# define TracyCSecureAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth, 1 ) -# define TracyCSecureFreeS( ptr, depth ) ___tracy_emit_memory_free_callstack( ptr, depth, 1 ) +#define TracyCAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth, 0 ) +#define TracyCFreeS( ptr, depth ) ___tracy_emit_memory_free_callstack( ptr, depth, 0 ) +#define TracyCMemoryDiscardS( name, depth ) ___tracy_emit_memory_discard_callstack( name, 0, depth ) +#define TracyCSecureAllocS( ptr, size, depth ) ___tracy_emit_memory_alloc_callstack( ptr, size, depth, 1 ) +#define TracyCSecureFreeS( ptr, depth ) ___tracy_emit_memory_free_callstack( ptr, depth, 1 ) +#define TracyCSecureMemoryDiscardS( name, depth ) ___tracy_emit_memory_discard_callstack( name, 1, depth ) -# define TracyCAllocNS( ptr, size, depth, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, depth, 0, name ) -# define TracyCFreeNS( ptr, depth, name ) ___tracy_emit_memory_free_callstack_named( ptr, depth, 0, name ) -# define TracyCSecureAllocNS( ptr, size, depth, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, depth, 1, name ) -# define TracyCSecureFreeNS( ptr, depth, name ) ___tracy_emit_memory_free_callstack_named( ptr, depth, 1, name ) +#define TracyCAllocNS( ptr, size, depth, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, depth, 0, name ) +#define TracyCFreeNS( ptr, depth, name ) ___tracy_emit_memory_free_callstack_named( ptr, depth, 0, name ) +#define TracyCSecureAllocNS( ptr, size, depth, name ) ___tracy_emit_memory_alloc_callstack_named( ptr, size, depth, 1, name ) +#define TracyCSecureFreeNS( ptr, depth, name ) ___tracy_emit_memory_free_callstack_named( ptr, depth, 1, name ) -# define TracyCMessageS( txt, size, depth ) ___tracy_emit_message( txt, size, depth ); -# define TracyCMessageLS( txt, depth ) ___tracy_emit_messageL( txt, depth ); -# define TracyCMessageCS( txt, size, color, depth ) ___tracy_emit_messageC( txt, size, color, depth ); -# define TracyCMessageLCS( txt, color, depth ) ___tracy_emit_messageLC( txt, color, depth ); -#else -# define TracyCZoneS( ctx, depth, active ) TracyCZone( ctx, active ) -# define TracyCZoneNS( ctx, name, depth, active ) TracyCZoneN( ctx, name, active ) -# define TracyCZoneCS( ctx, color, depth, active ) TracyCZoneC( ctx, color, active ) -# define TracyCZoneNCS( ctx, name, color, depth, active ) TracyCZoneNC( ctx, name, color, active ) - -# define TracyCAllocS( ptr, size, depth ) TracyCAlloc( ptr, size ) -# define TracyCFreeS( ptr, depth ) TracyCFree( ptr ) -# define TracyCSecureAllocS( ptr, size, depth ) TracyCSecureAlloc( ptr, size ) -# define TracyCSecureFreeS( ptr, depth ) TracyCSecureFree( ptr ) - -# define TracyCAllocNS( ptr, size, depth, name ) TracyCAllocN( ptr, size, name ) -# define TracyCFreeNS( ptr, depth, name ) TracyCFreeN( ptr, name ) -# define TracyCSecureAllocNS( ptr, size, depth, name ) TracyCSecureAllocN( ptr, size, name ) -# define TracyCSecureFreeNS( ptr, depth, name ) TracyCSecureFreeN( ptr, name ) - -# define TracyCMessageS( txt, size, depth ) TracyCMessage( txt, size ) -# define TracyCMessageLS( txt, depth ) TracyCMessageL( txt ) -# define TracyCMessageCS( txt, size, color, depth ) TracyCMessageC( txt, size, color ) -# define TracyCMessageLCS( txt, color, depth ) TracyCMessageLC( txt, color ) -#endif +#define TracyCMessageS( txt, size, depth ) ___tracy_emit_message( txt, size, depth ); +#define TracyCMessageLS( txt, depth ) ___tracy_emit_messageL( txt, depth ); +#define TracyCMessageCS( txt, size, color, depth ) ___tracy_emit_messageC( txt, size, color, depth ); +#define TracyCMessageLCS( txt, color, depth ) ___tracy_emit_messageLC( txt, color, depth ); TRACY_API struct __tracy_lockable_context_data* ___tracy_announce_lockable_ctx( const struct ___tracy_source_location_data* srcloc ); TRACY_API void ___tracy_terminate_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); -TRACY_API int ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); +TRACY_API int32_t ___tracy_before_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); TRACY_API void ___tracy_after_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); TRACY_API void ___tracy_after_unlock_lockable_ctx( struct __tracy_lockable_context_data* lockdata ); -TRACY_API void ___tracy_after_try_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata, int acquired ); +TRACY_API void ___tracy_after_try_lock_lockable_ctx( struct __tracy_lockable_context_data* lockdata, int32_t acquired ); TRACY_API void ___tracy_mark_lockable_ctx( struct __tracy_lockable_context_data* lockdata, const struct ___tracy_source_location_data* srcloc ); TRACY_API void ___tracy_custom_name_lockable_ctx( struct __tracy_lockable_context_data* lockdata, const char* name, size_t nameSz ); diff --git a/libs/tracy/tracy/TracyCUDA.hpp b/libs/tracy/tracy/TracyCUDA.hpp new file mode 100644 index 0000000..40ff55d --- /dev/null +++ b/libs/tracy/tracy/TracyCUDA.hpp @@ -0,0 +1,1325 @@ +#ifndef __TRACYCUDA_HPP__ +#define __TRACYCUDA_HPP__ + +#ifndef TRACY_ENABLE + +#define TracyCUDAContext() nullptr +#define TracyCUDAContextDestroy(ctx) +#define TracyCUDAContextName(ctx, name, size) + +#define TracyCUDAStartProfiling(ctx) +#define TracyCUDAStopProfiling(ctx) + +#define TracyCUDACollect(ctx) + +#else +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef _MSC_VER +#include +#endif + +#include + +#ifndef UNREFERENCED +#define UNREFERENCED(x) (void)x +#endif//UNREFERENCED + +#ifndef TRACY_CUDA_CALIBRATED_CONTEXT +#define TRACY_CUDA_CALIBRATED_CONTEXT (1) +#endif//TRACY_CUDA_CALIBRATED_CONTEXT + +#ifndef TRACY_CUDA_ENABLE_COLLECTOR_THREAD +#define TRACY_CUDA_ENABLE_COLLECTOR_THREAD (1) +#endif//TRACY_CUDA_ENABLE_COLLECTOR_THREAD + +#ifndef TRACY_CUDA_ENABLE_CUDA_CALL_STATS +#define TRACY_CUDA_ENABLE_CUDA_CALL_STATS (0) +#endif//TRACY_CUDA_ENABLE_CUDA_CALL_STATS + +namespace { + +// TODO(marcos): wrap these in structs for better type safety +using CUptiTimestamp = uint64_t; +using TracyTimestamp = int64_t; + +struct IncrementalRegression { + using float_t = double; + struct Parameters { + float_t slope, intercept; + }; + + int n = 0; + float_t x_mean = 0; + float_t y_mean = 0; + float_t x_svar = 0; + float_t y_svar = 0; + float_t xy_scov = 0; + + auto parameters() const { + float_t slope = xy_scov / x_svar; + float_t intercept = y_mean - slope * x_mean; + return Parameters{ slope, intercept }; + } + + auto orthogonal() const { + // NOTE(marcos): orthogonal regression is Deming regression with delta = 1 + float_t delta = float_t(1); // delta = 1 -> orthogonal regression + float_t k = y_svar - delta * x_svar; + float_t slope = (k + sqrt(k * k + 4 * delta * xy_scov * xy_scov)) / (2 * xy_scov); + float_t intercept = y_mean - slope * x_mean; + return Parameters{ slope, intercept }; + } + + void addSample(float_t x, float_t y) { + ++n; + float_t x_mean_prev = x_mean; + float_t y_mean_prev = y_mean; + x_mean += (x - x_mean) / n; + y_mean += (y - y_mean) / n; + x_svar += (x - x_mean_prev) * (x - x_mean); + y_svar += (y - y_mean_prev) * (y - y_mean); + xy_scov += (x - x_mean_prev) * (y - y_mean); + } +}; + +tracy_force_inline TracyTimestamp tracyGetTimestamp() { + return tracy::Profiler::GetTime(); +} + +auto& getCachedRegressionParameters() { + // WARN(marcos): in theory, these linear regression parameters would be loaded/stored atomically; + // in practice, however, it should not matter so long as the loads/stores are not "sliced" + static IncrementalRegression::Parameters cached; + return cached; +} + +TracyTimestamp tracyFromCUpti(CUptiTimestamp cuptiTime) { + // NOTE(marcos): linear regression estimate + // y_hat = slope * x + intercept | X: CUptiTimestamp, Y: TracyTimestamp + auto [slope, intercept] = getCachedRegressionParameters(); + double y_hat = slope * cuptiTime + intercept; + TracyTimestamp tracyTime = TracyTimestamp(y_hat); + assert(tracyTime >= 0); + return tracyTime; +} + +template +tracy_force_inline void tracyMemWrite(T& where,U what) { + static_assert(std::is_same_v, "tracy::MemWrite: type mismatch."); + tracy::MemWrite(&where, what); +} + +void* tracyMalloc(size_t bytes) { + return tracy::tracy_malloc(bytes); +} + +void tracyFree(void* ptr) { + tracy::tracy_free(ptr); +} + +void tracyZoneBegin(TracyTimestamp time, tracy::SourceLocationData* srcLoc) { + using namespace tracy; + TracyQueuePrepare(QueueType::ZoneBegin); + tracyMemWrite(item->zoneBegin.time, time); + tracyMemWrite(item->zoneBegin.srcloc, (uint64_t)srcLoc); + TracyQueueCommit(zoneBeginThread); +} + +void tracyZoneEnd(TracyTimestamp time) { + using namespace tracy; + TracyQueuePrepare(QueueType::ZoneEnd); + tracyMemWrite(item->zoneEnd.time, time); + TracyQueueCommit(zoneEndThread); +} + +void tracyPlot(const char* name, float value, TracyTimestamp time) { + using namespace tracy; + TracyLfqPrepare(QueueType::PlotDataFloat); + tracyMemWrite(item->plotDataFloat.name, (uint64_t)name); + tracyMemWrite(item->plotDataFloat.time, time); + tracyMemWrite(item->plotDataFloat.val, value); + TracyLfqCommit; +} + +void tracyPlot(const char* name, float value, CUptiTimestamp time) { + tracyPlot(name, value, tracyFromCUpti(time)); +} + +void tracyPlotActivity(const char* name, TracyTimestamp start, TracyTimestamp end, float value = 1.0f, float baseline = 0.0f) { + tracyPlot(name, baseline, start); + tracyPlot(name, value, start + 3); + tracyPlot(name, value, end - 3); + tracyPlot(name, baseline, end); +} + +void tracyPlotActivity(const char* name, CUptiTimestamp start, CUptiTimestamp end, float value = 1.0f, float baseline = 0.0f) { + tracyPlotActivity(name, tracyFromCUpti(start), tracyFromCUpti(end), value, baseline); +} + +void tracyPlotBlip(const char* name, TracyTimestamp time, float value = 1.0f, float baseline = 0.0f) { + tracyPlot(name, baseline, time - 3); + tracyPlot(name, value, time); + tracyPlot(name, baseline, time + 3); +} + +void tracyPlotBlip(const char* name, CUptiTimestamp time, float value = 1.0f, float baseline = 0.0f) { + tracyPlotBlip(name, tracyFromCUpti(time), value, baseline); +} + +void tracyEmitMemAlloc(const char* name, const void* ptr, size_t size, TracyTimestamp time) { + using namespace tracy; + const auto thread = GetThreadHandle(); + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::MemNamePayload); + tracyMemWrite(item->memName.name, (uint64_t)name); + Profiler::QueueSerialFinish(); + + item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::MemAllocNamed); + tracyMemWrite(item->memAlloc.time, time); + tracyMemWrite(item->memAlloc.thread, thread); + tracyMemWrite(item->memAlloc.ptr, (uint64_t)ptr); + + if (compile_time_condition::value) + { + memcpy(&item->memAlloc.size, &size, 4); + memset(&item->memAlloc.size + 4, 0, 2); + } + else + { + assert(sizeof(size) == 8); + memcpy(&item->memAlloc.size, &size, 4); + memcpy(((char *)&item->memAlloc.size) + 4, ((char *)&size) + 4, 2); + } + Profiler::QueueSerialFinish(); +} + +void tracyEmitMemFree(const char* name, const void* ptr, TracyTimestamp time) { + using namespace tracy; + const auto thread = GetThreadHandle(); + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::MemNamePayload); + tracyMemWrite(item->memName.name, (uint64_t)name); + Profiler::QueueSerialFinish(); + + item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::MemFreeNamed); + tracyMemWrite(item->memFree.time, time); + tracyMemWrite(item->memFree.thread, thread); + tracyMemWrite(item->memFree.ptr, (uint64_t)ptr); + Profiler::QueueSerialFinish(); +} + +void tracyEmitMemAlloc(const char* name, const void* ptr, size_t size, CUptiTimestamp cuptiTime) { + tracyEmitMemAlloc(name, ptr, size, tracyFromCUpti(cuptiTime)); +} + +void tracyEmitMemFree(const char* name, const void* ptr, CUptiTimestamp cuptiTime) { + tracyEmitMemFree(name, ptr, tracyFromCUpti(cuptiTime)); +} + +void tracyAnnounceGpuTimestamp(TracyTimestamp apiStart, TracyTimestamp apiEnd, + uint16_t queryId, uint8_t gpuContextId, + const tracy::SourceLocationData* sourceLocation, uint32_t threadId) { + using namespace tracy; + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuZoneBeginSerial); + tracyMemWrite(item->gpuZoneBegin.cpuTime, apiStart); + tracyMemWrite(item->gpuZoneBegin.srcloc, (uint64_t)sourceLocation); + tracyMemWrite(item->gpuZoneBegin.thread, threadId); + tracyMemWrite(item->gpuZoneBegin.queryId, uint16_t(queryId+0)); + tracyMemWrite(item->gpuZoneBegin.context, gpuContextId); + Profiler::QueueSerialFinish(); + + item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuZoneEndSerial); + tracyMemWrite(item->gpuZoneEnd.cpuTime, apiEnd); + tracyMemWrite(item->gpuZoneEnd.thread, threadId); + tracyMemWrite(item->gpuZoneEnd.queryId, uint16_t(queryId+1)); + tracyMemWrite(item->gpuZoneEnd.context, gpuContextId); + Profiler::QueueSerialFinish(); +} + +void tracySubmitGpuTimestamp(CUptiTimestamp gpuStart, CUptiTimestamp gpuEnd, + uint16_t queryId, uint8_t gpuContextId) { + using namespace tracy; + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuTime); + tracyMemWrite(item->gpuTime.gpuTime, (int64_t)gpuStart); + tracyMemWrite(item->gpuTime.queryId, uint16_t(queryId+0)); + tracyMemWrite(item->gpuTime.context, gpuContextId); + Profiler::QueueSerialFinish(); + + item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuTime); + tracyMemWrite(item->gpuTime.gpuTime, (int64_t)gpuEnd); + tracyMemWrite(item->gpuTime.queryId, uint16_t(queryId+1)); + tracyMemWrite(item->gpuTime.context, gpuContextId); + Profiler::QueueSerialFinish(); +} + +#define CUPTI_API_CALL(call) CUptiCallChecked(call, #call, __FILE__, __LINE__) + +#define DRIVER_API_CALL(call) cudaDriverCallChecked(call, #call, __FILE__, __LINE__) + +CUptiResult CUptiCallChecked(CUptiResult result, const char* call, const char* file, int line) noexcept { + if (result == CUPTI_SUCCESS) + return result; + const char* resultMsg = ""; + CUPTI_API_CALL(cuptiGetResultString(result, &resultMsg)); // maybe not a good idea to recurse here... + fprintf(stderr, "ERROR:\t%s:%d:\n\tfunction '%s' failed with error '%s'.\n", file, line, call, resultMsg); + //assert(result == CUPTI_SUCCESS); + return result; +} + +CUresult cudaDriverCallChecked(CUresult result, const char* call, const char* file, int line) noexcept { + if (result == CUDA_SUCCESS) + return result; + const char* resultMsg = ""; + DRIVER_API_CALL(cuGetErrorString(result, &resultMsg)); // maybe not a good idea to recurse here... + fprintf(stderr, "ERROR:\t%s:%d:\n\tfunction '%s' failed with error '%s'.\n", file, line, call, resultMsg); + //assert(result == CUDA_SUCCESS); + return result; +} + +template +struct ConcurrentHashMap { + static constexpr bool instrument = false; + auto acquire_read_lock() { + if (m.try_lock_shared()) + return std::shared_lock(m, std::adopt_lock); + ZoneNamedC(rwlock, tracy::Color::Tomato, instrument); + return std::shared_lock(m); + } + auto acquire_write_lock() { + if (m.try_lock()) + return std::unique_lock(m, std::adopt_lock); + ZoneNamedC(wxlock, tracy::Color::Tomato, instrument); + return std::unique_lock(m); + } + std::unordered_map mapping; + std::shared_mutex m; + auto& operator[](TKey key) { + { + auto lock = acquire_read_lock(); + auto it = mapping.find(key); + if (it != mapping.end()) { + return it->second; + } + } + return emplace(key, TValue{}).first->second; + } + auto find(TKey key) { + ZoneNamed(find, instrument); + auto lock = acquire_read_lock(); + return mapping.find(key); + } + auto fetch(TKey key, TValue& value) { + ZoneNamed(fetch, instrument); + auto it = mapping.find(key); + if (it != mapping.end()) { + value = it->second; + return true; + } + return false; + } + auto end() { + ZoneNamed(end, instrument); + auto lock = acquire_read_lock(); + return mapping.end(); + } + template + auto emplace(TKey key, Args&&... args) { + ZoneNamed(emplace, instrument); + auto lock = acquire_write_lock(); + return mapping.emplace(std::forward(key), std::forward(args)...); + } + auto erase(TKey key) { + ZoneNamed(erase, instrument); + auto lock = acquire_write_lock(); + return mapping.erase(key); + } +}; + +#if TRACY_CUDA_ENABLE_CUDA_CALL_STATS +struct ProfilerStats { + static constexpr bool instrument = false; + + ConcurrentHashMap> apiCallCount; + + void update(CUpti_CallbackDomain domain, CUpti_CallbackId cbid) { + ZoneNamed(update, instrument); + uint32_t key = (domain << 24) | (cbid & 0x00'FFFFFF); + auto it = apiCallCount.find(key); + if (it == apiCallCount.end()) { + it = apiCallCount.emplace(key, 0).first; + } + it->second.fetch_add(1, std::memory_order::memory_order_relaxed); + } +}; +#endif + +// StringTable: string memoization/interning +struct StringTable { + static constexpr bool instrument = false; + + // TODO(marcos): this could be just a "ConcurrentHashSet" + ConcurrentHashMap table; + + ~StringTable() { /* TODO(marcos): free string copy */ } + + std::string_view operator[](std::string_view str) { + ZoneNamedN(lookup, "StringTable::lookup", instrument); + std::string_view memoized; + if (!table.fetch(str, memoized)) { + ZoneNamedN(lookup, "StringTable::insert", instrument); + char* copy = (char*)tracyMalloc(str.size() + 1); + strncpy(copy, str.data(), str.size()); + copy[str.size()] = '\0'; + std::string_view value (copy, str.size()); + auto [it, inserted] = table.emplace(value, value); + if (!inserted) { + // another thread inserted it while we were trying to: cleanup + tracyFree(copy); + } + memoized = it->second; + } + assert(str == memoized); + return memoized; + } +}; + +struct SourceLocationMap { + static constexpr bool instrument = false; + + // NOTE(marcos): the address of an unordered_map value may become invalid + // later on (e.g., during a rehash), so mapping to a pointer is necessary + ConcurrentHashMap locations; + + ~SourceLocationMap() { /* TODO(marcos): free SourceLocationData* entries */ } + + tracy::SourceLocationData* retrieve(std::string_view function) { + ZoneNamed(retrieve, instrument); + tracy::SourceLocationData* pSrcLoc = nullptr; + locations.fetch(function, pSrcLoc); + return pSrcLoc; + } + + tracy::SourceLocationData* add(std::string_view function, std::string_view file, int line, uint32_t color=0) { + ZoneNamed(emplace, instrument); + assert(*function.end() == '\0'); + assert(*file.end() == '\0'); + void* bytes = tracyMalloc(sizeof(tracy::SourceLocationData)); + auto pSrcLoc = new(bytes)tracy::SourceLocationData{ function.data(), TracyFunction, file.data(), (uint32_t)line, color }; + auto [it, inserted] = locations.emplace(function, pSrcLoc); + if (!inserted) { + // another thread inserted it while we were trying to: cleanup + tracyFree(pSrcLoc); // POD: no destructor to call + } + assert(it->second != nullptr); + return it->second; + } +}; + +struct SourceLocationLUT { + static constexpr bool instrument = false; + + ~SourceLocationLUT() { /* no action needed: no dynamic allocation */ } + + tracy::SourceLocationData runtime [CUpti_runtime_api_trace_cbid::CUPTI_RUNTIME_TRACE_CBID_SIZE] = {}; + tracy::SourceLocationData driver [CUpti_driver_api_trace_cbid::CUPTI_DRIVER_TRACE_CBID_SIZE] = {}; + + tracy::SourceLocationData* retrieve(CUpti_CallbackDomain domain, CUpti_CallbackId cbid, CUpti_CallbackData* apiInfo) { + ZoneNamed(retrieve, instrument); + tracy::SourceLocationData* pSrcLoc = nullptr; + switch (domain) { + case CUPTI_CB_DOMAIN_RUNTIME_API : + if ((cbid > 0) && (cbid < CUPTI_RUNTIME_TRACE_CBID_SIZE)) { + pSrcLoc = &runtime[cbid]; + } + break; + case CUPTI_CB_DOMAIN_DRIVER_API : + if ((cbid > 0) && (cbid < CUPTI_DRIVER_TRACE_CBID_SIZE)) { + pSrcLoc = &driver[cbid]; + } + break; + default: + break; + } + if (pSrcLoc->name == nullptr) { + const char* function = apiInfo->functionName ? apiInfo->functionName : "cuda???"; + // cuptiGetCallbackName includes the "version suffix" of the function/cbid + //CUPTI_API_CALL(cuptiGetCallbackName(domain, cbid, &function)); + *pSrcLoc = tracy::SourceLocationData{ function, TracyFunction, TracyFile, TracyLine, 0 }; + } + return pSrcLoc; + } +}; + +uint32_t tracyTimelineId(uint32_t contextId, uint32_t streamId) { + // 0xA7C5 = 42,949 => 42,949 * 100,000 = 4,294,900,000 + // 4,294,900,000 + 65,535 = 4,294,965,535 < 4,294,967,295 (max uint32) + assert(contextId <= 0xA7C5); + assert((streamId == CUPTI_INVALID_STREAM_ID) || (streamId < 0xFFFF)); + uint32_t packed = (contextId * 100'000) + (streamId & 0x0000'FFFF); + return packed; +} + +} // unnamed/anonymous namespace + +namespace tracy +{ + class CUDACtx + { + public: + static CUDACtx* Create() { + auto& s = Singleton::Get(); + std::unique_lock lock (s.m); + if (s.ref_count == 0) { + assert(s.ctx == nullptr); + s.ctx = new CUDACtx(s.ctx_id); + s.ref_count += 1; + s.ctx_id = s.ctx->m_tracyGpuContext; + } + return s.ctx; + } + + static void Destroy(CUDACtx* ctx) { + auto& s = Singleton::Get(); + std::unique_lock lock(s.m); + assert(ctx == s.ctx); + s.ref_count -= 1; + if (s.ref_count == 0) { + delete s.ctx; + s.ctx = nullptr; + } + } + + void Collect() + { + ZoneScoped; + CUPTI::FlushActivity(); + } + + void printStats() + { + #if TRACY_CUDA_ENABLE_CUDA_CALL_STATS + fprintf(stdout, "\nCUDA API stats:\n"); + { + struct Stats { CUpti_CallbackDomain domain; CUpti_CallbackId cbid; int count; }; + std::vector sorted; + for (auto&& api : stats.apiCallCount.mapping) { + auto domain = CUpti_CallbackDomain(api.first >> 24); + auto cbid = CUpti_CallbackId(api.first & 0x00'FFFFFF); + int count = api.second; + sorted.emplace_back(Stats{ domain, cbid, count }); + } + std::sort(sorted.begin(), sorted.end(), [](const Stats& x, const Stats& y) { return x.count > y.count; }); + for (auto&& api : sorted) { + const char* function = ""; + CUPTI_API_CALL(cuptiGetCallbackName(api.domain, api.cbid, &function)); + printf("- %s : %d\n", function, api.count); + } + } + #endif + } + + void StartProfiling() + { + ZoneScoped; + CUPTI::BeginInstrumentation(this); + } + + void StopProfiling() + { + ZoneScoped; + CUPTI::EndInstrumentation(); + printStats(); + } + + void Name(const char *name, uint16_t len) + { + auto ptr = (char*)tracyMalloc(len); + memcpy(ptr, name, len); + + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuContextName); + tracyMemWrite(item->gpuContextNameFat.context, m_tracyGpuContext); + tracyMemWrite(item->gpuContextNameFat.ptr, (uint64_t)ptr); + tracyMemWrite(item->gpuContextNameFat.size, len); + SubmitQueueItem(item); + } + + tracy_force_inline void SubmitQueueItem(tracy::QueueItem *item) + { +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + static void QueryTimestamps(TracyTimestamp& tTracy, CUptiTimestamp& tCUpti) { + TracyTimestamp tTracy1 = tracyGetTimestamp(); + CUPTI_API_CALL(cuptiGetTimestamp(&tCUpti)); + TracyTimestamp tTracy2 = tracyGetTimestamp(); + // NOTE(marcos): giving more weight to 'tTracy2' + tTracy = (3*tTracy1 + 5*tTracy2) / 8; + } + + // NOTE(marcos): recalibration is 'static' since Tracy and CUPTI timestamps + // are "global" across all contexts; that said, each Tracy GPU context needs + // its own GpuCalibration message, but for now there's just a singleton context. + void Recalibrate() { + ZoneScoped; + // NOTE(marcos): only one thread should do the calibration, but there's + // no good reason to block threads that also trying to do the same + static std::mutex m; + if (!m.try_lock()) + return; + std::unique_lock lock (m, std::adopt_lock); + ZoneNamedNC(zone, "tracy::CUDACtx::Recalibrate[effective]", tracy::Color::Goldenrod, true); + TracyTimestamp tTracy; + CUptiTimestamp tCUpti; + QueryTimestamps(tTracy, tCUpti); + #if TRACY_CUDA_CALIBRATED_CONTEXT + static CUptiTimestamp prevCUptiTime = tCUpti; + int64_t deltaTicksCUpti = tCUpti - prevCUptiTime; + if (deltaTicksCUpti > 0) { + prevCUptiTime = tCUpti; + auto* item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuCalibration); + tracyMemWrite(item->gpuCalibration.gpuTime, (int64_t)tCUpti); + tracyMemWrite(item->gpuCalibration.cpuTime, tTracy); + tracyMemWrite(item->gpuCalibration.cpuDelta, deltaTicksCUpti); + tracyMemWrite(item->gpuCalibration.context, m_tracyGpuContext); + Profiler::QueueSerialFinish(); + } + #endif + // NOTE(marcos): update linear regression incrementally, which will refine + // the estimation of Tracy timestamps (Y) from CUpti timestamps (X) + static IncrementalRegression model; + model.addSample(double(tCUpti), double(tTracy)); + // NOTE(marcos): using orthogonal regression because the independet variable + // (X: CUpti timestamps) measurements are also imprecise + getCachedRegressionParameters() = model.orthogonal(); + } + + protected: + void EmitGpuZone(TracyTimestamp apiStart, TracyTimestamp apiEnd, + CUptiTimestamp gpuStart, CUptiTimestamp gpuEnd, + const tracy::SourceLocationData* pSrcLoc, + uint32_t cudaContextId, uint32_t cudaStreamId) { + //uint32_t timelineId = tracy::GetThreadHandle(); + uint32_t timelineId = tracyTimelineId(cudaContextId, cudaStreamId); + uint16_t queryId = m_queryIdGen.fetch_add(2); + tracyAnnounceGpuTimestamp(apiStart, apiEnd, queryId, m_tracyGpuContext, pSrcLoc, timelineId); + tracySubmitGpuTimestamp(gpuStart, gpuEnd, queryId, m_tracyGpuContext); + } + + void OnEventsProcessed() { + Recalibrate(); + } + + struct CUPTI { + static void CUPTIAPI OnBufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) + { + ZoneScoped; + // TODO(marcos): avoid malloc and instead suballocate from a large circular buffer; + // according to the CUPTI documentation: "To minimize profiling overhead the client + // should return as quickly as possible from these callbacks." + *size = 1 * 1024*1024; // 1MB + *buffer = (uint8_t*)tracyMalloc(*size); + assert(*buffer != nullptr); + FlushActivityAsync(); + } + + static void CUPTIAPI OnBufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, size_t validSize) + { + // CUDA 6.0 onwards: all buffers from this callback are "global" buffers + // (i.e. there is no context/stream specific buffer; ctx is always NULL) + ZoneScoped; + tracy::SetThreadName("NVIDIA CUPTI Worker"); + CUptiResult status; + CUpti_Activity* record = nullptr; + while ((status = cuptiActivityGetNextRecord(buffer, validSize, &record)) == CUPTI_SUCCESS) { + DoProcessDeviceEvent(record); + } + if (status != CUPTI_ERROR_MAX_LIMIT_REACHED) { + CUptiCallChecked(status, "cuptiActivityGetNextRecord", TracyFile, TracyLine); + } + size_t dropped = 0; + CUPTI_API_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); + assert(dropped == 0); + tracyFree(buffer); + PersistentState::Get().profilerHost->OnEventsProcessed(); + } + + // correlationID -> [CPU start time, CPU end time, CUPTI start time] + using CorrelationID = uint32_t; + struct APICallInfo { TracyTimestamp start = 0, end = 0; CUptiTimestamp cupti = CUPTI_TIMESTAMP_UNKNOWN; CUDACtx* host = nullptr; }; + + static void CUPTIAPI OnCallbackAPI( + void* userdata, + CUpti_CallbackDomain domain, + CUpti_CallbackId cbid, + const void* cbdata) + { + static constexpr bool instrument = false; + + TracyTimestamp apiCallStartTime = tracyGetTimestamp(); + CUDACtx* profilerHost = (CUDACtx*)userdata; + + switch (domain) { + case CUPTI_CB_DOMAIN_RUNTIME_API: + case CUPTI_CB_DOMAIN_DRIVER_API: + break; + case CUPTI_CB_DOMAIN_RESOURCE: { + // match 'callbackId' with CUpti_CallbackIdResource + // interpret 'cbdata' as CUpti_ResourceData, + // or as CUpti_ModuleResourceData, + // or as CUpti_GraphData, + // or as CUpti_StreamAttrData, + // or as ... (what else?) + return; + } + case CUPTI_CB_DOMAIN_SYNCHRONIZE: { + // match 'callbackId' with CUpti_CallbackIdSync + // interpret 'cbdata' as CUpti_SynchronizeData + return; + } + case CUPTI_CB_DOMAIN_STATE: { + // match 'callbackId' with CUpti_CallbackIdState + // interpret 'cbdata' as CUpti_StateData + return; + } + case CUPTI_CB_DOMAIN_NVTX: { + // match 'callbackId' with CUpti_nvtx_api_trace_cbid + // interpret 'cbdata' as CUpti_NvtxData + return; + } + case CUPTI_CB_DOMAIN_FORCE_INT: + // NOTE(marcos): the "FORCE_INT" values in CUPTI enums exist only to + // force the enum to have a specific representation (signed 32bits) + case CUPTI_CB_DOMAIN_INVALID: + default: + // TODO(marcos): unexpected error! + return; + } + + // if we reached this point, then we are in the (runtime or driver) API domain + CUpti_CallbackData* apiInfo = (CUpti_CallbackData*)cbdata; + + // Emit the Tracy 'ZoneBegin' message upon entering the API call + // TODO(marcos): a RAII object could be useful here... + if (apiInfo->callbackSite == CUPTI_API_ENTER) { + #if TRACY_CUDA_ENABLE_CUDA_CALL_STATS + ctx->stats.update(domain, cbid); + #endif + + auto& cudaCallSourceLocation = PersistentState::Get().cudaCallSourceLocation; + auto pSrcLoc = cudaCallSourceLocation.retrieve(domain, cbid, apiInfo); + + // HACK(marcos): the SourceLocationLUT::retrieve zone (above) should + // not be emitted before its enclosing zone (below) actually begins, + // so we delay the beginning of the enclosing zone to "unstack" them + if (SourceLocationLUT::instrument) + apiCallStartTime = tracyGetTimestamp(); + tracyZoneBegin(apiCallStartTime, pSrcLoc); + } + + if (apiInfo->callbackSite == CUPTI_API_ENTER) { + ZoneNamedN(enter, "tracy::CUDACtx::OnCUptiCallback[enter]", instrument); + // Track API calls that generate device activity: + bool trackDeviceActivity = false; + CUstream hStream = nullptr; + if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) { + #define GET_STREAM_FUNC(Params, field) [](CUpti_CallbackData* api) { return ((Params*)api->functionParams)->field; } + #define NON_STREAM_FUNC() [](CUpti_CallbackData*) { return cudaStream_t(nullptr); } + static std::unordered_map cbidRuntimeTrackers = { + // Runtime: Kernel + { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000, GET_STREAM_FUNC(cudaLaunchKernel_v7000_params, stream) }, + { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000, GET_STREAM_FUNC(cudaLaunchKernel_ptsz_v7000_params, stream) }, + { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060, GET_STREAM_FUNC(cudaLaunchKernelExC_v11060_params, config->stream) }, + { CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060, GET_STREAM_FUNC(cudaLaunchKernelExC_ptsz_v11060_params, config->stream) }, + // Runtime: Memory + { CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020, NON_STREAM_FUNC() }, + // Runtime: Memcpy + { CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020, GET_STREAM_FUNC(cudaMemcpyAsync_v3020_params, stream) }, + // Runtime: Memset + { CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020, GET_STREAM_FUNC(cudaMemsetAsync_v3020_params, stream) }, + // Runtime: Synchronization + { CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020, NON_STREAM_FUNC() }, + { CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020, NON_STREAM_FUNC() }, + }; + #undef NON_STREAM_FUNC + #undef GET_STREAM_FUNC + auto it = cbidRuntimeTrackers.find(CUpti_runtime_api_trace_cbid(cbid)); + if (it != cbidRuntimeTrackers.end()) { + trackDeviceActivity = true; + hStream = (CUstream)it->second(apiInfo); + } + } + if (domain == CUPTI_CB_DOMAIN_DRIVER_API) { + #define GET_STREAM_FUNC(Params, field) [](CUpti_CallbackData* api) { return ((Params*)api->functionParams)->field; } + #define NON_STREAM_FUNC() [](CUpti_CallbackData*) { return CUstream(nullptr); } + static std::unordered_map cbidDriverTrackers = { + // Driver: Kernel + { CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel, GET_STREAM_FUNC(cuLaunchKernel_params, hStream) }, + { CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz, GET_STREAM_FUNC(cuLaunchKernel_ptsz_params, hStream)} , + { CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx, GET_STREAM_FUNC(cuLaunchKernelEx_params, config->hStream) }, + { CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz, GET_STREAM_FUNC(cuLaunchKernelEx_params, config->hStream) }, + }; + #undef NON_STREAM_FUNC + #undef GET_STREAM_FUNC + auto it = cbidDriverTrackers.find(CUpti_driver_api_trace_cbid(cbid)); + if (it != cbidDriverTrackers.end()) { + trackDeviceActivity = true; + hStream = it->second(apiInfo); + } + } + if (trackDeviceActivity) { + // NOTE(marcos): we should NOT track if the stream is being captured + CUstreamCaptureStatus status = {}; + DRIVER_API_CALL(cuStreamIsCapturing(hStream, &status)); + trackDeviceActivity = !(status == CU_STREAM_CAPTURE_STATUS_ACTIVE); + } + if (trackDeviceActivity) { + CUptiTimestamp tgpu; + // TODO(marcos): do a "reverse-estimate" to obtain CUpti time from Tracy time instead? + CUPTI_API_CALL(cuptiGetTimestamp(&tgpu)); + auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo; + cudaCallSiteInfo.emplace(apiInfo->correlationId, APICallInfo{ apiCallStartTime, apiCallStartTime, tgpu, profilerHost }); + } + auto& entryFlags = *apiInfo->correlationData; + assert(entryFlags == 0); + entryFlags |= trackDeviceActivity ? 0x8000 : 0; + } + + if (apiInfo->callbackSite == CUPTI_API_EXIT) { + APICallInfo* pApiInterval = [](CUpti_CallbackData* apiInfo) { + ZoneNamedN(exit, "tracy::CUDACtx::OnCUptiCallback[exit]", instrument); + auto entryFlags = *apiInfo->correlationData; + bool trackDeviceActivity = (entryFlags & 0x8000) != 0; + if (trackDeviceActivity) { + auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo; + auto it = cudaCallSiteInfo.find(apiInfo->correlationId); + if (it != cudaCallSiteInfo.end()) { + // WARN(marcos): leaking the address of a hash-map value could spell trouble + return &it->second; + } + } + // NOTE(marcos): this can happen if the GPU activity completes + // before the CUDA function that enqueued it returns (e.g., sync) + static APICallInfo sentinel; + return &sentinel; + }(apiInfo); + pApiInterval->end = tracyGetTimestamp(); + tracyZoneEnd(pApiInterval->end); + } + } + + static bool matchActivityToAPICall(uint32_t correlationId, APICallInfo& apiCallInfo) { + static constexpr bool instrument = false; + ZoneNamed(match, instrument); + auto& cudaCallSiteInfo = PersistentState::Get().cudaCallSiteInfo; + if (!cudaCallSiteInfo.fetch(correlationId, apiCallInfo)) { + return false; + } + cudaCallSiteInfo.erase(correlationId); + assert(apiCallInfo.host != nullptr); + return true; + } + + static void matchError(uint32_t correlationId, const char* kind) { + char msg [128]; + snprintf(msg, sizeof(msg), "ERROR: device activity '%s' has no matching CUDA API call (id=%u).", kind, correlationId); + TracyMessageC(msg, strlen(msg), tracy::Color::Tomato); + } + + static std::string extractActualName(char** name){ + //If name does not start with number, return empty string + if (!isdigit(**name)) + { + return std::string(); + } + // Assuming name starts with number followed by actual name + std::string actualName; + char* currStr = *name; + int num = 0; + while (*currStr >= '0' && *currStr <= '9') + { + num = num * 10 + (*currStr - '0'); + currStr++; + } + + // Return the string start at currStr ends at num + actualName = std::string(currStr, num); + // check if actualName starts with _GLOBAL__N__ + if (actualName.rfind("_GLOBAL__N__", 0) == 0) + { + // _GLOBAL__N__ with an id stands for anonymous namespace + actualName = std::string("(anonymous_namespace)"); + } + + *name = currStr + num; + return actualName; + } + + static std::string extractActualNameNested(const char* demangledName) + { + ZoneNamedN(demangle, "demangle_kernel", false); + //If name does not start with _Z, return a new std::string with original name + if (demangledName[0] != '_' || demangledName[1] != 'Z') + { + return std::string(demangledName); + } + std::string actualName; + char* currStr = (char*)demangledName + 2; + + if (*currStr == 'N') + { + currStr++; + // extract actual name from nested name + std::string nestedName = extractActualName(&currStr); + actualName += nestedName; + while (1) + { + //Loop until nested name is empty + nestedName = extractActualName(&currStr); + if (nestedName.empty()) + { + break; + } + actualName += "::" + nestedName; + } + } else + { + actualName = extractActualName(&currStr); + } + return actualName; + } + + static tracy::SourceLocationData* getKernelSourceLocation(const char* kernelName) + { + auto& kernelSrcLoc = PersistentState::Get().kernelSrcLoc; + std::string_view demangledName; + #ifndef _MSC_VER + // TODO(marcos): extractActualNameNested is the main bottleneck right now; + // we need a specialized StringTable mapping from "peristent" kernel names + // (const char*/uintptr_t) to memoized, lazily initialized demangled names + auto& demangledNameTable = PersistentState::Get().demangledNameTable; + std::string demangled = extractActualNameNested(kernelName); + demangledName = demangledNameTable[demangled]; + #else + demangledName = kernelName; + #endif + auto pSrcLoc = kernelSrcLoc.retrieve(demangledName); + if (pSrcLoc == nullptr) { + pSrcLoc = kernelSrcLoc.add(demangledName, TracyFile, TracyLine); + } + return pSrcLoc; + } + + static void DoProcessDeviceEvent(CUpti_Activity *record) + { + static constexpr bool instrument = false; + ZoneNamed(activity, instrument); + + switch (record->kind) + { + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[kernel]", instrument); + CUpti_ActivityKernel9* kernel9 = (CUpti_ActivityKernel9*) record; + APICallInfo apiCall; + if (!matchActivityToAPICall(kernel9->correlationId, apiCall)) { + return matchError(kernel9->correlationId, "KERNEL"); + } + apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, kernel9->start, kernel9->end, getKernelSourceLocation(kernel9->name), kernel9->contextId, kernel9->streamId); + auto latency_ms = (kernel9->start - apiCall.cupti) / 1'000'000.0; + tracyPlotBlip("Kernel Latency (ms)", kernel9->start, latency_ms); + break; + } + + case CUPTI_ACTIVITY_KIND_MEMCPY: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[memcpy]", instrument); + CUpti_ActivityMemcpy5* memcpy5 = (CUpti_ActivityMemcpy5*) record; + APICallInfo apiCall; + if (!matchActivityToAPICall(memcpy5->correlationId, apiCall)) { + return matchError(memcpy5->correlationId, "MEMCPY"); + } + static constexpr tracy::SourceLocationData TracyCUPTISrcLocDeviceMemcpy { "CUDA::memcpy", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Blue }; + apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, memcpy5->start, memcpy5->end, &TracyCUPTISrcLocDeviceMemcpy, memcpy5->contextId, memcpy5->streamId); + static constexpr const char* graph_name = "CUDA Memory Copy"; + tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)memcpy5->correlationId, memcpy5->bytes, memcpy5->start); + tracyEmitMemFree (graph_name, (void*)(uintptr_t)memcpy5->correlationId, memcpy5->end); + break; + } + + case CUPTI_ACTIVITY_KIND_MEMSET: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[memset]", instrument); + CUpti_ActivityMemset4* memset4 = (CUpti_ActivityMemset4*) record; + APICallInfo apiCall; + if (!matchActivityToAPICall(memset4->correlationId, apiCall)) { + return matchError(memset4->correlationId, "MEMSET"); + } + static constexpr tracy::SourceLocationData TracyCUPTISrcLocDeviceMemset { "CUDA::memset", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Blue }; + apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, memset4->start, memset4->end, &TracyCUPTISrcLocDeviceMemset, memset4->contextId, memset4->streamId); + static constexpr const char* graph_name = "CUDA Memory Set"; + tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)memset4->correlationId, memset4->bytes, memset4->start); + tracyEmitMemFree (graph_name, (void*)(uintptr_t)memset4->correlationId, memset4->end); + break; + } + + case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[sync]", instrument); + CUpti_ActivitySynchronization* synchronization = (CUpti_ActivitySynchronization*) record; + APICallInfo apiCall; + if (!matchActivityToAPICall(synchronization->correlationId, apiCall)) { + return matchError(synchronization->correlationId, "SYNCHRONIZATION"); + } + // NOTE(marcos): synchronization can happen at different levels/objects: + // a. on the entire context : cuCtxSynchronize() -> timeline(ctx,0) + // b. on a specific stream : cuStreamSynchronize() -> timeline(ctx,stream) + // c. on a specific event : cuEventSynchronize() -> timeline(ctx,0xffff) + static constexpr tracy::SourceLocationData TracyCUPTISrcLocContextSynchronization { "CUDA::Context::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta }; + auto* pSrcLoc = &TracyCUPTISrcLocContextSynchronization; + uint32_t cudaContextId = synchronization->contextId; + uint32_t cudaStreamId = 0; + if (synchronization->streamId != CUPTI_SYNCHRONIZATION_INVALID_VALUE) { + static constexpr tracy::SourceLocationData TracyCUPTISrcLocStreamSynchronization{ "CUDA::Stream::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta3 }; + pSrcLoc = &TracyCUPTISrcLocStreamSynchronization; + cudaStreamId = synchronization->streamId; + } + if (synchronization->cudaEventId != CUPTI_SYNCHRONIZATION_INVALID_VALUE) { + static constexpr tracy::SourceLocationData TracyCUPTISrcLocEventSynchronization{ "CUDA::Event::sync", TracyFunction, TracyFile, (uint32_t)TracyLine, tracy::Color::Magenta4 }; + pSrcLoc = &TracyCUPTISrcLocEventSynchronization; + cudaStreamId = 0xFFFFFFFF; + // TODO(marcos): CUpti_ActivitySynchronization2 introduces a new + // field 'cudaEventSyncId' which complements 'cudaEventId' + } + apiCall.host->EmitGpuZone(apiCall.start, apiCall.end, synchronization->start, synchronization->end, pSrcLoc, cudaContextId, cudaStreamId); + static constexpr const char* graph_name = "CUDA Synchronization"; + tracyEmitMemAlloc(graph_name, (void*)(uintptr_t)synchronization->correlationId, 1, synchronization->start); + tracyEmitMemFree (graph_name, (void*)(uintptr_t)synchronization->correlationId, synchronization->end); + break; + } + case CUPTI_ACTIVITY_KIND_MEMORY2: + { + ZoneNamedN(kernel, "tracy::CUDACtx::DoProcessDeviceEvent[malloc/free]", instrument); + CUpti_ActivityMemory3* memory3 = (CUpti_ActivityMemory3*)record; + APICallInfo apiCall; + if (!matchActivityToAPICall(memory3->correlationId, apiCall)) { + return matchError(memory3->correlationId, "MEMORY"); + } + static constexpr const char* graph_name = "CUDA Memory Allocation"; + if (memory3->memoryOperationType == CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION){ + auto& memAllocAddress = PersistentState::Get().memAllocAddress; + memAllocAddress[memory3->address] = 1; + tracyEmitMemAlloc(graph_name, (void*)memory3->address, memory3->bytes, memory3->timestamp); + } + else if (memory3->memoryOperationType == CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE){ + auto& memAllocAddress = PersistentState::Get().memAllocAddress; + int dontCare; + if (!memAllocAddress.fetch(memory3->address, dontCare)){ + // Note(Frank): This is a hack to handle the case where the memory allocation + // corresponds to the memory release is not found. + // This can happen when the memory is allocated when profiling is not enabled. + matchError(memory3->correlationId, "MEMORY/RELEASE"); + tracyEmitMemAlloc(graph_name, (void*)memory3->address, memory3->bytes, memory3->timestamp); + } else { + memAllocAddress.erase(memory3->address); + } + tracyEmitMemFree(graph_name, (void*)memory3->address, memory3->timestamp); + } + break; + } + case CUPTI_ACTIVITY_KIND_CUDA_EVENT : + { + // NOTE(marcos): a byproduct of CUPTI_ACTIVITY_KIND_SYNCHRONIZATION + // (I think this is related to cudaEvent*() API calls) + CUpti_ActivityCudaEvent2* event = (CUpti_ActivityCudaEvent2*)record; + UNREFERENCED(event); + break; + } + default: + { + char buffer[64]; + snprintf(buffer, sizeof(buffer), "Unknown activity record (kind is %d)", record->kind); + TracyMessageC(buffer, strlen(buffer), tracy::Color::Crimson); + break; + } + } + } + + static constexpr CUpti_CallbackDomain domains[] = { + CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_CB_DOMAIN_DRIVER_API, + //CUPTI_CB_DOMAIN_RESOURCE, + //CUPTI_CB_DOMAIN_SYNCHRONIZE, + //CUPTI_CB_DOMAIN_NVTX, + //CUPTI_CB_DOMAIN_STATE + }; + + static constexpr CUpti_ActivityKind activities[] = { + //CUPTI_ACTIVITY_KIND_KERNEL, // mutually exclusive with CONCURRENT_KERNEL + CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL, + CUPTI_ACTIVITY_KIND_MEMCPY, + CUPTI_ACTIVITY_KIND_MEMSET, + CUPTI_ACTIVITY_KIND_SYNCHRONIZATION, + CUPTI_ACTIVITY_KIND_MEMORY2, + //CUPTI_ACTIVITY_KIND_MEMCPY2, + //CUPTI_ACTIVITY_KIND_OVERHEAD, + //CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API, + //CUPTI_ACTIVITY_KIND_RUNTIME, + //CUPTI_ACTIVITY_KIND_DRIVER, + }; + + static void BeginInstrumentation(CUDACtx* profilerHost) { + auto& currentProfilerHost = PersistentState::Get().profilerHost; + if (currentProfilerHost != nullptr) { + return; + } + currentProfilerHost = profilerHost; + + // NOTE(frank): full-stop synchronization to ensure we only handle + // CUDA API calls and device activities that happens past this point + cudaDeviceSynchronize(); + + auto& subscriber = PersistentState::Get().subscriber; + CUPTI_API_CALL(cuptiSubscribe(&subscriber, CUPTI::OnCallbackAPI, profilerHost)); + CUPTI_API_CALL(cuptiActivityRegisterCallbacks(CUPTI::OnBufferRequested, CUPTI::OnBufferCompleted)); + for (auto domain : domains) { + CUPTI_API_CALL(cuptiEnableDomain(uint32_t(true), subscriber, domain)); + } + for (auto activity : activities) { + CUPTI_API_CALL(cuptiActivityEnable(activity)); + } + + #if TRACY_CUDA_ENABLE_COLLECTOR_THREAD + auto& collector = PersistentState::Get().collector; + collector.period = 160; + collector.signal.notify_one(); + #endif + } + + static void EndInstrumentation() { + auto& currentProfilerHost = PersistentState::Get().profilerHost; + if (currentProfilerHost == nullptr) { + return; + } + + // NOTE(frank): full-stop synchronization to ensure we catch + // and drain all the activities that has been tracked up to now. + cudaDeviceSynchronize(); + + FlushActivity(); + + auto& subscriber = PersistentState::Get().subscriber; + for (auto activity : activities) { + CUPTI_API_CALL(cuptiActivityDisable(activity)); + } + for (auto domain : domains) { + CUPTI_API_CALL(cuptiEnableDomain(uint32_t(false), subscriber, domain)); + } + // TODO(marcos): is here a counterpart for 'cuptiActivityRegisterCallbacks()'? + CUPTI_API_CALL(cuptiUnsubscribe(subscriber)); + + #if TRACY_CUDA_ENABLE_COLLECTOR_THREAD + auto& collector = PersistentState::Get().collector; + collector.period = ~uint32_t(0); + collector.signal.notify_one(); + #endif + + currentProfilerHost = nullptr; + } + + static void FlushActivity() + { + // NOTE(marcos): only one thread should do the collection at any given time, + // but there's no reason to block threads that are also trying to do the same + static std::mutex m; + if (!m.try_lock()) + return; + std::unique_lock lock (m, std::adopt_lock); + ZoneNamedNC(zone, "cuptiActivityFlushAll", tracy::Color::Red4, true); + CUPTI_API_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_NONE)); + } + + #if TRACY_CUDA_ENABLE_COLLECTOR_THREAD + // WARN(marcos): technically, CUPTI already offers async flushing of + // activity records through cuptiActivityFlushPeriod(), but I haven't + // had much luck getting reliable, consistent delivery with it... + struct Collector { + std::atomic running = true; + volatile uint32_t period = ~uint32_t(0); + std::mutex mtx; + std::condition_variable signal; + std::thread thread = std::thread( + [this]() { + tracy::SetThreadName("Tracy CUDA Collector"); + atexit([]() { + auto& collector = CUPTI::PersistentState::Get().collector; + collector.running = false; + collector.signal.notify_one(); + collector.thread.join(); + }); + while (running) { + { + std::unique_lock lock(mtx); + signal.wait_for(lock, std::chrono::milliseconds(period)); + } + FlushActivity(); + } + } + ); + }; + #endif + + static void FlushActivityAsync() + { + #if TRACY_CUDA_ENABLE_COLLECTOR_THREAD + ZoneScoped; + auto& collector = PersistentState::Get().collector; + collector.signal.notify_one(); + #endif + } + + struct PersistentState { + // NOTE(marcos): these objects must remain in memory past the application + // returning from main() because the Tracy client worker thread may still + // be responding to string/source-location requests from the server + SourceLocationMap kernelSrcLoc; + StringTable demangledNameTable; + SourceLocationLUT cudaCallSourceLocation; + + // NOTE(marcos): these objects do not need to persist, but their relative + // footprint is trivial enough that we don't care if we let them leak + ConcurrentHashMap cudaCallSiteInfo; + ConcurrentHashMap memAllocAddress; + CUpti_SubscriberHandle subscriber = {}; + CUDACtx* profilerHost = nullptr; + + Collector collector; + + static PersistentState& Get() { + static PersistentState& persistent = *(new PersistentState()); + return persistent; + } + }; + + }; + + CUDACtx(uint8_t gpuContextID = 255) + { + ZoneScoped; + + if (gpuContextID != 255) { + m_tracyGpuContext = gpuContextID; + return; + } + + m_tracyGpuContext = GetGpuCtxCounter().fetch_add(1, std::memory_order_relaxed); + assert(m_tracyGpuContext != 255); + + TracyTimestamp tTracy; + CUptiTimestamp tCUpti; + QueryTimestamps(tTracy, tCUpti); + + // Announce to Tracy about a new GPU context/timeline: + auto item = Profiler::QueueSerial(); + tracyMemWrite(item->hdr.type, QueueType::GpuNewContext); + tracyMemWrite(item->gpuNewContext.cpuTime, tTracy); + tracyMemWrite(item->gpuNewContext.gpuTime, (int64_t)tCUpti); // TODO: Be more careful about this cast + tracyMemWrite(item->gpuNewContext.thread, (uint32_t)0); + tracyMemWrite(item->gpuNewContext.period, 1.0f); + tracyMemWrite(item->gpuNewContext.type, GpuContextType::CUDA); + tracyMemWrite(item->gpuNewContext.context, m_tracyGpuContext); + #if TRACY_CUDA_CALIBRATED_CONTEXT + tracyMemWrite(item->gpuNewContext.flags, GpuContextCalibration); + #else + tracyMemWrite(item->gpuNewContext.flags, tracy::GpuContextFlags(0)); + #endif + Profiler::QueueSerialFinish(); + + constexpr const char* tracyCtxName = "CUDA GPU/Device Activity"; + this->Name(tracyCtxName, uint16_t(strlen(tracyCtxName))); + + // NOTE(marcos): a few rounds of calibation amorthized over 1 second + // in order to get a meaningful linear regression estimator + Recalibrate(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + Recalibrate(); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + Recalibrate(); + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + Recalibrate(); + std::this_thread::sleep_for(std::chrono::milliseconds(400)); + Recalibrate(); + } + + ~CUDACtx() + { + ZoneScoped; + } + + struct Singleton { + CUDACtx* ctx = nullptr; + std::mutex m; + int ref_count = 0; + uint8_t ctx_id = 255; + static Singleton& Get() { + static Singleton singleton; + return singleton; + } + }; + + #if TRACY_CUDA_ENABLE_CUDA_CALL_STATS + ProfilerStats stats = {}; + #endif + + uint8_t m_tracyGpuContext = 255; + static constexpr size_t cacheline = 64; + alignas(cacheline) std::atomic m_queryIdGen = 0; + }; + +} + +#define TracyCUDAContext() tracy::CUDACtx::Create() +#define TracyCUDAContextDestroy(ctx) tracy::CUDACtx::Destroy(ctx) +#define TracyCUDAContextName(ctx, name, size) ctx->Name(name, size) + +#define TracyCUDAStartProfiling(ctx) ctx->StartProfiling() +#define TracyCUDAStopProfiling(ctx) ctx->StopProfiling() + +#define TracyCUDACollect(ctx) ctx->Collect() + +#endif + +#endif \ No newline at end of file diff --git a/libs/tracy/tracy/TracyD3D11.hpp b/libs/tracy/tracy/TracyD3D11.hpp index 3ed151b..acab383 100644 --- a/libs/tracy/tracy/TracyD3D11.hpp +++ b/libs/tracy/tracy/TracyD3D11.hpp @@ -95,6 +95,10 @@ class D3D11Ctx int64_t tcpu0 = Profiler::GetTime(); WaitForQuery(m_disjointQuery); + // NOTE: one would expect that by waiting for the enclosing disjoint query to finish, + // all timestamp queries within would also be readily available, but that does not + // seem to be the case here... See https://github.com/wolfpld/tracy/issues/947 + WaitForQuery(m_queries[0]); int64_t tcpu1 = Profiler::GetTime(); D3D11_QUERY_DATA_TIMESTAMP_DISJOINT disjoint = { }; @@ -109,7 +113,7 @@ class D3D11Ctx UINT64 timestamp = 0; if (m_immediateDevCtx->GetData(m_queries[0], ×tamp, sizeof(timestamp), 0) != S_OK) - continue; // this should never happen, since the enclosing disjoint query succeeded + continue; // this should never happen (we waited for the query to finish above) tcpu = tcpu0 + (tcpu1 - tcpu0) * 1 / 2; tgpu = timestamp * (1000000000 / disjoint.Frequency); @@ -307,13 +311,21 @@ class D3D11ZoneScope WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast(srcloc)); } - tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int depth, bool active ) + tracy_force_inline D3D11ZoneScope( D3D11Ctx* ctx, const SourceLocationData* srcloc, int32_t depth, bool active ) : D3D11ZoneScope(ctx, active) { if( !m_active ) return; - auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); - WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast(srcloc)); + if( depth > 0 && has_callstack() ) + { + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginCallstackSerial, reinterpret_cast(srcloc)); + } + else + { + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast(srcloc)); + } } tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, bool active) @@ -327,15 +339,23 @@ class D3D11ZoneScope WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation); } - tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool active) + tracy_force_inline D3D11ZoneScope(D3D11Ctx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int32_t depth, bool active) : D3D11ZoneScope(ctx, active) { if( !m_active ) return; const auto sourceLocation = Profiler::AllocSourceLocation(line, source, sourceSz, function, functionSz, name, nameSz); - auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); - WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation); + if ( depth > 0 && has_callstack() ) + { + auto* item = Profiler::QueueSerialCallstack(Callstack(depth)); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial, sourceLocation); + } + else + { + auto* item = Profiler::QueueSerial(); + WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation); + } } tracy_force_inline ~D3D11ZoneScope() diff --git a/libs/tracy/tracy/TracyD3D12.hpp b/libs/tracy/tracy/TracyD3D12.hpp index 4156793..d36253d 100644 --- a/libs/tracy/tracy/TracyD3D12.hpp +++ b/libs/tracy/tracy/TracyD3D12.hpp @@ -385,7 +385,7 @@ namespace tracy WriteQueueItem(item, QueueType::GpuZoneBeginSerial, reinterpret_cast(srcLocation)); } - tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int depth, bool active) + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, ID3D12GraphicsCommandList* cmdList, const SourceLocationData* srcLocation, int32_t depth, bool active) : D3D12ZoneScope(ctx, cmdList, active) { if (!m_active) return; @@ -405,7 +405,7 @@ namespace tracy WriteQueueItem(item, QueueType::GpuZoneBeginAllocSrcLocSerial, sourceLocation); } - tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int depth, bool active) + tracy_force_inline D3D12ZoneScope(D3D12QueueCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, ID3D12GraphicsCommandList* cmdList, int32_t depth, bool active) : D3D12ZoneScope(ctx, cmdList, active) { if (!m_active) return; diff --git a/libs/tracy/tracy/TracyLua.hpp b/libs/tracy/tracy/TracyLua.hpp index 51dead5..f0c5c40 100644 --- a/libs/tracy/tracy/TracyLua.hpp +++ b/libs/tracy/tracy/TracyLua.hpp @@ -120,6 +120,8 @@ static inline void LuaRemove( char* script ) } } +static inline void LuaHook( lua_State* L, lua_Debug* ar ) {} + } #else @@ -143,6 +145,13 @@ TRACY_API LuaZoneState& GetLuaZoneState(); namespace detail { +static inline void LuaShortenSrc( char* dst, const char* src ) +{ + size_t l = std::min( (size_t)255, strlen( src ) ); + memcpy( dst, src, l ); + dst[l] = 0; +} + #ifdef TRACY_HAS_CALLSTACK static tracy_force_inline void SendLuaCallstack( lua_State* L, uint32_t depth ) { @@ -188,13 +197,6 @@ static tracy_force_inline void SendLuaCallstack( lua_State* L, uint32_t depth ) TracyQueueCommit( callstackAllocFatThread ); } -static inline void LuaShortenSrc( char* dst, const char* src ) -{ - size_t l = std::min( (size_t)255, strlen( src ) ); - memcpy( dst, src, l ); - dst[l] = 0; -} - static inline int LuaZoneBeginS( lua_State* L ) { #ifdef TRACY_ON_DEMAND @@ -439,6 +441,44 @@ static inline void LuaRegister( lua_State* L ) static inline void LuaRemove( char* script ) {} +static inline void LuaHook( lua_State* L, lua_Debug* ar ) +{ + if ( ar->event == LUA_HOOKCALL ) + { +#ifdef TRACY_ON_DEMAND + const auto zoneCnt = GetLuaZoneState().counter++; + if ( zoneCnt != 0 && !GetLuaZoneState().active ) return; + GetLuaZoneState().active = GetProfiler().IsConnected(); + if ( !GetLuaZoneState().active ) return; +#endif + lua_getinfo( L, "Snl", ar ); + + char src[256]; + detail::LuaShortenSrc( src, ar->short_src ); + + const auto srcloc = Profiler::AllocSourceLocation( ar->currentline, src, ar->name ? ar->name : ar->short_src ); + TracyQueuePrepare( QueueType::ZoneBeginAllocSrcLoc ); + MemWrite( &item->zoneBegin.time, Profiler::GetTime() ); + MemWrite( &item->zoneBegin.srcloc, srcloc ); + TracyQueueCommit( zoneBeginThread ); + } + else if (ar->event == LUA_HOOKRET) { +#ifdef TRACY_ON_DEMAND + assert( GetLuaZoneState().counter != 0 ); + GetLuaZoneState().counter--; + if ( !GetLuaZoneState().active ) return; + if ( !GetProfiler().IsConnected() ) + { + GetLuaZoneState().active = false; + return; + } +#endif + TracyQueuePrepare( QueueType::ZoneEnd ); + MemWrite( &item->zoneEnd.time, Profiler::GetTime() ); + TracyQueueCommit( zoneEndThread ); + } +} + } #endif diff --git a/libs/tracy/tracy/TracyMetal.hmm b/libs/tracy/tracy/TracyMetal.hmm new file mode 100644 index 0000000..a4b4cb5 --- /dev/null +++ b/libs/tracy/tracy/TracyMetal.hmm @@ -0,0 +1,644 @@ +#ifndef __TRACYMETAL_HMM__ +#define __TRACYMETAL_HMM__ + +/* This file implements a Metal API back-end for Tracy (it has only been tested on Apple + Silicon devices, but it should also work on Intel-based Macs and older iOS devices). + The Metal back-end in Tracy operates differently than other GPU back-ends like Vulkan, + Direct3D and OpenGL. Specifically, TracyMetalZone() must be placed around the site where + a command encoder is created. This is because not all hardware supports timestamps at + command granularity, and can only provide timestamps around an entire command encoder. + This accommodates for all tiers of hardware; in the future, variants of TracyMetalZone() + will be added to support the habitual command-level granularity of Tracy GPU back-ends. + Metal also imposes a few restrictions that make the process of requesting and collecting + queries more complicated in Tracy: + a) timestamp query buffers are limited to 4096 queries (32KB, where each query is 8 bytes) + b) when a timestamp query buffer is created, Metal initializes all timestamps with zeroes, + and there's no way to reset them back to zero after timestamps get resolved; the only + way to clear the timestamps is by allocating a new timestamp query buffer + c) if a command encoder records no commands and its corresponding command buffer ends up + committed to the command queue, Metal will "optimize-away" the encoder along with any + timestamp queries associated with it (the timestamp will remain as zero and will never + get resolved) + Because of the limitations above, two timestamp buffers are managed internally. Once one + of the buffers fills up with requests, the second buffer can start serving new requests. + Once all requests in a buffer get resolved and collected, the entire buffer is discarded + and a new one allocated for future requests. (Proper cycling through a ring buffer would + require bookkeeping and completion handlers to collect only the known complete queries.) + In the current implementation, there is potential for a race condition when the buffer is + discarded and reallocated. In practice, the race condition will never materialize so long + as TracyMetalCollect() is called frequently to keep the amount of unresolved queries low. + Finally, there's a timeout mechanism during timestamp collection to detect "empty" command + encoders and ensure progress. +*/ + +#ifndef TRACY_ENABLE + +#define TracyMetalContext(device) nullptr +#define TracyMetalDestroy(ctx) +#define TracyMetalContextName(ctx, name, size) + +#define TracyMetalZone(ctx, encoderDesc, name) +#define TracyMetalZoneC(ctx, encoderDesc, name, color) +#define TracyMetalNamedZone(ctx, varname, encoderDesc, name, active) +#define TracyMetalNamedZoneC(ctx, varname, encoderDesc, name, color, active) + +#define TracyMetalCollect(ctx) + +namespace tracy +{ +class MetalZoneScope {}; +} + +using TracyMetalCtx = void; + +#else + +#if not __has_feature(objc_arc) +#error TracyMetal requires ARC to be enabled. +#endif + +#include +#include +#include + +#include "Tracy.hpp" +#include "../client/TracyProfiler.hpp" +#include "../client/TracyCallstack.hpp" +#include "../common/TracyAlign.hpp" +#include "../common/TracyAlloc.hpp" + +// ok to import if in obj-c code +#import + +#define TRACY_METAL_VA_ARGS(...) , ##__VA_ARGS__ + +#define TracyMetalPanic(ret, msg, ...) do { \ + char buffer [1024]; \ + snprintf(buffer, sizeof(buffer), "TracyMetal: " msg TRACY_METAL_VA_ARGS(__VA_ARGS__)); \ + TracyMessageC(buffer, strlen(buffer), tracy::Color::OrangeRed); \ + fprintf(stderr, "%s\n", buffer); \ + ret; \ + } while(false); + +#ifndef TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT +#define TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT 0.200f +#endif//TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT + +#ifndef TRACY_METAL_DEBUG_MASK +#define TRACY_METAL_DEBUG_MASK (0) +#endif//TRACY_METAL_DEBUG_MASK + +#if TRACY_METAL_DEBUG_MASK + #define TracyMetalDebugMasked(mask, ...) if constexpr (mask & TRACY_METAL_DEBUG_MASK) { __VA_ARGS__; } +#else + #define TracyMetalDebugMasked(mask, ...) +#endif + +#if TRACY_METAL_DEBUG_MASK & (1 << 1) + #define TracyMetalDebug_0b00010(...) __VA_ARGS__; +#else + #define TracyMetalDebug_0b00010(...) +#endif + +#if TRACY_METAL_DEBUG_MASK & (1 << 4) + #define TracyMetalDebug_0b10000(...) __VA_ARGS__; +#else + #define TracyMetalDebug_0b10000(...) +#endif + +#ifndef TracyMetalDebugZoneScopeWireTap +#define TracyMetalDebugZoneScopeWireTap +#endif//TracyMetalDebugZoneScopeWireTap + +namespace tracy +{ + +class MetalCtx +{ + friend class MetalZoneScope; + + enum { MaxQueries = 4 * 1024 }; // Metal: between 8 and 32768 _BYTES_... + +public: + static MetalCtx* Create(id device) + { + ZoneScopedNC("tracy::MetalCtx::Create", Color::Red4); + auto ctx = static_cast(tracy_malloc(sizeof(MetalCtx))); + new (ctx) MetalCtx(device); + if (ctx->m_contextId == 255) + { + TracyMetalPanic({assert(false);} return nullptr, "ERROR: unable to create context."); + Destroy(ctx); + } + return ctx; + } + + static void Destroy(MetalCtx* ctx) + { + ZoneScopedNC("tracy::MetalCtx::Destroy", Color::Red4); + ctx->~MetalCtx(); + tracy_free(ctx); + } + + void Name( const char* name, uint16_t len ) + { + auto ptr = (char*)tracy_malloc( len ); + memcpy( ptr, name, len ); + + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuContextName ); + MemWrite( &item->gpuContextNameFat.context, m_contextId ); + MemWrite( &item->gpuContextNameFat.ptr, (uint64_t)ptr ); + MemWrite( &item->gpuContextNameFat.size, len ); + SubmitQueueItem(item); + } + + bool Collect() + { + ZoneScopedNC("tracy::MetalCtx::Collect", Color::Red4); + +#ifdef TRACY_ON_DEMAND + if (!GetProfiler().IsConnected()) + { + return true; + } +#endif + + // Only one thread is allowed to collect timestamps at any given time + // but there's no need to block contending threads + if (!m_collectionMutex.try_lock()) + { + return true; + } + + std::unique_lock lock (m_collectionMutex, std::adopt_lock); + + uintptr_t begin = m_previousCheckpoint.load(); + uintptr_t latestCheckpoint = m_queryCounter.load(); // TODO: MTLEvent? MTLFence?; + TracyMetalDebugMasked(1<<3, ZoneValue(begin)); + TracyMetalDebugMasked(1<<3, ZoneValue(latestCheckpoint)); + + uint32_t count = RingCount(begin, latestCheckpoint); + if (count == 0) // no pending timestamp queries + { + //uintptr_t nextCheckpoint = m_queryCounter.load(); + //if (nextCheckpoint != latestCheckpoint) + //{ + // // TODO: signal event / fence now? + //} + return true; + } + + // resolve up until the ring buffer boundary and let a subsequenty call + // to Collect handle the wrap-around + bool reallocateBuffer = false; + if (RingIndex(begin) + count >= RingSize()) + { + count = RingSize() - RingIndex(begin); + reallocateBuffer = true; + } + TracyMetalDebugMasked(1<<3, ZoneValue(count)); + + auto buffer_idx = (begin / MaxQueries) % 2; + auto counterSampleBuffer = m_counterSampleBuffers[buffer_idx]; + + if (count >= RingSize()) + { + TracyMetalPanic(return false, "Collect: FULL! too many pending timestamp queries. [%llu, %llu] (%u)", begin, latestCheckpoint, count); + } + + TracyMetalDebugMasked(1<<3, TracyMetalPanic(, "Collect: [%llu, %llu] :: (%u)", begin, latestCheckpoint, count)); + + NSRange range = NSMakeRange(RingIndex(begin), count); + NSData* data = [counterSampleBuffer resolveCounterRange:range]; + NSUInteger numResolvedTimestamps = data.length / sizeof(MTLCounterResultTimestamp); + MTLCounterResultTimestamp* timestamps = (MTLCounterResultTimestamp *)(data.bytes); + if (timestamps == nil) + { + TracyMetalPanic(return false, "Collect: unable to resolve timestamps."); + } + + if (numResolvedTimestamps != count) + { + TracyMetalPanic(, "Collect: numResolvedTimestamps != count : %u != %u", (uint32_t)numResolvedTimestamps, count); + } + + int resolved = 0; + for (auto i = 0; i < numResolvedTimestamps; i += 2) + { + TracyMetalDebug_0b10000( ZoneScopedN("tracy::MetalCtx::Collect::[i]") ); + MTLTimestamp t_start = timestamps[i+0].timestamp; + MTLTimestamp t_end = timestamps[i+1].timestamp; + uint32_t k = RingIndex(begin + i); + TracyMetalDebugMasked(1<<4, TracyMetalPanic(, "Collect: timestamp[%u] = %llu | timestamp[%u] = %llu | diff = %llu\n", k, t_start, k+1, t_end, (t_end - t_start))); + if ((t_start == MTLCounterErrorValue) || (t_end == MTLCounterErrorValue)) + { + TracyMetalPanic(, "Collect: invalid timestamp (MTLCounterErrorValue) at %u.", k); + break; + } + // Metal will initialize timestamp buffer with zeroes; encountering a zero-value + // timestamp means that the timestamp has not been written and resolved yet + if ((t_start == 0) || (t_end == 0)) + { + auto checkTime = std::chrono::high_resolution_clock::now(); + auto requestTime = m_timestampRequestTime[k]; + auto ms_in_flight = std::chrono::duration(checkTime-requestTime).count()*1000.0f; + TracyMetalDebugMasked(1<<4, TracyMetalPanic(, "Collect: invalid timestamp (zero) at %u [%.0fms in flight].", k, ms_in_flight)); + const float timeout_ms = TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT * 1000.0f; + if (ms_in_flight < timeout_ms) + break; + TracyMetalDebug_0b10000( ZoneScopedN("tracy::MetalCtx::Collect::Drop") ); + TracyMetalPanic(, "Collect: giving up on timestamp at %u [%.0fms in flight].", k, ms_in_flight); + t_start = m_mostRecentTimestamp + 5; + t_end = t_start + 5; + } + TracyMetalDebugMasked(1<<2, TracyFreeN((void*)(uintptr_t)(k+0), "TracyMetalGpuZone")); + TracyMetalDebugMasked(1<<2, TracyFreeN((void*)(uintptr_t)(k+1), "TracyMetalGpuZone")); + { + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_start)); + MemWrite(&item->gpuTime.queryId, static_cast(k)); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + } + { + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuTime); + MemWrite(&item->gpuTime.gpuTime, static_cast(t_end)); + MemWrite(&item->gpuTime.queryId, static_cast(k+1)); + MemWrite(&item->gpuTime.context, m_contextId); + Profiler::QueueSerialFinish(); + } + m_mostRecentTimestamp = (t_end > m_mostRecentTimestamp) ? t_end : m_mostRecentTimestamp; + TracyMetalDebugMasked(1<<1, TracyFreeN((void*)(uintptr_t)k, "TracyMetalTimestampQueryId")); + resolved += 2; + } + TracyMetalDebugMasked(1<<3, ZoneValue(RingCount(begin, m_previousCheckpoint.load()))); + + m_previousCheckpoint += resolved; + + // Check whether the timestamp buffer has been fully resolved/collected: + // WARN: there's technically a race condition here: NextQuery() may reference the + // buffer that is being released instead of the new one. In practice, this should + // never happen so long as Collect is called frequently enough to prevent pending + // timestamp query requests from piling up too quickly. + if ((resolved == count) && (m_previousCheckpoint.load() % MaxQueries) == 0) + { + m_counterSampleBuffers[buffer_idx] = NewTimestampSampleBuffer(m_device, MaxQueries); + } + + //RecalibrateClocks(); // to account for drift + + return true; + } + +private: + MetalCtx(id device) + : m_device(device) + { + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "MTLCounterErrorValue = 0x%llx", MTLCounterErrorValue)); + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "MTLCounterDontSample = 0x%llx", MTLCounterDontSample)); + + if (m_device == nil) + { + TracyMetalPanic({assert(false);} return, "device is nil."); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary]) + { + TracyMetalPanic({assert(false);} return, "ERROR: timestamp sampling at pipeline stage boundary is not supported."); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDrawBoundary]) + { + TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at draw call boundary is not supported.\n")); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtBlitBoundary]) + { + TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at blit boundary is not supported.\n")); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtDispatchBoundary]) + { + TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at compute dispatch boundary is not supported.\n")); + } + if (![m_device supportsCounterSampling:MTLCounterSamplingPointAtTileDispatchBoundary]) + { + TracyMetalDebugMasked(1<<0, fprintf(stderr, "WARNING: timestamp sampling at tile dispatch boundary is not supported.\n")); + } + + m_counterSampleBuffers[0] = NewTimestampSampleBuffer(m_device, MaxQueries); + m_counterSampleBuffers[1] = NewTimestampSampleBuffer(m_device, MaxQueries); + + m_timestampRequestTime.resize(MaxQueries); + + MTLTimestamp cpuTimestamp = 0; + MTLTimestamp gpuTimestamp = 0; + [m_device sampleTimestamps:&cpuTimestamp gpuTimestamp:&gpuTimestamp]; + m_mostRecentTimestamp = gpuTimestamp; + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Metal): %llu", cpuTimestamp)); + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: GPU timestamp (Metal): %llu", gpuTimestamp)); + + cpuTimestamp = Profiler::GetTime(); + TracyMetalDebugMasked(1<<0, TracyMetalPanic(, "Calibration: CPU timestamp (Tracy): %llu", cpuTimestamp)); + + float period = 1.0f; + + m_contextId = GetGpuCtxCounter().fetch_add(1); + + auto* item = Profiler::QueueSerial(); + MemWrite(&item->hdr.type, QueueType::GpuNewContext); + MemWrite(&item->gpuNewContext.cpuTime, int64_t(cpuTimestamp)); + MemWrite(&item->gpuNewContext.gpuTime, int64_t(gpuTimestamp)); + MemWrite(&item->gpuNewContext.thread, uint32_t(0)); // TODO: why not GetThreadHandle()? + MemWrite(&item->gpuNewContext.period, period); + MemWrite(&item->gpuNewContext.context, m_contextId); + //MemWrite(&item->gpuNewContext.flags, GpuContextCalibration); + MemWrite(&item->gpuNewContext.flags, GpuContextFlags(0)); + MemWrite(&item->gpuNewContext.type, GpuContextType::Metal); + SubmitQueueItem(item); + } + + ~MetalCtx() + { + // collect the last remnants of Metal GPU activity... + // TODO: add a timeout to this loop? + while (m_previousCheckpoint.load() != m_queryCounter.load()) + Collect(); + } + + tracy_force_inline void SubmitQueueItem(QueueItem* item) + { +#ifdef TRACY_ON_DEMAND + GetProfiler().DeferItem(*item); +#endif + Profiler::QueueSerialFinish(); + } + + tracy_force_inline uint32_t RingIndex(uintptr_t index) + { + index %= MaxQueries; + return static_cast(index); + } + + tracy_force_inline uint32_t RingCount(uintptr_t begin, uintptr_t end) + { + // wrap-around safe: all unsigned + uintptr_t count = end - begin; + return static_cast(count); + } + + tracy_force_inline uint32_t RingSize() const + { + return MaxQueries; + } + + struct Query { id buffer; uint32_t idx; }; + + tracy_force_inline Query NextQuery() + { + TracyMetalDebug_0b00010( ZoneScopedNC("Tracy::MetalCtx::NextQuery", tracy::Color::LightCoral) ); + auto id = m_queryCounter.fetch_add(2); + TracyMetalDebug_0b00010( ZoneValue(id) ); + auto count = RingCount(m_previousCheckpoint, id); + if (count >= MaxQueries) + { + // TODO: return a proper (hidden) "sentinel" query + Query sentinel = Query{ m_counterSampleBuffers[1], MaxQueries-2 }; + TracyMetalPanic( + return sentinel, + "NextQueryId: FULL! too many pending timestamp queries. Consider calling TracyMetalCollect() more frequently. [%llu, %llu] (%u)", + m_previousCheckpoint.load(), id, count + ); + } + uint32_t buffer_idx = (id / MaxQueries) % 2; + TracyMetalDebug_0b00010( ZoneValue(buffer_idx) ); + auto buffer = m_counterSampleBuffers[buffer_idx]; + if (buffer == nil) + TracyMetalPanic(, "NextQueryId: sample buffer is nil! (id=%llu)", id); + uint32_t idx = RingIndex(id); + TracyMetalDebug_0b00010( ZoneValue(idx) ); + TracyMetalDebug_0b00010( TracyAllocN((void*)(uintptr_t)idx, 2, "TracyMetalTimestampQueryId") ); + m_timestampRequestTime[idx] = std::chrono::high_resolution_clock::now(); + return Query{ buffer, idx }; + } + + tracy_force_inline uint8_t GetContextId() const + { + return m_contextId; + } + + static id NewTimestampSampleBuffer(id device, size_t count) + { + ZoneScopedN("tracy::MetalCtx::NewTimestampSampleBuffer"); + + id timestampCounterSet = nil; + for (id counterSet in device.counterSets) + { + if ([counterSet.name isEqualToString:MTLCommonCounterSetTimestamp]) + { + timestampCounterSet = counterSet; + break; + } + } + if (timestampCounterSet == nil) + { + TracyMetalPanic({assert(false);} return nil, "ERROR: timestamp counters are not supported on the platform."); + } + + MTLCounterSampleBufferDescriptor* sampleDescriptor = [[MTLCounterSampleBufferDescriptor alloc] init]; + sampleDescriptor.counterSet = timestampCounterSet; + sampleDescriptor.sampleCount = MaxQueries; + sampleDescriptor.storageMode = MTLStorageModeShared; + sampleDescriptor.label = @"TracyMetalTimestampPool"; + + NSError* error = nil; + id counterSampleBuffer = [device newCounterSampleBufferWithDescriptor:sampleDescriptor error:&error]; + if (error != nil) + { + //NSLog(@"%@ | %@", error.localizedDescription, error.localizedFailureReason); + TracyMetalPanic({assert(false);} return nil, + "ERROR: unable to create sample buffer for timestamp counters : %s | %s", + [error.localizedDescription cString], [error.localizedFailureReason cString]); + } + + return counterSampleBuffer; + } + + uint8_t m_contextId = 255; + + id m_device = nil; + id m_counterSampleBuffers [2] = {}; + + using atomic_counter = std::atomic; + static_assert(atomic_counter::is_always_lock_free); + atomic_counter m_queryCounter = 0; + + atomic_counter m_previousCheckpoint = 0; + MTLTimestamp m_mostRecentTimestamp = 0; + + std::vector m_timestampRequestTime; + + std::mutex m_collectionMutex; +}; + +class MetalZoneScope +{ +public: + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLComputePassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic({assert(false);} return, "compute pass descriptor is nil."); + m_ctx = ctx; + + auto& query = m_query = ctx->NextQuery(); + + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0; + desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1; + + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); + } + + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLBlitPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic({assert(false); }return, "blit pass descriptor is nil."); + m_ctx = ctx; + + auto& query = m_query = ctx->NextQuery(); + + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfEncoderSampleIndex = query.idx+0; + desc.sampleBufferAttachments[0].endOfEncoderSampleIndex = query.idx+1; + + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); + } + + tracy_force_inline MetalZoneScope( MetalCtx* ctx, MTLRenderPassDescriptor* desc, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if ( !m_active ) return; + if (desc == nil) TracyMetalPanic({assert(false);} return, "render pass descriptor is nil."); + m_ctx = ctx; + + auto& query = m_query = ctx->NextQuery(); + + desc.sampleBufferAttachments[0].sampleBuffer = query.buffer; + desc.sampleBufferAttachments[0].startOfVertexSampleIndex = query.idx+0; + desc.sampleBufferAttachments[0].endOfVertexSampleIndex = MTLCounterDontSample; + desc.sampleBufferAttachments[0].startOfFragmentSampleIndex = MTLCounterDontSample; + desc.sampleBufferAttachments[0].endOfFragmentSampleIndex = query.idx+1; + + SubmitZoneBeginGpu(ctx, query.idx + 0, srcloc); + } + + /* TODO: implement this constructor interfarce for "command-level" profiling, if the device supports it + tracy_force_inline MetalZoneScope( MetalCtx* ctx, id cmdEncoder, const SourceLocationData* srcloc, bool is_active ) +#ifdef TRACY_ON_DEMAND + : m_active( is_active && GetProfiler().IsConnected() ) +#else + : m_active( is_active ) +#endif + { + if( !m_active ) return; + m_ctx = ctx; + m_cmdEncoder = cmdEncoder; + + auto& query = m_query = ctx->NextQueryId(); + + [m_cmdEncoder sampleCountersInBuffer:m_ctx->m_counterSampleBuffer atSampleIndex:query.idx withBarrier:YES]; + + SubmitZoneBeginGpu(ctx, query.idx, srcloc); + } + */ + + tracy_force_inline ~MetalZoneScope() + { + if( !m_active ) return; + + SubmitZoneEndGpu(m_ctx, m_query.idx + 1); + } + + TracyMetalDebugZoneScopeWireTap; + +private: + const bool m_active; + + MetalCtx* m_ctx; + + /* TODO: declare it for "command-level" profiling + id m_cmdEncoder; + */ + + static void SubmitZoneBeginGpu(MetalCtx* ctx, uint32_t queryId, const SourceLocationData* srcloc) + { + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); + MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneBegin.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneBegin.context, ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + + TracyMetalDebugMasked(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone")); + } + + static void SubmitZoneEndGpu(MetalCtx* ctx, uint32_t queryId) + { + auto* item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneEndSerial ); + MemWrite( &item->gpuZoneEnd.cpuTime, Profiler::GetTime() ); + MemWrite( &item->gpuZoneEnd.thread, GetThreadHandle() ); + MemWrite( &item->gpuZoneEnd.queryId, uint16_t( queryId ) ); + MemWrite( &item->gpuZoneEnd.context, ctx->GetContextId() ); + Profiler::QueueSerialFinish(); + + TracyMetalDebugMasked(1<<2, TracyAllocN((void*)(uintptr_t)queryId, 1, "TracyMetalGpuZone")); + } + + MetalCtx::Query m_query = {}; +}; + +} + +using TracyMetalCtx = tracy::MetalCtx; + +#define TracyMetalContext(device) tracy::MetalCtx::Create(device) +#define TracyMetalDestroy(ctx) tracy::MetalCtx::Destroy(ctx) +#define TracyMetalContextName(ctx, name, size) ctx->Name(name, size) + +#define TracyMetalZone( ctx, encoderDesc, name ) TracyMetalNamedZone( ctx, ___tracy_gpu_zone, encoderDesc, name, true ) +#define TracyMetalZoneC( ctx, encoderDesc, name, color ) TracyMetalNamedZoneC( ctx, ___tracy_gpu_zone, encoderDesc, name, color, true ) +#define TracyMetalNamedZone( ctx, varname, encoderDesc, name, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::MetalZoneScope varname( ctx, encoderDesc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); +#define TracyMetalNamedZoneC( ctx, varname, encoderDesc, name, color, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, color }; tracy::MetalZoneScope varname( ctx, encoderDesc, &TracyConcat(__tracy_gpu_source_location,TracyLine), active ); + +#define TracyMetalCollect( ctx ) ctx->Collect(); + + + +#undef TracyMetalDebug_ZoneScopeWireTap +#undef TracyMetalDebug_0b00010 +#undef TracyMetalDebug_0b10000 +#undef TracyMetalDebugMasked +#undef TRACY_METAL_DEBUG_MASK +#undef TRACY_METAL_TIMESTAMP_COLLECT_TIMEOUT +#undef TracyMetalPanic +#undef TRACY_METAL_VA_ARGS + +#endif + +#endif//__TRACYMETAL_HMM__ diff --git a/libs/tracy/tracy/TracyOpenCL.hpp b/libs/tracy/tracy/TracyOpenCL.hpp index 20d0a7c..ede5c46 100644 --- a/libs/tracy/tracy/TracyOpenCL.hpp +++ b/libs/tracy/tracy/TracyOpenCL.hpp @@ -255,7 +255,7 @@ namespace tracy { Profiler::QueueSerialFinish(); } - tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int depth, bool is_active) + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, const SourceLocationData* srcLoc, int32_t depth, bool is_active) #ifdef TRACY_ON_DEMAND : m_active(is_active&& GetProfiler().IsConnected()) #else @@ -304,7 +304,7 @@ namespace tracy { Profiler::QueueSerialFinish(); } - tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active) + tracy_force_inline OpenCLCtxScope(OpenCLCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int32_t depth, bool is_active) #ifdef TRACY_ON_DEMAND : m_active(is_active && GetProfiler().IsConnected()) #else diff --git a/libs/tracy/tracy/TracyOpenGL.hpp b/libs/tracy/tracy/TracyOpenGL.hpp index 3bdadcc..30abd4f 100644 --- a/libs/tracy/tracy/TracyOpenGL.hpp +++ b/libs/tracy/tracy/TracyOpenGL.hpp @@ -25,7 +25,7 @@ class GpuCtxScope { public: GpuCtxScope( const SourceLocationData*, bool ) {} - GpuCtxScope( const SourceLocationData*, int, bool ) {} + GpuCtxScope( const SourceLocationData*, int32_t, bool ) {} }; } @@ -222,7 +222,7 @@ class GpuCtxScope TracyLfqCommit; } - tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc, int depth, bool is_active ) + tracy_force_inline GpuCtxScope( const SourceLocationData* srcloc, int32_t depth, bool is_active ) #ifdef TRACY_ON_DEMAND : m_active( is_active && GetProfiler().IsConnected() ) #else @@ -271,7 +271,7 @@ class GpuCtxScope TracyLfqCommit; } - tracy_force_inline GpuCtxScope( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int depth, bool is_active ) + tracy_force_inline GpuCtxScope( uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, int32_t depth, bool is_active ) #ifdef TRACY_ON_DEMAND : m_active( is_active && GetProfiler().IsConnected() ) #else diff --git a/libs/tracy/tracy/TracyVulkan.hpp b/libs/tracy/tracy/TracyVulkan.hpp index c34b718..429f299 100644 --- a/libs/tracy/tracy/TracyVulkan.hpp +++ b/libs/tracy/tracy/TracyVulkan.hpp @@ -16,6 +16,7 @@ #define TracyVkZoneC(c,x,y,z) #define TracyVkZoneTransient(c,x,y,z,w) #define TracyVkCollect(c,x) +#define TracyVkCollectHost(c) #define TracyVkNamedZoneS(c,x,y,z,w,a) #define TracyVkNamedZoneCS(c,x,y,z,w,v,a) @@ -256,7 +257,9 @@ class VkCtx #ifdef TRACY_ON_DEMAND if( !GetProfiler().IsConnected() ) { - VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) ); + cmdbuf ? + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, 0, m_queryCount ) ) : + VK_FUNCTION_WRAPPER( vkResetQueryPool( m_device, m_query, 0, m_queryCount ) ); m_tail = head; m_oldCnt = 0; int64_t tgpu; @@ -265,7 +268,7 @@ class VkCtx } #endif assert( head > m_tail ); - + const unsigned int wrappedTail = (unsigned int)( m_tail % m_queryCount ); unsigned int cnt; @@ -325,7 +328,9 @@ class VkCtx } } - VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) ); + cmdbuf ? + VK_FUNCTION_WRAPPER( vkCmdResetQueryPool( cmdbuf, m_query, wrappedTail, cnt ) ) : + VK_FUNCTION_WRAPPER( vkResetQueryPool( m_device, m_query, wrappedTail, cnt ) ); m_tail += cnt; } @@ -531,7 +536,7 @@ class VkCtxScope Profiler::QueueSerialFinish(); } - tracy_force_inline VkCtxScope( VkCtx* ctx, const SourceLocationData* srcloc, VkCommandBuffer cmdbuf, int depth, bool is_active ) + tracy_force_inline VkCtxScope( VkCtx* ctx, const SourceLocationData* srcloc, VkCommandBuffer cmdbuf, int32_t depth, bool is_active ) #ifdef TRACY_ON_DEMAND : m_active( is_active && GetProfiler().IsConnected() ) #else @@ -545,8 +550,17 @@ class VkCtxScope const auto queryId = ctx->NextQueryId(); CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); - auto item = Profiler::QueueSerialCallstack( Callstack( depth ) ); - MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial ); + QueueItem *item; + if( depth > 0 && has_callstack() ) + { + item = Profiler::QueueSerialCallstack( Callstack( depth ) ); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginCallstackSerial ); + } + else + { + item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginSerial ); + } MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); MemWrite( &item->gpuZoneBegin.srcloc, (uint64_t)srcloc ); MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); @@ -580,7 +594,7 @@ class VkCtxScope Profiler::QueueSerialFinish(); } - tracy_force_inline VkCtxScope( VkCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, VkCommandBuffer cmdbuf, int depth, bool is_active ) + tracy_force_inline VkCtxScope( VkCtx* ctx, uint32_t line, const char* source, size_t sourceSz, const char* function, size_t functionSz, const char* name, size_t nameSz, VkCommandBuffer cmdbuf, int32_t depth, bool is_active ) #ifdef TRACY_ON_DEMAND : m_active( is_active && GetProfiler().IsConnected() ) #else @@ -595,8 +609,17 @@ class VkCtxScope CONTEXT_VK_FUNCTION_WRAPPER( vkCmdWriteTimestamp( cmdbuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, ctx->m_query, queryId ) ); const auto srcloc = Profiler::AllocSourceLocation( line, source, sourceSz, function, functionSz, name, nameSz ); - auto item = Profiler::QueueSerialCallstack( Callstack( depth ) ); - MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial ); + QueueItem *item; + if( depth > 0 && has_callstack() ) + { + item = Profiler::QueueSerialCallstack( Callstack( depth ) ); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocCallstackSerial ); + } + else + { + item = Profiler::QueueSerial(); + MemWrite( &item->hdr.type, QueueType::GpuZoneBeginAllocSrcLocSerial ); + } MemWrite( &item->gpuZoneBegin.cpuTime, Profiler::GetTime() ); MemWrite( &item->gpuZoneBegin.srcloc, srcloc ); MemWrite( &item->gpuZoneBegin.thread, GetThreadHandle() ); @@ -703,6 +726,7 @@ using TracyVkCtx = tracy::VkCtx*; # define TracyVkZoneTransient( ctx, varname, cmdbuf, name, active ) tracy::VkCtxScope varname( ctx, TracyLine, TracyFile, strlen( TracyFile ), TracyFunction, strlen( TracyFunction ), name, strlen( name ), cmdbuf, active ); #endif #define TracyVkCollect( ctx, cmdbuf ) ctx->Collect( cmdbuf ); +#define TracyVkCollectHost( ctx ) ctx->Collect( VK_NULL_HANDLE ); #ifdef TRACY_HAS_CALLSTACK # define TracyVkNamedZoneS( ctx, varname, cmdbuf, name, depth, active ) static constexpr tracy::SourceLocationData TracyConcat(__tracy_gpu_source_location,TracyLine) { name, TracyFunction, TracyFile, (uint32_t)TracyLine, 0 }; tracy::VkCtxScope varname( ctx, &TracyConcat(__tracy_gpu_source_location,TracyLine), cmdbuf, depth, active ); diff --git a/src/impl.zig b/src/impl.zig new file mode 100644 index 0000000..810fc6a --- /dev/null +++ b/src/impl.zig @@ -0,0 +1,395 @@ +const std = @import("std"); +const Src = std.builtin.SourceLocation; + +const c = @import("c"); +const has_callstack_support = @hasDecl(c, "TRACY_HAS_CALLSTACK") and @hasDecl(c, "TRACY_CALLSTACK"); +const callstack_depth: c_int = if (has_callstack_support) c.TRACY_CALLSTACK else 0; + +const debug_verify_stack_order = false; +threadlocal var stack_depth: if (debug_verify_stack_order) usize else u0 = 0; + +pub const ZoneCtx = struct { + _zone: c.___tracy_c_zone_context, + _token: if (debug_verify_stack_order) usize else void, + + pub inline fn Text(self: ZoneCtx, text: []const u8) void { + if (debug_verify_stack_order) { + if (stack_depth != self._token) { + std.debug.panic( + "Error: expected Value() at stack depth {} but was {}\n", + .{ self._token, stack_depth }, + ); + } + } + c.___tracy_emit_zone_text(self._zone, text.ptr, text.len); + } + pub inline fn Name(self: ZoneCtx, name: []const u8) void { + if (debug_verify_stack_order) { + if (stack_depth != self._token) { + std.debug.panic( + "Error: expected Value() at stack depth {} but was {}\n", + .{ self._token, stack_depth }, + ); + } + } + c.___tracy_emit_zone_name(self._zone, name.ptr, name.len); + } + pub inline fn Value(self: ZoneCtx, val: u64) void { + if (debug_verify_stack_order) { + if (stack_depth != self._token) { + std.debug.panic( + "Error: expected Value() at stack depth {} but was {}\n", + .{ self._token, stack_depth }, + ); + } + } + c.___tracy_emit_zone_value(self._zone, val); + } + pub inline fn End(self: ZoneCtx) void { + if (debug_verify_stack_order) { + if (stack_depth != self._token) { + std.debug.panic( + "Error: expected End() at stack depth {} but was {}\n", + .{ self._token, stack_depth }, + ); + } + stack_depth -= 1; + } + c.___tracy_emit_zone_end(self._zone); + } +}; + +inline fn initZone(comptime src: Src, name: ?[*:0]const u8, color: u32, depth: c_int) ZoneCtx { + // Tracy uses pointer identity to identify contexts. + // The `src` parameter being comptime ensures that + // each zone gets its own unique global location for this + // struct. + const static = struct { + var loc: c.___tracy_source_location_data = undefined; + + // Ensure that a unique struct type is generated for each unique `src`. See + // https://github.com/ziglang/zig/issues/18816 + comptime { + // https://github.com/ziglang/zig/issues/19274 + _ = @sizeOf(@TypeOf(src)); + } + }; + static.loc = .{ + .name = name, + .function = src.fn_name.ptr, + .file = src.file.ptr, + .line = src.line, + .color = color, + }; + + const zone = if (has_callstack_support) + c.___tracy_emit_zone_begin_callstack(&static.loc, depth, 1) + else + c.___tracy_emit_zone_begin(&static.loc, 1); + + if (debug_verify_stack_order) { + stack_depth += 1; + return ZoneCtx{ ._zone = zone, ._token = stack_depth }; + } else { + return ZoneCtx{ ._zone = zone, ._token = {} }; + } +} + +pub inline fn SetThreadName(name: [*:0]const u8) void { + c.___tracy_set_thread_name(name); +} + +pub inline fn Zone(comptime src: Src) ZoneCtx { + return initZone(src, null, 0, callstack_depth); +} +pub inline fn ZoneN(comptime src: Src, name: [*:0]const u8) ZoneCtx { + return initZone(src, name, 0, callstack_depth); +} +pub inline fn ZoneC(comptime src: Src, color: u32) ZoneCtx { + return initZone(src, null, color, callstack_depth); +} +pub inline fn ZoneNC(comptime src: Src, name: [*:0]const u8, color: u32) ZoneCtx { + return initZone(src, name, color, callstack_depth); +} +pub inline fn ZoneS(comptime src: Src, depth: i32) ZoneCtx { + return initZone(src, null, 0, depth); +} +pub inline fn ZoneNS(comptime src: Src, name: [*:0]const u8, depth: i32) ZoneCtx { + return initZone(src, name, 0, depth); +} +pub inline fn ZoneCS(comptime src: Src, color: u32, depth: i32) ZoneCtx { + return initZone(src, null, color, depth); +} +pub inline fn ZoneNCS(comptime src: Src, name: [*:0]const u8, color: u32, depth: i32) ZoneCtx { + return initZone(src, name, color, depth); +} + +pub inline fn Alloc(ptr: ?*const anyopaque, size: usize) void { + if (has_callstack_support) { + c.___tracy_emit_memory_alloc_callstack(ptr, size, callstack_depth, 0); + } else { + c.___tracy_emit_memory_alloc(ptr, size, 0); + } +} +pub inline fn Free(ptr: ?*const anyopaque) void { + if (has_callstack_support) { + c.___tracy_emit_memory_free_callstack(ptr, callstack_depth, 0); + } else { + c.___tracy_emit_memory_free(ptr, 0); + } +} +pub inline fn SecureAlloc(ptr: ?*const anyopaque, size: usize) void { + if (has_callstack_support) { + c.___tracy_emit_memory_alloc_callstack(ptr, size, callstack_depth, 1); + } else { + c.___tracy_emit_memory_alloc(ptr, size, 1); + } +} +pub inline fn SecureFree(ptr: ?*const anyopaque) void { + if (has_callstack_support) { + c.___tracy_emit_memory_free_callstack(ptr, callstack_depth, 1); + } else { + c.___tracy_emit_memory_free(ptr, 1); + } +} +pub inline fn AllocS(ptr: ?*const anyopaque, size: usize, depth: c_int) void { + if (has_callstack_support) { + c.___tracy_emit_memory_alloc_callstack(ptr, size, depth, 0); + } else { + c.___tracy_emit_memory_alloc(ptr, size, 0); + } +} +pub inline fn FreeS(ptr: ?*const anyopaque, depth: c_int) void { + if (has_callstack_support) { + c.___tracy_emit_memory_free_callstack(ptr, depth, 0); + } else { + c.___tracy_emit_memory_free(ptr, 0); + } +} +pub inline fn SecureAllocS(ptr: ?*const anyopaque, size: usize, depth: c_int) void { + if (has_callstack_support) { + c.___tracy_emit_memory_alloc_callstack(ptr, size, depth, 1); + } else { + c.___tracy_emit_memory_alloc(ptr, size, 1); + } +} +pub inline fn SecureFreeS(ptr: ?*const anyopaque, depth: c_int) void { + if (has_callstack_support) { + c.___tracy_emit_memory_free_callstack(ptr, depth, 1); + } else { + c.___tracy_emit_memory_free(ptr, 1); + } +} + +pub inline fn AllocN(ptr: ?*const anyopaque, size: usize, name: [*:0]const u8) void { + if (has_callstack_support) { + c.___tracy_emit_memory_alloc_callstack_named(ptr, size, callstack_depth, 0, name); + } else { + c.___tracy_emit_memory_alloc_named(ptr, size, 0, name); + } +} +pub inline fn FreeN(ptr: ?*const anyopaque, name: [*:0]const u8) void { + if (has_callstack_support) { + c.___tracy_emit_memory_free_callstack_named(ptr, callstack_depth, 0, name); + } else { + c.___tracy_emit_memory_free_named(ptr, 0, name); + } +} +pub inline fn SecureAllocN(ptr: ?*const anyopaque, size: usize, name: [*:0]const u8) void { + if (has_callstack_support) { + c.___tracy_emit_memory_alloc_callstack_named(ptr, size, callstack_depth, 1, name); + } else { + c.___tracy_emit_memory_alloc_named(ptr, size, 1, name); + } +} +pub inline fn SecureFreeN(ptr: ?*const anyopaque, name: [*:0]const u8) void { + if (has_callstack_support) { + c.___tracy_emit_memory_free_callstack_named(ptr, callstack_depth, 1, name); + } else { + c.___tracy_emit_memory_free_named(ptr, 1, name); + } +} +pub inline fn AllocNS(ptr: ?*const anyopaque, size: usize, depth: c_int, name: [*:0]const u8) void { + if (has_callstack_support) { + c.___tracy_emit_memory_alloc_callstack_named(ptr, size, depth, 0, name); + } else { + c.___tracy_emit_memory_alloc_named(ptr, size, 0, name); + } +} +pub inline fn FreeNS(ptr: ?*const anyopaque, depth: c_int, name: [*:0]const u8) void { + if (has_callstack_support) { + c.___tracy_emit_memory_free_callstack_named(ptr, depth, 0, name); + } else { + c.___tracy_emit_memory_free_named(ptr, 0, name); + } +} +pub inline fn SecureAllocNS(ptr: ?*const anyopaque, size: usize, depth: c_int, name: [*:0]const u8) void { + if (has_callstack_support) { + c.___tracy_emit_memory_alloc_callstack_named(ptr, size, depth, 1, name); + } else { + c.___tracy_emit_memory_alloc_named(ptr, size, 1, name); + } +} +pub inline fn SecureFreeNS(ptr: ?*const anyopaque, depth: c_int, name: [*:0]const u8) void { + if (has_callstack_support) { + c.___tracy_emit_memory_free_callstack_named(ptr, depth, 1, name); + } else { + c.___tracy_emit_memory_free_named(ptr, 1, name); + } +} + +pub inline fn Message(text: []const u8) void { + c.___tracy_emit_message(text.ptr, text.len, callstack_depth); +} +pub inline fn MessageL(text: [*:0]const u8, color: u32) void { + c.___tracy_emit_messageL(text, color, callstack_depth); +} +pub inline fn MessageC(text: []const u8, color: u32) void { + c.___tracy_emit_messageC(text.ptr, text.len, color, callstack_depth); +} +pub inline fn MessageLC(text: [*:0]const u8, color: u32) void { + c.___tracy_emit_messageLC(text, color, callstack_depth); +} +pub inline fn MessageS(text: []const u8, depth: c_int) void { + const inner_depth: c_int = if (has_callstack_support) depth else 0; + c.___tracy_emit_message(text.ptr, text.len, inner_depth); +} +pub inline fn MessageLS(text: [*:0]const u8, depth: c_int) void { + const inner_depth: c_int = if (has_callstack_support) depth else 0; + c.___tracy_emit_messageL(text, inner_depth); +} +pub inline fn MessageCS(text: []const u8, color: u32, depth: c_int) void { + const inner_depth: c_int = if (has_callstack_support) depth else 0; + c.___tracy_emit_messageC(text.ptr, text.len, color, inner_depth); +} +pub inline fn MessageLCS(text: [*:0]const u8, color: u32, depth: c_int) void { + const inner_depth: c_int = if (has_callstack_support) depth else 0; + c.___tracy_emit_messageLC(text, color, inner_depth); +} + +pub inline fn FrameMark() void { + c.___tracy_emit_frame_mark(null); +} +pub inline fn FrameMarkNamed(name: [*:0]const u8) void { + c.___tracy_emit_frame_mark(name); +} +pub inline fn FrameMarkStart(name: [*:0]const u8) void { + c.___tracy_emit_frame_mark_start(name); +} +pub inline fn FrameMarkEnd(name: [*:0]const u8) void { + c.___tracy_emit_frame_mark_end(name); +} +pub inline fn FrameImage(image: ?*const anyopaque, width: u16, height: u16, offset: u8, flip: c_int) void { + c.___tracy_emit_frame_image(image, width, height, offset, flip); +} + +pub inline fn FiberEnter(name: [*:0]const u8) void { + c.___tracy_fiber_enter(name); +} +pub inline fn FiberLeave() void { + c.___tracy_fiber_leave(); +} + +pub inline fn PlotF(name: [*:0]const u8, val: f64) void { + c.___tracy_emit_plot(name, val); +} +pub inline fn PlotU(name: [*:0]const u8, val: u64) void { + c.___tracy_emit_plot(name, @as(f64, @floatFromInt(val))); +} +pub inline fn PlotI(name: [*:0]const u8, val: i64) void { + c.___tracy_emit_plot(name, @as(f64, @floatFromInt(val))); +} +pub inline fn AppInfo(text: []const u8) void { + c.___tracy_emit_message_appinfo(text.ptr, text.len); +} + +pub const TracyAllocator = struct { + child_allocator: std.mem.Allocator, + + pub fn init(child_allocator: std.mem.Allocator) TracyAllocator { + return .{ + .child_allocator = child_allocator, + }; + } + + pub fn allocator(self: *TracyAllocator) std.mem.Allocator { + return .{ + .ptr = self, + .vtable = &.{ + .alloc = alloc, + .resize = resize, + .remap = remap, + .free = free, + }, + }; + } + + fn alloc( + ctx: *anyopaque, + len: usize, + alignment: std.mem.Alignment, + ra: usize, + ) ?[*]u8 { + const self: *TracyAllocator = @ptrCast(@alignCast(ctx)); + const result = self.child_allocator.rawAlloc(len, alignment, ra); + if (result) |addr| { + Alloc(addr, len); + } else { + var buffer: [128]u8 = undefined; + const msg = std.fmt.bufPrint(&buffer, "alloc failed requesting {d}", .{len}) catch return result; + Message(msg); + } + return result; + } + + fn resize( + ctx: *anyopaque, + buf: []u8, + alignment: std.mem.Alignment, + new_len: usize, + ra: usize, + ) bool { + const self: *TracyAllocator = @ptrCast(@alignCast(ctx)); + const result = self.child_allocator.rawResize(buf, alignment, new_len, ra); + if (result) { + Free(buf.ptr); + Alloc(buf.ptr, new_len); + } else { + var buffer: [128]u8 = undefined; + const msg = std.fmt.bufPrint(&buffer, "resize failed requesting {d} -> {d}", .{ buf.len, new_len }) catch return result; + Message(msg); + } + return result; + } + + fn remap( + ctx: *anyopaque, + buf: []u8, + alignment: std.mem.Alignment, + new_len: usize, + ra: usize, + ) ?[*]u8 { + const self: *TracyAllocator = @ptrCast(@alignCast(ctx)); + const result = self.child_allocator.rawRemap(buf, alignment, new_len, ra); + if (result) |data| { + Free(buf.ptr); + Alloc(data, new_len); + } else { + var buffer: [128]u8 = undefined; + const msg = std.fmt.bufPrint(&buffer, "remap failed requesting {d} -> {d}", .{ buf.len, new_len }) catch return result; + Message(msg); + } + return result; + } + + fn free( + ctx: *anyopaque, + buf: []u8, + alignment: std.mem.Alignment, + ra: usize, + ) void { + const self: *TracyAllocator = @ptrCast(@alignCast(ctx)); + self.child_allocator.rawFree(buf, alignment, ra); + Free(buf.ptr); + } +}; diff --git a/src/stub.zig b/src/stub.zig new file mode 100644 index 0000000..7beb8db --- /dev/null +++ b/src/stub.zig @@ -0,0 +1,229 @@ +const std = @import("std"); +const Src = std.builtin.SourceLocation; + +pub const ZoneCtx = struct { + pub inline fn Text(self: ZoneCtx, text: []const u8) void { + _ = self; + _ = text; + } + pub inline fn Name(self: ZoneCtx, name: []const u8) void { + _ = self; + _ = name; + } + pub inline fn Value(self: ZoneCtx, value: u64) void { + _ = self; + _ = value; + } + pub inline fn End(self: ZoneCtx) void { + _ = self; + } +}; + +pub inline fn SetThreadName(name: [*:0]const u8) void { + _ = name; +} + +pub inline fn Zone(comptime src: Src) ZoneCtx { + _ = src; + return .{}; +} +pub inline fn ZoneN(comptime src: Src, name: [*:0]const u8) ZoneCtx { + _ = src; + _ = name; + return .{}; +} +pub inline fn ZoneC(comptime src: Src, color: u32) ZoneCtx { + _ = src; + _ = color; + return .{}; +} +pub inline fn ZoneNC(comptime src: Src, name: [*:0]const u8, color: u32) ZoneCtx { + _ = src; + _ = name; + _ = color; + return .{}; +} +pub inline fn ZoneS(comptime src: Src, depth: i32) ZoneCtx { + _ = src; + _ = depth; + return .{}; +} +pub inline fn ZoneNS(comptime src: Src, name: [*:0]const u8, depth: i32) ZoneCtx { + _ = src; + _ = name; + _ = depth; + return .{}; +} +pub inline fn ZoneCS(comptime src: Src, color: u32, depth: i32) ZoneCtx { + _ = src; + _ = color; + _ = depth; + return .{}; +} +pub inline fn ZoneNCS(comptime src: Src, name: [*:0]const u8, color: u32, depth: i32) ZoneCtx { + _ = src; + _ = name; + _ = color; + _ = depth; + return .{}; +} + +pub inline fn Alloc(ptr: ?*const anyopaque, size: usize) void { + _ = ptr; + _ = size; +} +pub inline fn Free(ptr: ?*const anyopaque) void { + _ = ptr; +} +pub inline fn SecureAlloc(ptr: ?*const anyopaque, size: usize) void { + _ = ptr; + _ = size; +} +pub inline fn SecureFree(ptr: ?*const anyopaque) void { + _ = ptr; +} +pub inline fn AllocS(ptr: ?*const anyopaque, size: usize, depth: c_int) void { + _ = ptr; + _ = size; + _ = depth; +} +pub inline fn FreeS(ptr: ?*const anyopaque, depth: c_int) void { + _ = ptr; + _ = depth; +} +pub inline fn SecureAllocS(ptr: ?*const anyopaque, size: usize, depth: c_int) void { + _ = ptr; + _ = size; + _ = depth; +} +pub inline fn SecureFreeS(ptr: ?*const anyopaque, depth: c_int) void { + _ = ptr; + _ = depth; +} + +pub inline fn AllocN(ptr: ?*const anyopaque, size: usize, name: [*:0]const u8) void { + _ = ptr; + _ = size; + _ = name; +} +pub inline fn FreeN(ptr: ?*const anyopaque, name: [*:0]const u8) void { + _ = ptr; + _ = name; +} +pub inline fn SecureAllocN(ptr: ?*const anyopaque, size: usize, name: [*:0]const u8) void { + _ = ptr; + _ = size; + _ = name; +} +pub inline fn SecureFreeN(ptr: ?*const anyopaque, name: [*:0]const u8) void { + _ = ptr; + _ = name; +} +pub inline fn AllocNS(ptr: ?*const anyopaque, size: usize, depth: c_int, name: [*:0]const u8) void { + _ = ptr; + _ = size; + _ = depth; + _ = name; +} +pub inline fn FreeNS(ptr: ?*const anyopaque, depth: c_int, name: [*:0]const u8) void { + _ = ptr; + _ = depth; + _ = name; +} +pub inline fn SecureAllocNS(ptr: ?*const anyopaque, size: usize, depth: c_int, name: [*:0]const u8) void { + _ = ptr; + _ = size; + _ = depth; + _ = name; +} +pub inline fn SecureFreeNS(ptr: ?*const anyopaque, depth: c_int, name: [*:0]const u8) void { + _ = ptr; + _ = depth; + _ = name; +} + +pub inline fn Message(text: []const u8) void { + _ = text; +} +pub inline fn MessageL(text: [*:0]const u8) void { + _ = text; +} +pub inline fn MessageC(text: []const u8, color: u32) void { + _ = text; + _ = color; +} +pub inline fn MessageLC(text: [*:0]const u8, color: u32) void { + _ = text; + _ = color; +} +pub inline fn MessageS(text: []const u8, depth: c_int) void { + _ = text; + _ = depth; +} +pub inline fn MessageLS(text: [*:0]const u8, depth: c_int) void { + _ = text; + _ = depth; +} +pub inline fn MessageCS(text: []const u8, color: u32, depth: c_int) void { + _ = text; + _ = color; + _ = depth; +} +pub inline fn MessageLCS(text: [*:0]const u8, color: u32, depth: c_int) void { + _ = text; + _ = color; + _ = depth; +} + +pub inline fn FrameMark() void {} +pub inline fn FrameMarkNamed(name: [*:0]const u8) void { + _ = name; +} +pub inline fn FrameMarkStart(name: [*:0]const u8) void { + _ = name; +} +pub inline fn FrameMarkEnd(name: [*:0]const u8) void { + _ = name; +} +pub inline fn FrameImage(image: ?*const anyopaque, width: u16, height: u16, offset: u8, flip: c_int) void { + _ = image; + _ = width; + _ = height; + _ = offset; + _ = flip; +} + +pub inline fn FiberEnter(name: [*:0]const u8) void { + _ = name; +} +pub inline fn FiberLeave() void {} + +pub inline fn PlotF(name: [*:0]const u8, val: f64) void { + _ = name; + _ = val; +} +pub inline fn PlotU(name: [*:0]const u8, val: u64) void { + _ = name; + _ = val; +} +pub inline fn PlotI(name: [*:0]const u8, val: i64) void { + _ = name; + _ = val; +} +pub inline fn AppInfo(text: []const u8) void { + _ = text; +} + +pub const TracyAllocator = struct { + child_allocator: std.mem.Allocator, + + pub fn init(child_allocator: std.mem.Allocator) TracyAllocator { + return .{ + .child_allocator = child_allocator, + }; + } + + pub fn allocator(self: *TracyAllocator) std.mem.Allocator { + return self.child_allocator; + } +}; diff --git a/src/ztracy.zig b/src/ztracy.zig index aa83f66..79ee7c9 100644 --- a/src/ztracy.zig +++ b/src/ztracy.zig @@ -1,6 +1,5 @@ const std = @import("std"); const builtin = @import("builtin"); -const Src = std.builtin.SourceLocation; // check for a decl named tracy_enabled in root or build_options pub const enabled = blk: { @@ -18,632 +17,67 @@ pub const enabled = blk: { break :blk build_enable orelse false; }; -const debug_verify_stack_order = false; - -pub usingnamespace if (enabled) tracy_full else tracy_stub; - -const tracy_stub = struct { - pub const ZoneCtx = struct { - pub inline fn Text(self: ZoneCtx, text: []const u8) void { - _ = self; - _ = text; - } - pub inline fn Name(self: ZoneCtx, name: []const u8) void { - _ = self; - _ = name; - } - pub inline fn Value(self: ZoneCtx, value: u64) void { - _ = self; - _ = value; - } - pub inline fn End(self: ZoneCtx) void { - _ = self; - } - }; - - pub inline fn SetThreadName(name: [*:0]const u8) void { - _ = name; - } - - pub inline fn Zone(comptime src: Src) ZoneCtx { - _ = src; - return .{}; - } - pub inline fn ZoneN(comptime src: Src, name: [*:0]const u8) ZoneCtx { - _ = src; - _ = name; - return .{}; - } - pub inline fn ZoneC(comptime src: Src, color: u32) ZoneCtx { - _ = src; - _ = color; - return .{}; - } - pub inline fn ZoneNC(comptime src: Src, name: [*:0]const u8, color: u32) ZoneCtx { - _ = src; - _ = name; - _ = color; - return .{}; - } - pub inline fn ZoneS(comptime src: Src, depth: i32) ZoneCtx { - _ = src; - _ = depth; - return .{}; - } - pub inline fn ZoneNS(comptime src: Src, name: [*:0]const u8, depth: i32) ZoneCtx { - _ = src; - _ = name; - _ = depth; - return .{}; - } - pub inline fn ZoneCS(comptime src: Src, color: u32, depth: i32) ZoneCtx { - _ = src; - _ = color; - _ = depth; - return .{}; - } - pub inline fn ZoneNCS(comptime src: Src, name: [*:0]const u8, color: u32, depth: i32) ZoneCtx { - _ = src; - _ = name; - _ = color; - _ = depth; - return .{}; - } - - pub inline fn Alloc(ptr: ?*const anyopaque, size: usize) void { - _ = ptr; - _ = size; - } - pub inline fn Free(ptr: ?*const anyopaque) void { - _ = ptr; - } - pub inline fn SecureAlloc(ptr: ?*const anyopaque, size: usize) void { - _ = ptr; - _ = size; - } - pub inline fn SecureFree(ptr: ?*const anyopaque) void { - _ = ptr; - } - pub inline fn AllocS(ptr: ?*const anyopaque, size: usize, depth: c_int) void { - _ = ptr; - _ = size; - _ = depth; - } - pub inline fn FreeS(ptr: ?*const anyopaque, depth: c_int) void { - _ = ptr; - _ = depth; - } - pub inline fn SecureAllocS(ptr: ?*const anyopaque, size: usize, depth: c_int) void { - _ = ptr; - _ = size; - _ = depth; - } - pub inline fn SecureFreeS(ptr: ?*const anyopaque, depth: c_int) void { - _ = ptr; - _ = depth; - } - - pub inline fn AllocN(ptr: ?*const anyopaque, size: usize, name: [*:0]const u8) void { - _ = ptr; - _ = size; - _ = name; - } - pub inline fn FreeN(ptr: ?*const anyopaque, name: [*:0]const u8) void { - _ = ptr; - _ = name; - } - pub inline fn SecureAllocN(ptr: ?*const anyopaque, size: usize, name: [*:0]const u8) void { - _ = ptr; - _ = size; - _ = name; - } - pub inline fn SecureFreeN(ptr: ?*const anyopaque, name: [*:0]const u8) void { - _ = ptr; - _ = name; - } - pub inline fn AllocNS(ptr: ?*const anyopaque, size: usize, depth: c_int, name: [*:0]const u8) void { - _ = ptr; - _ = size; - _ = depth; - _ = name; - } - pub inline fn FreeNS(ptr: ?*const anyopaque, depth: c_int, name: [*:0]const u8) void { - _ = ptr; - _ = depth; - _ = name; - } - pub inline fn SecureAllocNS(ptr: ?*const anyopaque, size: usize, depth: c_int, name: [*:0]const u8) void { - _ = ptr; - _ = size; - _ = depth; - _ = name; - } - pub inline fn SecureFreeNS(ptr: ?*const anyopaque, depth: c_int, name: [*:0]const u8) void { - _ = ptr; - _ = depth; - _ = name; - } - - pub inline fn Message(text: []const u8) void { - _ = text; - } - pub inline fn MessageL(text: [*:0]const u8) void { - _ = text; - } - pub inline fn MessageC(text: []const u8, color: u32) void { - _ = text; - _ = color; - } - pub inline fn MessageLC(text: [*:0]const u8, color: u32) void { - _ = text; - _ = color; - } - pub inline fn MessageS(text: []const u8, depth: c_int) void { - _ = text; - _ = depth; - } - pub inline fn MessageLS(text: [*:0]const u8, depth: c_int) void { - _ = text; - _ = depth; - } - pub inline fn MessageCS(text: []const u8, color: u32, depth: c_int) void { - _ = text; - _ = color; - _ = depth; - } - pub inline fn MessageLCS(text: [*:0]const u8, color: u32, depth: c_int) void { - _ = text; - _ = color; - _ = depth; - } - - pub inline fn FrameMark() void {} - pub inline fn FrameMarkNamed(name: [*:0]const u8) void { - _ = name; - } - pub inline fn FrameMarkStart(name: [*:0]const u8) void { - _ = name; - } - pub inline fn FrameMarkEnd(name: [*:0]const u8) void { - _ = name; - } - pub inline fn FrameImage(image: ?*const anyopaque, width: u16, height: u16, offset: u8, flip: c_int) void { - _ = image; - _ = width; - _ = height; - _ = offset; - _ = flip; - } - - pub inline fn FiberEnter(name: [*:0]const u8) void { - _ = name; - } - pub inline fn FiberLeave() void {} - - pub inline fn PlotF(name: [*:0]const u8, val: f64) void { - _ = name; - _ = val; - } - pub inline fn PlotU(name: [*:0]const u8, val: u64) void { - _ = name; - _ = val; - } - pub inline fn PlotI(name: [*:0]const u8, val: i64) void { - _ = name; - _ = val; - } - pub inline fn AppInfo(text: []const u8) void { - _ = text; - } - - pub const TracyAllocator = struct { - child_allocator: std.mem.Allocator, - - pub fn init(child_allocator: std.mem.Allocator) TracyAllocator { - return .{ - .child_allocator = child_allocator, - }; - } - - pub fn allocator(self: *TracyAllocator) std.mem.Allocator { - return self.child_allocator; - } - }; -}; - -const tracy_full = struct { - const c = @import("c"); - const has_callstack_support = @hasDecl(c, "TRACY_HAS_CALLSTACK") and @hasDecl(c, "TRACY_CALLSTACK"); - const callstack_depth: c_int = if (has_callstack_support) c.TRACY_CALLSTACK else 0; - - threadlocal var stack_depth: if (debug_verify_stack_order) usize else u0 = 0; - - pub const ZoneCtx = struct { - _zone: c.___tracy_c_zone_context, - _token: if (debug_verify_stack_order) usize else void, - - pub inline fn Text(self: ZoneCtx, text: []const u8) void { - if (debug_verify_stack_order) { - if (stack_depth != self._token) { - std.debug.panic( - "Error: expected Value() at stack depth {} but was {}\n", - .{ self._token, stack_depth }, - ); - } - } - c.___tracy_emit_zone_text(self._zone, text.ptr, text.len); - } - pub inline fn Name(self: ZoneCtx, name: []const u8) void { - if (debug_verify_stack_order) { - if (stack_depth != self._token) { - std.debug.panic( - "Error: expected Value() at stack depth {} but was {}\n", - .{ self._token, stack_depth }, - ); - } - } - c.___tracy_emit_zone_name(self._zone, name.ptr, name.len); - } - pub inline fn Value(self: ZoneCtx, val: u64) void { - if (debug_verify_stack_order) { - if (stack_depth != self._token) { - std.debug.panic( - "Error: expected Value() at stack depth {} but was {}\n", - .{ self._token, stack_depth }, - ); - } - } - c.___tracy_emit_zone_value(self._zone, val); - } - pub inline fn End(self: ZoneCtx) void { - if (debug_verify_stack_order) { - if (stack_depth != self._token) { - std.debug.panic( - "Error: expected End() at stack depth {} but was {}\n", - .{ self._token, stack_depth }, - ); - } - stack_depth -= 1; - } - c.___tracy_emit_zone_end(self._zone); - } - }; - - inline fn initZone(comptime src: Src, name: ?[*:0]const u8, color: u32, depth: c_int) ZoneCtx { - // Tracy uses pointer identity to identify contexts. - // The `src` parameter being comptime ensures that - // each zone gets its own unique global location for this - // struct. - const static = struct { - var loc: c.___tracy_source_location_data = undefined; - - // Ensure that a unique struct type is generated for each unique `src`. See - // https://github.com/ziglang/zig/issues/18816 - comptime { - // https://github.com/ziglang/zig/issues/19274 - _ = @sizeOf(@TypeOf(src)); - } - }; - static.loc = .{ - .name = name, - .function = src.fn_name.ptr, - .file = src.file.ptr, - .line = src.line, - .color = color, - }; - - const zone = if (has_callstack_support) - c.___tracy_emit_zone_begin_callstack(&static.loc, depth, 1) - else - c.___tracy_emit_zone_begin(&static.loc, 1); - - if (debug_verify_stack_order) { - stack_depth += 1; - return ZoneCtx{ ._zone = zone, ._token = stack_depth }; - } else { - return ZoneCtx{ ._zone = zone, ._token = {} }; - } - } - - pub inline fn SetThreadName(name: [*:0]const u8) void { - c.___tracy_set_thread_name(name); - } - - pub inline fn Zone(comptime src: Src) ZoneCtx { - return initZone(src, null, 0, callstack_depth); - } - pub inline fn ZoneN(comptime src: Src, name: [*:0]const u8) ZoneCtx { - return initZone(src, name, 0, callstack_depth); - } - pub inline fn ZoneC(comptime src: Src, color: u32) ZoneCtx { - return initZone(src, null, color, callstack_depth); - } - pub inline fn ZoneNC(comptime src: Src, name: [*:0]const u8, color: u32) ZoneCtx { - return initZone(src, name, color, callstack_depth); - } - pub inline fn ZoneS(comptime src: Src, depth: i32) ZoneCtx { - return initZone(src, null, 0, depth); - } - pub inline fn ZoneNS(comptime src: Src, name: [*:0]const u8, depth: i32) ZoneCtx { - return initZone(src, name, 0, depth); - } - pub inline fn ZoneCS(comptime src: Src, color: u32, depth: i32) ZoneCtx { - return initZone(src, null, color, depth); - } - pub inline fn ZoneNCS(comptime src: Src, name: [*:0]const u8, color: u32, depth: i32) ZoneCtx { - return initZone(src, name, color, depth); - } - - pub inline fn Alloc(ptr: ?*const anyopaque, size: usize) void { - if (has_callstack_support) { - c.___tracy_emit_memory_alloc_callstack(ptr, size, callstack_depth, 0); - } else { - c.___tracy_emit_memory_alloc(ptr, size, 0); - } - } - pub inline fn Free(ptr: ?*const anyopaque) void { - if (has_callstack_support) { - c.___tracy_emit_memory_free_callstack(ptr, callstack_depth, 0); - } else { - c.___tracy_emit_memory_free(ptr, 0); - } - } - pub inline fn SecureAlloc(ptr: ?*const anyopaque, size: usize) void { - if (has_callstack_support) { - c.___tracy_emit_memory_alloc_callstack(ptr, size, callstack_depth, 1); - } else { - c.___tracy_emit_memory_alloc(ptr, size, 1); - } - } - pub inline fn SecureFree(ptr: ?*const anyopaque) void { - if (has_callstack_support) { - c.___tracy_emit_memory_free_callstack(ptr, callstack_depth, 1); - } else { - c.___tracy_emit_memory_free(ptr, 1); - } - } - pub inline fn AllocS(ptr: ?*const anyopaque, size: usize, depth: c_int) void { - if (has_callstack_support) { - c.___tracy_emit_memory_alloc_callstack(ptr, size, depth, 0); - } else { - c.___tracy_emit_memory_alloc(ptr, size, 0); - } - } - pub inline fn FreeS(ptr: ?*const anyopaque, depth: c_int) void { - if (has_callstack_support) { - c.___tracy_emit_memory_free_callstack(ptr, depth, 0); - } else { - c.___tracy_emit_memory_free(ptr, 0); - } - } - pub inline fn SecureAllocS(ptr: ?*const anyopaque, size: usize, depth: c_int) void { - if (has_callstack_support) { - c.___tracy_emit_memory_alloc_callstack(ptr, size, depth, 1); - } else { - c.___tracy_emit_memory_alloc(ptr, size, 1); - } - } - pub inline fn SecureFreeS(ptr: ?*const anyopaque, depth: c_int) void { - if (has_callstack_support) { - c.___tracy_emit_memory_free_callstack(ptr, depth, 1); - } else { - c.___tracy_emit_memory_free(ptr, 1); - } - } - - pub inline fn AllocN(ptr: ?*const anyopaque, size: usize, name: [*:0]const u8) void { - if (has_callstack_support) { - c.___tracy_emit_memory_alloc_callstack_named(ptr, size, callstack_depth, 0, name); - } else { - c.___tracy_emit_memory_alloc_named(ptr, size, 0, name); - } - } - pub inline fn FreeN(ptr: ?*const anyopaque, name: [*:0]const u8) void { - if (has_callstack_support) { - c.___tracy_emit_memory_free_callstack_named(ptr, callstack_depth, 0, name); - } else { - c.___tracy_emit_memory_free_named(ptr, 0, name); - } - } - pub inline fn SecureAllocN(ptr: ?*const anyopaque, size: usize, name: [*:0]const u8) void { - if (has_callstack_support) { - c.___tracy_emit_memory_alloc_callstack_named(ptr, size, callstack_depth, 1, name); - } else { - c.___tracy_emit_memory_alloc_named(ptr, size, 1, name); - } - } - pub inline fn SecureFreeN(ptr: ?*const anyopaque, name: [*:0]const u8) void { - if (has_callstack_support) { - c.___tracy_emit_memory_free_callstack_named(ptr, callstack_depth, 1, name); - } else { - c.___tracy_emit_memory_free_named(ptr, 1, name); - } - } - pub inline fn AllocNS(ptr: ?*const anyopaque, size: usize, depth: c_int, name: [*:0]const u8) void { - if (has_callstack_support) { - c.___tracy_emit_memory_alloc_callstack_named(ptr, size, depth, 0, name); - } else { - c.___tracy_emit_memory_alloc_named(ptr, size, 0, name); - } - } - pub inline fn FreeNS(ptr: ?*const anyopaque, depth: c_int, name: [*:0]const u8) void { - if (has_callstack_support) { - c.___tracy_emit_memory_free_callstack_named(ptr, depth, 0, name); - } else { - c.___tracy_emit_memory_free_named(ptr, 0, name); - } - } - pub inline fn SecureAllocNS(ptr: ?*const anyopaque, size: usize, depth: c_int, name: [*:0]const u8) void { - if (has_callstack_support) { - c.___tracy_emit_memory_alloc_callstack_named(ptr, size, depth, 1, name); - } else { - c.___tracy_emit_memory_alloc_named(ptr, size, 1, name); - } - } - pub inline fn SecureFreeNS(ptr: ?*const anyopaque, depth: c_int, name: [*:0]const u8) void { - if (has_callstack_support) { - c.___tracy_emit_memory_free_callstack_named(ptr, depth, 1, name); - } else { - c.___tracy_emit_memory_free_named(ptr, 1, name); - } - } - - pub inline fn Message(text: []const u8) void { - c.___tracy_emit_message(text.ptr, text.len, callstack_depth); - } - pub inline fn MessageL(text: [*:0]const u8, color: u32) void { - c.___tracy_emit_messageL(text, color, callstack_depth); - } - pub inline fn MessageC(text: []const u8, color: u32) void { - c.___tracy_emit_messageC(text.ptr, text.len, color, callstack_depth); - } - pub inline fn MessageLC(text: [*:0]const u8, color: u32) void { - c.___tracy_emit_messageLC(text, color, callstack_depth); - } - pub inline fn MessageS(text: []const u8, depth: c_int) void { - const inner_depth: c_int = if (has_callstack_support) depth else 0; - c.___tracy_emit_message(text.ptr, text.len, inner_depth); - } - pub inline fn MessageLS(text: [*:0]const u8, depth: c_int) void { - const inner_depth: c_int = if (has_callstack_support) depth else 0; - c.___tracy_emit_messageL(text, inner_depth); - } - pub inline fn MessageCS(text: []const u8, color: u32, depth: c_int) void { - const inner_depth: c_int = if (has_callstack_support) depth else 0; - c.___tracy_emit_messageC(text.ptr, text.len, color, inner_depth); - } - pub inline fn MessageLCS(text: [*:0]const u8, color: u32, depth: c_int) void { - const inner_depth: c_int = if (has_callstack_support) depth else 0; - c.___tracy_emit_messageLC(text, color, inner_depth); - } - - pub inline fn FrameMark() void { - c.___tracy_emit_frame_mark(null); - } - pub inline fn FrameMarkNamed(name: [*:0]const u8) void { - c.___tracy_emit_frame_mark(name); - } - pub inline fn FrameMarkStart(name: [*:0]const u8) void { - c.___tracy_emit_frame_mark_start(name); - } - pub inline fn FrameMarkEnd(name: [*:0]const u8) void { - c.___tracy_emit_frame_mark_end(name); - } - pub inline fn FrameImage(image: ?*const anyopaque, width: u16, height: u16, offset: u8, flip: c_int) void { - c.___tracy_emit_frame_image(image, width, height, offset, flip); - } - - pub inline fn FiberEnter(name: [*:0]const u8) void { - c.___tracy_fiber_enter(name); - } - pub inline fn FiberLeave() void { - c.___tracy_fiber_leave(); - } - - pub inline fn PlotF(name: [*:0]const u8, val: f64) void { - c.___tracy_emit_plot(name, val); - } - pub inline fn PlotU(name: [*:0]const u8, val: u64) void { - c.___tracy_emit_plot(name, @as(f64, @floatFromInt(val))); - } - pub inline fn PlotI(name: [*:0]const u8, val: i64) void { - c.___tracy_emit_plot(name, @as(f64, @floatFromInt(val))); - } - pub inline fn AppInfo(text: []const u8) void { - c.___tracy_emit_message_appinfo(text.ptr, text.len); - } - - pub const TracyAllocator = struct { - child_allocator: std.mem.Allocator, - - pub fn init(child_allocator: std.mem.Allocator) TracyAllocator { - return .{ - .child_allocator = child_allocator, - }; - } - - pub fn allocator(self: *TracyAllocator) std.mem.Allocator { - return .{ - .ptr = self, - .vtable = &.{ - .alloc = alloc, - .resize = resize, - .remap = remap, - .free = free, - }, - }; - } - - fn alloc( - ctx: *anyopaque, - len: usize, - alignment: std.mem.Alignment, - ra: usize, - ) ?[*]u8 { - const self: *TracyAllocator = @ptrCast(@alignCast(ctx)); - const result = self.child_allocator.rawAlloc(len, alignment, ra); - if (result) |addr| { - Alloc(addr, len); - } else { - var buffer: [128]u8 = undefined; - const msg = std.fmt.bufPrint(&buffer, "alloc failed requesting {d}", .{len}) catch return result; - Message(msg); - } - return result; - } - - fn resize( - ctx: *anyopaque, - buf: []u8, - alignment: std.mem.Alignment, - new_len: usize, - ra: usize, - ) bool { - const self: *TracyAllocator = @ptrCast(@alignCast(ctx)); - const result = self.child_allocator.rawResize(buf, alignment, new_len, ra); - if (result) { - Free(buf.ptr); - Alloc(buf.ptr, new_len); - } else { - var buffer: [128]u8 = undefined; - const msg = std.fmt.bufPrint(&buffer, "resize failed requesting {d} -> {d}", .{ buf.len, new_len }) catch return result; - Message(msg); - } - return result; - } - - fn remap( - ctx: *anyopaque, - buf: []u8, - alignment: std.mem.Alignment, - new_len: usize, - ra: usize, - ) ?[*]u8 { - const self: *TracyAllocator = @ptrCast(@alignCast(ctx)); - const result = self.child_allocator.rawRemap(buf, alignment, new_len, ra); - if (result) |data| { - Free(buf.ptr); - Alloc(data, new_len); - } else { - var buffer: [128]u8 = undefined; - const msg = std.fmt.bufPrint(&buffer, "remap failed requesting {d} -> {d}", .{ buf.len, new_len }) catch return result; - Message(msg); - } - return result; - } - - fn free( - ctx: *anyopaque, - buf: []u8, - alignment: std.mem.Alignment, - ra: usize, - ) void { - const self: *TracyAllocator = @ptrCast(@alignCast(ctx)); - self.child_allocator.rawFree(buf, alignment, ra); - Free(buf.ptr); - } - }; -}; +const stub = @import("stub.zig"); +const impl = @import("impl.zig"); + +pub const ZoneCtx = if (enabled) impl.ZoneCtx else stub.ZoneCtx; + +pub const SetThreadName = if (enabled) impl.SetThreadName else stub.SetThreadName; + +pub const Zone = if (enabled) impl.Zone else stub.Zone; +pub const ZoneN = if (enabled) impl.ZoneN else stub.ZoneN; +pub const ZoneC = if (enabled) impl.ZoneC else stub.ZoneC; +pub const ZoneNC = if (enabled) impl.ZoneNC else stub.ZoneNC; +pub const ZoneS = if (enabled) impl.ZoneS else stub.ZoneS; +pub const ZoneNS = if (enabled) impl.ZoneNS else stub.ZoneNS; +pub const ZoneCS = if (enabled) impl.ZoneCS else stub.ZoneCS; +pub const ZoneNCS = if (enabled) impl.ZoneNCS else stub.ZoneNCS; + +pub const Alloc = if (enabled) impl.Alloc else stub.Alloc; +pub const Free = if (enabled) impl.Free else stub.Free; +pub const SecureAlloc = if (enabled) impl.SecureAlloc else stub.SecureAlloc; +pub const SecureFree = if (enabled) impl.SecureFree else stub.SecureFree; + +pub const AllocS = if (enabled) impl.AllocS else stub.AllocS; +pub const FreeS = if (enabled) impl.FreeS else stub.FreeS; +pub const SecureAllocS = if (enabled) impl.SecureAllocS else stub.SecureAllocS; +pub const SecureFreeS = if (enabled) impl.SecureFreeS else stub.SecureFreeS; + +pub const AllocN = if (enabled) impl.AllocN else stub.AllocN; +pub const FreeN = if (enabled) impl.FreeN else stub.FreeN; +pub const SecureAllocN = if (enabled) impl.SecureAllocN else stub.SecureAllocN; +pub const SecureFreeN = if (enabled) impl.SecureFreeN else stub.SecureFreeN; + +pub const AllocNS = if (enabled) impl.AllocNS else stub.AllocNS; +pub const FreeNS = if (enabled) impl.FreeNS else stub.FreeNS; +pub const SecureAllocNS = if (enabled) impl.SecureAllocNS else stub.SecureAllocNS; +pub const SecureFreeNS = if (enabled) impl.SecureFreeNS else stub.SecureFreeNS; + +pub const Message = if (enabled) impl.Message else stub.Message; +pub const MessageL = if (enabled) impl.MessageL else stub.MessageL; +pub const MessageC = if (enabled) impl.MessageC else stub.MessageC; +pub const MessageLC = if (enabled) impl.MessageLC else stub.MessageLC; +pub const MessageS = if (enabled) impl.MessageS else stub.MessageS; +pub const MessageLS = if (enabled) impl.MessageLS else stub.MessageLS; +pub const MessageCS = if (enabled) impl.MessageCS else stub.MessageCS; +pub const MessageLCS = if (enabled) impl.MessageLCS else stub.MessageLCS; + +pub const FrameMark = if (enabled) impl.FrameMark else stub.FrameMark; +pub const FrameMarkNamed = if (enabled) impl.FrameMarkNamed else stub.FrameMarkNamed; +pub const FrameMarkStart = if (enabled) impl.FrameMarkStart else stub.FrameMarkStart; +pub const FrameMarkEnd = if (enabled) impl.FrameMarkEnd else stub.FrameMarkEnd; +pub const FrameImage = if (enabled) impl.FrameImage else stub.FrameImage; + +pub const FiberEnter = if (enabled) impl.FiberEnter else stub.FiberEnter; +pub const FiberLeave = if (enabled) impl.FiberLeave else stub.FiberLeave; + +pub const PlotF = if (enabled) impl.PlotF else stub.PlotF; +pub const PlotU = if (enabled) impl.PlotU else stub.PlotU; +pub const PlotI = if (enabled) impl.PlotI else stub.PlotI; + +pub const AppInfo = if (enabled) impl.AppInfo else stub.AppInfo; + +pub const TracyAllocator = if (enabled) impl.AppInfo else stub.AppInfo; test { std.testing.refAllDeclsRecursive(@This());